gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2018 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #include "config.h"
  24 #define INCLUDE_STRING
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "memmodel.h"
  32 #include "gimple.h"
  33 #include "cfghooks.h"
  34 #include "cfgloop.h"
  35 #include "df.h"
  36 #include "tm_p.h"
  37 #include "stringpool.h"
  38 #include "attribs.h"
  39 #include "optabs.h"
  40 #include "regs.h"
  41 #include "emit-rtl.h"
  42 #include "recog.h"
  43 #include "diagnostic.h"
  44 #include "insn-attr.h"
  45 #include "alias.h"
  46 #include "fold-const.h"
  47 #include "stor-layout.h"
  48 #include "calls.h"
  49 #include "varasm.h"
  50 #include "output.h"
  51 #include "flags.h"
  52 #include "explow.h"
  53 #include "expr.h"
  54 #include "reload.h"
  55 #include "langhooks.h"
  56 #include "opts.h"
  57 #include "params.h"
  58 #include "gimplify.h"
  59 #include "dwarf2.h"
  60 #include "gimple-iterator.h"
  61 #include "tree-vectorizer.h"
  62 #include "aarch64-cost-tables.h"
  63 #include "dumpfile.h"
  64 #include "builtins.h"
  65 #include "rtl-iter.h"
  66 #include "tm-constrs.h"
  67 #include "sched-int.h"
  68 #include "target-globals.h"
  69 #include "common/common-target.h"
  70 #include "cfgrtl.h"
  71 #include "selftest.h"
  72 #include "selftest-rtl.h"
  73 #include "rtx-vector-builder.h"
  74
  75 /* This file should be included last.  */
  76 #include "target-def.h"
  77
  78 /* Defined for convenience.  */
  79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  80
  81 /* Classifies an address.
  82
  83    ADDRESS_REG_IMM
  84        A simple base register plus immediate offset.
  85
  86    ADDRESS_REG_WB
  87        A base register indexed by immediate offset with writeback.
  88
  89    ADDRESS_REG_REG
  90        A base register indexed by (optionally scaled) register.
  91
  92    ADDRESS_REG_UXTW
  93        A base register indexed by (optionally scaled) zero-extended register.
  94
  95    ADDRESS_REG_SXTW
  96        A base register indexed by (optionally scaled) sign-extended register.
  97
  98    ADDRESS_LO_SUM
  99        A LO_SUM rtx with a base register and "LO12" symbol relocation.
 100
 101    ADDRESS_SYMBOLIC:
 102        A constant symbolic address, in pc-relative literal pool.  */
 103
 104 enum aarch64_address_type {
 105   ADDRESS_REG_IMM,
 106   ADDRESS_REG_WB,
 107   ADDRESS_REG_REG,
 108   ADDRESS_REG_UXTW,
 109   ADDRESS_REG_SXTW,
 110   ADDRESS_LO_SUM,
 111   ADDRESS_SYMBOLIC
 112 };
 113
 114 struct aarch64_address_info {
 115   enum aarch64_address_type type;
 116   rtx base;
 117   rtx offset;
 118   poly_int64 const_offset;
 119   int shift;
 120   enum aarch64_symbol_type symbol_type;
 121 };
 122
 123 /* Information about a legitimate vector immediate operand.  */
 124 struct simd_immediate_info
 125 {
 126   enum insn_type { MOV, MVN };
 127   enum modifier_type { LSL, MSL };
 128
 129   simd_immediate_info () {}
 130   simd_immediate_info (scalar_float_mode, rtx);
 131   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
 132                        insn_type = MOV, modifier_type = LSL,
 133                        unsigned int = 0);
 134   simd_immediate_info (scalar_mode, rtx, rtx);
 135
 136   /* The mode of the elements.  */
 137   scalar_mode elt_mode;
 138
 139   /* The value of each element if all elements are the same, or the
 140      first value if the constant is a series.  */
 141   rtx value;
 142
 143   /* The value of the step if the constant is a series, null otherwise.  */
 144   rtx step;
 145
 146   /* The instruction to use to move the immediate into a vector.  */
 147   insn_type insn;
 148
 149   /* The kind of shift modifier to use, and the number of bits to shift.
 150      This is (LSL, 0) if no shift is needed.  */
 151   modifier_type modifier;
 152   unsigned int shift;
 153 };
 154
 155 /* Construct a floating-point immediate in which each element has mode
 156    ELT_MODE_IN and value VALUE_IN.  */
 157 inline simd_immediate_info
 158 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 159   : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
 160     modifier (LSL), shift (0)
 161 {}
 162
 163 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 164    and value VALUE_IN.  The other parameters are as for the structure
 165    fields.  */
 166 inline simd_immediate_info
 167 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 168                        unsigned HOST_WIDE_INT value_in,
 169                        insn_type insn_in, modifier_type modifier_in,
 170                        unsigned int shift_in)
 171   : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
 172     step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
 173 {}
 174
 175 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 176    and where element I is equal to VALUE_IN + I * STEP_IN.  */
 177 inline simd_immediate_info
 178 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
 179   : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
 180     modifier (LSL), shift (0)
 181 {}
 182
 183 /* The current code model.  */
 184 enum aarch64_code_model aarch64_cmodel;
 185
 186 /* The number of 64-bit elements in an SVE vector.  */
 187 poly_uint16 aarch64_sve_vg;
 188
 189 #ifdef HAVE_AS_TLS
 190 #undef TARGET_HAVE_TLS
 191 #define TARGET_HAVE_TLS 1
 192 #endif
 193
 194 static bool aarch64_composite_type_p (const_tree, machine_mode);
 195 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 196                                                      const_tree,
 197                                                      machine_mode *, int *,
 198                                                      bool *);
 199 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 200 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 201 static void aarch64_override_options_after_change (void);
 202 static bool aarch64_vector_mode_supported_p (machine_mode);
 203 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 204 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 205                                                          const_tree type,
 206                                                          int misalignment,
 207                                                          bool is_packed);
 208 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 209 static bool aarch64_print_ldpstp_address (FILE *, machine_mode, rtx);
 210
 211 /* Major revision number of the ARM Architecture implemented by the target.  */
 212 unsigned aarch64_architecture_version;
 213
 214 /* The processor for which instructions should be scheduled.  */
 215 enum aarch64_processor aarch64_tune = cortexa53;
 216
 217 /* Mask to specify which instruction scheduling options should be used.  */
 218 unsigned long aarch64_tune_flags = 0;
 219
 220 /* Global flag for PC relative loads.  */
 221 bool aarch64_pcrelative_literal_loads;
 222
 223 /* Global flag for whether frame pointer is enabled.  */
 224 bool aarch64_use_frame_pointer;
 225
 226 /* Support for command line parsing of boolean flags in the tuning
 227    structures.  */
 228 struct aarch64_flag_desc
 229 {
 230   const char* name;
 231   unsigned int flag;
 232 };
 233
 234 #define AARCH64_FUSION_PAIR(name, internal_name) \
 235   { name, AARCH64_FUSE_##internal_name },
 236 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 237 {
 238   { "none", AARCH64_FUSE_NOTHING },
 239 #include "aarch64-fusion-pairs.def"
 240   { "all", AARCH64_FUSE_ALL },
 241   { NULL, AARCH64_FUSE_NOTHING }
 242 };
 243
 244 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 245   { name, AARCH64_EXTRA_TUNE_##internal_name },
 246 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 247 {
 248   { "none", AARCH64_EXTRA_TUNE_NONE },
 249 #include "aarch64-tuning-flags.def"
 250   { "all", AARCH64_EXTRA_TUNE_ALL },
 251   { NULL, AARCH64_EXTRA_TUNE_NONE }
 252 };
 253
 254 /* Tuning parameters.  */
 255
 256 static const struct cpu_addrcost_table generic_addrcost_table =
 257 {
 258     {
 259       1, /* hi  */
 260       0, /* si  */
 261       0, /* di  */
 262       1, /* ti  */
 263     },
 264   0, /* pre_modify  */
 265   0, /* post_modify  */
 266   0, /* register_offset  */
 267   0, /* register_sextend  */
 268   0, /* register_zextend  */
 269   0 /* imm_offset  */
 270 };
 271
 272 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 273 {
 274     {
 275       0, /* hi  */
 276       0, /* si  */
 277       0, /* di  */
 278       2, /* ti  */
 279     },
 280   0, /* pre_modify  */
 281   0, /* post_modify  */
 282   1, /* register_offset  */
 283   1, /* register_sextend  */
 284   2, /* register_zextend  */
 285   0, /* imm_offset  */
 286 };
 287
 288 static const struct cpu_addrcost_table xgene1_addrcost_table =
 289 {
 290     {
 291       1, /* hi  */
 292       0, /* si  */
 293       0, /* di  */
 294       1, /* ti  */
 295     },
 296   1, /* pre_modify  */
 297   0, /* post_modify  */
 298   0, /* register_offset  */
 299   1, /* register_sextend  */
 300   1, /* register_zextend  */
 301   0, /* imm_offset  */
 302 };
 303
 304 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 305 {
 306     {
 307       1, /* hi  */
 308       1, /* si  */
 309       1, /* di  */
 310       2, /* ti  */
 311     },
 312   0, /* pre_modify  */
 313   0, /* post_modify  */
 314   2, /* register_offset  */
 315   3, /* register_sextend  */
 316   3, /* register_zextend  */
 317   0, /* imm_offset  */
 318 };
 319
 320 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 321 {
 322     {
 323       1, /* hi  */
 324       1, /* si  */
 325       1, /* di  */
 326       2, /* ti  */
 327     },
 328   1, /* pre_modify  */
 329   1, /* post_modify  */
 330   3, /* register_offset  */
 331   4, /* register_sextend  */
 332   3, /* register_zextend  */
 333   2, /* imm_offset  */
 334 };
 335
 336 static const struct cpu_regmove_cost generic_regmove_cost =
 337 {
 338   1, /* GP2GP  */
 339   /* Avoid the use of slow int<->fp moves for spilling by setting
 340      their cost higher than memmov_cost.  */
 341   5, /* GP2FP  */
 342   5, /* FP2GP  */
 343   2 /* FP2FP  */
 344 };
 345
 346 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 347 {
 348   1, /* GP2GP  */
 349   /* Avoid the use of slow int<->fp moves for spilling by setting
 350      their cost higher than memmov_cost.  */
 351   5, /* GP2FP  */
 352   5, /* FP2GP  */
 353   2 /* FP2FP  */
 354 };
 355
 356 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 357 {
 358   1, /* GP2GP  */
 359   /* Avoid the use of slow int<->fp moves for spilling by setting
 360      their cost higher than memmov_cost.  */
 361   5, /* GP2FP  */
 362   5, /* FP2GP  */
 363   2 /* FP2FP  */
 364 };
 365
 366 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 367 {
 368   1, /* GP2GP  */
 369   /* Avoid the use of slow int<->fp moves for spilling by setting
 370      their cost higher than memmov_cost (actual, 4 and 9).  */
 371   9, /* GP2FP  */
 372   9, /* FP2GP  */
 373   1 /* FP2FP  */
 374 };
 375
 376 static const struct cpu_regmove_cost thunderx_regmove_cost =
 377 {
 378   2, /* GP2GP  */
 379   2, /* GP2FP  */
 380   6, /* FP2GP  */
 381   4 /* FP2FP  */
 382 };
 383
 384 static const struct cpu_regmove_cost xgene1_regmove_cost =
 385 {
 386   1, /* GP2GP  */
 387   /* Avoid the use of slow int<->fp moves for spilling by setting
 388      their cost higher than memmov_cost.  */
 389   8, /* GP2FP  */
 390   8, /* FP2GP  */
 391   2 /* FP2FP  */
 392 };
 393
 394 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 395 {
 396   2, /* GP2GP  */
 397   /* Avoid the use of int<->fp moves for spilling.  */
 398   6, /* GP2FP  */
 399   6, /* FP2GP  */
 400   4 /* FP2FP  */
 401 };
 402
 403 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 404 {
 405   1, /* GP2GP  */
 406   /* Avoid the use of int<->fp moves for spilling.  */
 407   8, /* GP2FP  */
 408   8, /* FP2GP  */
 409   4  /* FP2FP  */
 410 };
 411
 412 /* Generic costs for vector insn classes.  */
 413 static const struct cpu_vector_cost generic_vector_cost =
 414 {
 415   1, /* scalar_int_stmt_cost  */
 416   1, /* scalar_fp_stmt_cost  */
 417   1, /* scalar_load_cost  */
 418   1, /* scalar_store_cost  */
 419   1, /* vec_int_stmt_cost  */
 420   1, /* vec_fp_stmt_cost  */
 421   2, /* vec_permute_cost  */
 422   1, /* vec_to_scalar_cost  */
 423   1, /* scalar_to_vec_cost  */
 424   1, /* vec_align_load_cost  */
 425   1, /* vec_unalign_load_cost  */
 426   1, /* vec_unalign_store_cost  */
 427   1, /* vec_store_cost  */
 428   3, /* cond_taken_branch_cost  */
 429   1 /* cond_not_taken_branch_cost  */
 430 };
 431
 432 /* ThunderX costs for vector insn classes.  */
 433 static const struct cpu_vector_cost thunderx_vector_cost =
 434 {
 435   1, /* scalar_int_stmt_cost  */
 436   1, /* scalar_fp_stmt_cost  */
 437   3, /* scalar_load_cost  */
 438   1, /* scalar_store_cost  */
 439   4, /* vec_int_stmt_cost  */
 440   1, /* vec_fp_stmt_cost  */
 441   4, /* vec_permute_cost  */
 442   2, /* vec_to_scalar_cost  */
 443   2, /* scalar_to_vec_cost  */
 444   3, /* vec_align_load_cost  */
 445   5, /* vec_unalign_load_cost  */
 446   5, /* vec_unalign_store_cost  */
 447   1, /* vec_store_cost  */
 448   3, /* cond_taken_branch_cost  */
 449   3 /* cond_not_taken_branch_cost  */
 450 };
 451
 452 /* Generic costs for vector insn classes.  */
 453 static const struct cpu_vector_cost cortexa57_vector_cost =
 454 {
 455   1, /* scalar_int_stmt_cost  */
 456   1, /* scalar_fp_stmt_cost  */
 457   4, /* scalar_load_cost  */
 458   1, /* scalar_store_cost  */
 459   2, /* vec_int_stmt_cost  */
 460   2, /* vec_fp_stmt_cost  */
 461   3, /* vec_permute_cost  */
 462   8, /* vec_to_scalar_cost  */
 463   8, /* scalar_to_vec_cost  */
 464   4, /* vec_align_load_cost  */
 465   4, /* vec_unalign_load_cost  */
 466   1, /* vec_unalign_store_cost  */
 467   1, /* vec_store_cost  */
 468   1, /* cond_taken_branch_cost  */
 469   1 /* cond_not_taken_branch_cost  */
 470 };
 471
 472 static const struct cpu_vector_cost exynosm1_vector_cost =
 473 {
 474   1, /* scalar_int_stmt_cost  */
 475   1, /* scalar_fp_stmt_cost  */
 476   5, /* scalar_load_cost  */
 477   1, /* scalar_store_cost  */
 478   3, /* vec_int_stmt_cost  */
 479   3, /* vec_fp_stmt_cost  */
 480   3, /* vec_permute_cost  */
 481   3, /* vec_to_scalar_cost  */
 482   3, /* scalar_to_vec_cost  */
 483   5, /* vec_align_load_cost  */
 484   5, /* vec_unalign_load_cost  */
 485   1, /* vec_unalign_store_cost  */
 486   1, /* vec_store_cost  */
 487   1, /* cond_taken_branch_cost  */
 488   1 /* cond_not_taken_branch_cost  */
 489 };
 490
 491 /* Generic costs for vector insn classes.  */
 492 static const struct cpu_vector_cost xgene1_vector_cost =
 493 {
 494   1, /* scalar_int_stmt_cost  */
 495   1, /* scalar_fp_stmt_cost  */
 496   5, /* scalar_load_cost  */
 497   1, /* scalar_store_cost  */
 498   2, /* vec_int_stmt_cost  */
 499   2, /* vec_fp_stmt_cost  */
 500   2, /* vec_permute_cost  */
 501   4, /* vec_to_scalar_cost  */
 502   4, /* scalar_to_vec_cost  */
 503   10, /* vec_align_load_cost  */
 504   10, /* vec_unalign_load_cost  */
 505   2, /* vec_unalign_store_cost  */
 506   2, /* vec_store_cost  */
 507   2, /* cond_taken_branch_cost  */
 508   1 /* cond_not_taken_branch_cost  */
 509 };
 510
 511 /* Costs for vector insn classes for Vulcan.  */
 512 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 513 {
 514   1, /* scalar_int_stmt_cost  */
 515   6, /* scalar_fp_stmt_cost  */
 516   4, /* scalar_load_cost  */
 517   1, /* scalar_store_cost  */
 518   5, /* vec_int_stmt_cost  */
 519   6, /* vec_fp_stmt_cost  */
 520   3, /* vec_permute_cost  */
 521   6, /* vec_to_scalar_cost  */
 522   5, /* scalar_to_vec_cost  */
 523   8, /* vec_align_load_cost  */
 524   8, /* vec_unalign_load_cost  */
 525   4, /* vec_unalign_store_cost  */
 526   4, /* vec_store_cost  */
 527   2, /* cond_taken_branch_cost  */
 528   1  /* cond_not_taken_branch_cost  */
 529 };
 530
 531 /* Generic costs for branch instructions.  */
 532 static const struct cpu_branch_cost generic_branch_cost =
 533 {
 534   1,  /* Predictable.  */
 535   3   /* Unpredictable.  */
 536 };
 537
 538 /* Generic approximation modes.  */
 539 static const cpu_approx_modes generic_approx_modes =
 540 {
 541   AARCH64_APPROX_NONE,  /* division  */
 542   AARCH64_APPROX_NONE,  /* sqrt  */
 543   AARCH64_APPROX_NONE   /* recip_sqrt  */
 544 };
 545
 546 /* Approximation modes for Exynos M1.  */
 547 static const cpu_approx_modes exynosm1_approx_modes =
 548 {
 549   AARCH64_APPROX_NONE,  /* division  */
 550   AARCH64_APPROX_ALL,   /* sqrt  */
 551   AARCH64_APPROX_ALL    /* recip_sqrt  */
 552 };
 553
 554 /* Approximation modes for X-Gene 1.  */
 555 static const cpu_approx_modes xgene1_approx_modes =
 556 {
 557   AARCH64_APPROX_NONE,  /* division  */
 558   AARCH64_APPROX_NONE,  /* sqrt  */
 559   AARCH64_APPROX_ALL    /* recip_sqrt  */
 560 };
 561
 562 /* Generic prefetch settings (which disable prefetch).  */
 563 static const cpu_prefetch_tune generic_prefetch_tune =
 564 {
 565   0,                    /* num_slots  */
 566   -1,                   /* l1_cache_size  */
 567   -1,                   /* l1_cache_line_size  */
 568   -1,                   /* l2_cache_size  */
 569   true,                 /* prefetch_dynamic_strides */
 570   -1,                   /* minimum_stride */
 571   -1                    /* default_opt_level  */
 572 };
 573
 574 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 575 {
 576   0,                    /* num_slots  */
 577   -1,                   /* l1_cache_size  */
 578   64,                   /* l1_cache_line_size  */
 579   -1,                   /* l2_cache_size  */
 580   true,                 /* prefetch_dynamic_strides */
 581   -1,                   /* minimum_stride */
 582   -1                    /* default_opt_level  */
 583 };
 584
 585 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 586 {
 587   4,                    /* num_slots  */
 588   32,                   /* l1_cache_size  */
 589   64,                   /* l1_cache_line_size  */
 590   512,                  /* l2_cache_size  */
 591   false,                /* prefetch_dynamic_strides */
 592   2048,                 /* minimum_stride */
 593   3                     /* default_opt_level  */
 594 };
 595
 596 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 597 {
 598   8,                    /* num_slots  */
 599   32,                   /* l1_cache_size  */
 600   128,                  /* l1_cache_line_size  */
 601   16*1024,              /* l2_cache_size  */
 602   true,                 /* prefetch_dynamic_strides */
 603   -1,                   /* minimum_stride */
 604   3                     /* default_opt_level  */
 605 };
 606
 607 static const cpu_prefetch_tune thunderx_prefetch_tune =
 608 {
 609   8,                    /* num_slots  */
 610   32,                   /* l1_cache_size  */
 611   128,                  /* l1_cache_line_size  */
 612   -1,                   /* l2_cache_size  */
 613   true,                 /* prefetch_dynamic_strides */
 614   -1,                   /* minimum_stride */
 615   -1                    /* default_opt_level  */
 616 };
 617
 618 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 619 {
 620   8,                    /* num_slots  */
 621   32,                   /* l1_cache_size  */
 622   64,                   /* l1_cache_line_size  */
 623   256,                  /* l2_cache_size  */
 624   true,                 /* prefetch_dynamic_strides */
 625   -1,                   /* minimum_stride */
 626   -1                    /* default_opt_level  */
 627 };
 628
 629 static const struct tune_params generic_tunings =
 630 {
 631   &cortexa57_extra_costs,
 632   &generic_addrcost_table,
 633   &generic_regmove_cost,
 634   &generic_vector_cost,
 635   &generic_branch_cost,
 636   &generic_approx_modes,
 637   4, /* memmov_cost  */
 638   2, /* issue_rate  */
 639   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 640   "8",  /* function_align.  */
 641   "4",  /* jump_align.  */
 642   "8",  /* loop_align.  */
 643   2,    /* int_reassoc_width.  */
 644   4,    /* fp_reassoc_width.  */
 645   1,    /* vec_reassoc_width.  */
 646   2,    /* min_div_recip_mul_sf.  */
 647   2,    /* min_div_recip_mul_df.  */
 648   0,    /* max_case_values.  */
 649   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 650   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 651   &generic_prefetch_tune
 652 };
 653
 654 static const struct tune_params cortexa35_tunings =
 655 {
 656   &cortexa53_extra_costs,
 657   &generic_addrcost_table,
 658   &cortexa53_regmove_cost,
 659   &generic_vector_cost,
 660   &generic_branch_cost,
 661   &generic_approx_modes,
 662   4, /* memmov_cost  */
 663   1, /* issue_rate  */
 664   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 665    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 666   "16", /* function_align.  */
 667   "4",  /* jump_align.  */
 668   "8",  /* loop_align.  */
 669   2,    /* int_reassoc_width.  */
 670   4,    /* fp_reassoc_width.  */
 671   1,    /* vec_reassoc_width.  */
 672   2,    /* min_div_recip_mul_sf.  */
 673   2,    /* min_div_recip_mul_df.  */
 674   0,    /* max_case_values.  */
 675   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 676   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 677   &generic_prefetch_tune
 678 };
 679
 680 static const struct tune_params cortexa53_tunings =
 681 {
 682   &cortexa53_extra_costs,
 683   &generic_addrcost_table,
 684   &cortexa53_regmove_cost,
 685   &generic_vector_cost,
 686   &generic_branch_cost,
 687   &generic_approx_modes,
 688   4, /* memmov_cost  */
 689   2, /* issue_rate  */
 690   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 691    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 692   "16", /* function_align.  */
 693   "4",  /* jump_align.  */
 694   "8",  /* loop_align.  */
 695   2,    /* int_reassoc_width.  */
 696   4,    /* fp_reassoc_width.  */
 697   1,    /* vec_reassoc_width.  */
 698   2,    /* min_div_recip_mul_sf.  */
 699   2,    /* min_div_recip_mul_df.  */
 700   0,    /* max_case_values.  */
 701   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 702   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 703   &generic_prefetch_tune
 704 };
 705
 706 static const struct tune_params cortexa57_tunings =
 707 {
 708   &cortexa57_extra_costs,
 709   &generic_addrcost_table,
 710   &cortexa57_regmove_cost,
 711   &cortexa57_vector_cost,
 712   &generic_branch_cost,
 713   &generic_approx_modes,
 714   4, /* memmov_cost  */
 715   3, /* issue_rate  */
 716   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 717    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 718   "16", /* function_align.  */
 719   "4",  /* jump_align.  */
 720   "8",  /* loop_align.  */
 721   2,    /* int_reassoc_width.  */
 722   4,    /* fp_reassoc_width.  */
 723   1,    /* vec_reassoc_width.  */
 724   2,    /* min_div_recip_mul_sf.  */
 725   2,    /* min_div_recip_mul_df.  */
 726   0,    /* max_case_values.  */
 727   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 728   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 729   &generic_prefetch_tune
 730 };
 731
 732 static const struct tune_params cortexa72_tunings =
 733 {
 734   &cortexa57_extra_costs,
 735   &generic_addrcost_table,
 736   &cortexa57_regmove_cost,
 737   &cortexa57_vector_cost,
 738   &generic_branch_cost,
 739   &generic_approx_modes,
 740   4, /* memmov_cost  */
 741   3, /* issue_rate  */
 742   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 743    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 744   "16", /* function_align.  */
 745   "4",  /* jump_align.  */
 746   "8",  /* loop_align.  */
 747   2,    /* int_reassoc_width.  */
 748   4,    /* fp_reassoc_width.  */
 749   1,    /* vec_reassoc_width.  */
 750   2,    /* min_div_recip_mul_sf.  */
 751   2,    /* min_div_recip_mul_df.  */
 752   0,    /* max_case_values.  */
 753   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 754   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 755   &generic_prefetch_tune
 756 };
 757
 758 static const struct tune_params cortexa73_tunings =
 759 {
 760   &cortexa57_extra_costs,
 761   &generic_addrcost_table,
 762   &cortexa57_regmove_cost,
 763   &cortexa57_vector_cost,
 764   &generic_branch_cost,
 765   &generic_approx_modes,
 766   4, /* memmov_cost.  */
 767   2, /* issue_rate.  */
 768   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 769    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 770   "16", /* function_align.  */
 771   "4",  /* jump_align.  */
 772   "8",  /* loop_align.  */
 773   2,    /* int_reassoc_width.  */
 774   4,    /* fp_reassoc_width.  */
 775   1,    /* vec_reassoc_width.  */
 776   2,    /* min_div_recip_mul_sf.  */
 777   2,    /* min_div_recip_mul_df.  */
 778   0,    /* max_case_values.  */
 779   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 780   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 781   &generic_prefetch_tune
 782 };
 783
 784
 785
 786 static const struct tune_params exynosm1_tunings =
 787 {
 788   &exynosm1_extra_costs,
 789   &exynosm1_addrcost_table,
 790   &exynosm1_regmove_cost,
 791   &exynosm1_vector_cost,
 792   &generic_branch_cost,
 793   &exynosm1_approx_modes,
 794   4,    /* memmov_cost  */
 795   3,    /* issue_rate  */
 796   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 797   "4",  /* function_align.  */
 798   "4",  /* jump_align.  */
 799   "4",  /* loop_align.  */
 800   2,    /* int_reassoc_width.  */
 801   4,    /* fp_reassoc_width.  */
 802   1,    /* vec_reassoc_width.  */
 803   2,    /* min_div_recip_mul_sf.  */
 804   2,    /* min_div_recip_mul_df.  */
 805   48,   /* max_case_values.  */
 806   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 807   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 808   &exynosm1_prefetch_tune
 809 };
 810
 811 static const struct tune_params thunderxt88_tunings =
 812 {
 813   &thunderx_extra_costs,
 814   &generic_addrcost_table,
 815   &thunderx_regmove_cost,
 816   &thunderx_vector_cost,
 817   &generic_branch_cost,
 818   &generic_approx_modes,
 819   6, /* memmov_cost  */
 820   2, /* issue_rate  */
 821   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 822   "8",  /* function_align.  */
 823   "8",  /* jump_align.  */
 824   "8",  /* loop_align.  */
 825   2,    /* int_reassoc_width.  */
 826   4,    /* fp_reassoc_width.  */
 827   1,    /* vec_reassoc_width.  */
 828   2,    /* min_div_recip_mul_sf.  */
 829   2,    /* min_div_recip_mul_df.  */
 830   0,    /* max_case_values.  */
 831   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 832   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 833   &thunderxt88_prefetch_tune
 834 };
 835
 836 static const struct tune_params thunderx_tunings =
 837 {
 838   &thunderx_extra_costs,
 839   &generic_addrcost_table,
 840   &thunderx_regmove_cost,
 841   &thunderx_vector_cost,
 842   &generic_branch_cost,
 843   &generic_approx_modes,
 844   6, /* memmov_cost  */
 845   2, /* issue_rate  */
 846   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 847   "8",  /* function_align.  */
 848   "8",  /* jump_align.  */
 849   "8",  /* loop_align.  */
 850   2,    /* int_reassoc_width.  */
 851   4,    /* fp_reassoc_width.  */
 852   1,    /* vec_reassoc_width.  */
 853   2,    /* min_div_recip_mul_sf.  */
 854   2,    /* min_div_recip_mul_df.  */
 855   0,    /* max_case_values.  */
 856   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 857   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 858    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 859   &thunderx_prefetch_tune
 860 };
 861
 862 static const struct tune_params xgene1_tunings =
 863 {
 864   &xgene1_extra_costs,
 865   &xgene1_addrcost_table,
 866   &xgene1_regmove_cost,
 867   &xgene1_vector_cost,
 868   &generic_branch_cost,
 869   &xgene1_approx_modes,
 870   6, /* memmov_cost  */
 871   4, /* issue_rate  */
 872   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 873   "16", /* function_align.  */
 874   "8",  /* jump_align.  */
 875   "16", /* loop_align.  */
 876   2,    /* int_reassoc_width.  */
 877   4,    /* fp_reassoc_width.  */
 878   1,    /* vec_reassoc_width.  */
 879   2,    /* min_div_recip_mul_sf.  */
 880   2,    /* min_div_recip_mul_df.  */
 881   0,    /* max_case_values.  */
 882   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 883   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
 884   &generic_prefetch_tune
 885 };
 886
 887 static const struct tune_params qdf24xx_tunings =
 888 {
 889   &qdf24xx_extra_costs,
 890   &qdf24xx_addrcost_table,
 891   &qdf24xx_regmove_cost,
 892   &generic_vector_cost,
 893   &generic_branch_cost,
 894   &generic_approx_modes,
 895   4, /* memmov_cost  */
 896   4, /* issue_rate  */
 897   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 898    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 899   "16", /* function_align.  */
 900   "8",  /* jump_align.  */
 901   "16", /* loop_align.  */
 902   2,    /* int_reassoc_width.  */
 903   4,    /* fp_reassoc_width.  */
 904   1,    /* vec_reassoc_width.  */
 905   2,    /* min_div_recip_mul_sf.  */
 906   2,    /* min_div_recip_mul_df.  */
 907   0,    /* max_case_values.  */
 908   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 909   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
 910   &qdf24xx_prefetch_tune
 911 };
 912
 913 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
 914    for now.  */
 915 static const struct tune_params saphira_tunings =
 916 {
 917   &generic_extra_costs,
 918   &generic_addrcost_table,
 919   &generic_regmove_cost,
 920   &generic_vector_cost,
 921   &generic_branch_cost,
 922   &generic_approx_modes,
 923   4, /* memmov_cost  */
 924   4, /* issue_rate  */
 925   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 926    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 927   "16", /* function_align.  */
 928   "8",  /* jump_align.  */
 929   "16", /* loop_align.  */
 930   2,    /* int_reassoc_width.  */
 931   4,    /* fp_reassoc_width.  */
 932   1,    /* vec_reassoc_width.  */
 933   2,    /* min_div_recip_mul_sf.  */
 934   2,    /* min_div_recip_mul_df.  */
 935   0,    /* max_case_values.  */
 936   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 937   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
 938   &generic_prefetch_tune
 939 };
 940
 941 static const struct tune_params thunderx2t99_tunings =
 942 {
 943   &thunderx2t99_extra_costs,
 944   &thunderx2t99_addrcost_table,
 945   &thunderx2t99_regmove_cost,
 946   &thunderx2t99_vector_cost,
 947   &generic_branch_cost,
 948   &generic_approx_modes,
 949   4, /* memmov_cost.  */
 950   4, /* issue_rate.  */
 951   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
 952    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 953   "16", /* function_align.  */
 954   "8",  /* jump_align.  */
 955   "16", /* loop_align.  */
 956   3,    /* int_reassoc_width.  */
 957   2,    /* fp_reassoc_width.  */
 958   2,    /* vec_reassoc_width.  */
 959   2,    /* min_div_recip_mul_sf.  */
 960   2,    /* min_div_recip_mul_df.  */
 961   0,    /* max_case_values.  */
 962   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 963   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 964   &thunderx2t99_prefetch_tune
 965 };
 966
 967 /* Support for fine-grained override of the tuning structures.  */
 968 struct aarch64_tuning_override_function
 969 {
 970   const char* name;
 971   void (*parse_override)(const char*, struct tune_params*);
 972 };
 973
 974 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 975 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 976
 977 static const struct aarch64_tuning_override_function
 978 aarch64_tuning_override_functions[] =
 979 {
 980   { "fuse", aarch64_parse_fuse_string },
 981   { "tune", aarch64_parse_tune_string },
 982   { NULL, NULL }
 983 };
 984
 985 /* A processor implementing AArch64.  */
 986 struct processor
 987 {
 988   const char *const name;
 989   enum aarch64_processor ident;
 990   enum aarch64_processor sched_core;
 991   enum aarch64_arch arch;
 992   unsigned architecture_version;
 993   const unsigned long flags;
 994   const struct tune_params *const tune;
 995 };
 996
 997 /* Architectures implementing AArch64.  */
 998 static const struct processor all_architectures[] =
 999 {
1000 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1001   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1002 #include "aarch64-arches.def"
1003   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1004 };
1005
1006 /* Processor cores implementing AArch64.  */
1007 static const struct processor all_cores[] =
1008 {
1009 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1010   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
1011   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
1012   FLAGS, &COSTS##_tunings},
1013 #include "aarch64-cores.def"
1014   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1015     AARCH64_FL_FOR_ARCH8, &generic_tunings},
1016   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1017 };
1018
1019
1020 /* Target specification.  These are populated by the -march, -mtune, -mcpu
1021    handling code or by target attributes.  */
1022 static const struct processor *selected_arch;
1023 static const struct processor *selected_cpu;
1024 static const struct processor *selected_tune;
1025
1026 /* The current tuning set.  */
1027 struct tune_params aarch64_tune_params = generic_tunings;
1028
1029 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1030
1031 /* An ISA extension in the co-processor and main instruction set space.  */
1032 struct aarch64_option_extension
1033 {
1034   const char *const name;
1035   const unsigned long flags_on;
1036   const unsigned long flags_off;
1037 };
1038
1039 typedef enum aarch64_cond_code
1040 {
1041   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1042   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1043   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1044 }
1045 aarch64_cc;
1046
1047 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1048
1049 /* The condition codes of the processor, and the inverse function.  */
1050 static const char * const aarch64_condition_codes[] =
1051 {
1052   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1053   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1054 };
1055
1056 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1057 const char *
1058 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1059                         const char * branch_format)
1060 {
1061     rtx_code_label * tmp_label = gen_label_rtx ();
1062     char label_buf[256];
1063     char buffer[128];
1064     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1065                                  CODE_LABEL_NUMBER (tmp_label));
1066     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1067     rtx dest_label = operands[pos_label];
1068     operands[pos_label] = tmp_label;
1069
1070     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1071     output_asm_insn (buffer, operands);
1072
1073     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1074     operands[pos_label] = dest_label;
1075     output_asm_insn (buffer, operands);
1076     return "";
1077 }
1078
1079 void
1080 aarch64_err_no_fpadvsimd (machine_mode mode)
1081 {
1082   if (TARGET_GENERAL_REGS_ONLY)
1083     if (FLOAT_MODE_P (mode))
1084       error ("%qs is incompatible with the use of floating-point types",
1085              "-mgeneral-regs-only");
1086     else
1087       error ("%qs is incompatible with the use of vector types",
1088              "-mgeneral-regs-only");
1089   else
1090     if (FLOAT_MODE_P (mode))
1091       error ("%qs feature modifier is incompatible with the use of"
1092              " floating-point types", "+nofp");
1093     else
1094       error ("%qs feature modifier is incompatible with the use of"
1095              " vector types", "+nofp");
1096 }
1097
1098 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1099    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1100    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1101    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1102    and GENERAL_REGS is lower than the memory cost (in this case the best class
1103    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
1104    cost results in bad allocations with many redundant int<->FP moves which
1105    are expensive on various cores.
1106    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1107    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
1108    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
1109    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
1110    The result of this is that it is no longer inefficient to have a higher
1111    memory move cost than the register move cost.
1112 */
1113
1114 static reg_class_t
1115 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1116                                          reg_class_t best_class)
1117 {
1118   machine_mode mode;
1119
1120   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1121       || !reg_class_subset_p (FP_REGS, allocno_class))
1122     return allocno_class;
1123
1124   if (!reg_class_subset_p (GENERAL_REGS, best_class)
1125       || !reg_class_subset_p (FP_REGS, best_class))
1126     return best_class;
1127
1128   mode = PSEUDO_REGNO_MODE (regno);
1129   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1130 }
1131
1132 static unsigned int
1133 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1134 {
1135   if (GET_MODE_UNIT_SIZE (mode) == 4)
1136     return aarch64_tune_params.min_div_recip_mul_sf;
1137   return aarch64_tune_params.min_div_recip_mul_df;
1138 }
1139
1140 /* Return the reassociation width of treeop OPC with mode MODE.  */
1141 static int
1142 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1143 {
1144   if (VECTOR_MODE_P (mode))
1145     return aarch64_tune_params.vec_reassoc_width;
1146   if (INTEGRAL_MODE_P (mode))
1147     return aarch64_tune_params.int_reassoc_width;
1148   /* Avoid reassociating floating point addition so we emit more FMAs.  */
1149   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1150     return aarch64_tune_params.fp_reassoc_width;
1151   return 1;
1152 }
1153
1154 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1155 unsigned
1156 aarch64_dbx_register_number (unsigned regno)
1157 {
1158    if (GP_REGNUM_P (regno))
1159      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1160    else if (regno == SP_REGNUM)
1161      return AARCH64_DWARF_SP;
1162    else if (FP_REGNUM_P (regno))
1163      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1164    else if (PR_REGNUM_P (regno))
1165      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1166    else if (regno == VG_REGNUM)
1167      return AARCH64_DWARF_VG;
1168
1169    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1170       equivalent DWARF register.  */
1171    return DWARF_FRAME_REGISTERS;
1172 }
1173
1174 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1175 static bool
1176 aarch64_advsimd_struct_mode_p (machine_mode mode)
1177 {
1178   return (TARGET_SIMD
1179           && (mode == OImode || mode == CImode || mode == XImode));
1180 }
1181
1182 /* Return true if MODE is an SVE predicate mode.  */
1183 static bool
1184 aarch64_sve_pred_mode_p (machine_mode mode)
1185 {
1186   return (TARGET_SVE
1187           && (mode == VNx16BImode
1188               || mode == VNx8BImode
1189               || mode == VNx4BImode
1190               || mode == VNx2BImode));
1191 }
1192
1193 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1194 const unsigned int VEC_ADVSIMD  = 1;
1195 const unsigned int VEC_SVE_DATA = 2;
1196 const unsigned int VEC_SVE_PRED = 4;
1197 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1198    a structure of 2, 3 or 4 vectors.  */
1199 const unsigned int VEC_STRUCT   = 8;
1200 /* Useful combinations of the above.  */
1201 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1202 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1203
1204 /* Return a set of flags describing the vector properties of mode MODE.
1205    Ignore modes that are not supported by the current target.  */
1206 static unsigned int
1207 aarch64_classify_vector_mode (machine_mode mode)
1208 {
1209   if (aarch64_advsimd_struct_mode_p (mode))
1210     return VEC_ADVSIMD | VEC_STRUCT;
1211
1212   if (aarch64_sve_pred_mode_p (mode))
1213     return VEC_SVE_PRED;
1214
1215   scalar_mode inner = GET_MODE_INNER (mode);
1216   if (VECTOR_MODE_P (mode)
1217       && (inner == QImode
1218           || inner == HImode
1219           || inner == HFmode
1220           || inner == SImode
1221           || inner == SFmode
1222           || inner == DImode
1223           || inner == DFmode))
1224     {
1225       if (TARGET_SVE)
1226         {
1227           if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1228             return VEC_SVE_DATA;
1229           if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1230               || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1231               || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1232             return VEC_SVE_DATA | VEC_STRUCT;
1233         }
1234
1235       /* This includes V1DF but not V1DI (which doesn't exist).  */
1236       if (TARGET_SIMD
1237           && (known_eq (GET_MODE_BITSIZE (mode), 64)
1238               || known_eq (GET_MODE_BITSIZE (mode), 128)))
1239         return VEC_ADVSIMD;
1240     }
1241
1242   return 0;
1243 }
1244
1245 /* Return true if MODE is any of the data vector modes, including
1246    structure modes.  */
1247 static bool
1248 aarch64_vector_data_mode_p (machine_mode mode)
1249 {
1250   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1251 }
1252
1253 /* Return true if MODE is an SVE data vector mode; either a single vector
1254    or a structure of vectors.  */
1255 static bool
1256 aarch64_sve_data_mode_p (machine_mode mode)
1257 {
1258   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1259 }
1260
1261 /* Implement target hook TARGET_ARRAY_MODE.  */
1262 static opt_machine_mode
1263 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1264 {
1265   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1266       && IN_RANGE (nelems, 2, 4))
1267     return mode_for_vector (GET_MODE_INNER (mode),
1268                             GET_MODE_NUNITS (mode) * nelems);
1269
1270   return opt_machine_mode ();
1271 }
1272
1273 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1274 static bool
1275 aarch64_array_mode_supported_p (machine_mode mode,
1276                                 unsigned HOST_WIDE_INT nelems)
1277 {
1278   if (TARGET_SIMD
1279       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1280           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1281       && (nelems >= 2 && nelems <= 4))
1282     return true;
1283
1284   return false;
1285 }
1286
1287 /* Return the SVE predicate mode to use for elements that have
1288    ELEM_NBYTES bytes, if such a mode exists.  */
1289
1290 opt_machine_mode
1291 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1292 {
1293   if (TARGET_SVE)
1294     {
1295       if (elem_nbytes == 1)
1296         return VNx16BImode;
1297       if (elem_nbytes == 2)
1298         return VNx8BImode;
1299       if (elem_nbytes == 4)
1300         return VNx4BImode;
1301       if (elem_nbytes == 8)
1302         return VNx2BImode;
1303     }
1304   return opt_machine_mode ();
1305 }
1306
1307 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1308
1309 static opt_machine_mode
1310 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1311 {
1312   if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1313     {
1314       unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1315       machine_mode pred_mode;
1316       if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1317         return pred_mode;
1318     }
1319
1320   return default_get_mask_mode (nunits, nbytes);
1321 }
1322
1323 /* Implement TARGET_HARD_REGNO_NREGS.  */
1324
1325 static unsigned int
1326 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1327 {
1328   /* ??? Logically we should only need to provide a value when
1329      HARD_REGNO_MODE_OK says that the combination is valid,
1330      but at the moment we need to handle all modes.  Just ignore
1331      any runtime parts for registers that can't store them.  */
1332   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1333   switch (aarch64_regno_regclass (regno))
1334     {
1335     case FP_REGS:
1336     case FP_LO_REGS:
1337       if (aarch64_sve_data_mode_p (mode))
1338         return exact_div (GET_MODE_SIZE (mode),
1339                           BYTES_PER_SVE_VECTOR).to_constant ();
1340       return CEIL (lowest_size, UNITS_PER_VREG);
1341     case PR_REGS:
1342     case PR_LO_REGS:
1343     case PR_HI_REGS:
1344       return 1;
1345     default:
1346       return CEIL (lowest_size, UNITS_PER_WORD);
1347     }
1348   gcc_unreachable ();
1349 }
1350
1351 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1352
1353 static bool
1354 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1355 {
1356   if (GET_MODE_CLASS (mode) == MODE_CC)
1357     return regno == CC_REGNUM;
1358
1359   if (regno == VG_REGNUM)
1360     /* This must have the same size as _Unwind_Word.  */
1361     return mode == DImode;
1362
1363   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1364   if (vec_flags & VEC_SVE_PRED)
1365     return PR_REGNUM_P (regno);
1366
1367   if (PR_REGNUM_P (regno))
1368     return 0;
1369
1370   if (regno == SP_REGNUM)
1371     /* The purpose of comparing with ptr_mode is to support the
1372        global register variable associated with the stack pointer
1373        register via the syntax of asm ("wsp") in ILP32.  */
1374     return mode == Pmode || mode == ptr_mode;
1375
1376   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1377     return mode == Pmode;
1378
1379   if (GP_REGNUM_P (regno) && known_le (GET_MODE_SIZE (mode), 16))
1380     return true;
1381
1382   if (FP_REGNUM_P (regno))
1383     {
1384       if (vec_flags & VEC_STRUCT)
1385         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1386       else
1387         return !VECTOR_MODE_P (mode) || vec_flags != 0;
1388     }
1389
1390   return false;
1391 }
1392
1393 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1394    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1395    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1396
1397 static bool
1398 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1399 {
1400   return FP_REGNUM_P (regno) && maybe_gt (GET_MODE_SIZE (mode), 8);
1401 }
1402
1403 /* Implement REGMODE_NATURAL_SIZE.  */
1404 poly_uint64
1405 aarch64_regmode_natural_size (machine_mode mode)
1406 {
1407   /* The natural size for SVE data modes is one SVE data vector,
1408      and similarly for predicates.  We can't independently modify
1409      anything smaller than that.  */
1410   /* ??? For now, only do this for variable-width SVE registers.
1411      Doing it for constant-sized registers breaks lower-subreg.c.  */
1412   /* ??? And once that's fixed, we should probably have similar
1413      code for Advanced SIMD.  */
1414   if (!aarch64_sve_vg.is_constant ())
1415     {
1416       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1417       if (vec_flags & VEC_SVE_PRED)
1418         return BYTES_PER_SVE_PRED;
1419       if (vec_flags & VEC_SVE_DATA)
1420         return BYTES_PER_SVE_VECTOR;
1421     }
1422   return UNITS_PER_WORD;
1423 }
1424
1425 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1426 machine_mode
1427 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1428                                      machine_mode mode)
1429 {
1430   /* The predicate mode determines which bits are significant and
1431      which are "don't care".  Decreasing the number of lanes would
1432      lose data while increasing the number of lanes would make bits
1433      unnecessarily significant.  */
1434   if (PR_REGNUM_P (regno))
1435     return mode;
1436   if (known_ge (GET_MODE_SIZE (mode), 4))
1437     return mode;
1438   else
1439     return SImode;
1440 }
1441
1442 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
1443    that strcpy from constants will be faster.  */
1444
1445 static HOST_WIDE_INT
1446 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1447 {
1448   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1449     return MAX (align, BITS_PER_WORD);
1450   return align;
1451 }
1452
1453 /* Return true if calls to DECL should be treated as
1454    long-calls (ie called via a register).  */
1455 static bool
1456 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1457 {
1458   return false;
1459 }
1460
1461 /* Return true if calls to symbol-ref SYM should be treated as
1462    long-calls (ie called via a register).  */
1463 bool
1464 aarch64_is_long_call_p (rtx sym)
1465 {
1466   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1467 }
1468
1469 /* Return true if calls to symbol-ref SYM should not go through
1470    plt stubs.  */
1471
1472 bool
1473 aarch64_is_noplt_call_p (rtx sym)
1474 {
1475   const_tree decl = SYMBOL_REF_DECL (sym);
1476
1477   if (flag_pic
1478       && decl
1479       && (!flag_plt
1480           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1481       && !targetm.binds_local_p (decl))
1482     return true;
1483
1484   return false;
1485 }
1486
1487 /* Return true if the offsets to a zero/sign-extract operation
1488    represent an expression that matches an extend operation.  The
1489    operands represent the paramters from
1490
1491    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1492 bool
1493 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1494                                 rtx extract_imm)
1495 {
1496   HOST_WIDE_INT mult_val, extract_val;
1497
1498   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1499     return false;
1500
1501   mult_val = INTVAL (mult_imm);
1502   extract_val = INTVAL (extract_imm);
1503
1504   if (extract_val > 8
1505       && extract_val < GET_MODE_BITSIZE (mode)
1506       && exact_log2 (extract_val & ~7) > 0
1507       && (extract_val & 7) <= 4
1508       && mult_val == (1 << (extract_val & 7)))
1509     return true;
1510
1511   return false;
1512 }
1513
1514 /* Emit an insn that's a simple single-set.  Both the operands must be
1515    known to be valid.  */
1516 inline static rtx_insn *
1517 emit_set_insn (rtx x, rtx y)
1518 {
1519   return emit_insn (gen_rtx_SET (x, y));
1520 }
1521
1522 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1523    return the rtx for register 0 in the proper mode.  */
1524 rtx
1525 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1526 {
1527   machine_mode mode = SELECT_CC_MODE (code, x, y);
1528   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1529
1530   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1531   return cc_reg;
1532 }
1533
1534 /* Build the SYMBOL_REF for __tls_get_addr.  */
1535
1536 static GTY(()) rtx tls_get_addr_libfunc;
1537
1538 rtx
1539 aarch64_tls_get_addr (void)
1540 {
1541   if (!tls_get_addr_libfunc)
1542     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1543   return tls_get_addr_libfunc;
1544 }
1545
1546 /* Return the TLS model to use for ADDR.  */
1547
1548 static enum tls_model
1549 tls_symbolic_operand_type (rtx addr)
1550 {
1551   enum tls_model tls_kind = TLS_MODEL_NONE;
1552   if (GET_CODE (addr) == CONST)
1553     {
1554       poly_int64 addend;
1555       rtx sym = strip_offset (addr, &addend);
1556       if (GET_CODE (sym) == SYMBOL_REF)
1557         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1558     }
1559   else if (GET_CODE (addr) == SYMBOL_REF)
1560     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1561
1562   return tls_kind;
1563 }
1564
1565 /* We'll allow lo_sum's in addresses in our legitimate addresses
1566    so that combine would take care of combining addresses where
1567    necessary, but for generation purposes, we'll generate the address
1568    as :
1569    RTL                               Absolute
1570    tmp = hi (symbol_ref);            adrp  x1, foo
1571    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1572                                      nop
1573
1574    PIC                               TLS
1575    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1576    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1577                                      bl   __tls_get_addr
1578                                      nop
1579
1580    Load TLS symbol, depending on TLS mechanism and TLS access model.
1581
1582    Global Dynamic - Traditional TLS:
1583    adrp tmp, :tlsgd:imm
1584    add  dest, tmp, #:tlsgd_lo12:imm
1585    bl   __tls_get_addr
1586
1587    Global Dynamic - TLS Descriptors:
1588    adrp dest, :tlsdesc:imm
1589    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1590    add  dest, dest, #:tlsdesc_lo12:imm
1591    blr  tmp
1592    mrs  tp, tpidr_el0
1593    add  dest, dest, tp
1594
1595    Initial Exec:
1596    mrs  tp, tpidr_el0
1597    adrp tmp, :gottprel:imm
1598    ldr  dest, [tmp, #:gottprel_lo12:imm]
1599    add  dest, dest, tp
1600
1601    Local Exec:
1602    mrs  tp, tpidr_el0
1603    add  t0, tp, #:tprel_hi12:imm, lsl #12
1604    add  t0, t0, #:tprel_lo12_nc:imm
1605 */
1606
1607 static void
1608 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1609                                    enum aarch64_symbol_type type)
1610 {
1611   switch (type)
1612     {
1613     case SYMBOL_SMALL_ABSOLUTE:
1614       {
1615         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1616         rtx tmp_reg = dest;
1617         machine_mode mode = GET_MODE (dest);
1618
1619         gcc_assert (mode == Pmode || mode == ptr_mode);
1620
1621         if (can_create_pseudo_p ())
1622           tmp_reg = gen_reg_rtx (mode);
1623
1624         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1625         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1626         return;
1627       }
1628
1629     case SYMBOL_TINY_ABSOLUTE:
1630       emit_insn (gen_rtx_SET (dest, imm));
1631       return;
1632
1633     case SYMBOL_SMALL_GOT_28K:
1634       {
1635         machine_mode mode = GET_MODE (dest);
1636         rtx gp_rtx = pic_offset_table_rtx;
1637         rtx insn;
1638         rtx mem;
1639
1640         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1641            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1642            decide rtx costs, in which case pic_offset_table_rtx is not
1643            initialized.  For that case no need to generate the first adrp
1644            instruction as the final cost for global variable access is
1645            one instruction.  */
1646         if (gp_rtx != NULL)
1647           {
1648             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1649                using the page base as GOT base, the first page may be wasted,
1650                in the worst scenario, there is only 28K space for GOT).
1651
1652                The generate instruction sequence for accessing global variable
1653                is:
1654
1655                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1656
1657                Only one instruction needed. But we must initialize
1658                pic_offset_table_rtx properly.  We generate initialize insn for
1659                every global access, and allow CSE to remove all redundant.
1660
1661                The final instruction sequences will look like the following
1662                for multiply global variables access.
1663
1664                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1665
1666                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1667                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1668                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1669                  ...  */
1670
1671             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1672             crtl->uses_pic_offset_table = 1;
1673             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1674
1675             if (mode != GET_MODE (gp_rtx))
1676              gp_rtx = gen_lowpart (mode, gp_rtx);
1677
1678           }
1679
1680         if (mode == ptr_mode)
1681           {
1682             if (mode == DImode)
1683               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1684             else
1685               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1686
1687             mem = XVECEXP (SET_SRC (insn), 0, 0);
1688           }
1689         else
1690           {
1691             gcc_assert (mode == Pmode);
1692
1693             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1694             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1695           }
1696
1697         /* The operand is expected to be MEM.  Whenever the related insn
1698            pattern changed, above code which calculate mem should be
1699            updated.  */
1700         gcc_assert (GET_CODE (mem) == MEM);
1701         MEM_READONLY_P (mem) = 1;
1702         MEM_NOTRAP_P (mem) = 1;
1703         emit_insn (insn);
1704         return;
1705       }
1706
1707     case SYMBOL_SMALL_GOT_4G:
1708       {
1709         /* In ILP32, the mode of dest can be either SImode or DImode,
1710            while the got entry is always of SImode size.  The mode of
1711            dest depends on how dest is used: if dest is assigned to a
1712            pointer (e.g. in the memory), it has SImode; it may have
1713            DImode if dest is dereferenced to access the memeory.
1714            This is why we have to handle three different ldr_got_small
1715            patterns here (two patterns for ILP32).  */
1716
1717         rtx insn;
1718         rtx mem;
1719         rtx tmp_reg = dest;
1720         machine_mode mode = GET_MODE (dest);
1721
1722         if (can_create_pseudo_p ())
1723           tmp_reg = gen_reg_rtx (mode);
1724
1725         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1726         if (mode == ptr_mode)
1727           {
1728             if (mode == DImode)
1729               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1730             else
1731               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1732
1733             mem = XVECEXP (SET_SRC (insn), 0, 0);
1734           }
1735         else
1736           {
1737             gcc_assert (mode == Pmode);
1738
1739             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1740             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1741           }
1742
1743         gcc_assert (GET_CODE (mem) == MEM);
1744         MEM_READONLY_P (mem) = 1;
1745         MEM_NOTRAP_P (mem) = 1;
1746         emit_insn (insn);
1747         return;
1748       }
1749
1750     case SYMBOL_SMALL_TLSGD:
1751       {
1752         rtx_insn *insns;
1753         machine_mode mode = GET_MODE (dest);
1754         rtx result = gen_rtx_REG (mode, R0_REGNUM);
1755
1756         start_sequence ();
1757         if (TARGET_ILP32)
1758           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1759         else
1760           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1761         insns = get_insns ();
1762         end_sequence ();
1763
1764         RTL_CONST_CALL_P (insns) = 1;
1765         emit_libcall_block (insns, dest, result, imm);
1766         return;
1767       }
1768
1769     case SYMBOL_SMALL_TLSDESC:
1770       {
1771         machine_mode mode = GET_MODE (dest);
1772         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1773         rtx tp;
1774
1775         gcc_assert (mode == Pmode || mode == ptr_mode);
1776
1777         /* In ILP32, the got entry is always of SImode size.  Unlike
1778            small GOT, the dest is fixed at reg 0.  */
1779         if (TARGET_ILP32)
1780           emit_insn (gen_tlsdesc_small_si (imm));
1781         else
1782           emit_insn (gen_tlsdesc_small_di (imm));
1783         tp = aarch64_load_tp (NULL);
1784
1785         if (mode != Pmode)
1786           tp = gen_lowpart (mode, tp);
1787
1788         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1789         if (REG_P (dest))
1790           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1791         return;
1792       }
1793
1794     case SYMBOL_SMALL_TLSIE:
1795       {
1796         /* In ILP32, the mode of dest can be either SImode or DImode,
1797            while the got entry is always of SImode size.  The mode of
1798            dest depends on how dest is used: if dest is assigned to a
1799            pointer (e.g. in the memory), it has SImode; it may have
1800            DImode if dest is dereferenced to access the memeory.
1801            This is why we have to handle three different tlsie_small
1802            patterns here (two patterns for ILP32).  */
1803         machine_mode mode = GET_MODE (dest);
1804         rtx tmp_reg = gen_reg_rtx (mode);
1805         rtx tp = aarch64_load_tp (NULL);
1806
1807         if (mode == ptr_mode)
1808           {
1809             if (mode == DImode)
1810               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1811             else
1812               {
1813                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1814                 tp = gen_lowpart (mode, tp);
1815               }
1816           }
1817         else
1818           {
1819             gcc_assert (mode == Pmode);
1820             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1821           }
1822
1823         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1824         if (REG_P (dest))
1825           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1826         return;
1827       }
1828
1829     case SYMBOL_TLSLE12:
1830     case SYMBOL_TLSLE24:
1831     case SYMBOL_TLSLE32:
1832     case SYMBOL_TLSLE48:
1833       {
1834         machine_mode mode = GET_MODE (dest);
1835         rtx tp = aarch64_load_tp (NULL);
1836
1837         if (mode != Pmode)
1838           tp = gen_lowpart (mode, tp);
1839
1840         switch (type)
1841           {
1842           case SYMBOL_TLSLE12:
1843             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1844                         (dest, tp, imm));
1845             break;
1846           case SYMBOL_TLSLE24:
1847             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1848                         (dest, tp, imm));
1849           break;
1850           case SYMBOL_TLSLE32:
1851             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1852                         (dest, imm));
1853             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1854                         (dest, dest, tp));
1855           break;
1856           case SYMBOL_TLSLE48:
1857             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1858                         (dest, imm));
1859             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1860                         (dest, dest, tp));
1861             break;
1862           default:
1863             gcc_unreachable ();
1864           }
1865
1866         if (REG_P (dest))
1867           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1868         return;
1869       }
1870
1871     case SYMBOL_TINY_GOT:
1872       emit_insn (gen_ldr_got_tiny (dest, imm));
1873       return;
1874
1875     case SYMBOL_TINY_TLSIE:
1876       {
1877         machine_mode mode = GET_MODE (dest);
1878         rtx tp = aarch64_load_tp (NULL);
1879
1880         if (mode == ptr_mode)
1881           {
1882             if (mode == DImode)
1883               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1884             else
1885               {
1886                 tp = gen_lowpart (mode, tp);
1887                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1888               }
1889           }
1890         else
1891           {
1892             gcc_assert (mode == Pmode);
1893             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1894           }
1895
1896         if (REG_P (dest))
1897           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1898         return;
1899       }
1900
1901     default:
1902       gcc_unreachable ();
1903     }
1904 }
1905
1906 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1907    handle all moves if !can_create_pseudo_p ().  The distinction is
1908    important because, unlike emit_move_insn, the move expanders know
1909    how to force Pmode objects into the constant pool even when the
1910    constant pool address is not itself legitimate.  */
1911 static rtx
1912 aarch64_emit_move (rtx dest, rtx src)
1913 {
1914   return (can_create_pseudo_p ()
1915           ? emit_move_insn (dest, src)
1916           : emit_move_insn_1 (dest, src));
1917 }
1918
1919 /* Apply UNOPTAB to OP and store the result in DEST.  */
1920
1921 static void
1922 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
1923 {
1924   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
1925   if (dest != tmp)
1926     emit_move_insn (dest, tmp);
1927 }
1928
1929 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
1930
1931 static void
1932 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
1933 {
1934   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
1935                           OPTAB_DIRECT);
1936   if (dest != tmp)
1937     emit_move_insn (dest, tmp);
1938 }
1939
1940 /* Split a 128-bit move operation into two 64-bit move operations,
1941    taking care to handle partial overlap of register to register
1942    copies.  Special cases are needed when moving between GP regs and
1943    FP regs.  SRC can be a register, constant or memory; DST a register
1944    or memory.  If either operand is memory it must not have any side
1945    effects.  */
1946 void
1947 aarch64_split_128bit_move (rtx dst, rtx src)
1948 {
1949   rtx dst_lo, dst_hi;
1950   rtx src_lo, src_hi;
1951
1952   machine_mode mode = GET_MODE (dst);
1953
1954   gcc_assert (mode == TImode || mode == TFmode);
1955   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1956   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1957
1958   if (REG_P (dst) && REG_P (src))
1959     {
1960       int src_regno = REGNO (src);
1961       int dst_regno = REGNO (dst);
1962
1963       /* Handle FP <-> GP regs.  */
1964       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1965         {
1966           src_lo = gen_lowpart (word_mode, src);
1967           src_hi = gen_highpart (word_mode, src);
1968
1969           if (mode == TImode)
1970             {
1971               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1972               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1973             }
1974           else
1975             {
1976               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1977               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1978             }
1979           return;
1980         }
1981       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1982         {
1983           dst_lo = gen_lowpart (word_mode, dst);
1984           dst_hi = gen_highpart (word_mode, dst);
1985
1986           if (mode == TImode)
1987             {
1988               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1989               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1990             }
1991           else
1992             {
1993               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1994               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1995             }
1996           return;
1997         }
1998     }
1999
2000   dst_lo = gen_lowpart (word_mode, dst);
2001   dst_hi = gen_highpart (word_mode, dst);
2002   src_lo = gen_lowpart (word_mode, src);
2003   src_hi = gen_highpart_mode (word_mode, mode, src);
2004
2005   /* At most one pairing may overlap.  */
2006   if (reg_overlap_mentioned_p (dst_lo, src_hi))
2007     {
2008       aarch64_emit_move (dst_hi, src_hi);
2009       aarch64_emit_move (dst_lo, src_lo);
2010     }
2011   else
2012     {
2013       aarch64_emit_move (dst_lo, src_lo);
2014       aarch64_emit_move (dst_hi, src_hi);
2015     }
2016 }
2017
2018 bool
2019 aarch64_split_128bit_move_p (rtx dst, rtx src)
2020 {
2021   return (! REG_P (src)
2022           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2023 }
2024
2025 /* Split a complex SIMD combine.  */
2026
2027 void
2028 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2029 {
2030   machine_mode src_mode = GET_MODE (src1);
2031   machine_mode dst_mode = GET_MODE (dst);
2032
2033   gcc_assert (VECTOR_MODE_P (dst_mode));
2034   gcc_assert (register_operand (dst, dst_mode)
2035               && register_operand (src1, src_mode)
2036               && register_operand (src2, src_mode));
2037
2038   rtx (*gen) (rtx, rtx, rtx);
2039
2040   switch (src_mode)
2041     {
2042     case E_V8QImode:
2043       gen = gen_aarch64_simd_combinev8qi;
2044       break;
2045     case E_V4HImode:
2046       gen = gen_aarch64_simd_combinev4hi;
2047       break;
2048     case E_V2SImode:
2049       gen = gen_aarch64_simd_combinev2si;
2050       break;
2051     case E_V4HFmode:
2052       gen = gen_aarch64_simd_combinev4hf;
2053       break;
2054     case E_V2SFmode:
2055       gen = gen_aarch64_simd_combinev2sf;
2056       break;
2057     case E_DImode:
2058       gen = gen_aarch64_simd_combinedi;
2059       break;
2060     case E_DFmode:
2061       gen = gen_aarch64_simd_combinedf;
2062       break;
2063     default:
2064       gcc_unreachable ();
2065     }
2066
2067   emit_insn (gen (dst, src1, src2));
2068   return;
2069 }
2070
2071 /* Split a complex SIMD move.  */
2072
2073 void
2074 aarch64_split_simd_move (rtx dst, rtx src)
2075 {
2076   machine_mode src_mode = GET_MODE (src);
2077   machine_mode dst_mode = GET_MODE (dst);
2078
2079   gcc_assert (VECTOR_MODE_P (dst_mode));
2080
2081   if (REG_P (dst) && REG_P (src))
2082     {
2083       rtx (*gen) (rtx, rtx);
2084
2085       gcc_assert (VECTOR_MODE_P (src_mode));
2086
2087       switch (src_mode)
2088         {
2089         case E_V16QImode:
2090           gen = gen_aarch64_split_simd_movv16qi;
2091           break;
2092         case E_V8HImode:
2093           gen = gen_aarch64_split_simd_movv8hi;
2094           break;
2095         case E_V4SImode:
2096           gen = gen_aarch64_split_simd_movv4si;
2097           break;
2098         case E_V2DImode:
2099           gen = gen_aarch64_split_simd_movv2di;
2100           break;
2101         case E_V8HFmode:
2102           gen = gen_aarch64_split_simd_movv8hf;
2103           break;
2104         case E_V4SFmode:
2105           gen = gen_aarch64_split_simd_movv4sf;
2106           break;
2107         case E_V2DFmode:
2108           gen = gen_aarch64_split_simd_movv2df;
2109           break;
2110         default:
2111           gcc_unreachable ();
2112         }
2113
2114       emit_insn (gen (dst, src));
2115       return;
2116     }
2117 }
2118
2119 bool
2120 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2121                               machine_mode ymode, rtx y)
2122 {
2123   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2124   gcc_assert (r != NULL);
2125   return rtx_equal_p (x, r);
2126 }
2127
2128
2129 static rtx
2130 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2131 {
2132   if (can_create_pseudo_p ())
2133     return force_reg (mode, value);
2134   else
2135     {
2136       gcc_assert (x);
2137       aarch64_emit_move (x, value);
2138       return x;
2139     }
2140 }
2141
2142 /* Return true if we can move VALUE into a register using a single
2143    CNT[BHWD] instruction.  */
2144
2145 static bool
2146 aarch64_sve_cnt_immediate_p (poly_int64 value)
2147 {
2148   HOST_WIDE_INT factor = value.coeffs[0];
2149   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
2150   return (value.coeffs[1] == factor
2151           && IN_RANGE (factor, 2, 16 * 16)
2152           && (factor & 1) == 0
2153           && factor <= 16 * (factor & -factor));
2154 }
2155
2156 /* Likewise for rtx X.  */
2157
2158 bool
2159 aarch64_sve_cnt_immediate_p (rtx x)
2160 {
2161   poly_int64 value;
2162   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2163 }
2164
2165 /* Return the asm string for an instruction with a CNT-like vector size
2166    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2167    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2168    first part of the operands template (the part that comes before the
2169    vector size itself).  FACTOR is the number of quadwords.
2170    NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2171    If it is zero, we can use any element size.  */
2172
2173 static char *
2174 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2175                                   unsigned int factor,
2176                                   unsigned int nelts_per_vq)
2177 {
2178   static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2179
2180   if (nelts_per_vq == 0)
2181     /* There is some overlap in the ranges of the four CNT instructions.
2182        Here we always use the smallest possible element size, so that the
2183        multiplier is 1 whereever possible.  */
2184     nelts_per_vq = factor & -factor;
2185   int shift = std::min (exact_log2 (nelts_per_vq), 4);
2186   gcc_assert (IN_RANGE (shift, 1, 4));
2187   char suffix = "dwhb"[shift - 1];
2188
2189   factor >>= shift;
2190   unsigned int written;
2191   if (factor == 1)
2192     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2193                         prefix, suffix, operands);
2194   else
2195     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2196                         prefix, suffix, operands, factor);
2197   gcc_assert (written < sizeof (buffer));
2198   return buffer;
2199 }
2200
2201 /* Return the asm string for an instruction with a CNT-like vector size
2202    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2203    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2204    first part of the operands template (the part that comes before the
2205    vector size itself).  X is the value of the vector size operand,
2206    as a polynomial integer rtx.  */
2207
2208 char *
2209 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2210                                   rtx x)
2211 {
2212   poly_int64 value = rtx_to_poly_int64 (x);
2213   gcc_assert (aarch64_sve_cnt_immediate_p (value));
2214   return aarch64_output_sve_cnt_immediate (prefix, operands,
2215                                            value.coeffs[1], 0);
2216 }
2217
2218 /* Return true if we can add VALUE to a register using a single ADDVL
2219    or ADDPL instruction.  */
2220
2221 static bool
2222 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2223 {
2224   HOST_WIDE_INT factor = value.coeffs[0];
2225   if (factor == 0 || value.coeffs[1] != factor)
2226     return false;
2227   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2228      and a value of 16 is one vector width.  */
2229   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2230           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2231 }
2232
2233 /* Likewise for rtx X.  */
2234
2235 bool
2236 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2237 {
2238   poly_int64 value;
2239   return (poly_int_rtx_p (x, &value)
2240           && aarch64_sve_addvl_addpl_immediate_p (value));
2241 }
2242
2243 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2244    and storing the result in operand 0.  */
2245
2246 char *
2247 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2248 {
2249   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2250   poly_int64 offset_value = rtx_to_poly_int64 (offset);
2251   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2252
2253   /* Use INC or DEC if possible.  */
2254   if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2255     {
2256       if (aarch64_sve_cnt_immediate_p (offset_value))
2257         return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2258                                                  offset_value.coeffs[1], 0);
2259       if (aarch64_sve_cnt_immediate_p (-offset_value))
2260         return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2261                                                  -offset_value.coeffs[1], 0);
2262     }
2263
2264   int factor = offset_value.coeffs[1];
2265   if ((factor & 15) == 0)
2266     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2267   else
2268     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2269   return buffer;
2270 }
2271
2272 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2273    instruction.  If it is, store the number of elements in each vector
2274    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2275    factor in *FACTOR_OUT (if nonnull).  */
2276
2277 bool
2278 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2279                                  unsigned int *nelts_per_vq_out)
2280 {
2281   rtx elt;
2282   poly_int64 value;
2283
2284   if (!const_vec_duplicate_p (x, &elt)
2285       || !poly_int_rtx_p (elt, &value))
2286     return false;
2287
2288   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2289   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2290     /* There's no vector INCB.  */
2291     return false;
2292
2293   HOST_WIDE_INT factor = value.coeffs[0];
2294   if (value.coeffs[1] != factor)
2295     return false;
2296
2297   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
2298   if ((factor % nelts_per_vq) != 0
2299       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2300     return false;
2301
2302   if (factor_out)
2303     *factor_out = factor;
2304   if (nelts_per_vq_out)
2305     *nelts_per_vq_out = nelts_per_vq;
2306   return true;
2307 }
2308
2309 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2310    instruction.  */
2311
2312 bool
2313 aarch64_sve_inc_dec_immediate_p (rtx x)
2314 {
2315   return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2316 }
2317
2318 /* Return the asm template for an SVE vector INC or DEC instruction.
2319    OPERANDS gives the operands before the vector count and X is the
2320    value of the vector count operand itself.  */
2321
2322 char *
2323 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2324 {
2325   int factor;
2326   unsigned int nelts_per_vq;
2327   if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2328     gcc_unreachable ();
2329   if (factor < 0)
2330     return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2331                                              nelts_per_vq);
2332   else
2333     return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2334                                              nelts_per_vq);
2335 }
2336
2337 static int
2338 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2339                                 scalar_int_mode mode)
2340 {
2341   int i;
2342   unsigned HOST_WIDE_INT val, val2, mask;
2343   int one_match, zero_match;
2344   int num_insns;
2345
2346   val = INTVAL (imm);
2347
2348   if (aarch64_move_imm (val, mode))
2349     {
2350       if (generate)
2351         emit_insn (gen_rtx_SET (dest, imm));
2352       return 1;
2353     }
2354
2355   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2356      (with XXXX non-zero). In that case check to see if the move can be done in
2357      a smaller mode.  */
2358   val2 = val & 0xffffffff;
2359   if (mode == DImode
2360       && aarch64_move_imm (val2, SImode)
2361       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2362     {
2363       if (generate)
2364         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2365
2366       /* Check if we have to emit a second instruction by checking to see
2367          if any of the upper 32 bits of the original DI mode value is set.  */
2368       if (val == val2)
2369         return 1;
2370
2371       i = (val >> 48) ? 48 : 32;
2372
2373       if (generate)
2374          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2375                                     GEN_INT ((val >> i) & 0xffff)));
2376
2377       return 2;
2378     }
2379
2380   if ((val >> 32) == 0 || mode == SImode)
2381     {
2382       if (generate)
2383         {
2384           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2385           if (mode == SImode)
2386             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2387                                        GEN_INT ((val >> 16) & 0xffff)));
2388           else
2389             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2390                                        GEN_INT ((val >> 16) & 0xffff)));
2391         }
2392       return 2;
2393     }
2394
2395   /* Remaining cases are all for DImode.  */
2396
2397   mask = 0xffff;
2398   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2399     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2400   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2401     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2402
2403   if (zero_match != 2 && one_match != 2)
2404     {
2405       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2406          For a 64-bit bitmask try whether changing 16 bits to all ones or
2407          zeroes creates a valid bitmask.  To check any repeated bitmask,
2408          try using 16 bits from the other 32-bit half of val.  */
2409
2410       for (i = 0; i < 64; i += 16, mask <<= 16)
2411         {
2412           val2 = val & ~mask;
2413           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2414             break;
2415           val2 = val | mask;
2416           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2417             break;
2418           val2 = val2 & ~mask;
2419           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2420           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2421             break;
2422         }
2423       if (i != 64)
2424         {
2425           if (generate)
2426             {
2427               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2428               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2429                                          GEN_INT ((val >> i) & 0xffff)));
2430             }
2431           return 2;
2432         }
2433     }
2434
2435   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2436      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
2437      otherwise skip zero bits.  */
2438
2439   num_insns = 1;
2440   mask = 0xffff;
2441   val2 = one_match > zero_match ? ~val : val;
2442   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2443
2444   if (generate)
2445     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2446                                            ? (val | ~(mask << i))
2447                                            : (val & (mask << i)))));
2448   for (i += 16; i < 64; i += 16)
2449     {
2450       if ((val2 & (mask << i)) == 0)
2451         continue;
2452       if (generate)
2453         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2454                                    GEN_INT ((val >> i) & 0xffff)));
2455       num_insns ++;
2456     }
2457
2458   return num_insns;
2459 }
2460
2461 /* Return whether imm is a 128-bit immediate which is simple enough to
2462    expand inline.  */
2463 bool
2464 aarch64_mov128_immediate (rtx imm)
2465 {
2466   if (GET_CODE (imm) == CONST_INT)
2467     return true;
2468
2469   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2470
2471   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2472   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2473
2474   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2475          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2476 }
2477
2478
2479 /* Return the number of temporary registers that aarch64_add_offset_1
2480    would need to add OFFSET to a register.  */
2481
2482 static unsigned int
2483 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2484 {
2485   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2486 }
2487
2488 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
2489    a non-polynomial OFFSET.  MODE is the mode of the addition.
2490    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2491    be set and CFA adjustments added to the generated instructions.
2492
2493    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2494    temporary if register allocation is already complete.  This temporary
2495    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
2496    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2497    the immediate again.
2498
2499    Since this function may be used to adjust the stack pointer, we must
2500    ensure that it cannot cause transient stack deallocation (for example
2501    by first incrementing SP and then decrementing when adjusting by a
2502    large immediate).  */
2503
2504 static void
2505 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2506                       rtx src, HOST_WIDE_INT offset, rtx temp1,
2507                       bool frame_related_p, bool emit_move_imm)
2508 {
2509   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2510   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2511
2512   HOST_WIDE_INT moffset = abs_hwi (offset);
2513   rtx_insn *insn;
2514
2515   if (!moffset)
2516     {
2517       if (!rtx_equal_p (dest, src))
2518         {
2519           insn = emit_insn (gen_rtx_SET (dest, src));
2520           RTX_FRAME_RELATED_P (insn) = frame_related_p;
2521         }
2522       return;
2523     }
2524
2525   /* Single instruction adjustment.  */
2526   if (aarch64_uimm12_shift (moffset))
2527     {
2528       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2529       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2530       return;
2531     }
2532
2533   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2534      and either:
2535
2536      a) the offset cannot be loaded by a 16-bit move or
2537      b) there is no spare register into which we can move it.  */
2538   if (moffset < 0x1000000
2539       && ((!temp1 && !can_create_pseudo_p ())
2540           || !aarch64_move_imm (moffset, mode)))
2541     {
2542       HOST_WIDE_INT low_off = moffset & 0xfff;
2543
2544       low_off = offset < 0 ? -low_off : low_off;
2545       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2546       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2547       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2548       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2549       return;
2550     }
2551
2552   /* Emit a move immediate if required and an addition/subtraction.  */
2553   if (emit_move_imm)
2554     {
2555       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2556       temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2557     }
2558   insn = emit_insn (offset < 0
2559                     ? gen_sub3_insn (dest, src, temp1)
2560                     : gen_add3_insn (dest, src, temp1));
2561   if (frame_related_p)
2562     {
2563       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2564       rtx adj = plus_constant (mode, src, offset);
2565       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2566     }
2567 }
2568
2569 /* Return the number of temporary registers that aarch64_add_offset
2570    would need to move OFFSET into a register or add OFFSET to a register;
2571    ADD_P is true if we want the latter rather than the former.  */
2572
2573 static unsigned int
2574 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2575 {
2576   /* This follows the same structure as aarch64_add_offset.  */
2577   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2578     return 0;
2579
2580   unsigned int count = 0;
2581   HOST_WIDE_INT factor = offset.coeffs[1];
2582   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2583   poly_int64 poly_offset (factor, factor);
2584   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2585     /* Need one register for the ADDVL/ADDPL result.  */
2586     count += 1;
2587   else if (factor != 0)
2588     {
2589       factor = abs (factor);
2590       if (factor > 16 * (factor & -factor))
2591         /* Need one register for the CNT result and one for the multiplication
2592            factor.  If necessary, the second temporary can be reused for the
2593            constant part of the offset.  */
2594         return 2;
2595       /* Need one register for the CNT result (which might then
2596          be shifted).  */
2597       count += 1;
2598     }
2599   return count + aarch64_add_offset_1_temporaries (constant);
2600 }
2601
2602 /* If X can be represented as a poly_int64, return the number
2603    of temporaries that are required to add it to a register.
2604    Return -1 otherwise.  */
2605
2606 int
2607 aarch64_add_offset_temporaries (rtx x)
2608 {
2609   poly_int64 offset;
2610   if (!poly_int_rtx_p (x, &offset))
2611     return -1;
2612   return aarch64_offset_temporaries (true, offset);
2613 }
2614
2615 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
2616    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2617    be set and CFA adjustments added to the generated instructions.
2618
2619    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2620    temporary if register allocation is already complete.  This temporary
2621    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2622    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2623    false to avoid emitting the immediate again.
2624
2625    TEMP2, if nonnull, is a second temporary register that doesn't
2626    overlap either DEST or REG.
2627
2628    Since this function may be used to adjust the stack pointer, we must
2629    ensure that it cannot cause transient stack deallocation (for example
2630    by first incrementing SP and then decrementing when adjusting by a
2631    large immediate).  */
2632
2633 static void
2634 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2635                     poly_int64 offset, rtx temp1, rtx temp2,
2636                     bool frame_related_p, bool emit_move_imm = true)
2637 {
2638   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2639   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2640   gcc_assert (temp1 == NULL_RTX
2641               || !frame_related_p
2642               || !reg_overlap_mentioned_p (temp1, dest));
2643   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2644
2645   /* Try using ADDVL or ADDPL to add the whole value.  */
2646   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2647     {
2648       rtx offset_rtx = gen_int_mode (offset, mode);
2649       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2650       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2651       return;
2652     }
2653
2654   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2655      SVE vector register, over and above the minimum size of 128 bits.
2656      This is equivalent to half the value returned by CNTD with a
2657      vector shape of ALL.  */
2658   HOST_WIDE_INT factor = offset.coeffs[1];
2659   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2660
2661   /* Try using ADDVL or ADDPL to add the VG-based part.  */
2662   poly_int64 poly_offset (factor, factor);
2663   if (src != const0_rtx
2664       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2665     {
2666       rtx offset_rtx = gen_int_mode (poly_offset, mode);
2667       if (frame_related_p)
2668         {
2669           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2670           RTX_FRAME_RELATED_P (insn) = true;
2671           src = dest;
2672         }
2673       else
2674         {
2675           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2676           src = aarch64_force_temporary (mode, temp1, addr);
2677           temp1 = temp2;
2678           temp2 = NULL_RTX;
2679         }
2680     }
2681   /* Otherwise use a CNT-based sequence.  */
2682   else if (factor != 0)
2683     {
2684       /* Use a subtraction if we have a negative factor.  */
2685       rtx_code code = PLUS;
2686       if (factor < 0)
2687         {
2688           factor = -factor;
2689           code = MINUS;
2690         }
2691
2692       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
2693          into the multiplication.  */
2694       rtx val;
2695       int shift = 0;
2696       if (factor & 1)
2697         /* Use a right shift by 1.  */
2698         shift = -1;
2699       else
2700         factor /= 2;
2701       HOST_WIDE_INT low_bit = factor & -factor;
2702       if (factor <= 16 * low_bit)
2703         {
2704           if (factor > 16 * 8)
2705             {
2706               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2707                  the value with the minimum multiplier and shift it into
2708                  position.  */
2709               int extra_shift = exact_log2 (low_bit);
2710               shift += extra_shift;
2711               factor >>= extra_shift;
2712             }
2713           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
2714         }
2715       else
2716         {
2717           /* Use CNTD, then multiply it by FACTOR.  */
2718           val = gen_int_mode (poly_int64 (2, 2), mode);
2719           val = aarch64_force_temporary (mode, temp1, val);
2720
2721           /* Go back to using a negative multiplication factor if we have
2722              no register from which to subtract.  */
2723           if (code == MINUS && src == const0_rtx)
2724             {
2725               factor = -factor;
2726               code = PLUS;
2727             }
2728           rtx coeff1 = gen_int_mode (factor, mode);
2729           coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
2730           val = gen_rtx_MULT (mode, val, coeff1);
2731         }
2732
2733       if (shift > 0)
2734         {
2735           /* Multiply by 1 << SHIFT.  */
2736           val = aarch64_force_temporary (mode, temp1, val);
2737           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
2738         }
2739       else if (shift == -1)
2740         {
2741           /* Divide by 2.  */
2742           val = aarch64_force_temporary (mode, temp1, val);
2743           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
2744         }
2745
2746       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
2747       if (src != const0_rtx)
2748         {
2749           val = aarch64_force_temporary (mode, temp1, val);
2750           val = gen_rtx_fmt_ee (code, mode, src, val);
2751         }
2752       else if (code == MINUS)
2753         {
2754           val = aarch64_force_temporary (mode, temp1, val);
2755           val = gen_rtx_NEG (mode, val);
2756         }
2757
2758       if (constant == 0 || frame_related_p)
2759         {
2760           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
2761           if (frame_related_p)
2762             {
2763               RTX_FRAME_RELATED_P (insn) = true;
2764               add_reg_note (insn, REG_CFA_ADJUST_CFA,
2765                             gen_rtx_SET (dest, plus_constant (Pmode, src,
2766                                                               poly_offset)));
2767             }
2768           src = dest;
2769           if (constant == 0)
2770             return;
2771         }
2772       else
2773         {
2774           src = aarch64_force_temporary (mode, temp1, val);
2775           temp1 = temp2;
2776           temp2 = NULL_RTX;
2777         }
2778
2779       emit_move_imm = true;
2780     }
2781
2782   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
2783                         frame_related_p, emit_move_imm);
2784 }
2785
2786 /* Like aarch64_add_offset, but the offset is given as an rtx rather
2787    than a poly_int64.  */
2788
2789 void
2790 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2791                           rtx offset_rtx, rtx temp1, rtx temp2)
2792 {
2793   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
2794                       temp1, temp2, false);
2795 }
2796
2797 /* Add DELTA to the stack pointer, marking the instructions frame-related.
2798    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
2799    if TEMP1 already contains abs (DELTA).  */
2800
2801 static inline void
2802 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
2803 {
2804   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
2805                       temp1, temp2, true, emit_move_imm);
2806 }
2807
2808 /* Subtract DELTA from the stack pointer, marking the instructions
2809    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
2810    if nonnull.  */
2811
2812 static inline void
2813 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p)
2814 {
2815   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
2816                       temp1, temp2, frame_related_p);
2817 }
2818
2819 /* Set DEST to (vec_series BASE STEP).  */
2820
2821 static void
2822 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
2823 {
2824   machine_mode mode = GET_MODE (dest);
2825   scalar_mode inner = GET_MODE_INNER (mode);
2826
2827   /* Each operand can be a register or an immediate in the range [-16, 15].  */
2828   if (!aarch64_sve_index_immediate_p (base))
2829     base = force_reg (inner, base);
2830   if (!aarch64_sve_index_immediate_p (step))
2831     step = force_reg (inner, step);
2832
2833   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
2834 }
2835
2836 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
2837    integer of mode INT_MODE.  Return true on success.  */
2838
2839 static bool
2840 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
2841                                       rtx src)
2842 {
2843   /* If the constant is smaller than 128 bits, we can do the move
2844      using a vector of SRC_MODEs.  */
2845   if (src_mode != TImode)
2846     {
2847       poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
2848                                      GET_MODE_SIZE (src_mode));
2849       machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
2850       emit_move_insn (gen_lowpart (dup_mode, dest),
2851                       gen_const_vec_duplicate (dup_mode, src));
2852       return true;
2853     }
2854
2855   /* Use LD1RQ[BHWD] to load the 128 bits from memory.  */
2856   src = force_const_mem (src_mode, src);
2857   if (!src)
2858     return false;
2859
2860   /* Make sure that the address is legitimate.  */
2861   if (!aarch64_sve_ld1r_operand_p (src))
2862     {
2863       rtx addr = force_reg (Pmode, XEXP (src, 0));
2864       src = replace_equiv_address (src, addr);
2865     }
2866
2867   machine_mode mode = GET_MODE (dest);
2868   unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
2869   machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
2870   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
2871   src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
2872   emit_insn (gen_rtx_SET (dest, src));
2873   return true;
2874 }
2875
2876 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
2877    isn't a simple duplicate or series.  */
2878
2879 static void
2880 aarch64_expand_sve_const_vector (rtx dest, rtx src)
2881 {
2882   machine_mode mode = GET_MODE (src);
2883   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
2884   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
2885   gcc_assert (npatterns > 1);
2886
2887   if (nelts_per_pattern == 1)
2888     {
2889       /* The constant is a repeating seqeuence of at least two elements,
2890          where the repeating elements occupy no more than 128 bits.
2891          Get an integer representation of the replicated value.  */
2892       scalar_int_mode int_mode;
2893       if (BYTES_BIG_ENDIAN)
2894         /* For now, always use LD1RQ to load the value on big-endian
2895            targets, since the handling of smaller integers includes a
2896            subreg that is semantically an element reverse.  */
2897         int_mode = TImode;
2898       else
2899         {
2900           unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
2901           gcc_assert (int_bits <= 128);
2902           int_mode = int_mode_for_size (int_bits, 0).require ();
2903         }
2904       rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
2905       if (int_value
2906           && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
2907         return;
2908     }
2909
2910   /* Expand each pattern individually.  */
2911   rtx_vector_builder builder;
2912   auto_vec<rtx, 16> vectors (npatterns);
2913   for (unsigned int i = 0; i < npatterns; ++i)
2914     {
2915       builder.new_vector (mode, 1, nelts_per_pattern);
2916       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
2917         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
2918       vectors.quick_push (force_reg (mode, builder.build ()));
2919     }
2920
2921   /* Use permutes to interleave the separate vectors.  */
2922   while (npatterns > 1)
2923     {
2924       npatterns /= 2;
2925       for (unsigned int i = 0; i < npatterns; ++i)
2926         {
2927           rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
2928           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
2929           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
2930           vectors[i] = tmp;
2931         }
2932     }
2933   gcc_assert (vectors[0] == dest);
2934 }
2935
2936 /* Set DEST to immediate IMM.  For SVE vector modes, GEN_VEC_DUPLICATE
2937    is a pattern that can be used to set DEST to a replicated scalar
2938    element.  */
2939
2940 void
2941 aarch64_expand_mov_immediate (rtx dest, rtx imm,
2942                               rtx (*gen_vec_duplicate) (rtx, rtx))
2943 {
2944   machine_mode mode = GET_MODE (dest);
2945
2946   /* Check on what type of symbol it is.  */
2947   scalar_int_mode int_mode;
2948   if ((GET_CODE (imm) == SYMBOL_REF
2949        || GET_CODE (imm) == LABEL_REF
2950        || GET_CODE (imm) == CONST
2951        || GET_CODE (imm) == CONST_POLY_INT)
2952       && is_a <scalar_int_mode> (mode, &int_mode))
2953     {
2954       rtx mem;
2955       poly_int64 offset;
2956       HOST_WIDE_INT const_offset;
2957       enum aarch64_symbol_type sty;
2958
2959       /* If we have (const (plus symbol offset)), separate out the offset
2960          before we start classifying the symbol.  */
2961       rtx base = strip_offset (imm, &offset);
2962
2963       /* We must always add an offset involving VL separately, rather than
2964          folding it into the relocation.  */
2965       if (!offset.is_constant (&const_offset))
2966         {
2967           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
2968             emit_insn (gen_rtx_SET (dest, imm));
2969           else
2970             {
2971               /* Do arithmetic on 32-bit values if the result is smaller
2972                  than that.  */
2973               if (partial_subreg_p (int_mode, SImode))
2974                 {
2975                   /* It is invalid to do symbol calculations in modes
2976                      narrower than SImode.  */
2977                   gcc_assert (base == const0_rtx);
2978                   dest = gen_lowpart (SImode, dest);
2979                   int_mode = SImode;
2980                 }
2981               if (base != const0_rtx)
2982                 {
2983                   base = aarch64_force_temporary (int_mode, dest, base);
2984                   aarch64_add_offset (int_mode, dest, base, offset,
2985                                       NULL_RTX, NULL_RTX, false);
2986                 }
2987               else
2988                 aarch64_add_offset (int_mode, dest, base, offset,
2989                                     dest, NULL_RTX, false);
2990             }
2991           return;
2992         }
2993
2994       sty = aarch64_classify_symbol (base, const_offset);
2995       switch (sty)
2996         {
2997         case SYMBOL_FORCE_TO_MEM:
2998           if (const_offset != 0
2999               && targetm.cannot_force_const_mem (int_mode, imm))
3000             {
3001               gcc_assert (can_create_pseudo_p ());
3002               base = aarch64_force_temporary (int_mode, dest, base);
3003               aarch64_add_offset (int_mode, dest, base, const_offset,
3004                                   NULL_RTX, NULL_RTX, false);
3005               return;
3006             }
3007
3008           mem = force_const_mem (ptr_mode, imm);
3009           gcc_assert (mem);
3010
3011           /* If we aren't generating PC relative literals, then
3012              we need to expand the literal pool access carefully.
3013              This is something that needs to be done in a number
3014              of places, so could well live as a separate function.  */
3015           if (!aarch64_pcrelative_literal_loads)
3016             {
3017               gcc_assert (can_create_pseudo_p ());
3018               base = gen_reg_rtx (ptr_mode);
3019               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
3020               if (ptr_mode != Pmode)
3021                 base = convert_memory_address (Pmode, base);
3022               mem = gen_rtx_MEM (ptr_mode, base);
3023             }
3024
3025           if (int_mode != ptr_mode)
3026             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
3027
3028           emit_insn (gen_rtx_SET (dest, mem));
3029
3030           return;
3031
3032         case SYMBOL_SMALL_TLSGD:
3033         case SYMBOL_SMALL_TLSDESC:
3034         case SYMBOL_SMALL_TLSIE:
3035         case SYMBOL_SMALL_GOT_28K:
3036         case SYMBOL_SMALL_GOT_4G:
3037         case SYMBOL_TINY_GOT:
3038         case SYMBOL_TINY_TLSIE:
3039           if (const_offset != 0)
3040             {
3041               gcc_assert(can_create_pseudo_p ());
3042               base = aarch64_force_temporary (int_mode, dest, base);
3043               aarch64_add_offset (int_mode, dest, base, const_offset,
3044                                   NULL_RTX, NULL_RTX, false);
3045               return;
3046             }
3047           /* FALLTHRU */
3048
3049         case SYMBOL_SMALL_ABSOLUTE:
3050         case SYMBOL_TINY_ABSOLUTE:
3051         case SYMBOL_TLSLE12:
3052         case SYMBOL_TLSLE24:
3053         case SYMBOL_TLSLE32:
3054         case SYMBOL_TLSLE48:
3055           aarch64_load_symref_appropriately (dest, imm, sty);
3056           return;
3057
3058         default:
3059           gcc_unreachable ();
3060         }
3061     }
3062
3063   if (!CONST_INT_P (imm))
3064     {
3065       rtx base, step, value;
3066       if (GET_CODE (imm) == HIGH
3067           || aarch64_simd_valid_immediate (imm, NULL))
3068         emit_insn (gen_rtx_SET (dest, imm));
3069       else if (const_vec_series_p (imm, &base, &step))
3070         aarch64_expand_vec_series (dest, base, step);
3071       else if (const_vec_duplicate_p (imm, &value))
3072         {
3073           /* If the constant is out of range of an SVE vector move,
3074              load it from memory if we can, otherwise move it into
3075              a register and use a DUP.  */
3076           scalar_mode inner_mode = GET_MODE_INNER (mode);
3077           rtx op = force_const_mem (inner_mode, value);
3078           if (!op)
3079             op = force_reg (inner_mode, value);
3080           else if (!aarch64_sve_ld1r_operand_p (op))
3081             {
3082               rtx addr = force_reg (Pmode, XEXP (op, 0));
3083               op = replace_equiv_address (op, addr);
3084             }
3085           emit_insn (gen_vec_duplicate (dest, op));
3086         }
3087       else if (GET_CODE (imm) == CONST_VECTOR
3088                && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3089         aarch64_expand_sve_const_vector (dest, imm);
3090       else
3091         {
3092           rtx mem = force_const_mem (mode, imm);
3093           gcc_assert (mem);
3094           emit_move_insn (dest, mem);
3095         }
3096
3097       return;
3098     }
3099
3100   aarch64_internal_mov_immediate (dest, imm, true,
3101                                   as_a <scalar_int_mode> (mode));
3102 }
3103
3104 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
3105    that is known to contain PTRUE.  */
3106
3107 void
3108 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3109 {
3110   emit_insn (gen_rtx_SET (dest, gen_rtx_UNSPEC (GET_MODE (dest),
3111                                                 gen_rtvec (2, pred, src),
3112                                                 UNSPEC_MERGE_PTRUE)));
3113 }
3114
3115 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3116    operand is in memory.  In this case we need to use the predicated LD1
3117    and ST1 instead of LDR and STR, both for correctness on big-endian
3118    targets and because LD1 and ST1 support a wider range of addressing modes.
3119    PRED_MODE is the mode of the predicate.
3120
3121    See the comment at the head of aarch64-sve.md for details about the
3122    big-endian handling.  */
3123
3124 void
3125 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3126 {
3127   machine_mode mode = GET_MODE (dest);
3128   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3129   if (!register_operand (src, mode)
3130       && !register_operand (dest, mode))
3131     {
3132       rtx tmp = gen_reg_rtx (mode);
3133       if (MEM_P (src))
3134         aarch64_emit_sve_pred_move (tmp, ptrue, src);
3135       else
3136         emit_move_insn (tmp, src);
3137       src = tmp;
3138     }
3139   aarch64_emit_sve_pred_move (dest, ptrue, src);
3140 }
3141
3142 /* Called only on big-endian targets.  See whether an SVE vector move
3143    from SRC to DEST is effectively a REV[BHW] instruction, because at
3144    least one operand is a subreg of an SVE vector that has wider or
3145    narrower elements.  Return true and emit the instruction if so.
3146
3147    For example:
3148
3149      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3150
3151    represents a VIEW_CONVERT between the following vectors, viewed
3152    in memory order:
3153
3154      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
3155      R1: { [0],      [1],      [2],      [3],     ... }
3156
3157    The high part of lane X in R2 should therefore correspond to lane X*2
3158    of R1, but the register representations are:
3159
3160          msb                                      lsb
3161      R2: ...... [1].high  [1].low   [0].high  [0].low
3162      R1: ...... [3]       [2]       [1]       [0]
3163
3164    where the low part of lane X in R2 corresponds to lane X*2 in R1.
3165    We therefore need a reverse operation to swap the high and low values
3166    around.
3167
3168    This is purely an optimization.  Without it we would spill the
3169    subreg operand to the stack in one mode and reload it in the
3170    other mode, which has the same effect as the REV.  */
3171
3172 bool
3173 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3174 {
3175   gcc_assert (BYTES_BIG_ENDIAN);
3176   if (GET_CODE (dest) == SUBREG)
3177     dest = SUBREG_REG (dest);
3178   if (GET_CODE (src) == SUBREG)
3179     src = SUBREG_REG (src);
3180
3181   /* The optimization handles two single SVE REGs with different element
3182      sizes.  */
3183   if (!REG_P (dest)
3184       || !REG_P (src)
3185       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3186       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3187       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3188           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3189     return false;
3190
3191   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
3192   rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3193   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3194                                UNSPEC_REV_SUBREG);
3195   emit_insn (gen_rtx_SET (dest, unspec));
3196   return true;
3197 }
3198
3199 /* Return a copy of X with mode MODE, without changing its other
3200    attributes.  Unlike gen_lowpart, this doesn't care whether the
3201    mode change is valid.  */
3202
3203 static rtx
3204 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3205 {
3206   if (GET_MODE (x) == mode)
3207     return x;
3208
3209   x = shallow_copy_rtx (x);
3210   set_mode_and_regno (x, mode, REGNO (x));
3211   return x;
3212 }
3213
3214 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3215    operands.  */
3216
3217 void
3218 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3219 {
3220   /* Decide which REV operation we need.  The mode with narrower elements
3221      determines the mode of the operands and the mode with the wider
3222      elements determines the reverse width.  */
3223   machine_mode mode_with_wider_elts = GET_MODE (dest);
3224   machine_mode mode_with_narrower_elts = GET_MODE (src);
3225   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3226       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3227     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3228
3229   unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3230   unsigned int unspec;
3231   if (wider_bytes == 8)
3232     unspec = UNSPEC_REV64;
3233   else if (wider_bytes == 4)
3234     unspec = UNSPEC_REV32;
3235   else if (wider_bytes == 2)
3236     unspec = UNSPEC_REV16;
3237   else
3238     gcc_unreachable ();
3239   machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3240
3241   /* Emit:
3242
3243        (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3244                          UNSPEC_MERGE_PTRUE))
3245
3246      with the appropriate modes.  */
3247   ptrue = gen_lowpart (pred_mode, ptrue);
3248   dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3249   src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3250   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3251   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3252                         UNSPEC_MERGE_PTRUE);
3253   emit_insn (gen_rtx_SET (dest, src));
3254 }
3255
3256 static bool
3257 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3258                                  tree exp ATTRIBUTE_UNUSED)
3259 {
3260   /* Currently, always true.  */
3261   return true;
3262 }
3263
3264 /* Implement TARGET_PASS_BY_REFERENCE.  */
3265
3266 static bool
3267 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3268                            machine_mode mode,
3269                            const_tree type,
3270                            bool named ATTRIBUTE_UNUSED)
3271 {
3272   HOST_WIDE_INT size;
3273   machine_mode dummymode;
3274   int nregs;
3275
3276   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
3277   if (mode == BLKmode && type)
3278     size = int_size_in_bytes (type);
3279   else
3280     /* No frontends can create types with variable-sized modes, so we
3281        shouldn't be asked to pass or return them.  */
3282     size = GET_MODE_SIZE (mode).to_constant ();
3283
3284   /* Aggregates are passed by reference based on their size.  */
3285   if (type && AGGREGATE_TYPE_P (type))
3286     {
3287       size = int_size_in_bytes (type);
3288     }
3289
3290   /* Variable sized arguments are always returned by reference.  */
3291   if (size < 0)
3292     return true;
3293
3294   /* Can this be a candidate to be passed in fp/simd register(s)?  */
3295   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3296                                                &dummymode, &nregs,
3297                                                NULL))
3298     return false;
3299
3300   /* Arguments which are variable sized or larger than 2 registers are
3301      passed by reference unless they are a homogenous floating point
3302      aggregate.  */
3303   return size > 2 * UNITS_PER_WORD;
3304 }
3305
3306 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
3307 static bool
3308 aarch64_return_in_msb (const_tree valtype)
3309 {
3310   machine_mode dummy_mode;
3311   int dummy_int;
3312
3313   /* Never happens in little-endian mode.  */
3314   if (!BYTES_BIG_ENDIAN)
3315     return false;
3316
3317   /* Only composite types smaller than or equal to 16 bytes can
3318      be potentially returned in registers.  */
3319   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3320       || int_size_in_bytes (valtype) <= 0
3321       || int_size_in_bytes (valtype) > 16)
3322     return false;
3323
3324   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3325      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3326      is always passed/returned in the least significant bits of fp/simd
3327      register(s).  */
3328   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3329                                                &dummy_mode, &dummy_int, NULL))
3330     return false;
3331
3332   return true;
3333 }
3334
3335 /* Implement TARGET_FUNCTION_VALUE.
3336    Define how to find the value returned by a function.  */
3337
3338 static rtx
3339 aarch64_function_value (const_tree type, const_tree func,
3340                         bool outgoing ATTRIBUTE_UNUSED)
3341 {
3342   machine_mode mode;
3343   int unsignedp;
3344   int count;
3345   machine_mode ag_mode;
3346
3347   mode = TYPE_MODE (type);
3348   if (INTEGRAL_TYPE_P (type))
3349     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3350
3351   if (aarch64_return_in_msb (type))
3352     {
3353       HOST_WIDE_INT size = int_size_in_bytes (type);
3354
3355       if (size % UNITS_PER_WORD != 0)
3356         {
3357           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3358           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3359         }
3360     }
3361
3362   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3363                                                &ag_mode, &count, NULL))
3364     {
3365       if (!aarch64_composite_type_p (type, mode))
3366         {
3367           gcc_assert (count == 1 && mode == ag_mode);
3368           return gen_rtx_REG (mode, V0_REGNUM);
3369         }
3370       else
3371         {
3372           int i;
3373           rtx par;
3374
3375           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3376           for (i = 0; i < count; i++)
3377             {
3378               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3379               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3380               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3381               XVECEXP (par, 0, i) = tmp;
3382             }
3383           return par;
3384         }
3385     }
3386   else
3387     return gen_rtx_REG (mode, R0_REGNUM);
3388 }
3389
3390 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3391    Return true if REGNO is the number of a hard register in which the values
3392    of called function may come back.  */
3393
3394 static bool
3395 aarch64_function_value_regno_p (const unsigned int regno)
3396 {
3397   /* Maximum of 16 bytes can be returned in the general registers.  Examples
3398      of 16-byte return values are: 128-bit integers and 16-byte small
3399      structures (excluding homogeneous floating-point aggregates).  */
3400   if (regno == R0_REGNUM || regno == R1_REGNUM)
3401     return true;
3402
3403   /* Up to four fp/simd registers can return a function value, e.g. a
3404      homogeneous floating-point aggregate having four members.  */
3405   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3406     return TARGET_FLOAT;
3407
3408   return false;
3409 }
3410
3411 /* Implement TARGET_RETURN_IN_MEMORY.
3412
3413    If the type T of the result of a function is such that
3414      void func (T arg)
3415    would require that arg be passed as a value in a register (or set of
3416    registers) according to the parameter passing rules, then the result
3417    is returned in the same registers as would be used for such an
3418    argument.  */
3419
3420 static bool
3421 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3422 {
3423   HOST_WIDE_INT size;
3424   machine_mode ag_mode;
3425   int count;
3426
3427   if (!AGGREGATE_TYPE_P (type)
3428       && TREE_CODE (type) != COMPLEX_TYPE
3429       && TREE_CODE (type) != VECTOR_TYPE)
3430     /* Simple scalar types always returned in registers.  */
3431     return false;
3432
3433   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3434                                                type,
3435                                                &ag_mode,
3436                                                &count,
3437                                                NULL))
3438     return false;
3439
3440   /* Types larger than 2 registers returned in memory.  */
3441   size = int_size_in_bytes (type);
3442   return (size < 0 || size > 2 * UNITS_PER_WORD);
3443 }
3444
3445 static bool
3446 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3447                                const_tree type, int *nregs)
3448 {
3449   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3450   return aarch64_vfp_is_call_or_return_candidate (mode,
3451                                                   type,
3452                                                   &pcum->aapcs_vfp_rmode,
3453                                                   nregs,
3454                                                   NULL);
3455 }
3456
3457 /* Given MODE and TYPE of a function argument, return the alignment in
3458    bits.  The idea is to suppress any stronger alignment requested by
3459    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3460    This is a helper function for local use only.  */
3461
3462 static unsigned int
3463 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
3464 {
3465   if (!type)
3466     return GET_MODE_ALIGNMENT (mode);
3467
3468   if (integer_zerop (TYPE_SIZE (type)))
3469     return 0;
3470
3471   gcc_assert (TYPE_MODE (type) == mode);
3472
3473   if (!AGGREGATE_TYPE_P (type))
3474     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3475
3476   if (TREE_CODE (type) == ARRAY_TYPE)
3477     return TYPE_ALIGN (TREE_TYPE (type));
3478
3479   unsigned int alignment = 0;
3480   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3481     if (TREE_CODE (field) == FIELD_DECL)
3482       alignment = std::max (alignment, DECL_ALIGN (field));
3483
3484   return alignment;
3485 }
3486
3487 /* Layout a function argument according to the AAPCS64 rules.  The rule
3488    numbers refer to the rule numbers in the AAPCS64.  */
3489
3490 static void
3491 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3492                     const_tree type,
3493                     bool named ATTRIBUTE_UNUSED)
3494 {
3495   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3496   int ncrn, nvrn, nregs;
3497   bool allocate_ncrn, allocate_nvrn;
3498   HOST_WIDE_INT size;
3499
3500   /* We need to do this once per argument.  */
3501   if (pcum->aapcs_arg_processed)
3502     return;
3503
3504   pcum->aapcs_arg_processed = true;
3505
3506   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
3507   if (type)
3508     size = int_size_in_bytes (type);
3509   else
3510     /* No frontends can create types with variable-sized modes, so we
3511        shouldn't be asked to pass or return them.  */
3512     size = GET_MODE_SIZE (mode).to_constant ();
3513   size = ROUND_UP (size, UNITS_PER_WORD);
3514
3515   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3516   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3517                                                  mode,
3518                                                  type,
3519                                                  &nregs);
3520
3521   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3522      The following code thus handles passing by SIMD/FP registers first.  */
3523
3524   nvrn = pcum->aapcs_nvrn;
3525
3526   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3527      and homogenous short-vector aggregates (HVA).  */
3528   if (allocate_nvrn)
3529     {
3530       if (!TARGET_FLOAT)
3531         aarch64_err_no_fpadvsimd (mode);
3532
3533       if (nvrn + nregs <= NUM_FP_ARG_REGS)
3534         {
3535           pcum->aapcs_nextnvrn = nvrn + nregs;
3536           if (!aarch64_composite_type_p (type, mode))
3537             {
3538               gcc_assert (nregs == 1);
3539               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3540             }
3541           else
3542             {
3543               rtx par;
3544               int i;
3545               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3546               for (i = 0; i < nregs; i++)
3547                 {
3548                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3549                                          V0_REGNUM + nvrn + i);
3550                   rtx offset = gen_int_mode
3551                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3552                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3553                   XVECEXP (par, 0, i) = tmp;
3554                 }
3555               pcum->aapcs_reg = par;
3556             }
3557           return;
3558         }
3559       else
3560         {
3561           /* C.3 NSRN is set to 8.  */
3562           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3563           goto on_stack;
3564         }
3565     }
3566
3567   ncrn = pcum->aapcs_ncrn;
3568   nregs = size / UNITS_PER_WORD;
3569
3570   /* C6 - C9.  though the sign and zero extension semantics are
3571      handled elsewhere.  This is the case where the argument fits
3572      entirely general registers.  */
3573   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3574     {
3575
3576       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3577
3578       /* C.8 if the argument has an alignment of 16 then the NGRN is
3579          rounded up to the next even number.  */
3580       if (nregs == 2
3581           && ncrn % 2
3582           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3583              comparison is there because for > 16 * BITS_PER_UNIT
3584              alignment nregs should be > 2 and therefore it should be
3585              passed by reference rather than value.  */
3586           && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3587         {
3588           ++ncrn;
3589           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3590         }
3591
3592       /* NREGS can be 0 when e.g. an empty structure is to be passed.
3593          A reg is still generated for it, but the caller should be smart
3594          enough not to use it.  */
3595       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3596         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3597       else
3598         {
3599           rtx par;
3600           int i;
3601
3602           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3603           for (i = 0; i < nregs; i++)
3604             {
3605               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3606               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3607                                        GEN_INT (i * UNITS_PER_WORD));
3608               XVECEXP (par, 0, i) = tmp;
3609             }
3610           pcum->aapcs_reg = par;
3611         }
3612
3613       pcum->aapcs_nextncrn = ncrn + nregs;
3614       return;
3615     }
3616
3617   /* C.11  */
3618   pcum->aapcs_nextncrn = NUM_ARG_REGS;
3619
3620   /* The argument is passed on stack; record the needed number of words for
3621      this argument and align the total size if necessary.  */
3622 on_stack:
3623   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3624
3625   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3626     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
3627                                        16 / UNITS_PER_WORD);
3628   return;
3629 }
3630
3631 /* Implement TARGET_FUNCTION_ARG.  */
3632
3633 static rtx
3634 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3635                       const_tree type, bool named)
3636 {
3637   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3638   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3639
3640   if (mode == VOIDmode)
3641     return NULL_RTX;
3642
3643   aarch64_layout_arg (pcum_v, mode, type, named);
3644   return pcum->aapcs_reg;
3645 }
3646
3647 void
3648 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
3649                            const_tree fntype ATTRIBUTE_UNUSED,
3650                            rtx libname ATTRIBUTE_UNUSED,
3651                            const_tree fndecl ATTRIBUTE_UNUSED,
3652                            unsigned n_named ATTRIBUTE_UNUSED)
3653 {
3654   pcum->aapcs_ncrn = 0;
3655   pcum->aapcs_nvrn = 0;
3656   pcum->aapcs_nextncrn = 0;
3657   pcum->aapcs_nextnvrn = 0;
3658   pcum->pcs_variant = ARM_PCS_AAPCS64;
3659   pcum->aapcs_reg = NULL_RTX;
3660   pcum->aapcs_arg_processed = false;
3661   pcum->aapcs_stack_words = 0;
3662   pcum->aapcs_stack_size = 0;
3663
3664   if (!TARGET_FLOAT
3665       && fndecl && TREE_PUBLIC (fndecl)
3666       && fntype && fntype != error_mark_node)
3667     {
3668       const_tree type = TREE_TYPE (fntype);
3669       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
3670       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
3671       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
3672                                                    &mode, &nregs, NULL))
3673         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
3674     }
3675   return;
3676 }
3677
3678 static void
3679 aarch64_function_arg_advance (cumulative_args_t pcum_v,
3680                               machine_mode mode,
3681                               const_tree type,
3682                               bool named)
3683 {
3684   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3685   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
3686     {
3687       aarch64_layout_arg (pcum_v, mode, type, named);
3688       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
3689                   != (pcum->aapcs_stack_words != 0));
3690       pcum->aapcs_arg_processed = false;
3691       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
3692       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
3693       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
3694       pcum->aapcs_stack_words = 0;
3695       pcum->aapcs_reg = NULL_RTX;
3696     }
3697 }
3698
3699 bool
3700 aarch64_function_arg_regno_p (unsigned regno)
3701 {
3702   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
3703           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
3704 }
3705
3706 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
3707    PARM_BOUNDARY bits of alignment, but will be given anything up
3708    to STACK_BOUNDARY bits if the type requires it.  This makes sure
3709    that both before and after the layout of each argument, the Next
3710    Stacked Argument Address (NSAA) will have a minimum alignment of
3711    8 bytes.  */
3712
3713 static unsigned int
3714 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
3715 {
3716   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
3717   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
3718 }
3719
3720 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
3721
3722 static fixed_size_mode
3723 aarch64_get_reg_raw_mode (int regno)
3724 {
3725   if (TARGET_SVE && FP_REGNUM_P (regno))
3726     /* Don't use the SVE part of the register for __builtin_apply and
3727        __builtin_return.  The SVE registers aren't used by the normal PCS,
3728        so using them there would be a waste of time.  The PCS extensions
3729        for SVE types are fundamentally incompatible with the
3730        __builtin_return/__builtin_apply interface.  */
3731     return as_a <fixed_size_mode> (V16QImode);
3732   return default_get_reg_raw_mode (regno);
3733 }
3734
3735 /* Implement TARGET_FUNCTION_ARG_PADDING.
3736
3737    Small aggregate types are placed in the lowest memory address.
3738
3739    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
3740
3741 static pad_direction
3742 aarch64_function_arg_padding (machine_mode mode, const_tree type)
3743 {
3744   /* On little-endian targets, the least significant byte of every stack
3745      argument is passed at the lowest byte address of the stack slot.  */
3746   if (!BYTES_BIG_ENDIAN)
3747     return PAD_UPWARD;
3748
3749   /* Otherwise, integral, floating-point and pointer types are padded downward:
3750      the least significant byte of a stack argument is passed at the highest
3751      byte address of the stack slot.  */
3752   if (type
3753       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
3754          || POINTER_TYPE_P (type))
3755       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
3756     return PAD_DOWNWARD;
3757
3758   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
3759   return PAD_UPWARD;
3760 }
3761
3762 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
3763
3764    It specifies padding for the last (may also be the only)
3765    element of a block move between registers and memory.  If
3766    assuming the block is in the memory, padding upward means that
3767    the last element is padded after its highest significant byte,
3768    while in downward padding, the last element is padded at the
3769    its least significant byte side.
3770
3771    Small aggregates and small complex types are always padded
3772    upwards.
3773
3774    We don't need to worry about homogeneous floating-point or
3775    short-vector aggregates; their move is not affected by the
3776    padding direction determined here.  Regardless of endianness,
3777    each element of such an aggregate is put in the least
3778    significant bits of a fp/simd register.
3779
3780    Return !BYTES_BIG_ENDIAN if the least significant byte of the
3781    register has useful data, and return the opposite if the most
3782    significant byte does.  */
3783
3784 bool
3785 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
3786                      bool first ATTRIBUTE_UNUSED)
3787 {
3788
3789   /* Small composite types are always padded upward.  */
3790   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
3791     {
3792       HOST_WIDE_INT size;
3793       if (type)
3794         size = int_size_in_bytes (type);
3795       else
3796         /* No frontends can create types with variable-sized modes, so we
3797            shouldn't be asked to pass or return them.  */
3798         size = GET_MODE_SIZE (mode).to_constant ();
3799       if (size < 2 * UNITS_PER_WORD)
3800         return true;
3801     }
3802
3803   /* Otherwise, use the default padding.  */
3804   return !BYTES_BIG_ENDIAN;
3805 }
3806
3807 static scalar_int_mode
3808 aarch64_libgcc_cmp_return_mode (void)
3809 {
3810   return SImode;
3811 }
3812
3813 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
3814
3815 /* We use the 12-bit shifted immediate arithmetic instructions so values
3816    must be multiple of (1 << 12), i.e. 4096.  */
3817 #define ARITH_FACTOR 4096
3818
3819 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
3820 #error Cannot use simple address calculation for stack probing
3821 #endif
3822
3823 /* The pair of scratch registers used for stack probing.  */
3824 #define PROBE_STACK_FIRST_REG  9
3825 #define PROBE_STACK_SECOND_REG 10
3826
3827 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
3828    inclusive.  These are offsets from the current stack pointer.  */
3829
3830 static void
3831 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
3832 {
3833   HOST_WIDE_INT size;
3834   if (!poly_size.is_constant (&size))
3835     {
3836       sorry ("stack probes for SVE frames");
3837       return;
3838     }
3839
3840   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
3841
3842   /* See the same assertion on PROBE_INTERVAL above.  */
3843   gcc_assert ((first % ARITH_FACTOR) == 0);
3844
3845   /* See if we have a constant small number of probes to generate.  If so,
3846      that's the easy case.  */
3847   if (size <= PROBE_INTERVAL)
3848     {
3849       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
3850
3851       emit_set_insn (reg1,
3852                      plus_constant (Pmode,
3853                                     stack_pointer_rtx, -(first + base)));
3854       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
3855     }
3856
3857   /* The run-time loop is made up of 8 insns in the generic case while the
3858      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
3859   else if (size <= 4 * PROBE_INTERVAL)
3860     {
3861       HOST_WIDE_INT i, rem;
3862
3863       emit_set_insn (reg1,
3864                      plus_constant (Pmode,
3865                                     stack_pointer_rtx,
3866                                     -(first + PROBE_INTERVAL)));
3867       emit_stack_probe (reg1);
3868
3869       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
3870          it exceeds SIZE.  If only two probes are needed, this will not
3871          generate any code.  Then probe at FIRST + SIZE.  */
3872       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
3873         {
3874           emit_set_insn (reg1,
3875                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
3876           emit_stack_probe (reg1);
3877         }
3878
3879       rem = size - (i - PROBE_INTERVAL);
3880       if (rem > 256)
3881         {
3882           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3883
3884           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
3885           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
3886         }
3887       else
3888         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
3889     }
3890
3891   /* Otherwise, do the same as above, but in a loop.  Note that we must be
3892      extra careful with variables wrapping around because we might be at
3893      the very top (or the very bottom) of the address space and we have
3894      to be able to handle this case properly; in particular, we use an
3895      equality test for the loop condition.  */
3896   else
3897     {
3898       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
3899
3900       /* Step 1: round SIZE to the previous multiple of the interval.  */
3901
3902       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
3903
3904
3905       /* Step 2: compute initial and final value of the loop counter.  */
3906
3907       /* TEST_ADDR = SP + FIRST.  */
3908       emit_set_insn (reg1,
3909                      plus_constant (Pmode, stack_pointer_rtx, -first));
3910
3911       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
3912       HOST_WIDE_INT adjustment = - (first + rounded_size);
3913       if (! aarch64_uimm12_shift (adjustment))
3914         {
3915           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
3916                                           true, Pmode);
3917           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
3918         }
3919       else
3920         emit_set_insn (reg2,
3921                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
3922
3923       /* Step 3: the loop
3924
3925          do
3926            {
3927              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
3928              probe at TEST_ADDR
3929            }
3930          while (TEST_ADDR != LAST_ADDR)
3931
3932          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
3933          until it is equal to ROUNDED_SIZE.  */
3934
3935       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
3936
3937
3938       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
3939          that SIZE is equal to ROUNDED_SIZE.  */
3940
3941       if (size != rounded_size)
3942         {
3943           HOST_WIDE_INT rem = size - rounded_size;
3944
3945           if (rem > 256)
3946             {
3947               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3948
3949               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
3950               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
3951             }
3952           else
3953             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
3954         }
3955     }
3956
3957   /* Make sure nothing is scheduled before we are done.  */
3958   emit_insn (gen_blockage ());
3959 }
3960
3961 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
3962    absolute addresses.  */
3963
3964 const char *
3965 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
3966 {
3967   static int labelno = 0;
3968   char loop_lab[32];
3969   rtx xops[2];
3970
3971   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
3972
3973   /* Loop.  */
3974   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
3975
3976   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
3977   xops[0] = reg1;
3978   xops[1] = GEN_INT (PROBE_INTERVAL);
3979   output_asm_insn ("sub\t%0, %0, %1", xops);
3980
3981   /* Probe at TEST_ADDR.  */
3982   output_asm_insn ("str\txzr, [%0]", xops);
3983
3984   /* Test if TEST_ADDR == LAST_ADDR.  */
3985   xops[1] = reg2;
3986   output_asm_insn ("cmp\t%0, %1", xops);
3987
3988   /* Branch.  */
3989   fputs ("\tb.ne\t", asm_out_file);
3990   assemble_name_raw (asm_out_file, loop_lab);
3991   fputc ('\n', asm_out_file);
3992
3993   return "";
3994 }
3995
3996 /* Determine whether a frame chain needs to be generated.  */
3997 static bool
3998 aarch64_needs_frame_chain (void)
3999 {
4000   /* Force a frame chain for EH returns so the return address is at FP+8.  */
4001   if (frame_pointer_needed || crtl->calls_eh_return)
4002     return true;
4003
4004   /* A leaf function cannot have calls or write LR.  */
4005   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
4006
4007   /* Don't use a frame chain in leaf functions if leaf frame pointers
4008      are disabled.  */
4009   if (flag_omit_leaf_frame_pointer && is_leaf)
4010     return false;
4011
4012   return aarch64_use_frame_pointer;
4013 }
4014
4015 /* Mark the registers that need to be saved by the callee and calculate
4016    the size of the callee-saved registers area and frame record (both FP
4017    and LR may be omitted).  */
4018 static void
4019 aarch64_layout_frame (void)
4020 {
4021   HOST_WIDE_INT offset = 0;
4022   int regno, last_fp_reg = INVALID_REGNUM;
4023
4024   if (reload_completed && cfun->machine->frame.laid_out)
4025     return;
4026
4027   cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
4028
4029 #define SLOT_NOT_REQUIRED (-2)
4030 #define SLOT_REQUIRED     (-1)
4031
4032   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
4033   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
4034
4035   /* First mark all the registers that really need to be saved...  */
4036   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4037     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4038
4039   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4040     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4041
4042   /* ... that includes the eh data registers (if needed)...  */
4043   if (crtl->calls_eh_return)
4044     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
4045       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
4046         = SLOT_REQUIRED;
4047
4048   /* ... and any callee saved register that dataflow says is live.  */
4049   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4050     if (df_regs_ever_live_p (regno)
4051         && (regno == R30_REGNUM
4052             || !call_used_regs[regno]))
4053       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4054
4055   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4056     if (df_regs_ever_live_p (regno)
4057         && !call_used_regs[regno])
4058       {
4059         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4060         last_fp_reg = regno;
4061       }
4062
4063   if (cfun->machine->frame.emit_frame_chain)
4064     {
4065       /* FP and LR are placed in the linkage record.  */
4066       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4067       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4068       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4069       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4070       offset = 2 * UNITS_PER_WORD;
4071     }
4072
4073   /* Now assign stack slots for them.  */
4074   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4075     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4076       {
4077         cfun->machine->frame.reg_offset[regno] = offset;
4078         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4079           cfun->machine->frame.wb_candidate1 = regno;
4080         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4081           cfun->machine->frame.wb_candidate2 = regno;
4082         offset += UNITS_PER_WORD;
4083       }
4084
4085   HOST_WIDE_INT max_int_offset = offset;
4086   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4087   bool has_align_gap = offset != max_int_offset;
4088
4089   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4090     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4091       {
4092         /* If there is an alignment gap between integer and fp callee-saves,
4093            allocate the last fp register to it if possible.  */
4094         if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
4095           {
4096             cfun->machine->frame.reg_offset[regno] = max_int_offset;
4097             break;
4098           }
4099
4100         cfun->machine->frame.reg_offset[regno] = offset;
4101         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4102           cfun->machine->frame.wb_candidate1 = regno;
4103         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4104                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4105           cfun->machine->frame.wb_candidate2 = regno;
4106         offset += UNITS_PER_WORD;
4107       }
4108
4109   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4110
4111   cfun->machine->frame.saved_regs_size = offset;
4112
4113   HOST_WIDE_INT varargs_and_saved_regs_size
4114     = offset + cfun->machine->frame.saved_varargs_size;
4115
4116   cfun->machine->frame.hard_fp_offset
4117     = aligned_upper_bound (varargs_and_saved_regs_size
4118                            + get_frame_size (),
4119                            STACK_BOUNDARY / BITS_PER_UNIT);
4120
4121   /* Both these values are already aligned.  */
4122   gcc_assert (multiple_p (crtl->outgoing_args_size,
4123                           STACK_BOUNDARY / BITS_PER_UNIT));
4124   cfun->machine->frame.frame_size
4125     = (cfun->machine->frame.hard_fp_offset
4126        + crtl->outgoing_args_size);
4127
4128   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4129
4130   cfun->machine->frame.initial_adjust = 0;
4131   cfun->machine->frame.final_adjust = 0;
4132   cfun->machine->frame.callee_adjust = 0;
4133   cfun->machine->frame.callee_offset = 0;
4134
4135   HOST_WIDE_INT max_push_offset = 0;
4136   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4137     max_push_offset = 512;
4138   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4139     max_push_offset = 256;
4140
4141   HOST_WIDE_INT const_size, const_fp_offset;
4142   if (cfun->machine->frame.frame_size.is_constant (&const_size)
4143       && const_size < max_push_offset
4144       && known_eq (crtl->outgoing_args_size, 0))
4145     {
4146       /* Simple, small frame with no outgoing arguments:
4147          stp reg1, reg2, [sp, -frame_size]!
4148          stp reg3, reg4, [sp, 16]  */
4149       cfun->machine->frame.callee_adjust = const_size;
4150     }
4151   else if (known_lt (crtl->outgoing_args_size
4152                      + cfun->machine->frame.saved_regs_size, 512)
4153            && !(cfun->calls_alloca
4154                 && known_lt (cfun->machine->frame.hard_fp_offset,
4155                              max_push_offset)))
4156     {
4157       /* Frame with small outgoing arguments:
4158          sub sp, sp, frame_size
4159          stp reg1, reg2, [sp, outgoing_args_size]
4160          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
4161       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4162       cfun->machine->frame.callee_offset
4163         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4164     }
4165   else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4166            && const_fp_offset < max_push_offset)
4167     {
4168       /* Frame with large outgoing arguments but a small local area:
4169          stp reg1, reg2, [sp, -hard_fp_offset]!
4170          stp reg3, reg4, [sp, 16]
4171          sub sp, sp, outgoing_args_size  */
4172       cfun->machine->frame.callee_adjust = const_fp_offset;
4173       cfun->machine->frame.final_adjust
4174         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4175     }
4176   else
4177     {
4178       /* Frame with large local area and outgoing arguments using frame pointer:
4179          sub sp, sp, hard_fp_offset
4180          stp x29, x30, [sp, 0]
4181          add x29, sp, 0
4182          stp reg3, reg4, [sp, 16]
4183          sub sp, sp, outgoing_args_size  */
4184       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4185       cfun->machine->frame.final_adjust
4186         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4187     }
4188
4189   cfun->machine->frame.laid_out = true;
4190 }
4191
4192 /* Return true if the register REGNO is saved on entry to
4193    the current function.  */
4194
4195 static bool
4196 aarch64_register_saved_on_entry (int regno)
4197 {
4198   return cfun->machine->frame.reg_offset[regno] >= 0;
4199 }
4200
4201 /* Return the next register up from REGNO up to LIMIT for the callee
4202    to save.  */
4203
4204 static unsigned
4205 aarch64_next_callee_save (unsigned regno, unsigned limit)
4206 {
4207   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4208     regno ++;
4209   return regno;
4210 }
4211
4212 /* Push the register number REGNO of mode MODE to the stack with write-back
4213    adjusting the stack by ADJUSTMENT.  */
4214
4215 static void
4216 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4217                            HOST_WIDE_INT adjustment)
4218  {
4219   rtx base_rtx = stack_pointer_rtx;
4220   rtx insn, reg, mem;
4221
4222   reg = gen_rtx_REG (mode, regno);
4223   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4224                             plus_constant (Pmode, base_rtx, -adjustment));
4225   mem = gen_frame_mem (mode, mem);
4226
4227   insn = emit_move_insn (mem, reg);
4228   RTX_FRAME_RELATED_P (insn) = 1;
4229 }
4230
4231 /* Generate and return an instruction to store the pair of registers
4232    REG and REG2 of mode MODE to location BASE with write-back adjusting
4233    the stack location BASE by ADJUSTMENT.  */
4234
4235 static rtx
4236 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4237                           HOST_WIDE_INT adjustment)
4238 {
4239   switch (mode)
4240     {
4241     case E_DImode:
4242       return gen_storewb_pairdi_di (base, base, reg, reg2,
4243                                     GEN_INT (-adjustment),
4244                                     GEN_INT (UNITS_PER_WORD - adjustment));
4245     case E_DFmode:
4246       return gen_storewb_pairdf_di (base, base, reg, reg2,
4247                                     GEN_INT (-adjustment),
4248                                     GEN_INT (UNITS_PER_WORD - adjustment));
4249     default:
4250       gcc_unreachable ();
4251     }
4252 }
4253
4254 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4255    stack pointer by ADJUSTMENT.  */
4256
4257 static void
4258 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4259 {
4260   rtx_insn *insn;
4261   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4262
4263   if (regno2 == INVALID_REGNUM)
4264     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4265
4266   rtx reg1 = gen_rtx_REG (mode, regno1);
4267   rtx reg2 = gen_rtx_REG (mode, regno2);
4268
4269   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4270                                               reg2, adjustment));
4271   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4272   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4273   RTX_FRAME_RELATED_P (insn) = 1;
4274 }
4275
4276 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4277    adjusting it by ADJUSTMENT afterwards.  */
4278
4279 static rtx
4280 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4281                          HOST_WIDE_INT adjustment)
4282 {
4283   switch (mode)
4284     {
4285     case E_DImode:
4286       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4287                                    GEN_INT (UNITS_PER_WORD));
4288     case E_DFmode:
4289       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4290                                    GEN_INT (UNITS_PER_WORD));
4291     default:
4292       gcc_unreachable ();
4293     }
4294 }
4295
4296 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4297    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4298    into CFI_OPS.  */
4299
4300 static void
4301 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4302                   rtx *cfi_ops)
4303 {
4304   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4305   rtx reg1 = gen_rtx_REG (mode, regno1);
4306
4307   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4308
4309   if (regno2 == INVALID_REGNUM)
4310     {
4311       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4312       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4313       emit_move_insn (reg1, gen_frame_mem (mode, mem));
4314     }
4315   else
4316     {
4317       rtx reg2 = gen_rtx_REG (mode, regno2);
4318       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4319       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4320                                           reg2, adjustment));
4321     }
4322 }
4323
4324 /* Generate and return a store pair instruction of mode MODE to store
4325    register REG1 to MEM1 and register REG2 to MEM2.  */
4326
4327 static rtx
4328 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4329                         rtx reg2)
4330 {
4331   switch (mode)
4332     {
4333     case E_DImode:
4334       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
4335
4336     case E_DFmode:
4337       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
4338
4339     default:
4340       gcc_unreachable ();
4341     }
4342 }
4343
4344 /* Generate and regurn a load pair isntruction of mode MODE to load register
4345    REG1 from MEM1 and register REG2 from MEM2.  */
4346
4347 static rtx
4348 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4349                        rtx mem2)
4350 {
4351   switch (mode)
4352     {
4353     case E_DImode:
4354       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
4355
4356     case E_DFmode:
4357       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
4358
4359     default:
4360       gcc_unreachable ();
4361     }
4362 }
4363
4364 /* Return TRUE if return address signing should be enabled for the current
4365    function, otherwise return FALSE.  */
4366
4367 bool
4368 aarch64_return_address_signing_enabled (void)
4369 {
4370   /* This function should only be called after frame laid out.   */
4371   gcc_assert (cfun->machine->frame.laid_out);
4372
4373   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4374      if it's LR is pushed onto stack.  */
4375   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4376           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4377               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4378 }
4379
4380 /* Emit code to save the callee-saved registers from register number START
4381    to LIMIT to the stack at the location starting at offset START_OFFSET,
4382    skipping any write-back candidates if SKIP_WB is true.  */
4383
4384 static void
4385 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4386                            unsigned start, unsigned limit, bool skip_wb)
4387 {
4388   rtx_insn *insn;
4389   unsigned regno;
4390   unsigned regno2;
4391
4392   for (regno = aarch64_next_callee_save (start, limit);
4393        regno <= limit;
4394        regno = aarch64_next_callee_save (regno + 1, limit))
4395     {
4396       rtx reg, mem;
4397       poly_int64 offset;
4398
4399       if (skip_wb
4400           && (regno == cfun->machine->frame.wb_candidate1
4401               || regno == cfun->machine->frame.wb_candidate2))
4402         continue;
4403
4404       if (cfun->machine->reg_is_wrapped_separately[regno])
4405        continue;
4406
4407       reg = gen_rtx_REG (mode, regno);
4408       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4409       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4410                                                 offset));
4411
4412       regno2 = aarch64_next_callee_save (regno + 1, limit);
4413
4414       if (regno2 <= limit
4415           && !cfun->machine->reg_is_wrapped_separately[regno2]
4416           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4417               == cfun->machine->frame.reg_offset[regno2]))
4418
4419         {
4420           rtx reg2 = gen_rtx_REG (mode, regno2);
4421           rtx mem2;
4422
4423           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4424           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4425                                                      offset));
4426           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4427                                                     reg2));
4428
4429           /* The first part of a frame-related parallel insn is
4430              always assumed to be relevant to the frame
4431              calculations; subsequent parts, are only
4432              frame-related if explicitly marked.  */
4433           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4434           regno = regno2;
4435         }
4436       else
4437         insn = emit_move_insn (mem, reg);
4438
4439       RTX_FRAME_RELATED_P (insn) = 1;
4440     }
4441 }
4442
4443 /* Emit code to restore the callee registers of mode MODE from register
4444    number START up to and including LIMIT.  Restore from the stack offset
4445    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4446    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
4447
4448 static void
4449 aarch64_restore_callee_saves (machine_mode mode,
4450                               poly_int64 start_offset, unsigned start,
4451                               unsigned limit, bool skip_wb, rtx *cfi_ops)
4452 {
4453   rtx base_rtx = stack_pointer_rtx;
4454   unsigned regno;
4455   unsigned regno2;
4456   poly_int64 offset;
4457
4458   for (regno = aarch64_next_callee_save (start, limit);
4459        regno <= limit;
4460        regno = aarch64_next_callee_save (regno + 1, limit))
4461     {
4462       if (cfun->machine->reg_is_wrapped_separately[regno])
4463        continue;
4464
4465       rtx reg, mem;
4466
4467       if (skip_wb
4468           && (regno == cfun->machine->frame.wb_candidate1
4469               || regno == cfun->machine->frame.wb_candidate2))
4470         continue;
4471
4472       reg = gen_rtx_REG (mode, regno);
4473       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4474       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4475
4476       regno2 = aarch64_next_callee_save (regno + 1, limit);
4477
4478       if (regno2 <= limit
4479           && !cfun->machine->reg_is_wrapped_separately[regno2]
4480           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4481               == cfun->machine->frame.reg_offset[regno2]))
4482         {
4483           rtx reg2 = gen_rtx_REG (mode, regno2);
4484           rtx mem2;
4485
4486           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4487           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4488           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4489
4490           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4491           regno = regno2;
4492         }
4493       else
4494         emit_move_insn (reg, mem);
4495       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
4496     }
4497 }
4498
4499 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4500    of MODE.  */
4501
4502 static inline bool
4503 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4504 {
4505   HOST_WIDE_INT multiple;
4506   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4507           && IN_RANGE (multiple, -8, 7));
4508 }
4509
4510 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4511    of MODE.  */
4512
4513 static inline bool
4514 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4515 {
4516   HOST_WIDE_INT multiple;
4517   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4518           && IN_RANGE (multiple, 0, 63));
4519 }
4520
4521 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4522    of MODE.  */
4523
4524 bool
4525 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4526 {
4527   HOST_WIDE_INT multiple;
4528   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4529           && IN_RANGE (multiple, -64, 63));
4530 }
4531
4532 /* Return true if OFFSET is a signed 9-bit value.  */
4533
4534 static inline bool
4535 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
4536                                poly_int64 offset)
4537 {
4538   HOST_WIDE_INT const_offset;
4539   return (offset.is_constant (&const_offset)
4540           && IN_RANGE (const_offset, -256, 255));
4541 }
4542
4543 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4544    of MODE.  */
4545
4546 static inline bool
4547 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4548 {
4549   HOST_WIDE_INT multiple;
4550   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4551           && IN_RANGE (multiple, -256, 255));
4552 }
4553
4554 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4555    of MODE.  */
4556
4557 static inline bool
4558 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4559 {
4560   HOST_WIDE_INT multiple;
4561   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4562           && IN_RANGE (multiple, 0, 4095));
4563 }
4564
4565 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
4566
4567 static sbitmap
4568 aarch64_get_separate_components (void)
4569 {
4570   aarch64_layout_frame ();
4571
4572   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4573   bitmap_clear (components);
4574
4575   /* The registers we need saved to the frame.  */
4576   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4577     if (aarch64_register_saved_on_entry (regno))
4578       {
4579         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4580         if (!frame_pointer_needed)
4581           offset += cfun->machine->frame.frame_size
4582                     - cfun->machine->frame.hard_fp_offset;
4583         /* Check that we can access the stack slot of the register with one
4584            direct load with no adjustments needed.  */
4585         if (offset_12bit_unsigned_scaled_p (DImode, offset))
4586           bitmap_set_bit (components, regno);
4587       }
4588
4589   /* Don't mess with the hard frame pointer.  */
4590   if (frame_pointer_needed)
4591     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
4592
4593   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4594   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4595   /* If aarch64_layout_frame has chosen registers to store/restore with
4596      writeback don't interfere with them to avoid having to output explicit
4597      stack adjustment instructions.  */
4598   if (reg2 != INVALID_REGNUM)
4599     bitmap_clear_bit (components, reg2);
4600   if (reg1 != INVALID_REGNUM)
4601     bitmap_clear_bit (components, reg1);
4602
4603   bitmap_clear_bit (components, LR_REGNUM);
4604   bitmap_clear_bit (components, SP_REGNUM);
4605
4606   return components;
4607 }
4608
4609 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
4610
4611 static sbitmap
4612 aarch64_components_for_bb (basic_block bb)
4613 {
4614   bitmap in = DF_LIVE_IN (bb);
4615   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
4616   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
4617
4618   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4619   bitmap_clear (components);
4620
4621   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
4622   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4623     if ((!call_used_regs[regno])
4624        && (bitmap_bit_p (in, regno)
4625            || bitmap_bit_p (gen, regno)
4626            || bitmap_bit_p (kill, regno)))
4627       {
4628         unsigned regno2, offset, offset2;
4629         bitmap_set_bit (components, regno);
4630
4631         /* If there is a callee-save at an adjacent offset, add it too
4632            to increase the use of LDP/STP.  */
4633         offset = cfun->machine->frame.reg_offset[regno];
4634         regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
4635
4636         if (regno2 <= LAST_SAVED_REGNUM)
4637           {
4638             offset2 = cfun->machine->frame.reg_offset[regno2];
4639             if ((offset & ~8) == (offset2 & ~8))
4640               bitmap_set_bit (components, regno2);
4641           }
4642       }
4643
4644   return components;
4645 }
4646
4647 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
4648    Nothing to do for aarch64.  */
4649
4650 static void
4651 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
4652 {
4653 }
4654
4655 /* Return the next set bit in BMP from START onwards.  Return the total number
4656    of bits in BMP if no set bit is found at or after START.  */
4657
4658 static unsigned int
4659 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
4660 {
4661   unsigned int nbits = SBITMAP_SIZE (bmp);
4662   if (start == nbits)
4663     return start;
4664
4665   gcc_assert (start < nbits);
4666   for (unsigned int i = start; i < nbits; i++)
4667     if (bitmap_bit_p (bmp, i))
4668       return i;
4669
4670   return nbits;
4671 }
4672
4673 /* Do the work for aarch64_emit_prologue_components and
4674    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
4675    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
4676    for these components or the epilogue sequence.  That is, it determines
4677    whether we should emit stores or loads and what kind of CFA notes to attach
4678    to the insns.  Otherwise the logic for the two sequences is very
4679    similar.  */
4680
4681 static void
4682 aarch64_process_components (sbitmap components, bool prologue_p)
4683 {
4684   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
4685                              ? HARD_FRAME_POINTER_REGNUM
4686                              : STACK_POINTER_REGNUM);
4687
4688   unsigned last_regno = SBITMAP_SIZE (components);
4689   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
4690   rtx_insn *insn = NULL;
4691
4692   while (regno != last_regno)
4693     {
4694       /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
4695          so DFmode for the vector registers is enough.  */
4696       machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
4697       rtx reg = gen_rtx_REG (mode, regno);
4698       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4699       if (!frame_pointer_needed)
4700         offset += cfun->machine->frame.frame_size
4701                   - cfun->machine->frame.hard_fp_offset;
4702       rtx addr = plus_constant (Pmode, ptr_reg, offset);
4703       rtx mem = gen_frame_mem (mode, addr);
4704
4705       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
4706       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
4707       /* No more registers to handle after REGNO.
4708          Emit a single save/restore and exit.  */
4709       if (regno2 == last_regno)
4710         {
4711           insn = emit_insn (set);
4712           RTX_FRAME_RELATED_P (insn) = 1;
4713           if (prologue_p)
4714             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4715           else
4716             add_reg_note (insn, REG_CFA_RESTORE, reg);
4717           break;
4718         }
4719
4720       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
4721       /* The next register is not of the same class or its offset is not
4722          mergeable with the current one into a pair.  */
4723       if (!satisfies_constraint_Ump (mem)
4724           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
4725           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
4726                        GET_MODE_SIZE (mode)))
4727         {
4728           insn = emit_insn (set);
4729           RTX_FRAME_RELATED_P (insn) = 1;
4730           if (prologue_p)
4731             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4732           else
4733             add_reg_note (insn, REG_CFA_RESTORE, reg);
4734
4735           regno = regno2;
4736           continue;
4737         }
4738
4739       /* REGNO2 can be saved/restored in a pair with REGNO.  */
4740       rtx reg2 = gen_rtx_REG (mode, regno2);
4741       if (!frame_pointer_needed)
4742         offset2 += cfun->machine->frame.frame_size
4743                   - cfun->machine->frame.hard_fp_offset;
4744       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
4745       rtx mem2 = gen_frame_mem (mode, addr2);
4746       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
4747                              : gen_rtx_SET (reg2, mem2);
4748
4749       if (prologue_p)
4750         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
4751       else
4752         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4753
4754       RTX_FRAME_RELATED_P (insn) = 1;
4755       if (prologue_p)
4756         {
4757           add_reg_note (insn, REG_CFA_OFFSET, set);
4758           add_reg_note (insn, REG_CFA_OFFSET, set2);
4759         }
4760       else
4761         {
4762           add_reg_note (insn, REG_CFA_RESTORE, reg);
4763           add_reg_note (insn, REG_CFA_RESTORE, reg2);
4764         }
4765
4766       regno = aarch64_get_next_set_bit (components, regno2 + 1);
4767     }
4768 }
4769
4770 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
4771
4772 static void
4773 aarch64_emit_prologue_components (sbitmap components)
4774 {
4775   aarch64_process_components (components, true);
4776 }
4777
4778 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
4779
4780 static void
4781 aarch64_emit_epilogue_components (sbitmap components)
4782 {
4783   aarch64_process_components (components, false);
4784 }
4785
4786 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
4787
4788 static void
4789 aarch64_set_handled_components (sbitmap components)
4790 {
4791   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4792     if (bitmap_bit_p (components, regno))
4793       cfun->machine->reg_is_wrapped_separately[regno] = true;
4794 }
4795
4796 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
4797    is saved at BASE + OFFSET.  */
4798
4799 static void
4800 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
4801                             rtx base, poly_int64 offset)
4802 {
4803   rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
4804   add_reg_note (insn, REG_CFA_EXPRESSION,
4805                 gen_rtx_SET (mem, regno_reg_rtx[reg]));
4806 }
4807
4808 /* AArch64 stack frames generated by this compiler look like:
4809
4810         +-------------------------------+
4811         |                               |
4812         |  incoming stack arguments     |
4813         |                               |
4814         +-------------------------------+
4815         |                               | <-- incoming stack pointer (aligned)
4816         |  callee-allocated save area   |
4817         |  for register varargs         |
4818         |                               |
4819         +-------------------------------+
4820         |  local variables              | <-- frame_pointer_rtx
4821         |                               |
4822         +-------------------------------+
4823         |  padding0                     | \
4824         +-------------------------------+  |
4825         |  callee-saved registers       |  | frame.saved_regs_size
4826         +-------------------------------+  |
4827         |  LR'                          |  |
4828         +-------------------------------+  |
4829         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
4830         +-------------------------------+
4831         |  dynamic allocation           |
4832         +-------------------------------+
4833         |  padding                      |
4834         +-------------------------------+
4835         |  outgoing stack arguments     | <-- arg_pointer
4836         |                               |
4837         +-------------------------------+
4838         |                               | <-- stack_pointer_rtx (aligned)
4839
4840    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
4841    but leave frame_pointer_rtx and hard_frame_pointer_rtx
4842    unchanged.  */
4843
4844 /* Generate the prologue instructions for entry into a function.
4845    Establish the stack frame by decreasing the stack pointer with a
4846    properly calculated size and, if necessary, create a frame record
4847    filled with the values of LR and previous frame pointer.  The
4848    current FP is also set up if it is in use.  */
4849
4850 void
4851 aarch64_expand_prologue (void)
4852 {
4853   aarch64_layout_frame ();
4854
4855   poly_int64 frame_size = cfun->machine->frame.frame_size;
4856   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4857   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4858   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4859   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4860   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4861   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4862   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
4863   rtx_insn *insn;
4864
4865   /* Sign return address for functions.  */
4866   if (aarch64_return_address_signing_enabled ())
4867     {
4868       insn = emit_insn (gen_pacisp ());
4869       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
4870       RTX_FRAME_RELATED_P (insn) = 1;
4871     }
4872
4873   if (flag_stack_usage_info)
4874     current_function_static_stack_size = constant_lower_bound (frame_size);
4875
4876   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4877     {
4878       if (crtl->is_leaf && !cfun->calls_alloca)
4879         {
4880           if (maybe_gt (frame_size, PROBE_INTERVAL)
4881               && maybe_gt (frame_size, get_stack_check_protect ()))
4882             aarch64_emit_probe_stack_range (get_stack_check_protect (),
4883                                             (frame_size
4884                                              - get_stack_check_protect ()));
4885         }
4886       else if (maybe_gt (frame_size, 0))
4887         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
4888     }
4889
4890   rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4891   rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4892
4893   aarch64_sub_sp (ip0_rtx, ip1_rtx, initial_adjust, true);
4894
4895   if (callee_adjust != 0)
4896     aarch64_push_regs (reg1, reg2, callee_adjust);
4897
4898   if (emit_frame_chain)
4899     {
4900       poly_int64 reg_offset = callee_adjust;
4901       if (callee_adjust == 0)
4902         {
4903           reg1 = R29_REGNUM;
4904           reg2 = R30_REGNUM;
4905           reg_offset = callee_offset;
4906           aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
4907         }
4908       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
4909                           stack_pointer_rtx, callee_offset,
4910                           ip1_rtx, ip0_rtx, frame_pointer_needed);
4911       if (frame_pointer_needed && !frame_size.is_constant ())
4912         {
4913           /* Variable-sized frames need to describe the save slot
4914              address using DW_CFA_expression rather than DW_CFA_offset.
4915              This means that, without taking further action, the
4916              locations of the registers that we've already saved would
4917              remain based on the stack pointer even after we redefine
4918              the CFA based on the frame pointer.  We therefore need new
4919              DW_CFA_expressions to re-express the save slots with addresses
4920              based on the frame pointer.  */
4921           rtx_insn *insn = get_last_insn ();
4922           gcc_assert (RTX_FRAME_RELATED_P (insn));
4923
4924           /* Add an explicit CFA definition if this was previously
4925              implicit.  */
4926           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
4927             {
4928               rtx src = plus_constant (Pmode, stack_pointer_rtx,
4929                                        callee_offset);
4930               add_reg_note (insn, REG_CFA_ADJUST_CFA,
4931                             gen_rtx_SET (hard_frame_pointer_rtx, src));
4932             }
4933
4934           /* Change the save slot expressions for the registers that
4935              we've already saved.  */
4936           reg_offset -= callee_offset;
4937           aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
4938                                       reg_offset + UNITS_PER_WORD);
4939           aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
4940                                       reg_offset);
4941         }
4942       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
4943     }
4944
4945   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4946                              callee_adjust != 0 || emit_frame_chain);
4947   aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4948                              callee_adjust != 0 || emit_frame_chain);
4949   aarch64_sub_sp (ip1_rtx, ip0_rtx, final_adjust, !frame_pointer_needed);
4950 }
4951
4952 /* Return TRUE if we can use a simple_return insn.
4953
4954    This function checks whether the callee saved stack is empty, which
4955    means no restore actions are need. The pro_and_epilogue will use
4956    this to check whether shrink-wrapping opt is feasible.  */
4957
4958 bool
4959 aarch64_use_return_insn_p (void)
4960 {
4961   if (!reload_completed)
4962     return false;
4963
4964   if (crtl->profile)
4965     return false;
4966
4967   aarch64_layout_frame ();
4968
4969   return known_eq (cfun->machine->frame.frame_size, 0);
4970 }
4971
4972 /* Generate the epilogue instructions for returning from a function.
4973    This is almost exactly the reverse of the prolog sequence, except
4974    that we need to insert barriers to avoid scheduling loads that read
4975    from a deallocated stack, and we optimize the unwind records by
4976    emitting them all together if possible.  */
4977 void
4978 aarch64_expand_epilogue (bool for_sibcall)
4979 {
4980   aarch64_layout_frame ();
4981
4982   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4983   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4984   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4985   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4986   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4987   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4988   rtx cfi_ops = NULL;
4989   rtx_insn *insn;
4990   /* A stack clash protection prologue may not have left IP0_REGNUM or
4991      IP1_REGNUM in a usable state.  The same is true for allocations
4992      with an SVE component, since we then need both temporary registers
4993      for each allocation.  */
4994   bool can_inherit_p = (initial_adjust.is_constant ()
4995                         && final_adjust.is_constant ()
4996                         && !flag_stack_clash_protection);
4997
4998   /* We need to add memory barrier to prevent read from deallocated stack.  */
4999   bool need_barrier_p
5000     = maybe_ne (get_frame_size ()
5001                 + cfun->machine->frame.saved_varargs_size, 0);
5002
5003   /* Emit a barrier to prevent loads from a deallocated stack.  */
5004   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
5005       || cfun->calls_alloca
5006       || crtl->calls_eh_return)
5007     {
5008       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5009       need_barrier_p = false;
5010     }
5011
5012   /* Restore the stack pointer from the frame pointer if it may not
5013      be the same as the stack pointer.  */
5014   rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
5015   rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
5016   if (frame_pointer_needed
5017       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
5018     /* If writeback is used when restoring callee-saves, the CFA
5019        is restored on the instruction doing the writeback.  */
5020     aarch64_add_offset (Pmode, stack_pointer_rtx,
5021                         hard_frame_pointer_rtx, -callee_offset,
5022                         ip1_rtx, ip0_rtx, callee_adjust == 0);
5023   else
5024     aarch64_add_sp (ip1_rtx, ip0_rtx, final_adjust,
5025                     !can_inherit_p || df_regs_ever_live_p (IP1_REGNUM));
5026
5027   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5028                                 callee_adjust != 0, &cfi_ops);
5029   aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5030                                 callee_adjust != 0, &cfi_ops);
5031
5032   if (need_barrier_p)
5033     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5034
5035   if (callee_adjust != 0)
5036     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
5037
5038   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
5039     {
5040       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
5041       insn = get_last_insn ();
5042       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
5043       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
5044       RTX_FRAME_RELATED_P (insn) = 1;
5045       cfi_ops = NULL;
5046     }
5047
5048   aarch64_add_sp (ip0_rtx, ip1_rtx, initial_adjust,
5049                   !can_inherit_p || df_regs_ever_live_p (IP0_REGNUM));
5050
5051   if (cfi_ops)
5052     {
5053       /* Emit delayed restores and reset the CFA to be SP.  */
5054       insn = get_last_insn ();
5055       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
5056       REG_NOTES (insn) = cfi_ops;
5057       RTX_FRAME_RELATED_P (insn) = 1;
5058     }
5059
5060   /* We prefer to emit the combined return/authenticate instruction RETAA,
5061      however there are three cases in which we must instead emit an explicit
5062      authentication instruction.
5063
5064         1) Sibcalls don't return in a normal way, so if we're about to call one
5065            we must authenticate.
5066
5067         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5068            generating code for !TARGET_ARMV8_3 we can't use it and must
5069            explicitly authenticate.
5070
5071         3) On an eh_return path we make extra stack adjustments to update the
5072            canonical frame address to be the exception handler's CFA.  We want
5073            to authenticate using the CFA of the function which calls eh_return.
5074     */
5075   if (aarch64_return_address_signing_enabled ()
5076       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
5077     {
5078       insn = emit_insn (gen_autisp ());
5079       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5080       RTX_FRAME_RELATED_P (insn) = 1;
5081     }
5082
5083   /* Stack adjustment for exception handler.  */
5084   if (crtl->calls_eh_return)
5085     {
5086       /* We need to unwind the stack by the offset computed by
5087          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
5088          to be SP; letting the CFA move during this adjustment
5089          is just as correct as retaining the CFA from the body
5090          of the function.  Therefore, do nothing special.  */
5091       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
5092     }
5093
5094   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
5095   if (!for_sibcall)
5096     emit_jump_insn (ret_rtx);
5097 }
5098
5099 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
5100    normally or return to a previous frame after unwinding.
5101
5102    An EH return uses a single shared return sequence.  The epilogue is
5103    exactly like a normal epilogue except that it has an extra input
5104    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5105    that must be applied after the frame has been destroyed.  An extra label
5106    is inserted before the epilogue which initializes this register to zero,
5107    and this is the entry point for a normal return.
5108
5109    An actual EH return updates the return address, initializes the stack
5110    adjustment and jumps directly into the epilogue (bypassing the zeroing
5111    of the adjustment).  Since the return address is typically saved on the
5112    stack when a function makes a call, the saved LR must be updated outside
5113    the epilogue.
5114
5115    This poses problems as the store is generated well before the epilogue,
5116    so the offset of LR is not known yet.  Also optimizations will remove the
5117    store as it appears dead, even after the epilogue is generated (as the
5118    base or offset for loading LR is different in many cases).
5119
5120    To avoid these problems this implementation forces the frame pointer
5121    in eh_return functions so that the location of LR is fixed and known early.
5122    It also marks the store volatile, so no optimization is permitted to
5123    remove the store.  */
5124 rtx
5125 aarch64_eh_return_handler_rtx (void)
5126 {
5127   rtx tmp = gen_frame_mem (Pmode,
5128     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
5129
5130   /* Mark the store volatile, so no optimization is permitted to remove it.  */
5131   MEM_VOLATILE_P (tmp) = true;
5132   return tmp;
5133 }
5134
5135 /* Output code to add DELTA to the first argument, and then jump
5136    to FUNCTION.  Used for C++ multiple inheritance.  */
5137 static void
5138 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
5139                          HOST_WIDE_INT delta,
5140                          HOST_WIDE_INT vcall_offset,
5141                          tree function)
5142 {
5143   /* The this pointer is always in x0.  Note that this differs from
5144      Arm where the this pointer maybe bumped to r1 if r0 is required
5145      to return a pointer to an aggregate.  On AArch64 a result value
5146      pointer will be in x8.  */
5147   int this_regno = R0_REGNUM;
5148   rtx this_rtx, temp0, temp1, addr, funexp;
5149   rtx_insn *insn;
5150
5151   reload_completed = 1;
5152   emit_note (NOTE_INSN_PROLOGUE_END);
5153
5154   this_rtx = gen_rtx_REG (Pmode, this_regno);
5155   temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
5156   temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
5157
5158   if (vcall_offset == 0)
5159     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
5160   else
5161     {
5162       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
5163
5164       addr = this_rtx;
5165       if (delta != 0)
5166         {
5167           if (delta >= -256 && delta < 256)
5168             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
5169                                        plus_constant (Pmode, this_rtx, delta));
5170           else
5171             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
5172                                 temp1, temp0, false);
5173         }
5174
5175       if (Pmode == ptr_mode)
5176         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
5177       else
5178         aarch64_emit_move (temp0,
5179                            gen_rtx_ZERO_EXTEND (Pmode,
5180                                                 gen_rtx_MEM (ptr_mode, addr)));
5181
5182       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
5183           addr = plus_constant (Pmode, temp0, vcall_offset);
5184       else
5185         {
5186           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
5187                                           Pmode);
5188           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
5189         }
5190
5191       if (Pmode == ptr_mode)
5192         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
5193       else
5194         aarch64_emit_move (temp1,
5195                            gen_rtx_SIGN_EXTEND (Pmode,
5196                                                 gen_rtx_MEM (ptr_mode, addr)));
5197
5198       emit_insn (gen_add2_insn (this_rtx, temp1));
5199     }
5200
5201   /* Generate a tail call to the target function.  */
5202   if (!TREE_USED (function))
5203     {
5204       assemble_external (function);
5205       TREE_USED (function) = 1;
5206     }
5207   funexp = XEXP (DECL_RTL (function), 0);
5208   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
5209   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
5210   SIBLING_CALL_P (insn) = 1;
5211
5212   insn = get_insns ();
5213   shorten_branches (insn);
5214   final_start_function (insn, file, 1);
5215   final (insn, file, 1);
5216   final_end_function ();
5217
5218   /* Stop pretending to be a post-reload pass.  */
5219   reload_completed = 0;
5220 }
5221
5222 static bool
5223 aarch64_tls_referenced_p (rtx x)
5224 {
5225   if (!TARGET_HAVE_TLS)
5226     return false;
5227   subrtx_iterator::array_type array;
5228   FOR_EACH_SUBRTX (iter, array, x, ALL)
5229     {
5230       const_rtx x = *iter;
5231       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
5232         return true;
5233       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
5234          TLS offsets, not real symbol references.  */
5235       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5236         iter.skip_subrtxes ();
5237     }
5238   return false;
5239 }
5240
5241
5242 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5243    a left shift of 0 or 12 bits.  */
5244 bool
5245 aarch64_uimm12_shift (HOST_WIDE_INT val)
5246 {
5247   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
5248           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
5249           );
5250 }
5251
5252
5253 /* Return true if val is an immediate that can be loaded into a
5254    register by a MOVZ instruction.  */
5255 static bool
5256 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
5257 {
5258   if (GET_MODE_SIZE (mode) > 4)
5259     {
5260       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
5261           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
5262         return 1;
5263     }
5264   else
5265     {
5266       /* Ignore sign extension.  */
5267       val &= (HOST_WIDE_INT) 0xffffffff;
5268     }
5269   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
5270           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
5271 }
5272
5273 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
5274    64-bit (DImode) integer.  */
5275
5276 static unsigned HOST_WIDE_INT
5277 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5278 {
5279   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
5280   while (size < 64)
5281     {
5282       val &= (HOST_WIDE_INT_1U << size) - 1;
5283       val |= val << size;
5284       size *= 2;
5285     }
5286   return val;
5287 }
5288
5289 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
5290
5291 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
5292   {
5293     0x0000000100000001ull,
5294     0x0001000100010001ull,
5295     0x0101010101010101ull,
5296     0x1111111111111111ull,
5297     0x5555555555555555ull,
5298   };
5299
5300
5301 /* Return true if val is a valid bitmask immediate.  */
5302
5303 bool
5304 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
5305 {
5306   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
5307   int bits;
5308
5309   /* Check for a single sequence of one bits and return quickly if so.
5310      The special cases of all ones and all zeroes returns false.  */
5311   val = aarch64_replicate_bitmask_imm (val_in, mode);
5312   tmp = val + (val & -val);
5313
5314   if (tmp == (tmp & -tmp))
5315     return (val + 1) > 1;
5316
5317   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
5318   if (mode == SImode)
5319     val = (val << 32) | (val & 0xffffffff);
5320
5321   /* Invert if the immediate doesn't start with a zero bit - this means we
5322      only need to search for sequences of one bits.  */
5323   if (val & 1)
5324     val = ~val;
5325
5326   /* Find the first set bit and set tmp to val with the first sequence of one
5327      bits removed.  Return success if there is a single sequence of ones.  */
5328   first_one = val & -val;
5329   tmp = val & (val + first_one);
5330
5331   if (tmp == 0)
5332     return true;
5333
5334   /* Find the next set bit and compute the difference in bit position.  */
5335   next_one = tmp & -tmp;
5336   bits = clz_hwi (first_one) - clz_hwi (next_one);
5337   mask = val ^ tmp;
5338
5339   /* Check the bit position difference is a power of 2, and that the first
5340      sequence of one bits fits within 'bits' bits.  */
5341   if ((mask >> bits) != 0 || bits != (bits & -bits))
5342     return false;
5343
5344   /* Check the sequence of one bits is repeated 64/bits times.  */
5345   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
5346 }
5347
5348 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5349    Assumed precondition: VAL_IN Is not zero.  */
5350
5351 unsigned HOST_WIDE_INT
5352 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
5353 {
5354   int lowest_bit_set = ctz_hwi (val_in);
5355   int highest_bit_set = floor_log2 (val_in);
5356   gcc_assert (val_in != 0);
5357
5358   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
5359           (HOST_WIDE_INT_1U << lowest_bit_set));
5360 }
5361
5362 /* Create constant where bits outside of lowest bit set to highest bit set
5363    are set to 1.  */
5364
5365 unsigned HOST_WIDE_INT
5366 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
5367 {
5368   return val_in | ~aarch64_and_split_imm1 (val_in);
5369 }
5370
5371 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
5372
5373 bool
5374 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
5375 {
5376   scalar_int_mode int_mode;
5377   if (!is_a <scalar_int_mode> (mode, &int_mode))
5378     return false;
5379
5380   if (aarch64_bitmask_imm (val_in, int_mode))
5381     return false;
5382
5383   if (aarch64_move_imm (val_in, int_mode))
5384     return false;
5385
5386   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
5387
5388   return aarch64_bitmask_imm (imm2, int_mode);
5389 }
5390
5391 /* Return true if val is an immediate that can be loaded into a
5392    register in a single instruction.  */
5393 bool
5394 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
5395 {
5396   scalar_int_mode int_mode;
5397   if (!is_a <scalar_int_mode> (mode, &int_mode))
5398     return false;
5399
5400   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
5401     return 1;
5402   return aarch64_bitmask_imm (val, int_mode);
5403 }
5404
5405 static bool
5406 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
5407 {
5408   rtx base, offset;
5409
5410   if (GET_CODE (x) == HIGH)
5411     return true;
5412
5413   /* There's no way to calculate VL-based values using relocations.  */
5414   subrtx_iterator::array_type array;
5415   FOR_EACH_SUBRTX (iter, array, x, ALL)
5416     if (GET_CODE (*iter) == CONST_POLY_INT)
5417       return true;
5418
5419   split_const (x, &base, &offset);
5420   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
5421     {
5422       if (aarch64_classify_symbol (base, INTVAL (offset))
5423           != SYMBOL_FORCE_TO_MEM)
5424         return true;
5425       else
5426         /* Avoid generating a 64-bit relocation in ILP32; leave
5427            to aarch64_expand_mov_immediate to handle it properly.  */
5428         return mode != ptr_mode;
5429     }
5430
5431   return aarch64_tls_referenced_p (x);
5432 }
5433
5434 /* Implement TARGET_CASE_VALUES_THRESHOLD.
5435    The expansion for a table switch is quite expensive due to the number
5436    of instructions, the table lookup and hard to predict indirect jump.
5437    When optimizing for speed, and -O3 enabled, use the per-core tuning if
5438    set, otherwise use tables for > 16 cases as a tradeoff between size and
5439    performance.  When optimizing for size, use the default setting.  */
5440
5441 static unsigned int
5442 aarch64_case_values_threshold (void)
5443 {
5444   /* Use the specified limit for the number of cases before using jump
5445      tables at higher optimization levels.  */
5446   if (optimize > 2
5447       && selected_cpu->tune->max_case_values != 0)
5448     return selected_cpu->tune->max_case_values;
5449   else
5450     return optimize_size ? default_case_values_threshold () : 17;
5451 }
5452
5453 /* Return true if register REGNO is a valid index register.
5454    STRICT_P is true if REG_OK_STRICT is in effect.  */
5455
5456 bool
5457 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
5458 {
5459   if (!HARD_REGISTER_NUM_P (regno))
5460     {
5461       if (!strict_p)
5462         return true;
5463
5464       if (!reg_renumber)
5465         return false;
5466
5467       regno = reg_renumber[regno];
5468     }
5469   return GP_REGNUM_P (regno);
5470 }
5471
5472 /* Return true if register REGNO is a valid base register for mode MODE.
5473    STRICT_P is true if REG_OK_STRICT is in effect.  */
5474
5475 bool
5476 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
5477 {
5478   if (!HARD_REGISTER_NUM_P (regno))
5479     {
5480       if (!strict_p)
5481         return true;
5482
5483       if (!reg_renumber)
5484         return false;
5485
5486       regno = reg_renumber[regno];
5487     }
5488
5489   /* The fake registers will be eliminated to either the stack or
5490      hard frame pointer, both of which are usually valid base registers.
5491      Reload deals with the cases where the eliminated form isn't valid.  */
5492   return (GP_REGNUM_P (regno)
5493           || regno == SP_REGNUM
5494           || regno == FRAME_POINTER_REGNUM
5495           || regno == ARG_POINTER_REGNUM);
5496 }
5497
5498 /* Return true if X is a valid base register for mode MODE.
5499    STRICT_P is true if REG_OK_STRICT is in effect.  */
5500
5501 static bool
5502 aarch64_base_register_rtx_p (rtx x, bool strict_p)
5503 {
5504   if (!strict_p
5505       && GET_CODE (x) == SUBREG
5506       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
5507     x = SUBREG_REG (x);
5508
5509   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
5510 }
5511
5512 /* Return true if address offset is a valid index.  If it is, fill in INFO
5513    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
5514
5515 static bool
5516 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
5517                         machine_mode mode, bool strict_p)
5518 {
5519   enum aarch64_address_type type;
5520   rtx index;
5521   int shift;
5522
5523   /* (reg:P) */
5524   if ((REG_P (x) || GET_CODE (x) == SUBREG)
5525       && GET_MODE (x) == Pmode)
5526     {
5527       type = ADDRESS_REG_REG;
5528       index = x;
5529       shift = 0;
5530     }
5531   /* (sign_extend:DI (reg:SI)) */
5532   else if ((GET_CODE (x) == SIGN_EXTEND
5533             || GET_CODE (x) == ZERO_EXTEND)
5534            && GET_MODE (x) == DImode
5535            && GET_MODE (XEXP (x, 0)) == SImode)
5536     {
5537       type = (GET_CODE (x) == SIGN_EXTEND)
5538         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5539       index = XEXP (x, 0);
5540       shift = 0;
5541     }
5542   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
5543   else if (GET_CODE (x) == MULT
5544            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5545                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5546            && GET_MODE (XEXP (x, 0)) == DImode
5547            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5548            && CONST_INT_P (XEXP (x, 1)))
5549     {
5550       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5551         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5552       index = XEXP (XEXP (x, 0), 0);
5553       shift = exact_log2 (INTVAL (XEXP (x, 1)));
5554     }
5555   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
5556   else if (GET_CODE (x) == ASHIFT
5557            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5558                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5559            && GET_MODE (XEXP (x, 0)) == DImode
5560            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5561            && CONST_INT_P (XEXP (x, 1)))
5562     {
5563       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5564         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5565       index = XEXP (XEXP (x, 0), 0);
5566       shift = INTVAL (XEXP (x, 1));
5567     }
5568   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
5569   else if ((GET_CODE (x) == SIGN_EXTRACT
5570             || GET_CODE (x) == ZERO_EXTRACT)
5571            && GET_MODE (x) == DImode
5572            && GET_CODE (XEXP (x, 0)) == MULT
5573            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5574            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5575     {
5576       type = (GET_CODE (x) == SIGN_EXTRACT)
5577         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5578       index = XEXP (XEXP (x, 0), 0);
5579       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5580       if (INTVAL (XEXP (x, 1)) != 32 + shift
5581           || INTVAL (XEXP (x, 2)) != 0)
5582         shift = -1;
5583     }
5584   /* (and:DI (mult:DI (reg:DI) (const_int scale))
5585      (const_int 0xffffffff<<shift)) */
5586   else if (GET_CODE (x) == AND
5587            && GET_MODE (x) == DImode
5588            && GET_CODE (XEXP (x, 0)) == MULT
5589            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5590            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5591            && CONST_INT_P (XEXP (x, 1)))
5592     {
5593       type = ADDRESS_REG_UXTW;
5594       index = XEXP (XEXP (x, 0), 0);
5595       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5596       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5597         shift = -1;
5598     }
5599   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
5600   else if ((GET_CODE (x) == SIGN_EXTRACT
5601             || GET_CODE (x) == ZERO_EXTRACT)
5602            && GET_MODE (x) == DImode
5603            && GET_CODE (XEXP (x, 0)) == ASHIFT
5604            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5605            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5606     {
5607       type = (GET_CODE (x) == SIGN_EXTRACT)
5608         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5609       index = XEXP (XEXP (x, 0), 0);
5610       shift = INTVAL (XEXP (XEXP (x, 0), 1));
5611       if (INTVAL (XEXP (x, 1)) != 32 + shift
5612           || INTVAL (XEXP (x, 2)) != 0)
5613         shift = -1;
5614     }
5615   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
5616      (const_int 0xffffffff<<shift)) */
5617   else if (GET_CODE (x) == AND
5618            && GET_MODE (x) == DImode
5619            && GET_CODE (XEXP (x, 0)) == ASHIFT
5620            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5621            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5622            && CONST_INT_P (XEXP (x, 1)))
5623     {
5624       type = ADDRESS_REG_UXTW;
5625       index = XEXP (XEXP (x, 0), 0);
5626       shift = INTVAL (XEXP (XEXP (x, 0), 1));
5627       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5628         shift = -1;
5629     }
5630   /* (mult:P (reg:P) (const_int scale)) */
5631   else if (GET_CODE (x) == MULT
5632            && GET_MODE (x) == Pmode
5633            && GET_MODE (XEXP (x, 0)) == Pmode
5634            && CONST_INT_P (XEXP (x, 1)))
5635     {
5636       type = ADDRESS_REG_REG;
5637       index = XEXP (x, 0);
5638       shift = exact_log2 (INTVAL (XEXP (x, 1)));
5639     }
5640   /* (ashift:P (reg:P) (const_int shift)) */
5641   else if (GET_CODE (x) == ASHIFT
5642            && GET_MODE (x) == Pmode
5643            && GET_MODE (XEXP (x, 0)) == Pmode
5644            && CONST_INT_P (XEXP (x, 1)))
5645     {
5646       type = ADDRESS_REG_REG;
5647       index = XEXP (x, 0);
5648       shift = INTVAL (XEXP (x, 1));
5649     }
5650   else
5651     return false;
5652
5653   if (!strict_p
5654       && GET_CODE (index) == SUBREG
5655       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
5656     index = SUBREG_REG (index);
5657
5658   if (aarch64_sve_data_mode_p (mode))
5659     {
5660       if (type != ADDRESS_REG_REG
5661           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
5662         return false;
5663     }
5664   else
5665     {
5666       if (shift != 0
5667           && !(IN_RANGE (shift, 1, 3)
5668                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
5669         return false;
5670     }
5671
5672   if (REG_P (index)
5673       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
5674     {
5675       info->type = type;
5676       info->offset = index;
5677       info->shift = shift;
5678       return true;
5679     }
5680
5681   return false;
5682 }
5683
5684 /* Return true if MODE is one of the modes for which we
5685    support LDP/STP operations.  */
5686
5687 static bool
5688 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
5689 {
5690   return mode == SImode || mode == DImode
5691          || mode == SFmode || mode == DFmode
5692          || (aarch64_vector_mode_supported_p (mode)
5693              && (known_eq (GET_MODE_SIZE (mode), 8)
5694                  || (known_eq (GET_MODE_SIZE (mode), 16)
5695                     && (aarch64_tune_params.extra_tuning_flags
5696                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
5697 }
5698
5699 /* Return true if REGNO is a virtual pointer register, or an eliminable
5700    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
5701    include stack_pointer or hard_frame_pointer.  */
5702 static bool
5703 virt_or_elim_regno_p (unsigned regno)
5704 {
5705   return ((regno >= FIRST_VIRTUAL_REGISTER
5706            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
5707           || regno == FRAME_POINTER_REGNUM
5708           || regno == ARG_POINTER_REGNUM);
5709 }
5710
5711 /* Return true if X is a valid address of type TYPE for machine mode MODE.
5712    If it is, fill in INFO appropriately.  STRICT_P is true if
5713    REG_OK_STRICT is in effect.  */
5714
5715 static bool
5716 aarch64_classify_address (struct aarch64_address_info *info,
5717                           rtx x, machine_mode mode, bool strict_p,
5718                           aarch64_addr_query_type type = ADDR_QUERY_M)
5719 {
5720   enum rtx_code code = GET_CODE (x);
5721   rtx op0, op1;
5722   poly_int64 offset;
5723
5724   HOST_WIDE_INT const_size;
5725
5726   /* On BE, we use load/store pair for all large int mode load/stores.
5727      TI/TFmode may also use a load/store pair.  */
5728   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5729   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
5730   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
5731                             || mode == TImode
5732                             || mode == TFmode
5733                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
5734
5735   bool allow_reg_index_p = (!load_store_pair_p
5736                             && (known_lt (GET_MODE_SIZE (mode), 16)
5737                                 || vec_flags == VEC_ADVSIMD
5738                                 || vec_flags == VEC_SVE_DATA));
5739
5740   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
5741      [Rn, #offset, MUL VL].  */
5742   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
5743       && (code != REG && code != PLUS))
5744     return false;
5745
5746   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
5747      REG addressing.  */
5748   if (advsimd_struct_p
5749       && !BYTES_BIG_ENDIAN
5750       && (code != POST_INC && code != REG))
5751     return false;
5752
5753   gcc_checking_assert (GET_MODE (x) == VOIDmode
5754                        || SCALAR_INT_MODE_P (GET_MODE (x)));
5755
5756   switch (code)
5757     {
5758     case REG:
5759     case SUBREG:
5760       info->type = ADDRESS_REG_IMM;
5761       info->base = x;
5762       info->offset = const0_rtx;
5763       info->const_offset = 0;
5764       return aarch64_base_register_rtx_p (x, strict_p);
5765
5766     case PLUS:
5767       op0 = XEXP (x, 0);
5768       op1 = XEXP (x, 1);
5769
5770       if (! strict_p
5771           && REG_P (op0)
5772           && virt_or_elim_regno_p (REGNO (op0))
5773           && poly_int_rtx_p (op1, &offset))
5774         {
5775           info->type = ADDRESS_REG_IMM;
5776           info->base = op0;
5777           info->offset = op1;
5778           info->const_offset = offset;
5779
5780           return true;
5781         }
5782
5783       if (maybe_ne (GET_MODE_SIZE (mode), 0)
5784           && aarch64_base_register_rtx_p (op0, strict_p)
5785           && poly_int_rtx_p (op1, &offset))
5786         {
5787           info->type = ADDRESS_REG_IMM;
5788           info->base = op0;
5789           info->offset = op1;
5790           info->const_offset = offset;
5791
5792           /* TImode and TFmode values are allowed in both pairs of X
5793              registers and individual Q registers.  The available
5794              address modes are:
5795              X,X: 7-bit signed scaled offset
5796              Q:   9-bit signed offset
5797              We conservatively require an offset representable in either mode.
5798              When performing the check for pairs of X registers i.e.  LDP/STP
5799              pass down DImode since that is the natural size of the LDP/STP
5800              instruction memory accesses.  */
5801           if (mode == TImode || mode == TFmode)
5802             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
5803                     && (offset_9bit_signed_unscaled_p (mode, offset)
5804                         || offset_12bit_unsigned_scaled_p (mode, offset)));
5805
5806           /* A 7bit offset check because OImode will emit a ldp/stp
5807              instruction (only big endian will get here).
5808              For ldp/stp instructions, the offset is scaled for the size of a
5809              single element of the pair.  */
5810           if (mode == OImode)
5811             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
5812
5813           /* Three 9/12 bit offsets checks because CImode will emit three
5814              ldr/str instructions (only big endian will get here).  */
5815           if (mode == CImode)
5816             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5817                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
5818                         || offset_12bit_unsigned_scaled_p (V16QImode,
5819                                                            offset + 32)));
5820
5821           /* Two 7bit offsets checks because XImode will emit two ldp/stp
5822              instructions (only big endian will get here).  */
5823           if (mode == XImode)
5824             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5825                     && aarch64_offset_7bit_signed_scaled_p (TImode,
5826                                                             offset + 32));
5827
5828           /* Make "m" use the LD1 offset range for SVE data modes, so
5829              that pre-RTL optimizers like ivopts will work to that
5830              instead of the wider LDR/STR range.  */
5831           if (vec_flags == VEC_SVE_DATA)
5832             return (type == ADDR_QUERY_M
5833                     ? offset_4bit_signed_scaled_p (mode, offset)
5834                     : offset_9bit_signed_scaled_p (mode, offset));
5835
5836           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
5837             {
5838               poly_int64 end_offset = (offset
5839                                        + GET_MODE_SIZE (mode)
5840                                        - BYTES_PER_SVE_VECTOR);
5841               return (type == ADDR_QUERY_M
5842                       ? offset_4bit_signed_scaled_p (mode, offset)
5843                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
5844                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
5845                                                          end_offset)));
5846             }
5847
5848           if (vec_flags == VEC_SVE_PRED)
5849             return offset_9bit_signed_scaled_p (mode, offset);
5850
5851           if (load_store_pair_p)
5852             return ((known_eq (GET_MODE_SIZE (mode), 4)
5853                      || known_eq (GET_MODE_SIZE (mode), 8)
5854                      || known_eq (GET_MODE_SIZE (mode), 16))
5855                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5856           else
5857             return (offset_9bit_signed_unscaled_p (mode, offset)
5858                     || offset_12bit_unsigned_scaled_p (mode, offset));
5859         }
5860
5861       if (allow_reg_index_p)
5862         {
5863           /* Look for base + (scaled/extended) index register.  */
5864           if (aarch64_base_register_rtx_p (op0, strict_p)
5865               && aarch64_classify_index (info, op1, mode, strict_p))
5866             {
5867               info->base = op0;
5868               return true;
5869             }
5870           if (aarch64_base_register_rtx_p (op1, strict_p)
5871               && aarch64_classify_index (info, op0, mode, strict_p))
5872             {
5873               info->base = op1;
5874               return true;
5875             }
5876         }
5877
5878       return false;
5879
5880     case POST_INC:
5881     case POST_DEC:
5882     case PRE_INC:
5883     case PRE_DEC:
5884       info->type = ADDRESS_REG_WB;
5885       info->base = XEXP (x, 0);
5886       info->offset = NULL_RTX;
5887       return aarch64_base_register_rtx_p (info->base, strict_p);
5888
5889     case POST_MODIFY:
5890     case PRE_MODIFY:
5891       info->type = ADDRESS_REG_WB;
5892       info->base = XEXP (x, 0);
5893       if (GET_CODE (XEXP (x, 1)) == PLUS
5894           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
5895           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
5896           && aarch64_base_register_rtx_p (info->base, strict_p))
5897         {
5898           info->offset = XEXP (XEXP (x, 1), 1);
5899           info->const_offset = offset;
5900
5901           /* TImode and TFmode values are allowed in both pairs of X
5902              registers and individual Q registers.  The available
5903              address modes are:
5904              X,X: 7-bit signed scaled offset
5905              Q:   9-bit signed offset
5906              We conservatively require an offset representable in either mode.
5907            */
5908           if (mode == TImode || mode == TFmode)
5909             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
5910                     && offset_9bit_signed_unscaled_p (mode, offset));
5911
5912           if (load_store_pair_p)
5913             return ((known_eq (GET_MODE_SIZE (mode), 4)
5914                      || known_eq (GET_MODE_SIZE (mode), 8)
5915                      || known_eq (GET_MODE_SIZE (mode), 16))
5916                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5917           else
5918             return offset_9bit_signed_unscaled_p (mode, offset);
5919         }
5920       return false;
5921
5922     case CONST:
5923     case SYMBOL_REF:
5924     case LABEL_REF:
5925       /* load literal: pc-relative constant pool entry.  Only supported
5926          for SI mode or larger.  */
5927       info->type = ADDRESS_SYMBOLIC;
5928
5929       if (!load_store_pair_p
5930           && GET_MODE_SIZE (mode).is_constant (&const_size)
5931           && const_size >= 4)
5932         {
5933           rtx sym, addend;
5934
5935           split_const (x, &sym, &addend);
5936           return ((GET_CODE (sym) == LABEL_REF
5937                    || (GET_CODE (sym) == SYMBOL_REF
5938                        && CONSTANT_POOL_ADDRESS_P (sym)
5939                        && aarch64_pcrelative_literal_loads)));
5940         }
5941       return false;
5942
5943     case LO_SUM:
5944       info->type = ADDRESS_LO_SUM;
5945       info->base = XEXP (x, 0);
5946       info->offset = XEXP (x, 1);
5947       if (allow_reg_index_p
5948           && aarch64_base_register_rtx_p (info->base, strict_p))
5949         {
5950           rtx sym, offs;
5951           split_const (info->offset, &sym, &offs);
5952           if (GET_CODE (sym) == SYMBOL_REF
5953               && (aarch64_classify_symbol (sym, INTVAL (offs))
5954                   == SYMBOL_SMALL_ABSOLUTE))
5955             {
5956               /* The symbol and offset must be aligned to the access size.  */
5957               unsigned int align;
5958
5959               if (CONSTANT_POOL_ADDRESS_P (sym))
5960                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
5961               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
5962                 {
5963                   tree exp = SYMBOL_REF_DECL (sym);
5964                   align = TYPE_ALIGN (TREE_TYPE (exp));
5965                   align = aarch64_constant_alignment (exp, align);
5966                 }
5967               else if (SYMBOL_REF_DECL (sym))
5968                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
5969               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
5970                        && SYMBOL_REF_BLOCK (sym) != NULL)
5971                 align = SYMBOL_REF_BLOCK (sym)->alignment;
5972               else
5973                 align = BITS_PER_UNIT;
5974
5975               poly_int64 ref_size = GET_MODE_SIZE (mode);
5976               if (known_eq (ref_size, 0))
5977                 ref_size = GET_MODE_SIZE (DImode);
5978
5979               return (multiple_p (INTVAL (offs), ref_size)
5980                       && multiple_p (align / BITS_PER_UNIT, ref_size));
5981             }
5982         }
5983       return false;
5984
5985     default:
5986       return false;
5987     }
5988 }
5989
5990 /* Return true if the address X is valid for a PRFM instruction.
5991    STRICT_P is true if we should do strict checking with
5992    aarch64_classify_address.  */
5993
5994 bool
5995 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
5996 {
5997   struct aarch64_address_info addr;
5998
5999   /* PRFM accepts the same addresses as DImode...  */
6000   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
6001   if (!res)
6002     return false;
6003
6004   /* ... except writeback forms.  */
6005   return addr.type != ADDRESS_REG_WB;
6006 }
6007
6008 bool
6009 aarch64_symbolic_address_p (rtx x)
6010 {
6011   rtx offset;
6012
6013   split_const (x, &x, &offset);
6014   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
6015 }
6016
6017 /* Classify the base of symbolic expression X.  */
6018
6019 enum aarch64_symbol_type
6020 aarch64_classify_symbolic_expression (rtx x)
6021 {
6022   rtx offset;
6023
6024   split_const (x, &x, &offset);
6025   return aarch64_classify_symbol (x, INTVAL (offset));
6026 }
6027
6028
6029 /* Return TRUE if X is a legitimate address for accessing memory in
6030    mode MODE.  */
6031 static bool
6032 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
6033 {
6034   struct aarch64_address_info addr;
6035
6036   return aarch64_classify_address (&addr, x, mode, strict_p);
6037 }
6038
6039 /* Return TRUE if X is a legitimate address of type TYPE for accessing
6040    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
6041 bool
6042 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
6043                               aarch64_addr_query_type type)
6044 {
6045   struct aarch64_address_info addr;
6046
6047   return aarch64_classify_address (&addr, x, mode, strict_p, type);
6048 }
6049
6050 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
6051
6052 static bool
6053 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
6054                                          poly_int64 orig_offset,
6055                                          machine_mode mode)
6056 {
6057   HOST_WIDE_INT size;
6058   if (GET_MODE_SIZE (mode).is_constant (&size))
6059     {
6060       HOST_WIDE_INT const_offset, second_offset;
6061
6062       /* A general SVE offset is A * VQ + B.  Remove the A component from
6063          coefficient 0 in order to get the constant B.  */
6064       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
6065
6066       /* Split an out-of-range address displacement into a base and
6067          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
6068          range otherwise to increase opportunities for sharing the base
6069          address of different sizes.  Unaligned accesses use the signed
6070          9-bit range, TImode/TFmode use the intersection of signed
6071          scaled 7-bit and signed 9-bit offset.  */
6072       if (mode == TImode || mode == TFmode)
6073         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
6074       else if ((const_offset & (size - 1)) != 0)
6075         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
6076       else
6077         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
6078
6079       if (second_offset == 0 || known_eq (orig_offset, second_offset))
6080         return false;
6081
6082       /* Split the offset into second_offset and the rest.  */
6083       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6084       *offset2 = gen_int_mode (second_offset, Pmode);
6085       return true;
6086     }
6087   else
6088     {
6089       /* Get the mode we should use as the basis of the range.  For structure
6090          modes this is the mode of one vector.  */
6091       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6092       machine_mode step_mode
6093         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
6094
6095       /* Get the "mul vl" multiplier we'd like to use.  */
6096       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
6097       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
6098       if (vec_flags & VEC_SVE_DATA)
6099         /* LDR supports a 9-bit range, but the move patterns for
6100            structure modes require all vectors to be in range of the
6101            same base.  The simplest way of accomodating that while still
6102            promoting reuse of anchor points between different modes is
6103            to use an 8-bit range unconditionally.  */
6104         vnum = ((vnum + 128) & 255) - 128;
6105       else
6106         /* Predicates are only handled singly, so we might as well use
6107            the full range.  */
6108         vnum = ((vnum + 256) & 511) - 256;
6109       if (vnum == 0)
6110         return false;
6111
6112       /* Convert the "mul vl" multiplier into a byte offset.  */
6113       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
6114       if (known_eq (second_offset, orig_offset))
6115         return false;
6116
6117       /* Split the offset into second_offset and the rest.  */
6118       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6119       *offset2 = gen_int_mode (second_offset, Pmode);
6120       return true;
6121     }
6122 }
6123
6124 /* Return the binary representation of floating point constant VALUE in INTVAL.
6125    If the value cannot be converted, return false without setting INTVAL.
6126    The conversion is done in the given MODE.  */
6127 bool
6128 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
6129 {
6130
6131   /* We make a general exception for 0.  */
6132   if (aarch64_float_const_zero_rtx_p (value))
6133     {
6134       *intval = 0;
6135       return true;
6136     }
6137
6138   scalar_float_mode mode;
6139   if (GET_CODE (value) != CONST_DOUBLE
6140       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
6141       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
6142       /* Only support up to DF mode.  */
6143       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
6144     return false;
6145
6146   unsigned HOST_WIDE_INT ival = 0;
6147
6148   long res[2];
6149   real_to_target (res,
6150                   CONST_DOUBLE_REAL_VALUE (value),
6151                   REAL_MODE_FORMAT (mode));
6152
6153   if (mode == DFmode)
6154     {
6155       int order = BYTES_BIG_ENDIAN ? 1 : 0;
6156       ival = zext_hwi (res[order], 32);
6157       ival |= (zext_hwi (res[1 - order], 32) << 32);
6158     }
6159   else
6160       ival = zext_hwi (res[0], 32);
6161
6162   *intval = ival;
6163   return true;
6164 }
6165
6166 /* Return TRUE if rtx X is an immediate constant that can be moved using a
6167    single MOV(+MOVK) followed by an FMOV.  */
6168 bool
6169 aarch64_float_const_rtx_p (rtx x)
6170 {
6171   machine_mode mode = GET_MODE (x);
6172   if (mode == VOIDmode)
6173     return false;
6174
6175   /* Determine whether it's cheaper to write float constants as
6176      mov/movk pairs over ldr/adrp pairs.  */
6177   unsigned HOST_WIDE_INT ival;
6178
6179   if (GET_CODE (x) == CONST_DOUBLE
6180       && SCALAR_FLOAT_MODE_P (mode)
6181       && aarch64_reinterpret_float_as_int (x, &ival))
6182     {
6183       scalar_int_mode imode = (mode == HFmode
6184                                ? SImode
6185                                : int_mode_for_mode (mode).require ());
6186       int num_instr = aarch64_internal_mov_immediate
6187                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
6188       return num_instr < 3;
6189     }
6190
6191   return false;
6192 }
6193
6194 /* Return TRUE if rtx X is immediate constant 0.0 */
6195 bool
6196 aarch64_float_const_zero_rtx_p (rtx x)
6197 {
6198   if (GET_MODE (x) == VOIDmode)
6199     return false;
6200
6201   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
6202     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
6203   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
6204 }
6205
6206 /* Return TRUE if rtx X is immediate constant that fits in a single
6207    MOVI immediate operation.  */
6208 bool
6209 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
6210 {
6211   if (!TARGET_SIMD)
6212      return false;
6213
6214   machine_mode vmode;
6215   scalar_int_mode imode;
6216   unsigned HOST_WIDE_INT ival;
6217
6218   if (GET_CODE (x) == CONST_DOUBLE
6219       && SCALAR_FLOAT_MODE_P (mode))
6220     {
6221       if (!aarch64_reinterpret_float_as_int (x, &ival))
6222         return false;
6223
6224       /* We make a general exception for 0.  */
6225       if (aarch64_float_const_zero_rtx_p (x))
6226         return true;
6227
6228       imode = int_mode_for_mode (mode).require ();
6229     }
6230   else if (GET_CODE (x) == CONST_INT
6231            && is_a <scalar_int_mode> (mode, &imode))
6232     ival = INTVAL (x);
6233   else
6234     return false;
6235
6236    /* use a 64 bit mode for everything except for DI/DF mode, where we use
6237      a 128 bit vector mode.  */
6238   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
6239
6240   vmode = aarch64_simd_container_mode (imode, width);
6241   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
6242
6243   return aarch64_simd_valid_immediate (v_op, NULL);
6244 }
6245
6246
6247 /* Return the fixed registers used for condition codes.  */
6248
6249 static bool
6250 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
6251 {
6252   *p1 = CC_REGNUM;
6253   *p2 = INVALID_REGNUM;
6254   return true;
6255 }
6256
6257 /* This function is used by the call expanders of the machine description.
6258    RESULT is the register in which the result is returned.  It's NULL for
6259    "call" and "sibcall".
6260    MEM is the location of the function call.
6261    SIBCALL indicates whether this function call is normal call or sibling call.
6262    It will generate different pattern accordingly.  */
6263
6264 void
6265 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
6266 {
6267   rtx call, callee, tmp;
6268   rtvec vec;
6269   machine_mode mode;
6270
6271   gcc_assert (MEM_P (mem));
6272   callee = XEXP (mem, 0);
6273   mode = GET_MODE (callee);
6274   gcc_assert (mode == Pmode);
6275
6276   /* Decide if we should generate indirect calls by loading the
6277      address of the callee into a register before performing
6278      the branch-and-link.  */
6279   if (SYMBOL_REF_P (callee)
6280       ? (aarch64_is_long_call_p (callee)
6281          || aarch64_is_noplt_call_p (callee))
6282       : !REG_P (callee))
6283     XEXP (mem, 0) = force_reg (mode, callee);
6284
6285   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
6286
6287   if (result != NULL_RTX)
6288     call = gen_rtx_SET (result, call);
6289
6290   if (sibcall)
6291     tmp = ret_rtx;
6292   else
6293     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
6294
6295   vec = gen_rtvec (2, call, tmp);
6296   call = gen_rtx_PARALLEL (VOIDmode, vec);
6297
6298   aarch64_emit_call_insn (call);
6299 }
6300
6301 /* Emit call insn with PAT and do aarch64-specific handling.  */
6302
6303 void
6304 aarch64_emit_call_insn (rtx pat)
6305 {
6306   rtx insn = emit_call_insn (pat);
6307
6308   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
6309   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
6310   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
6311 }
6312
6313 machine_mode
6314 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
6315 {
6316   /* All floating point compares return CCFP if it is an equality
6317      comparison, and CCFPE otherwise.  */
6318   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
6319     {
6320       switch (code)
6321         {
6322         case EQ:
6323         case NE:
6324         case UNORDERED:
6325         case ORDERED:
6326         case UNLT:
6327         case UNLE:
6328         case UNGT:
6329         case UNGE:
6330         case UNEQ:
6331           return CCFPmode;
6332
6333         case LT:
6334         case LE:
6335         case GT:
6336         case GE:
6337         case LTGT:
6338           return CCFPEmode;
6339
6340         default:
6341           gcc_unreachable ();
6342         }
6343     }
6344
6345   /* Equality comparisons of short modes against zero can be performed
6346      using the TST instruction with the appropriate bitmask.  */
6347   if (y == const0_rtx && REG_P (x)
6348       && (code == EQ || code == NE)
6349       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
6350     return CC_NZmode;
6351
6352   /* Similarly, comparisons of zero_extends from shorter modes can
6353      be performed using an ANDS with an immediate mask.  */
6354   if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
6355       && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6356       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
6357       && (code == EQ || code == NE))
6358     return CC_NZmode;
6359
6360   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6361       && y == const0_rtx
6362       && (code == EQ || code == NE || code == LT || code == GE)
6363       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
6364           || GET_CODE (x) == NEG
6365           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
6366               && CONST_INT_P (XEXP (x, 2)))))
6367     return CC_NZmode;
6368
6369   /* A compare with a shifted operand.  Because of canonicalization,
6370      the comparison will have to be swapped when we emit the assembly
6371      code.  */
6372   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6373       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
6374       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
6375           || GET_CODE (x) == LSHIFTRT
6376           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
6377     return CC_SWPmode;
6378
6379   /* Similarly for a negated operand, but we can only do this for
6380      equalities.  */
6381   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6382       && (REG_P (y) || GET_CODE (y) == SUBREG)
6383       && (code == EQ || code == NE)
6384       && GET_CODE (x) == NEG)
6385     return CC_Zmode;
6386
6387   /* A test for unsigned overflow.  */
6388   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
6389       && code == NE
6390       && GET_CODE (x) == PLUS
6391       && GET_CODE (y) == ZERO_EXTEND)
6392     return CC_Cmode;
6393
6394   /* For everything else, return CCmode.  */
6395   return CCmode;
6396 }
6397
6398 static int
6399 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
6400
6401 int
6402 aarch64_get_condition_code (rtx x)
6403 {
6404   machine_mode mode = GET_MODE (XEXP (x, 0));
6405   enum rtx_code comp_code = GET_CODE (x);
6406
6407   if (GET_MODE_CLASS (mode) != MODE_CC)
6408     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
6409   return aarch64_get_condition_code_1 (mode, comp_code);
6410 }
6411
6412 static int
6413 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
6414 {
6415   switch (mode)
6416     {
6417     case E_CCFPmode:
6418     case E_CCFPEmode:
6419       switch (comp_code)
6420         {
6421         case GE: return AARCH64_GE;
6422         case GT: return AARCH64_GT;
6423         case LE: return AARCH64_LS;
6424         case LT: return AARCH64_MI;
6425         case NE: return AARCH64_NE;
6426         case EQ: return AARCH64_EQ;
6427         case ORDERED: return AARCH64_VC;
6428         case UNORDERED: return AARCH64_VS;
6429         case UNLT: return AARCH64_LT;
6430         case UNLE: return AARCH64_LE;
6431         case UNGT: return AARCH64_HI;
6432         case UNGE: return AARCH64_PL;
6433         default: return -1;
6434         }
6435       break;
6436
6437     case E_CCmode:
6438       switch (comp_code)
6439         {
6440         case NE: return AARCH64_NE;
6441         case EQ: return AARCH64_EQ;
6442         case GE: return AARCH64_GE;
6443         case GT: return AARCH64_GT;
6444         case LE: return AARCH64_LE;
6445         case LT: return AARCH64_LT;
6446         case GEU: return AARCH64_CS;
6447         case GTU: return AARCH64_HI;
6448         case LEU: return AARCH64_LS;
6449         case LTU: return AARCH64_CC;
6450         default: return -1;
6451         }
6452       break;
6453
6454     case E_CC_SWPmode:
6455       switch (comp_code)
6456         {
6457         case NE: return AARCH64_NE;
6458         case EQ: return AARCH64_EQ;
6459         case GE: return AARCH64_LE;
6460         case GT: return AARCH64_LT;
6461         case LE: return AARCH64_GE;
6462         case LT: return AARCH64_GT;
6463         case GEU: return AARCH64_LS;
6464         case GTU: return AARCH64_CC;
6465         case LEU: return AARCH64_CS;
6466         case LTU: return AARCH64_HI;
6467         default: return -1;
6468         }
6469       break;
6470
6471     case E_CC_NZmode:
6472       switch (comp_code)
6473         {
6474         case NE: return AARCH64_NE;
6475         case EQ: return AARCH64_EQ;
6476         case GE: return AARCH64_PL;
6477         case LT: return AARCH64_MI;
6478         default: return -1;
6479         }
6480       break;
6481
6482     case E_CC_Zmode:
6483       switch (comp_code)
6484         {
6485         case NE: return AARCH64_NE;
6486         case EQ: return AARCH64_EQ;
6487         default: return -1;
6488         }
6489       break;
6490
6491     case E_CC_Cmode:
6492       switch (comp_code)
6493         {
6494         case NE: return AARCH64_CS;
6495         case EQ: return AARCH64_CC;
6496         default: return -1;
6497         }
6498       break;
6499
6500     default:
6501       return -1;
6502     }
6503
6504   return -1;
6505 }
6506
6507 bool
6508 aarch64_const_vec_all_same_in_range_p (rtx x,
6509                                        HOST_WIDE_INT minval,
6510                                        HOST_WIDE_INT maxval)
6511 {
6512   rtx elt;
6513   return (const_vec_duplicate_p (x, &elt)
6514           && CONST_INT_P (elt)
6515           && IN_RANGE (INTVAL (elt), minval, maxval));
6516 }
6517
6518 bool
6519 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
6520 {
6521   return aarch64_const_vec_all_same_in_range_p (x, val, val);
6522 }
6523
6524 /* Return true if VEC is a constant in which every element is in the range
6525    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
6526
6527 static bool
6528 aarch64_const_vec_all_in_range_p (rtx vec,
6529                                   HOST_WIDE_INT minval,
6530                                   HOST_WIDE_INT maxval)
6531 {
6532   if (GET_CODE (vec) != CONST_VECTOR
6533       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
6534     return false;
6535
6536   int nunits;
6537   if (!CONST_VECTOR_STEPPED_P (vec))
6538     nunits = const_vector_encoded_nelts (vec);
6539   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
6540     return false;
6541
6542   for (int i = 0; i < nunits; i++)
6543     {
6544       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
6545       if (!CONST_INT_P (vec_elem)
6546           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
6547         return false;
6548     }
6549   return true;
6550 }
6551
6552 /* N Z C V.  */
6553 #define AARCH64_CC_V 1
6554 #define AARCH64_CC_C (1 << 1)
6555 #define AARCH64_CC_Z (1 << 2)
6556 #define AARCH64_CC_N (1 << 3)
6557
6558 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
6559 static const int aarch64_nzcv_codes[] =
6560 {
6561   0,            /* EQ, Z == 1.  */
6562   AARCH64_CC_Z, /* NE, Z == 0.  */
6563   0,            /* CS, C == 1.  */
6564   AARCH64_CC_C, /* CC, C == 0.  */
6565   0,            /* MI, N == 1.  */
6566   AARCH64_CC_N, /* PL, N == 0.  */
6567   0,            /* VS, V == 1.  */
6568   AARCH64_CC_V, /* VC, V == 0.  */
6569   0,            /* HI, C ==1 && Z == 0.  */
6570   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
6571   AARCH64_CC_V, /* GE, N == V.  */
6572   0,            /* LT, N != V.  */
6573   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
6574   0,            /* LE, !(Z == 0 && N == V).  */
6575   0,            /* AL, Any.  */
6576   0             /* NV, Any.  */
6577 };
6578
6579 /* Print floating-point vector immediate operand X to F, negating it
6580    first if NEGATE is true.  Return true on success, false if it isn't
6581    a constant we can handle.  */
6582
6583 static bool
6584 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
6585 {
6586   rtx elt;
6587
6588   if (!const_vec_duplicate_p (x, &elt))
6589     return false;
6590
6591   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
6592   if (negate)
6593     r = real_value_negate (&r);
6594
6595   /* We only handle the SVE single-bit immediates here.  */
6596   if (real_equal (&r, &dconst0))
6597     asm_fprintf (f, "0.0");
6598   else if (real_equal (&r, &dconst1))
6599     asm_fprintf (f, "1.0");
6600   else if (real_equal (&r, &dconsthalf))
6601     asm_fprintf (f, "0.5");
6602   else
6603     return false;
6604
6605   return true;
6606 }
6607
6608 /* Return the equivalent letter for size.  */
6609 static char
6610 sizetochar (int size)
6611 {
6612   switch (size)
6613     {
6614     case 64: return 'd';
6615     case 32: return 's';
6616     case 16: return 'h';
6617     case 8 : return 'b';
6618     default: gcc_unreachable ();
6619     }
6620 }
6621
6622 /* Print operand X to file F in a target specific manner according to CODE.
6623    The acceptable formatting commands given by CODE are:
6624      'c':               An integer or symbol address without a preceding #
6625                         sign.
6626      'C':               Take the duplicated element in a vector constant
6627                         and print it in hex.
6628      'D':               Take the duplicated element in a vector constant
6629                         and print it as an unsigned integer, in decimal.
6630      'e':               Print the sign/zero-extend size as a character 8->b,
6631                         16->h, 32->w.
6632      'p':               Prints N such that 2^N == X (X must be power of 2 and
6633                         const int).
6634      'P':               Print the number of non-zero bits in X (a const_int).
6635      'H':               Print the higher numbered register of a pair (TImode)
6636                         of regs.
6637      'm':               Print a condition (eq, ne, etc).
6638      'M':               Same as 'm', but invert condition.
6639      'N':               Take the duplicated element in a vector constant
6640                         and print the negative of it in decimal.
6641      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
6642      'S/T/U/V':         Print a FP/SIMD register name for a register list.
6643                         The register printed is the FP/SIMD register name
6644                         of X + 0/1/2/3 for S/T/U/V.
6645      'R':               Print a scalar FP/SIMD register name + 1.
6646      'X':               Print bottom 16 bits of integer constant in hex.
6647      'w/x':             Print a general register name or the zero register
6648                         (32-bit or 64-bit).
6649      '0':               Print a normal operand, if it's a general register,
6650                         then we assume DImode.
6651      'k':               Print NZCV for conditional compare instructions.
6652      'A':               Output address constant representing the first
6653                         argument of X, specifying a relocation offset
6654                         if appropriate.
6655      'L':               Output constant address specified by X
6656                         with a relocation offset if appropriate.
6657      'G':               Prints address of X, specifying a PC relative
6658                         relocation mode if appropriate.
6659      'y':               Output address of LDP or STP - this is used for
6660                         some LDP/STPs which don't use a PARALLEL in their
6661                         pattern (so the mode needs to be adjusted).
6662      'z':               Output address of a typical LDP or STP.  */
6663
6664 static void
6665 aarch64_print_operand (FILE *f, rtx x, int code)
6666 {
6667   rtx elt;
6668   switch (code)
6669     {
6670     case 'c':
6671       switch (GET_CODE (x))
6672         {
6673         case CONST_INT:
6674           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
6675           break;
6676
6677         case SYMBOL_REF:
6678           output_addr_const (f, x);
6679           break;
6680
6681         case CONST:
6682           if (GET_CODE (XEXP (x, 0)) == PLUS
6683               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
6684             {
6685               output_addr_const (f, x);
6686               break;
6687             }
6688           /* Fall through.  */
6689
6690         default:
6691           output_operand_lossage ("unsupported operand for code '%c'", code);
6692         }
6693       break;
6694
6695     case 'e':
6696       {
6697         int n;
6698
6699         if (!CONST_INT_P (x)
6700             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
6701           {
6702             output_operand_lossage ("invalid operand for '%%%c'", code);
6703             return;
6704           }
6705
6706         switch (n)
6707           {
6708           case 3:
6709             fputc ('b', f);
6710             break;
6711           case 4:
6712             fputc ('h', f);
6713             break;
6714           case 5:
6715             fputc ('w', f);
6716             break;
6717           default:
6718             output_operand_lossage ("invalid operand for '%%%c'", code);
6719             return;
6720           }
6721       }
6722       break;
6723
6724     case 'p':
6725       {
6726         int n;
6727
6728         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
6729           {
6730             output_operand_lossage ("invalid operand for '%%%c'", code);
6731             return;
6732           }
6733
6734         asm_fprintf (f, "%d", n);
6735       }
6736       break;
6737
6738     case 'P':
6739       if (!CONST_INT_P (x))
6740         {
6741           output_operand_lossage ("invalid operand for '%%%c'", code);
6742           return;
6743         }
6744
6745       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
6746       break;
6747
6748     case 'H':
6749       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
6750         {
6751           output_operand_lossage ("invalid operand for '%%%c'", code);
6752           return;
6753         }
6754
6755       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
6756       break;
6757
6758     case 'M':
6759     case 'm':
6760       {
6761         int cond_code;
6762         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
6763         if (x == const_true_rtx)
6764           {
6765             if (code == 'M')
6766               fputs ("nv", f);
6767             return;
6768           }
6769
6770         if (!COMPARISON_P (x))
6771           {
6772             output_operand_lossage ("invalid operand for '%%%c'", code);
6773             return;
6774           }
6775
6776         cond_code = aarch64_get_condition_code (x);
6777         gcc_assert (cond_code >= 0);
6778         if (code == 'M')
6779           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
6780         fputs (aarch64_condition_codes[cond_code], f);
6781       }
6782       break;
6783
6784     case 'N':
6785       if (!const_vec_duplicate_p (x, &elt))
6786         {
6787           output_operand_lossage ("invalid vector constant");
6788           return;
6789         }
6790
6791       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6792         asm_fprintf (f, "%wd", -INTVAL (elt));
6793       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6794                && aarch64_print_vector_float_operand (f, x, true))
6795         ;
6796       else
6797         {
6798           output_operand_lossage ("invalid vector constant");
6799           return;
6800         }
6801       break;
6802
6803     case 'b':
6804     case 'h':
6805     case 's':
6806     case 'd':
6807     case 'q':
6808       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6809         {
6810           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6811           return;
6812         }
6813       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
6814       break;
6815
6816     case 'S':
6817     case 'T':
6818     case 'U':
6819     case 'V':
6820       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6821         {
6822           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6823           return;
6824         }
6825       asm_fprintf (f, "%c%d",
6826                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
6827                    REGNO (x) - V0_REGNUM + (code - 'S'));
6828       break;
6829
6830     case 'R':
6831       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6832         {
6833           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6834           return;
6835         }
6836       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
6837       break;
6838
6839     case 'X':
6840       if (!CONST_INT_P (x))
6841         {
6842           output_operand_lossage ("invalid operand for '%%%c'", code);
6843           return;
6844         }
6845       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
6846       break;
6847
6848     case 'C':
6849       {
6850         /* Print a replicated constant in hex.  */
6851         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6852           {
6853             output_operand_lossage ("invalid operand for '%%%c'", code);
6854             return;
6855           }
6856         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6857         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6858       }
6859       break;
6860
6861     case 'D':
6862       {
6863         /* Print a replicated constant in decimal, treating it as
6864            unsigned.  */
6865         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6866           {
6867             output_operand_lossage ("invalid operand for '%%%c'", code);
6868             return;
6869           }
6870         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6871         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6872       }
6873       break;
6874
6875     case 'w':
6876     case 'x':
6877       if (x == const0_rtx
6878           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
6879         {
6880           asm_fprintf (f, "%czr", code);
6881           break;
6882         }
6883
6884       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
6885         {
6886           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
6887           break;
6888         }
6889
6890       if (REG_P (x) && REGNO (x) == SP_REGNUM)
6891         {
6892           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
6893           break;
6894         }
6895
6896       /* Fall through */
6897
6898     case 0:
6899       if (x == NULL)
6900         {
6901           output_operand_lossage ("missing operand");
6902           return;
6903         }
6904
6905       switch (GET_CODE (x))
6906         {
6907         case REG:
6908           if (aarch64_sve_data_mode_p (GET_MODE (x)))
6909             {
6910               if (REG_NREGS (x) == 1)
6911                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
6912               else
6913                 {
6914                   char suffix
6915                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
6916                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
6917                                REGNO (x) - V0_REGNUM, suffix,
6918                                END_REGNO (x) - V0_REGNUM - 1, suffix);
6919                 }
6920             }
6921           else
6922             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
6923           break;
6924
6925         case MEM:
6926           output_address (GET_MODE (x), XEXP (x, 0));
6927           break;
6928
6929         case LABEL_REF:
6930         case SYMBOL_REF:
6931           output_addr_const (asm_out_file, x);
6932           break;
6933
6934         case CONST_INT:
6935           asm_fprintf (f, "%wd", INTVAL (x));
6936           break;
6937
6938         case CONST:
6939           if (!VECTOR_MODE_P (GET_MODE (x)))
6940             {
6941               output_addr_const (asm_out_file, x);
6942               break;
6943             }
6944           /* fall through */
6945
6946         case CONST_VECTOR:
6947           if (!const_vec_duplicate_p (x, &elt))
6948             {
6949               output_operand_lossage ("invalid vector constant");
6950               return;
6951             }
6952
6953           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6954             asm_fprintf (f, "%wd", INTVAL (elt));
6955           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6956                    && aarch64_print_vector_float_operand (f, x, false))
6957             ;
6958           else
6959             {
6960               output_operand_lossage ("invalid vector constant");
6961               return;
6962             }
6963           break;
6964
6965         case CONST_DOUBLE:
6966           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
6967              be getting CONST_DOUBLEs holding integers.  */
6968           gcc_assert (GET_MODE (x) != VOIDmode);
6969           if (aarch64_float_const_zero_rtx_p (x))
6970             {
6971               fputc ('0', f);
6972               break;
6973             }
6974           else if (aarch64_float_const_representable_p (x))
6975             {
6976 #define buf_size 20
6977               char float_buf[buf_size] = {'\0'};
6978               real_to_decimal_for_mode (float_buf,
6979                                         CONST_DOUBLE_REAL_VALUE (x),
6980                                         buf_size, buf_size,
6981                                         1, GET_MODE (x));
6982               asm_fprintf (asm_out_file, "%s", float_buf);
6983               break;
6984 #undef buf_size
6985             }
6986           output_operand_lossage ("invalid constant");
6987           return;
6988         default:
6989           output_operand_lossage ("invalid operand");
6990           return;
6991         }
6992       break;
6993
6994     case 'A':
6995       if (GET_CODE (x) == HIGH)
6996         x = XEXP (x, 0);
6997
6998       switch (aarch64_classify_symbolic_expression (x))
6999         {
7000         case SYMBOL_SMALL_GOT_4G:
7001           asm_fprintf (asm_out_file, ":got:");
7002           break;
7003
7004         case SYMBOL_SMALL_TLSGD:
7005           asm_fprintf (asm_out_file, ":tlsgd:");
7006           break;
7007
7008         case SYMBOL_SMALL_TLSDESC:
7009           asm_fprintf (asm_out_file, ":tlsdesc:");
7010           break;
7011
7012         case SYMBOL_SMALL_TLSIE:
7013           asm_fprintf (asm_out_file, ":gottprel:");
7014           break;
7015
7016         case SYMBOL_TLSLE24:
7017           asm_fprintf (asm_out_file, ":tprel:");
7018           break;
7019
7020         case SYMBOL_TINY_GOT:
7021           gcc_unreachable ();
7022           break;
7023
7024         default:
7025           break;
7026         }
7027       output_addr_const (asm_out_file, x);
7028       break;
7029
7030     case 'L':
7031       switch (aarch64_classify_symbolic_expression (x))
7032         {
7033         case SYMBOL_SMALL_GOT_4G:
7034           asm_fprintf (asm_out_file, ":lo12:");
7035           break;
7036
7037         case SYMBOL_SMALL_TLSGD:
7038           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
7039           break;
7040
7041         case SYMBOL_SMALL_TLSDESC:
7042           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
7043           break;
7044
7045         case SYMBOL_SMALL_TLSIE:
7046           asm_fprintf (asm_out_file, ":gottprel_lo12:");
7047           break;
7048
7049         case SYMBOL_TLSLE12:
7050           asm_fprintf (asm_out_file, ":tprel_lo12:");
7051           break;
7052
7053         case SYMBOL_TLSLE24:
7054           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
7055           break;
7056
7057         case SYMBOL_TINY_GOT:
7058           asm_fprintf (asm_out_file, ":got:");
7059           break;
7060
7061         case SYMBOL_TINY_TLSIE:
7062           asm_fprintf (asm_out_file, ":gottprel:");
7063           break;
7064
7065         default:
7066           break;
7067         }
7068       output_addr_const (asm_out_file, x);
7069       break;
7070
7071     case 'G':
7072       switch (aarch64_classify_symbolic_expression (x))
7073         {
7074         case SYMBOL_TLSLE24:
7075           asm_fprintf (asm_out_file, ":tprel_hi12:");
7076           break;
7077         default:
7078           break;
7079         }
7080       output_addr_const (asm_out_file, x);
7081       break;
7082
7083     case 'k':
7084       {
7085         HOST_WIDE_INT cond_code;
7086
7087         if (!CONST_INT_P (x))
7088           {
7089             output_operand_lossage ("invalid operand for '%%%c'", code);
7090             return;
7091           }
7092
7093         cond_code = INTVAL (x);
7094         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
7095         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
7096       }
7097       break;
7098
7099     case 'y':
7100     case 'z':
7101       {
7102         machine_mode mode = GET_MODE (x);
7103
7104         if (GET_CODE (x) != MEM
7105             || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
7106           {
7107             output_operand_lossage ("invalid operand for '%%%c'", code);
7108             return;
7109           }
7110
7111         if (code == 'y')
7112           /* LDP/STP which uses a single double-width memory operand.
7113              Adjust the mode to appear like a typical LDP/STP.
7114              Currently this is supported for 16-byte accesses only.  */
7115           mode = DFmode;
7116
7117         if (!aarch64_print_ldpstp_address (f, mode, XEXP (x, 0)))
7118           output_operand_lossage ("invalid operand prefix '%%%c'", code);
7119       }
7120       break;
7121
7122     default:
7123       output_operand_lossage ("invalid operand prefix '%%%c'", code);
7124       return;
7125     }
7126 }
7127
7128 /* Print address 'x' of a memory access with mode 'mode'.
7129    'op' is the context required by aarch64_classify_address.  It can either be
7130    MEM for a normal memory access or PARALLEL for LDP/STP.  */
7131 static bool
7132 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
7133                                 aarch64_addr_query_type type)
7134 {
7135   struct aarch64_address_info addr;
7136   unsigned int size;
7137
7138   /* Check all addresses are Pmode - including ILP32.  */
7139   if (GET_MODE (x) != Pmode)
7140     output_operand_lossage ("invalid address mode");
7141
7142   if (aarch64_classify_address (&addr, x, mode, true, type))
7143     switch (addr.type)
7144       {
7145       case ADDRESS_REG_IMM:
7146         if (known_eq (addr.const_offset, 0))
7147           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
7148         else if (aarch64_sve_data_mode_p (mode))
7149           {
7150             HOST_WIDE_INT vnum
7151               = exact_div (addr.const_offset,
7152                            BYTES_PER_SVE_VECTOR).to_constant ();
7153             asm_fprintf (f, "[%s, #%wd, mul vl]",
7154                          reg_names[REGNO (addr.base)], vnum);
7155           }
7156         else if (aarch64_sve_pred_mode_p (mode))
7157           {
7158             HOST_WIDE_INT vnum
7159               = exact_div (addr.const_offset,
7160                            BYTES_PER_SVE_PRED).to_constant ();
7161             asm_fprintf (f, "[%s, #%wd, mul vl]",
7162                          reg_names[REGNO (addr.base)], vnum);
7163           }
7164         else
7165           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
7166                        INTVAL (addr.offset));
7167         return true;
7168
7169       case ADDRESS_REG_REG:
7170         if (addr.shift == 0)
7171           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
7172                        reg_names [REGNO (addr.offset)]);
7173         else
7174           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
7175                        reg_names [REGNO (addr.offset)], addr.shift);
7176         return true;
7177
7178       case ADDRESS_REG_UXTW:
7179         if (addr.shift == 0)
7180           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
7181                        REGNO (addr.offset) - R0_REGNUM);
7182         else
7183           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
7184                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
7185         return true;
7186
7187       case ADDRESS_REG_SXTW:
7188         if (addr.shift == 0)
7189           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
7190                        REGNO (addr.offset) - R0_REGNUM);
7191         else
7192           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
7193                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
7194         return true;
7195
7196       case ADDRESS_REG_WB:
7197         /* Writeback is only supported for fixed-width modes.  */
7198         size = GET_MODE_SIZE (mode).to_constant ();
7199         switch (GET_CODE (x))
7200           {
7201           case PRE_INC:
7202             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
7203             return true;
7204           case POST_INC:
7205             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
7206             return true;
7207           case PRE_DEC:
7208             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
7209             return true;
7210           case POST_DEC:
7211             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
7212             return true;
7213           case PRE_MODIFY:
7214             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
7215                          INTVAL (addr.offset));
7216             return true;
7217           case POST_MODIFY:
7218             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
7219                          INTVAL (addr.offset));
7220             return true;
7221           default:
7222             break;
7223           }
7224         break;
7225
7226       case ADDRESS_LO_SUM:
7227         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
7228         output_addr_const (f, addr.offset);
7229         asm_fprintf (f, "]");
7230         return true;
7231
7232       case ADDRESS_SYMBOLIC:
7233         output_addr_const (f, x);
7234         return true;
7235       }
7236
7237   return false;
7238 }
7239
7240 /* Print address 'x' of a LDP/STP with mode 'mode'.  */
7241 static bool
7242 aarch64_print_ldpstp_address (FILE *f, machine_mode mode, rtx x)
7243 {
7244   return aarch64_print_address_internal (f, mode, x, ADDR_QUERY_LDP_STP);
7245 }
7246
7247 /* Print address 'x' of a memory access with mode 'mode'.  */
7248 static void
7249 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
7250 {
7251   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
7252     output_addr_const (f, x);
7253 }
7254
7255 bool
7256 aarch64_label_mentioned_p (rtx x)
7257 {
7258   const char *fmt;
7259   int i;
7260
7261   if (GET_CODE (x) == LABEL_REF)
7262     return true;
7263
7264   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
7265      referencing instruction, but they are constant offsets, not
7266      symbols.  */
7267   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
7268     return false;
7269
7270   fmt = GET_RTX_FORMAT (GET_CODE (x));
7271   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
7272     {
7273       if (fmt[i] == 'E')
7274         {
7275           int j;
7276
7277           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
7278             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
7279               return 1;
7280         }
7281       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
7282         return 1;
7283     }
7284
7285   return 0;
7286 }
7287
7288 /* Implement REGNO_REG_CLASS.  */
7289
7290 enum reg_class
7291 aarch64_regno_regclass (unsigned regno)
7292 {
7293   if (GP_REGNUM_P (regno))
7294     return GENERAL_REGS;
7295
7296   if (regno == SP_REGNUM)
7297     return STACK_REG;
7298
7299   if (regno == FRAME_POINTER_REGNUM
7300       || regno == ARG_POINTER_REGNUM)
7301     return POINTER_REGS;
7302
7303   if (FP_REGNUM_P (regno))
7304     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
7305
7306   if (PR_REGNUM_P (regno))
7307     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
7308
7309   return NO_REGS;
7310 }
7311
7312 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
7313    If OFFSET is out of range, return an offset of an anchor point
7314    that is in range.  Return 0 otherwise.  */
7315
7316 static HOST_WIDE_INT
7317 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
7318                        machine_mode mode)
7319 {
7320   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
7321   if (size > 16)
7322     return (offset + 0x400) & ~0x7f0;
7323
7324   /* For offsets that aren't a multiple of the access size, the limit is
7325      -256...255.  */
7326   if (offset & (size - 1))
7327     {
7328       /* BLKmode typically uses LDP of X-registers.  */
7329       if (mode == BLKmode)
7330         return (offset + 512) & ~0x3ff;
7331       return (offset + 0x100) & ~0x1ff;
7332     }
7333
7334   /* Small negative offsets are supported.  */
7335   if (IN_RANGE (offset, -256, 0))
7336     return 0;
7337
7338   if (mode == TImode || mode == TFmode)
7339     return (offset + 0x100) & ~0x1ff;
7340
7341   /* Use 12-bit offset by access size.  */
7342   return offset & (~0xfff * size);
7343 }
7344
7345 static rtx
7346 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
7347 {
7348   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
7349      where mask is selected by alignment and size of the offset.
7350      We try to pick as large a range for the offset as possible to
7351      maximize the chance of a CSE.  However, for aligned addresses
7352      we limit the range to 4k so that structures with different sized
7353      elements are likely to use the same base.  We need to be careful
7354      not to split a CONST for some forms of address expression, otherwise
7355      it will generate sub-optimal code.  */
7356
7357   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
7358     {
7359       rtx base = XEXP (x, 0);
7360       rtx offset_rtx = XEXP (x, 1);
7361       HOST_WIDE_INT offset = INTVAL (offset_rtx);
7362
7363       if (GET_CODE (base) == PLUS)
7364         {
7365           rtx op0 = XEXP (base, 0);
7366           rtx op1 = XEXP (base, 1);
7367
7368           /* Force any scaling into a temp for CSE.  */
7369           op0 = force_reg (Pmode, op0);
7370           op1 = force_reg (Pmode, op1);
7371
7372           /* Let the pointer register be in op0.  */
7373           if (REG_POINTER (op1))
7374             std::swap (op0, op1);
7375
7376           /* If the pointer is virtual or frame related, then we know that
7377              virtual register instantiation or register elimination is going
7378              to apply a second constant.  We want the two constants folded
7379              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
7380           if (virt_or_elim_regno_p (REGNO (op0)))
7381             {
7382               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
7383                                    NULL_RTX, true, OPTAB_DIRECT);
7384               return gen_rtx_PLUS (Pmode, base, op1);
7385             }
7386
7387           /* Otherwise, in order to encourage CSE (and thence loop strength
7388              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
7389           base = expand_binop (Pmode, add_optab, op0, op1,
7390                                NULL_RTX, true, OPTAB_DIRECT);
7391           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
7392         }
7393
7394       HOST_WIDE_INT size;
7395       if (GET_MODE_SIZE (mode).is_constant (&size))
7396         {
7397           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
7398                                                              mode);
7399           if (base_offset != 0)
7400             {
7401               base = plus_constant (Pmode, base, base_offset);
7402               base = force_operand (base, NULL_RTX);
7403               return plus_constant (Pmode, base, offset - base_offset);
7404             }
7405         }
7406     }
7407
7408   return x;
7409 }
7410
7411 /* Return the reload icode required for a constant pool in mode.  */
7412 static enum insn_code
7413 aarch64_constant_pool_reload_icode (machine_mode mode)
7414 {
7415   switch (mode)
7416     {
7417     case E_SFmode:
7418       return CODE_FOR_aarch64_reload_movcpsfdi;
7419
7420     case E_DFmode:
7421       return CODE_FOR_aarch64_reload_movcpdfdi;
7422
7423     case E_TFmode:
7424       return CODE_FOR_aarch64_reload_movcptfdi;
7425
7426     case E_V8QImode:
7427       return CODE_FOR_aarch64_reload_movcpv8qidi;
7428
7429     case E_V16QImode:
7430       return CODE_FOR_aarch64_reload_movcpv16qidi;
7431
7432     case E_V4HImode:
7433       return CODE_FOR_aarch64_reload_movcpv4hidi;
7434
7435     case E_V8HImode:
7436       return CODE_FOR_aarch64_reload_movcpv8hidi;
7437
7438     case E_V2SImode:
7439       return CODE_FOR_aarch64_reload_movcpv2sidi;
7440
7441     case E_V4SImode:
7442       return CODE_FOR_aarch64_reload_movcpv4sidi;
7443
7444     case E_V2DImode:
7445       return CODE_FOR_aarch64_reload_movcpv2didi;
7446
7447     case E_V2DFmode:
7448       return CODE_FOR_aarch64_reload_movcpv2dfdi;
7449
7450     default:
7451       gcc_unreachable ();
7452     }
7453
7454   gcc_unreachable ();
7455 }
7456 static reg_class_t
7457 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
7458                           reg_class_t rclass,
7459                           machine_mode mode,
7460                           secondary_reload_info *sri)
7461 {
7462   /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
7463      directly by the *aarch64_sve_mov<mode>_be move pattern.  See the
7464      comment at the head of aarch64-sve.md for more details about the
7465      big-endian handling.  */
7466   if (BYTES_BIG_ENDIAN
7467       && reg_class_subset_p (rclass, FP_REGS)
7468       && !((REG_P (x) && HARD_REGISTER_P (x))
7469            || aarch64_simd_valid_immediate (x, NULL))
7470       && aarch64_sve_data_mode_p (mode))
7471     {
7472       sri->icode = CODE_FOR_aarch64_sve_reload_be;
7473       return NO_REGS;
7474     }
7475
7476   /* If we have to disable direct literal pool loads and stores because the
7477      function is too big, then we need a scratch register.  */
7478   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
7479       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
7480           || targetm.vector_mode_supported_p (GET_MODE (x)))
7481       && !aarch64_pcrelative_literal_loads)
7482     {
7483       sri->icode = aarch64_constant_pool_reload_icode (mode);
7484       return NO_REGS;
7485     }
7486
7487   /* Without the TARGET_SIMD instructions we cannot move a Q register
7488      to a Q register directly.  We need a scratch.  */
7489   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
7490       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
7491       && reg_class_subset_p (rclass, FP_REGS))
7492     {
7493       if (mode == TFmode)
7494         sri->icode = CODE_FOR_aarch64_reload_movtf;
7495       else if (mode == TImode)
7496         sri->icode = CODE_FOR_aarch64_reload_movti;
7497       return NO_REGS;
7498     }
7499
7500   /* A TFmode or TImode memory access should be handled via an FP_REGS
7501      because AArch64 has richer addressing modes for LDR/STR instructions
7502      than LDP/STP instructions.  */
7503   if (TARGET_FLOAT && rclass == GENERAL_REGS
7504       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
7505     return FP_REGS;
7506
7507   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
7508       return GENERAL_REGS;
7509
7510   return NO_REGS;
7511 }
7512
7513 static bool
7514 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
7515 {
7516   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
7517
7518   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
7519      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
7520   if (frame_pointer_needed)
7521     return to == HARD_FRAME_POINTER_REGNUM;
7522   return true;
7523 }
7524
7525 poly_int64
7526 aarch64_initial_elimination_offset (unsigned from, unsigned to)
7527 {
7528   aarch64_layout_frame ();
7529
7530   if (to == HARD_FRAME_POINTER_REGNUM)
7531     {
7532       if (from == ARG_POINTER_REGNUM)
7533         return cfun->machine->frame.hard_fp_offset;
7534
7535       if (from == FRAME_POINTER_REGNUM)
7536         return cfun->machine->frame.hard_fp_offset
7537                - cfun->machine->frame.locals_offset;
7538     }
7539
7540   if (to == STACK_POINTER_REGNUM)
7541     {
7542       if (from == FRAME_POINTER_REGNUM)
7543           return cfun->machine->frame.frame_size
7544                  - cfun->machine->frame.locals_offset;
7545     }
7546
7547   return cfun->machine->frame.frame_size;
7548 }
7549
7550 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
7551    previous frame.  */
7552
7553 rtx
7554 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
7555 {
7556   if (count != 0)
7557     return const0_rtx;
7558   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
7559 }
7560
7561
7562 static void
7563 aarch64_asm_trampoline_template (FILE *f)
7564 {
7565   if (TARGET_ILP32)
7566     {
7567       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
7568       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
7569     }
7570   else
7571     {
7572       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
7573       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
7574     }
7575   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
7576   assemble_aligned_integer (4, const0_rtx);
7577   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7578   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7579 }
7580
7581 static void
7582 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
7583 {
7584   rtx fnaddr, mem, a_tramp;
7585   const int tramp_code_sz = 16;
7586
7587   /* Don't need to copy the trailing D-words, we fill those in below.  */
7588   emit_block_move (m_tramp, assemble_trampoline_template (),
7589                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
7590   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
7591   fnaddr = XEXP (DECL_RTL (fndecl), 0);
7592   if (GET_MODE (fnaddr) != ptr_mode)
7593     fnaddr = convert_memory_address (ptr_mode, fnaddr);
7594   emit_move_insn (mem, fnaddr);
7595
7596   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
7597   emit_move_insn (mem, chain_value);
7598
7599   /* XXX We should really define a "clear_cache" pattern and use
7600      gen_clear_cache().  */
7601   a_tramp = XEXP (m_tramp, 0);
7602   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
7603                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
7604                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
7605                      ptr_mode);
7606 }
7607
7608 static unsigned char
7609 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
7610 {
7611   /* ??? Logically we should only need to provide a value when
7612      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
7613      can hold MODE, but at the moment we need to handle all modes.
7614      Just ignore any runtime parts for registers that can't store them.  */
7615   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
7616   unsigned int nregs;
7617   switch (regclass)
7618     {
7619     case TAILCALL_ADDR_REGS:
7620     case POINTER_REGS:
7621     case GENERAL_REGS:
7622     case ALL_REGS:
7623     case POINTER_AND_FP_REGS:
7624     case FP_REGS:
7625     case FP_LO_REGS:
7626       if (aarch64_sve_data_mode_p (mode)
7627           && constant_multiple_p (GET_MODE_SIZE (mode),
7628                                   BYTES_PER_SVE_VECTOR, &nregs))
7629         return nregs;
7630       return (aarch64_vector_data_mode_p (mode)
7631               ? CEIL (lowest_size, UNITS_PER_VREG)
7632               : CEIL (lowest_size, UNITS_PER_WORD));
7633     case STACK_REG:
7634     case PR_REGS:
7635     case PR_LO_REGS:
7636     case PR_HI_REGS:
7637       return 1;
7638
7639     case NO_REGS:
7640       return 0;
7641
7642     default:
7643       break;
7644     }
7645   gcc_unreachable ();
7646 }
7647
7648 static reg_class_t
7649 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
7650 {
7651   if (regclass == POINTER_REGS)
7652     return GENERAL_REGS;
7653
7654   if (regclass == STACK_REG)
7655     {
7656       if (REG_P(x)
7657           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
7658           return regclass;
7659
7660       return NO_REGS;
7661     }
7662
7663   /* Register eliminiation can result in a request for
7664      SP+constant->FP_REGS.  We cannot support such operations which
7665      use SP as source and an FP_REG as destination, so reject out
7666      right now.  */
7667   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
7668     {
7669       rtx lhs = XEXP (x, 0);
7670
7671       /* Look through a possible SUBREG introduced by ILP32.  */
7672       if (GET_CODE (lhs) == SUBREG)
7673         lhs = SUBREG_REG (lhs);
7674
7675       gcc_assert (REG_P (lhs));
7676       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
7677                                       POINTER_REGS));
7678       return NO_REGS;
7679     }
7680
7681   return regclass;
7682 }
7683
7684 void
7685 aarch64_asm_output_labelref (FILE* f, const char *name)
7686 {
7687   asm_fprintf (f, "%U%s", name);
7688 }
7689
7690 static void
7691 aarch64_elf_asm_constructor (rtx symbol, int priority)
7692 {
7693   if (priority == DEFAULT_INIT_PRIORITY)
7694     default_ctor_section_asm_out_constructor (symbol, priority);
7695   else
7696     {
7697       section *s;
7698       /* While priority is known to be in range [0, 65535], so 18 bytes
7699          would be enough, the compiler might not know that.  To avoid
7700          -Wformat-truncation false positive, use a larger size.  */
7701       char buf[23];
7702       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
7703       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7704       switch_to_section (s);
7705       assemble_align (POINTER_SIZE);
7706       assemble_aligned_integer (POINTER_BYTES, symbol);
7707     }
7708 }
7709
7710 static void
7711 aarch64_elf_asm_destructor (rtx symbol, int priority)
7712 {
7713   if (priority == DEFAULT_INIT_PRIORITY)
7714     default_dtor_section_asm_out_destructor (symbol, priority);
7715   else
7716     {
7717       section *s;
7718       /* While priority is known to be in range [0, 65535], so 18 bytes
7719          would be enough, the compiler might not know that.  To avoid
7720          -Wformat-truncation false positive, use a larger size.  */
7721       char buf[23];
7722       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
7723       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7724       switch_to_section (s);
7725       assemble_align (POINTER_SIZE);
7726       assemble_aligned_integer (POINTER_BYTES, symbol);
7727     }
7728 }
7729
7730 const char*
7731 aarch64_output_casesi (rtx *operands)
7732 {
7733   char buf[100];
7734   char label[100];
7735   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
7736   int index;
7737   static const char *const patterns[4][2] =
7738   {
7739     {
7740       "ldrb\t%w3, [%0,%w1,uxtw]",
7741       "add\t%3, %4, %w3, sxtb #2"
7742     },
7743     {
7744       "ldrh\t%w3, [%0,%w1,uxtw #1]",
7745       "add\t%3, %4, %w3, sxth #2"
7746     },
7747     {
7748       "ldr\t%w3, [%0,%w1,uxtw #2]",
7749       "add\t%3, %4, %w3, sxtw #2"
7750     },
7751     /* We assume that DImode is only generated when not optimizing and
7752        that we don't really need 64-bit address offsets.  That would
7753        imply an object file with 8GB of code in a single function!  */
7754     {
7755       "ldr\t%w3, [%0,%w1,uxtw #2]",
7756       "add\t%3, %4, %w3, sxtw #2"
7757     }
7758   };
7759
7760   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
7761
7762   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
7763   index = exact_log2 (GET_MODE_SIZE (mode));
7764
7765   gcc_assert (index >= 0 && index <= 3);
7766
7767   /* Need to implement table size reduction, by chaning the code below.  */
7768   output_asm_insn (patterns[index][0], operands);
7769   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
7770   snprintf (buf, sizeof (buf),
7771             "adr\t%%4, %s", targetm.strip_name_encoding (label));
7772   output_asm_insn (buf, operands);
7773   output_asm_insn (patterns[index][1], operands);
7774   output_asm_insn ("br\t%3", operands);
7775   assemble_label (asm_out_file, label);
7776   return "";
7777 }
7778
7779
7780 /* Return size in bits of an arithmetic operand which is shifted/scaled and
7781    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
7782    operator.  */
7783
7784 int
7785 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
7786 {
7787   if (shift >= 0 && shift <= 3)
7788     {
7789       int size;
7790       for (size = 8; size <= 32; size *= 2)
7791         {
7792           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
7793           if (mask == bits << shift)
7794             return size;
7795         }
7796     }
7797   return 0;
7798 }
7799
7800 /* Constant pools are per function only when PC relative
7801    literal loads are true or we are in the large memory
7802    model.  */
7803
7804 static inline bool
7805 aarch64_can_use_per_function_literal_pools_p (void)
7806 {
7807   return (aarch64_pcrelative_literal_loads
7808           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
7809 }
7810
7811 static bool
7812 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
7813 {
7814   /* We can't use blocks for constants when we're using a per-function
7815      constant pool.  */
7816   return !aarch64_can_use_per_function_literal_pools_p ();
7817 }
7818
7819 /* Select appropriate section for constants depending
7820    on where we place literal pools.  */
7821
7822 static section *
7823 aarch64_select_rtx_section (machine_mode mode,
7824                             rtx x,
7825                             unsigned HOST_WIDE_INT align)
7826 {
7827   if (aarch64_can_use_per_function_literal_pools_p ())
7828     return function_section (current_function_decl);
7829
7830   return default_elf_select_rtx_section (mode, x, align);
7831 }
7832
7833 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
7834 void
7835 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
7836                                   HOST_WIDE_INT offset)
7837 {
7838   /* When using per-function literal pools, we must ensure that any code
7839      section is aligned to the minimal instruction length, lest we get
7840      errors from the assembler re "unaligned instructions".  */
7841   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
7842     ASM_OUTPUT_ALIGN (f, 2);
7843 }
7844
7845 /* Costs.  */
7846
7847 /* Helper function for rtx cost calculation.  Strip a shift expression
7848    from X.  Returns the inner operand if successful, or the original
7849    expression on failure.  */
7850 static rtx
7851 aarch64_strip_shift (rtx x)
7852 {
7853   rtx op = x;
7854
7855   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
7856      we can convert both to ROR during final output.  */
7857   if ((GET_CODE (op) == ASHIFT
7858        || GET_CODE (op) == ASHIFTRT
7859        || GET_CODE (op) == LSHIFTRT
7860        || GET_CODE (op) == ROTATERT
7861        || GET_CODE (op) == ROTATE)
7862       && CONST_INT_P (XEXP (op, 1)))
7863     return XEXP (op, 0);
7864
7865   if (GET_CODE (op) == MULT
7866       && CONST_INT_P (XEXP (op, 1))
7867       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
7868     return XEXP (op, 0);
7869
7870   return x;
7871 }
7872
7873 /* Helper function for rtx cost calculation.  Strip an extend
7874    expression from X.  Returns the inner operand if successful, or the
7875    original expression on failure.  We deal with a number of possible
7876    canonicalization variations here. If STRIP_SHIFT is true, then
7877    we can strip off a shift also.  */
7878 static rtx
7879 aarch64_strip_extend (rtx x, bool strip_shift)
7880 {
7881   scalar_int_mode mode;
7882   rtx op = x;
7883
7884   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
7885     return op;
7886
7887   /* Zero and sign extraction of a widened value.  */
7888   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
7889       && XEXP (op, 2) == const0_rtx
7890       && GET_CODE (XEXP (op, 0)) == MULT
7891       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
7892                                          XEXP (op, 1)))
7893     return XEXP (XEXP (op, 0), 0);
7894
7895   /* It can also be represented (for zero-extend) as an AND with an
7896      immediate.  */
7897   if (GET_CODE (op) == AND
7898       && GET_CODE (XEXP (op, 0)) == MULT
7899       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
7900       && CONST_INT_P (XEXP (op, 1))
7901       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
7902                            INTVAL (XEXP (op, 1))) != 0)
7903     return XEXP (XEXP (op, 0), 0);
7904
7905   /* Now handle extended register, as this may also have an optional
7906      left shift by 1..4.  */
7907   if (strip_shift
7908       && GET_CODE (op) == ASHIFT
7909       && CONST_INT_P (XEXP (op, 1))
7910       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
7911     op = XEXP (op, 0);
7912
7913   if (GET_CODE (op) == ZERO_EXTEND
7914       || GET_CODE (op) == SIGN_EXTEND)
7915     op = XEXP (op, 0);
7916
7917   if (op != x)
7918     return op;
7919
7920   return x;
7921 }
7922
7923 /* Return true iff CODE is a shift supported in combination
7924    with arithmetic instructions.  */
7925
7926 static bool
7927 aarch64_shift_p (enum rtx_code code)
7928 {
7929   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
7930 }
7931
7932
7933 /* Return true iff X is a cheap shift without a sign extend. */
7934
7935 static bool
7936 aarch64_cheap_mult_shift_p (rtx x)
7937 {
7938   rtx op0, op1;
7939
7940   op0 = XEXP (x, 0);
7941   op1 = XEXP (x, 1);
7942
7943   if (!(aarch64_tune_params.extra_tuning_flags
7944                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
7945     return false;
7946
7947   if (GET_CODE (op0) == SIGN_EXTEND)
7948     return false;
7949
7950   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
7951       && UINTVAL (op1) <= 4)
7952     return true;
7953
7954   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
7955     return false;
7956
7957   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
7958
7959   if (l2 > 0 && l2 <= 4)
7960     return true;
7961
7962   return false;
7963 }
7964
7965 /* Helper function for rtx cost calculation.  Calculate the cost of
7966    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
7967    Return the calculated cost of the expression, recursing manually in to
7968    operands where needed.  */
7969
7970 static int
7971 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
7972 {
7973   rtx op0, op1;
7974   const struct cpu_cost_table *extra_cost
7975     = aarch64_tune_params.insn_extra_cost;
7976   int cost = 0;
7977   bool compound_p = (outer == PLUS || outer == MINUS);
7978   machine_mode mode = GET_MODE (x);
7979
7980   gcc_checking_assert (code == MULT);
7981
7982   op0 = XEXP (x, 0);
7983   op1 = XEXP (x, 1);
7984
7985   if (VECTOR_MODE_P (mode))
7986     mode = GET_MODE_INNER (mode);
7987
7988   /* Integer multiply/fma.  */
7989   if (GET_MODE_CLASS (mode) == MODE_INT)
7990     {
7991       /* The multiply will be canonicalized as a shift, cost it as such.  */
7992       if (aarch64_shift_p (GET_CODE (x))
7993           || (CONST_INT_P (op1)
7994               && exact_log2 (INTVAL (op1)) > 0))
7995         {
7996           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
7997                            || GET_CODE (op0) == SIGN_EXTEND;
7998           if (speed)
7999             {
8000               if (compound_p)
8001                 {
8002                   /* If the shift is considered cheap,
8003                      then don't add any cost. */
8004                   if (aarch64_cheap_mult_shift_p (x))
8005                     ;
8006                   else if (REG_P (op1))
8007                     /* ARITH + shift-by-register.  */
8008                     cost += extra_cost->alu.arith_shift_reg;
8009                   else if (is_extend)
8010                     /* ARITH + extended register.  We don't have a cost field
8011                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
8012                     cost += extra_cost->alu.extend_arith;
8013                   else
8014                     /* ARITH + shift-by-immediate.  */
8015                     cost += extra_cost->alu.arith_shift;
8016                 }
8017               else
8018                 /* LSL (immediate).  */
8019                 cost += extra_cost->alu.shift;
8020
8021             }
8022           /* Strip extends as we will have costed them in the case above.  */
8023           if (is_extend)
8024             op0 = aarch64_strip_extend (op0, true);
8025
8026           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
8027
8028           return cost;
8029         }
8030
8031       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
8032          compound and let the below cases handle it.  After all, MNEG is a
8033          special-case alias of MSUB.  */
8034       if (GET_CODE (op0) == NEG)
8035         {
8036           op0 = XEXP (op0, 0);
8037           compound_p = true;
8038         }
8039
8040       /* Integer multiplies or FMAs have zero/sign extending variants.  */
8041       if ((GET_CODE (op0) == ZERO_EXTEND
8042            && GET_CODE (op1) == ZERO_EXTEND)
8043           || (GET_CODE (op0) == SIGN_EXTEND
8044               && GET_CODE (op1) == SIGN_EXTEND))
8045         {
8046           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
8047           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
8048
8049           if (speed)
8050             {
8051               if (compound_p)
8052                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
8053                 cost += extra_cost->mult[0].extend_add;
8054               else
8055                 /* MUL/SMULL/UMULL.  */
8056                 cost += extra_cost->mult[0].extend;
8057             }
8058
8059           return cost;
8060         }
8061
8062       /* This is either an integer multiply or a MADD.  In both cases
8063          we want to recurse and cost the operands.  */
8064       cost += rtx_cost (op0, mode, MULT, 0, speed);
8065       cost += rtx_cost (op1, mode, MULT, 1, speed);
8066
8067       if (speed)
8068         {
8069           if (compound_p)
8070             /* MADD/MSUB.  */
8071             cost += extra_cost->mult[mode == DImode].add;
8072           else
8073             /* MUL.  */
8074             cost += extra_cost->mult[mode == DImode].simple;
8075         }
8076
8077       return cost;
8078     }
8079   else
8080     {
8081       if (speed)
8082         {
8083           /* Floating-point FMA/FMUL can also support negations of the
8084              operands, unless the rounding mode is upward or downward in
8085              which case FNMUL is different than FMUL with operand negation.  */
8086           bool neg0 = GET_CODE (op0) == NEG;
8087           bool neg1 = GET_CODE (op1) == NEG;
8088           if (compound_p || !flag_rounding_math || (neg0 && neg1))
8089             {
8090               if (neg0)
8091                 op0 = XEXP (op0, 0);
8092               if (neg1)
8093                 op1 = XEXP (op1, 0);
8094             }
8095
8096           if (compound_p)
8097             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
8098             cost += extra_cost->fp[mode == DFmode].fma;
8099           else
8100             /* FMUL/FNMUL.  */
8101             cost += extra_cost->fp[mode == DFmode].mult;
8102         }
8103
8104       cost += rtx_cost (op0, mode, MULT, 0, speed);
8105       cost += rtx_cost (op1, mode, MULT, 1, speed);
8106       return cost;
8107     }
8108 }
8109
8110 static int
8111 aarch64_address_cost (rtx x,
8112                       machine_mode mode,
8113                       addr_space_t as ATTRIBUTE_UNUSED,
8114                       bool speed)
8115 {
8116   enum rtx_code c = GET_CODE (x);
8117   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
8118   struct aarch64_address_info info;
8119   int cost = 0;
8120   info.shift = 0;
8121
8122   if (!aarch64_classify_address (&info, x, mode, false))
8123     {
8124       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
8125         {
8126           /* This is a CONST or SYMBOL ref which will be split
8127              in a different way depending on the code model in use.
8128              Cost it through the generic infrastructure.  */
8129           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
8130           /* Divide through by the cost of one instruction to
8131              bring it to the same units as the address costs.  */
8132           cost_symbol_ref /= COSTS_N_INSNS (1);
8133           /* The cost is then the cost of preparing the address,
8134              followed by an immediate (possibly 0) offset.  */
8135           return cost_symbol_ref + addr_cost->imm_offset;
8136         }
8137       else
8138         {
8139           /* This is most likely a jump table from a case
8140              statement.  */
8141           return addr_cost->register_offset;
8142         }
8143     }
8144
8145   switch (info.type)
8146     {
8147       case ADDRESS_LO_SUM:
8148       case ADDRESS_SYMBOLIC:
8149       case ADDRESS_REG_IMM:
8150         cost += addr_cost->imm_offset;
8151         break;
8152
8153       case ADDRESS_REG_WB:
8154         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
8155           cost += addr_cost->pre_modify;
8156         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
8157           cost += addr_cost->post_modify;
8158         else
8159           gcc_unreachable ();
8160
8161         break;
8162
8163       case ADDRESS_REG_REG:
8164         cost += addr_cost->register_offset;
8165         break;
8166
8167       case ADDRESS_REG_SXTW:
8168         cost += addr_cost->register_sextend;
8169         break;
8170
8171       case ADDRESS_REG_UXTW:
8172         cost += addr_cost->register_zextend;
8173         break;
8174
8175       default:
8176         gcc_unreachable ();
8177     }
8178
8179
8180   if (info.shift > 0)
8181     {
8182       /* For the sake of calculating the cost of the shifted register
8183          component, we can treat same sized modes in the same way.  */
8184       if (known_eq (GET_MODE_BITSIZE (mode), 16))
8185         cost += addr_cost->addr_scale_costs.hi;
8186       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
8187         cost += addr_cost->addr_scale_costs.si;
8188       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
8189         cost += addr_cost->addr_scale_costs.di;
8190       else
8191         /* We can't tell, or this is a 128-bit vector.  */
8192         cost += addr_cost->addr_scale_costs.ti;
8193     }
8194
8195   return cost;
8196 }
8197
8198 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
8199    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
8200    to be taken.  */
8201
8202 int
8203 aarch64_branch_cost (bool speed_p, bool predictable_p)
8204 {
8205   /* When optimizing for speed, use the cost of unpredictable branches.  */
8206   const struct cpu_branch_cost *branch_costs =
8207     aarch64_tune_params.branch_costs;
8208
8209   if (!speed_p || predictable_p)
8210     return branch_costs->predictable;
8211   else
8212     return branch_costs->unpredictable;
8213 }
8214
8215 /* Return true if the RTX X in mode MODE is a zero or sign extract
8216    usable in an ADD or SUB (extended register) instruction.  */
8217 static bool
8218 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
8219 {
8220   /* Catch add with a sign extract.
8221      This is add_<optab><mode>_multp2.  */
8222   if (GET_CODE (x) == SIGN_EXTRACT
8223       || GET_CODE (x) == ZERO_EXTRACT)
8224     {
8225       rtx op0 = XEXP (x, 0);
8226       rtx op1 = XEXP (x, 1);
8227       rtx op2 = XEXP (x, 2);
8228
8229       if (GET_CODE (op0) == MULT
8230           && CONST_INT_P (op1)
8231           && op2 == const0_rtx
8232           && CONST_INT_P (XEXP (op0, 1))
8233           && aarch64_is_extend_from_extract (mode,
8234                                              XEXP (op0, 1),
8235                                              op1))
8236         {
8237           return true;
8238         }
8239     }
8240   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
8241      No shift.  */
8242   else if (GET_CODE (x) == SIGN_EXTEND
8243            || GET_CODE (x) == ZERO_EXTEND)
8244     return REG_P (XEXP (x, 0));
8245
8246   return false;
8247 }
8248
8249 static bool
8250 aarch64_frint_unspec_p (unsigned int u)
8251 {
8252   switch (u)
8253     {
8254       case UNSPEC_FRINTZ:
8255       case UNSPEC_FRINTP:
8256       case UNSPEC_FRINTM:
8257       case UNSPEC_FRINTA:
8258       case UNSPEC_FRINTN:
8259       case UNSPEC_FRINTX:
8260       case UNSPEC_FRINTI:
8261         return true;
8262
8263       default:
8264         return false;
8265     }
8266 }
8267
8268 /* Return true iff X is an rtx that will match an extr instruction
8269    i.e. as described in the *extr<mode>5_insn family of patterns.
8270    OP0 and OP1 will be set to the operands of the shifts involved
8271    on success and will be NULL_RTX otherwise.  */
8272
8273 static bool
8274 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
8275 {
8276   rtx op0, op1;
8277   scalar_int_mode mode;
8278   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
8279     return false;
8280
8281   *res_op0 = NULL_RTX;
8282   *res_op1 = NULL_RTX;
8283
8284   if (GET_CODE (x) != IOR)
8285     return false;
8286
8287   op0 = XEXP (x, 0);
8288   op1 = XEXP (x, 1);
8289
8290   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
8291       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
8292     {
8293      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
8294       if (GET_CODE (op1) == ASHIFT)
8295         std::swap (op0, op1);
8296
8297       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
8298         return false;
8299
8300       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
8301       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
8302
8303       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
8304           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
8305         {
8306           *res_op0 = XEXP (op0, 0);
8307           *res_op1 = XEXP (op1, 0);
8308           return true;
8309         }
8310     }
8311
8312   return false;
8313 }
8314
8315 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
8316    storing it in *COST.  Result is true if the total cost of the operation
8317    has now been calculated.  */
8318 static bool
8319 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
8320 {
8321   rtx inner;
8322   rtx comparator;
8323   enum rtx_code cmpcode;
8324
8325   if (COMPARISON_P (op0))
8326     {
8327       inner = XEXP (op0, 0);
8328       comparator = XEXP (op0, 1);
8329       cmpcode = GET_CODE (op0);
8330     }
8331   else
8332     {
8333       inner = op0;
8334       comparator = const0_rtx;
8335       cmpcode = NE;
8336     }
8337
8338   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
8339     {
8340       /* Conditional branch.  */
8341       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8342         return true;
8343       else
8344         {
8345           if (cmpcode == NE || cmpcode == EQ)
8346             {
8347               if (comparator == const0_rtx)
8348                 {
8349                   /* TBZ/TBNZ/CBZ/CBNZ.  */
8350                   if (GET_CODE (inner) == ZERO_EXTRACT)
8351                     /* TBZ/TBNZ.  */
8352                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
8353                                        ZERO_EXTRACT, 0, speed);
8354                   else
8355                     /* CBZ/CBNZ.  */
8356                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
8357
8358                 return true;
8359               }
8360             }
8361           else if (cmpcode == LT || cmpcode == GE)
8362             {
8363               /* TBZ/TBNZ.  */
8364               if (comparator == const0_rtx)
8365                 return true;
8366             }
8367         }
8368     }
8369   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8370     {
8371       /* CCMP.  */
8372       if (GET_CODE (op1) == COMPARE)
8373         {
8374           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
8375           if (XEXP (op1, 1) == const0_rtx)
8376             *cost += 1;
8377           if (speed)
8378             {
8379               machine_mode mode = GET_MODE (XEXP (op1, 0));
8380               const struct cpu_cost_table *extra_cost
8381                 = aarch64_tune_params.insn_extra_cost;
8382
8383               if (GET_MODE_CLASS (mode) == MODE_INT)
8384                 *cost += extra_cost->alu.arith;
8385               else
8386                 *cost += extra_cost->fp[mode == DFmode].compare;
8387             }
8388           return true;
8389         }
8390
8391       /* It's a conditional operation based on the status flags,
8392          so it must be some flavor of CSEL.  */
8393
8394       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
8395       if (GET_CODE (op1) == NEG
8396           || GET_CODE (op1) == NOT
8397           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
8398         op1 = XEXP (op1, 0);
8399       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
8400         {
8401           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
8402           op1 = XEXP (op1, 0);
8403           op2 = XEXP (op2, 0);
8404         }
8405
8406       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
8407       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
8408       return true;
8409     }
8410
8411   /* We don't know what this is, cost all operands.  */
8412   return false;
8413 }
8414
8415 /* Check whether X is a bitfield operation of the form shift + extend that
8416    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
8417    operand to which the bitfield operation is applied.  Otherwise return
8418    NULL_RTX.  */
8419
8420 static rtx
8421 aarch64_extend_bitfield_pattern_p (rtx x)
8422 {
8423   rtx_code outer_code = GET_CODE (x);
8424   machine_mode outer_mode = GET_MODE (x);
8425
8426   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
8427       && outer_mode != SImode && outer_mode != DImode)
8428     return NULL_RTX;
8429
8430   rtx inner = XEXP (x, 0);
8431   rtx_code inner_code = GET_CODE (inner);
8432   machine_mode inner_mode = GET_MODE (inner);
8433   rtx op = NULL_RTX;
8434
8435   switch (inner_code)
8436     {
8437       case ASHIFT:
8438         if (CONST_INT_P (XEXP (inner, 1))
8439             && (inner_mode == QImode || inner_mode == HImode))
8440           op = XEXP (inner, 0);
8441         break;
8442       case LSHIFTRT:
8443         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
8444             && (inner_mode == QImode || inner_mode == HImode))
8445           op = XEXP (inner, 0);
8446         break;
8447       case ASHIFTRT:
8448         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
8449             && (inner_mode == QImode || inner_mode == HImode))
8450           op = XEXP (inner, 0);
8451         break;
8452       default:
8453         break;
8454     }
8455
8456   return op;
8457 }
8458
8459 /* Return true if the mask and a shift amount from an RTX of the form
8460    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
8461    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
8462
8463 bool
8464 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
8465                                     rtx shft_amnt)
8466 {
8467   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
8468          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
8469          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
8470          && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
8471 }
8472
8473 /* Calculate the cost of calculating X, storing it in *COST.  Result
8474    is true if the total cost of the operation has now been calculated.  */
8475 static bool
8476 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
8477                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
8478 {
8479   rtx op0, op1, op2;
8480   const struct cpu_cost_table *extra_cost
8481     = aarch64_tune_params.insn_extra_cost;
8482   int code = GET_CODE (x);
8483   scalar_int_mode int_mode;
8484
8485   /* By default, assume that everything has equivalent cost to the
8486      cheapest instruction.  Any additional costs are applied as a delta
8487      above this default.  */
8488   *cost = COSTS_N_INSNS (1);
8489
8490   switch (code)
8491     {
8492     case SET:
8493       /* The cost depends entirely on the operands to SET.  */
8494       *cost = 0;
8495       op0 = SET_DEST (x);
8496       op1 = SET_SRC (x);
8497
8498       switch (GET_CODE (op0))
8499         {
8500         case MEM:
8501           if (speed)
8502             {
8503               rtx address = XEXP (op0, 0);
8504               if (VECTOR_MODE_P (mode))
8505                 *cost += extra_cost->ldst.storev;
8506               else if (GET_MODE_CLASS (mode) == MODE_INT)
8507                 *cost += extra_cost->ldst.store;
8508               else if (mode == SFmode)
8509                 *cost += extra_cost->ldst.storef;
8510               else if (mode == DFmode)
8511                 *cost += extra_cost->ldst.stored;
8512
8513               *cost +=
8514                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8515                                                      0, speed));
8516             }
8517
8518           *cost += rtx_cost (op1, mode, SET, 1, speed);
8519           return true;
8520
8521         case SUBREG:
8522           if (! REG_P (SUBREG_REG (op0)))
8523             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
8524
8525           /* Fall through.  */
8526         case REG:
8527           /* The cost is one per vector-register copied.  */
8528           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
8529             {
8530               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
8531               *cost = COSTS_N_INSNS (nregs);
8532             }
8533           /* const0_rtx is in general free, but we will use an
8534              instruction to set a register to 0.  */
8535           else if (REG_P (op1) || op1 == const0_rtx)
8536             {
8537               /* The cost is 1 per register copied.  */
8538               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
8539               *cost = COSTS_N_INSNS (nregs);
8540             }
8541           else
8542             /* Cost is just the cost of the RHS of the set.  */
8543             *cost += rtx_cost (op1, mode, SET, 1, speed);
8544           return true;
8545
8546         case ZERO_EXTRACT:
8547         case SIGN_EXTRACT:
8548           /* Bit-field insertion.  Strip any redundant widening of
8549              the RHS to meet the width of the target.  */
8550           if (GET_CODE (op1) == SUBREG)
8551             op1 = SUBREG_REG (op1);
8552           if ((GET_CODE (op1) == ZERO_EXTEND
8553                || GET_CODE (op1) == SIGN_EXTEND)
8554               && CONST_INT_P (XEXP (op0, 1))
8555               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
8556               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
8557             op1 = XEXP (op1, 0);
8558
8559           if (CONST_INT_P (op1))
8560             {
8561               /* MOV immediate is assumed to always be cheap.  */
8562               *cost = COSTS_N_INSNS (1);
8563             }
8564           else
8565             {
8566               /* BFM.  */
8567               if (speed)
8568                 *cost += extra_cost->alu.bfi;
8569               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
8570             }
8571
8572           return true;
8573
8574         default:
8575           /* We can't make sense of this, assume default cost.  */
8576           *cost = COSTS_N_INSNS (1);
8577           return false;
8578         }
8579       return false;
8580
8581     case CONST_INT:
8582       /* If an instruction can incorporate a constant within the
8583          instruction, the instruction's expression avoids calling
8584          rtx_cost() on the constant.  If rtx_cost() is called on a
8585          constant, then it is usually because the constant must be
8586          moved into a register by one or more instructions.
8587
8588          The exception is constant 0, which can be expressed
8589          as XZR/WZR and is therefore free.  The exception to this is
8590          if we have (set (reg) (const0_rtx)) in which case we must cost
8591          the move.  However, we can catch that when we cost the SET, so
8592          we don't need to consider that here.  */
8593       if (x == const0_rtx)
8594         *cost = 0;
8595       else
8596         {
8597           /* To an approximation, building any other constant is
8598              proportionally expensive to the number of instructions
8599              required to build that constant.  This is true whether we
8600              are compiling for SPEED or otherwise.  */
8601           if (!is_a <scalar_int_mode> (mode, &int_mode))
8602             int_mode = word_mode;
8603           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
8604                                  (NULL_RTX, x, false, int_mode));
8605         }
8606       return true;
8607
8608     case CONST_DOUBLE:
8609
8610       /* First determine number of instructions to do the move
8611           as an integer constant.  */
8612       if (!aarch64_float_const_representable_p (x)
8613            && !aarch64_can_const_movi_rtx_p (x, mode)
8614            && aarch64_float_const_rtx_p (x))
8615         {
8616           unsigned HOST_WIDE_INT ival;
8617           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
8618           gcc_assert (succeed);
8619
8620           scalar_int_mode imode = (mode == HFmode
8621                                    ? SImode
8622                                    : int_mode_for_mode (mode).require ());
8623           int ncost = aarch64_internal_mov_immediate
8624                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
8625           *cost += COSTS_N_INSNS (ncost);
8626           return true;
8627         }
8628
8629       if (speed)
8630         {
8631           /* mov[df,sf]_aarch64.  */
8632           if (aarch64_float_const_representable_p (x))
8633             /* FMOV (scalar immediate).  */
8634             *cost += extra_cost->fp[mode == DFmode].fpconst;
8635           else if (!aarch64_float_const_zero_rtx_p (x))
8636             {
8637               /* This will be a load from memory.  */
8638               if (mode == DFmode)
8639                 *cost += extra_cost->ldst.loadd;
8640               else
8641                 *cost += extra_cost->ldst.loadf;
8642             }
8643           else
8644             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
8645                or MOV v0.s[0], wzr - neither of which are modeled by the
8646                cost tables.  Just use the default cost.  */
8647             {
8648             }
8649         }
8650
8651       return true;
8652
8653     case MEM:
8654       if (speed)
8655         {
8656           /* For loads we want the base cost of a load, plus an
8657              approximation for the additional cost of the addressing
8658              mode.  */
8659           rtx address = XEXP (x, 0);
8660           if (VECTOR_MODE_P (mode))
8661             *cost += extra_cost->ldst.loadv;
8662           else if (GET_MODE_CLASS (mode) == MODE_INT)
8663             *cost += extra_cost->ldst.load;
8664           else if (mode == SFmode)
8665             *cost += extra_cost->ldst.loadf;
8666           else if (mode == DFmode)
8667             *cost += extra_cost->ldst.loadd;
8668
8669           *cost +=
8670                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8671                                                      0, speed));
8672         }
8673
8674       return true;
8675
8676     case NEG:
8677       op0 = XEXP (x, 0);
8678
8679       if (VECTOR_MODE_P (mode))
8680         {
8681           if (speed)
8682             {
8683               /* FNEG.  */
8684               *cost += extra_cost->vect.alu;
8685             }
8686           return false;
8687         }
8688
8689       if (GET_MODE_CLASS (mode) == MODE_INT)
8690         {
8691           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8692               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8693             {
8694               /* CSETM.  */
8695               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
8696               return true;
8697             }
8698
8699           /* Cost this as SUB wzr, X.  */
8700           op0 = CONST0_RTX (mode);
8701           op1 = XEXP (x, 0);
8702           goto cost_minus;
8703         }
8704
8705       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8706         {
8707           /* Support (neg(fma...)) as a single instruction only if
8708              sign of zeros is unimportant.  This matches the decision
8709              making in aarch64.md.  */
8710           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
8711             {
8712               /* FNMADD.  */
8713               *cost = rtx_cost (op0, mode, NEG, 0, speed);
8714               return true;
8715             }
8716           if (GET_CODE (op0) == MULT)
8717             {
8718               /* FNMUL.  */
8719               *cost = rtx_cost (op0, mode, NEG, 0, speed);
8720               return true;
8721             }
8722           if (speed)
8723             /* FNEG.  */
8724             *cost += extra_cost->fp[mode == DFmode].neg;
8725           return false;
8726         }
8727
8728       return false;
8729
8730     case CLRSB:
8731     case CLZ:
8732       if (speed)
8733         {
8734           if (VECTOR_MODE_P (mode))
8735             *cost += extra_cost->vect.alu;
8736           else
8737             *cost += extra_cost->alu.clz;
8738         }
8739
8740       return false;
8741
8742     case COMPARE:
8743       op0 = XEXP (x, 0);
8744       op1 = XEXP (x, 1);
8745
8746       if (op1 == const0_rtx
8747           && GET_CODE (op0) == AND)
8748         {
8749           x = op0;
8750           mode = GET_MODE (op0);
8751           goto cost_logic;
8752         }
8753
8754       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
8755         {
8756           /* TODO: A write to the CC flags possibly costs extra, this
8757              needs encoding in the cost tables.  */
8758
8759           mode = GET_MODE (op0);
8760           /* ANDS.  */
8761           if (GET_CODE (op0) == AND)
8762             {
8763               x = op0;
8764               goto cost_logic;
8765             }
8766
8767           if (GET_CODE (op0) == PLUS)
8768             {
8769               /* ADDS (and CMN alias).  */
8770               x = op0;
8771               goto cost_plus;
8772             }
8773
8774           if (GET_CODE (op0) == MINUS)
8775             {
8776               /* SUBS.  */
8777               x = op0;
8778               goto cost_minus;
8779             }
8780
8781           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
8782               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
8783               && CONST_INT_P (XEXP (op0, 2)))
8784             {
8785               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
8786                  Handle it here directly rather than going to cost_logic
8787                  since we know the immediate generated for the TST is valid
8788                  so we can avoid creating an intermediate rtx for it only
8789                  for costing purposes.  */
8790               if (speed)
8791                 *cost += extra_cost->alu.logical;
8792
8793               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
8794                                  ZERO_EXTRACT, 0, speed);
8795               return true;
8796             }
8797
8798           if (GET_CODE (op1) == NEG)
8799             {
8800               /* CMN.  */
8801               if (speed)
8802                 *cost += extra_cost->alu.arith;
8803
8804               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
8805               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
8806               return true;
8807             }
8808
8809           /* CMP.
8810
8811              Compare can freely swap the order of operands, and
8812              canonicalization puts the more complex operation first.
8813              But the integer MINUS logic expects the shift/extend
8814              operation in op1.  */
8815           if (! (REG_P (op0)
8816                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
8817           {
8818             op0 = XEXP (x, 1);
8819             op1 = XEXP (x, 0);
8820           }
8821           goto cost_minus;
8822         }
8823
8824       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
8825         {
8826           /* FCMP.  */
8827           if (speed)
8828             *cost += extra_cost->fp[mode == DFmode].compare;
8829
8830           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
8831             {
8832               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
8833               /* FCMP supports constant 0.0 for no extra cost. */
8834               return true;
8835             }
8836           return false;
8837         }
8838
8839       if (VECTOR_MODE_P (mode))
8840         {
8841           /* Vector compare.  */
8842           if (speed)
8843             *cost += extra_cost->vect.alu;
8844
8845           if (aarch64_float_const_zero_rtx_p (op1))
8846             {
8847               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
8848                  cost.  */
8849               return true;
8850             }
8851           return false;
8852         }
8853       return false;
8854
8855     case MINUS:
8856       {
8857         op0 = XEXP (x, 0);
8858         op1 = XEXP (x, 1);
8859
8860 cost_minus:
8861         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
8862
8863         /* Detect valid immediates.  */
8864         if ((GET_MODE_CLASS (mode) == MODE_INT
8865              || (GET_MODE_CLASS (mode) == MODE_CC
8866                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
8867             && CONST_INT_P (op1)
8868             && aarch64_uimm12_shift (INTVAL (op1)))
8869           {
8870             if (speed)
8871               /* SUB(S) (immediate).  */
8872               *cost += extra_cost->alu.arith;
8873             return true;
8874           }
8875
8876         /* Look for SUB (extended register).  */
8877         if (is_a <scalar_int_mode> (mode, &int_mode)
8878             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
8879           {
8880             if (speed)
8881               *cost += extra_cost->alu.extend_arith;
8882
8883             op1 = aarch64_strip_extend (op1, true);
8884             *cost += rtx_cost (op1, VOIDmode,
8885                                (enum rtx_code) GET_CODE (op1), 0, speed);
8886             return true;
8887           }
8888
8889         rtx new_op1 = aarch64_strip_extend (op1, false);
8890
8891         /* Cost this as an FMA-alike operation.  */
8892         if ((GET_CODE (new_op1) == MULT
8893              || aarch64_shift_p (GET_CODE (new_op1)))
8894             && code != COMPARE)
8895           {
8896             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
8897                                             (enum rtx_code) code,
8898                                             speed);
8899             return true;
8900           }
8901
8902         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
8903
8904         if (speed)
8905           {
8906             if (VECTOR_MODE_P (mode))
8907               {
8908                 /* Vector SUB.  */
8909                 *cost += extra_cost->vect.alu;
8910               }
8911             else if (GET_MODE_CLASS (mode) == MODE_INT)
8912               {
8913                 /* SUB(S).  */
8914                 *cost += extra_cost->alu.arith;
8915               }
8916             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8917               {
8918                 /* FSUB.  */
8919                 *cost += extra_cost->fp[mode == DFmode].addsub;
8920               }
8921           }
8922         return true;
8923       }
8924
8925     case PLUS:
8926       {
8927         rtx new_op0;
8928
8929         op0 = XEXP (x, 0);
8930         op1 = XEXP (x, 1);
8931
8932 cost_plus:
8933         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8934             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8935           {
8936             /* CSINC.  */
8937             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
8938             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8939             return true;
8940           }
8941
8942         if (GET_MODE_CLASS (mode) == MODE_INT
8943             && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
8944                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
8945           {
8946             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
8947
8948             if (speed)
8949               /* ADD (immediate).  */
8950               *cost += extra_cost->alu.arith;
8951             return true;
8952           }
8953
8954         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8955
8956         /* Look for ADD (extended register).  */
8957         if (is_a <scalar_int_mode> (mode, &int_mode)
8958             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
8959           {
8960             if (speed)
8961               *cost += extra_cost->alu.extend_arith;
8962
8963             op0 = aarch64_strip_extend (op0, true);
8964             *cost += rtx_cost (op0, VOIDmode,
8965                                (enum rtx_code) GET_CODE (op0), 0, speed);
8966             return true;
8967           }
8968
8969         /* Strip any extend, leave shifts behind as we will
8970            cost them through mult_cost.  */
8971         new_op0 = aarch64_strip_extend (op0, false);
8972
8973         if (GET_CODE (new_op0) == MULT
8974             || aarch64_shift_p (GET_CODE (new_op0)))
8975           {
8976             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
8977                                             speed);
8978             return true;
8979           }
8980
8981         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
8982
8983         if (speed)
8984           {
8985             if (VECTOR_MODE_P (mode))
8986               {
8987                 /* Vector ADD.  */
8988                 *cost += extra_cost->vect.alu;
8989               }
8990             else if (GET_MODE_CLASS (mode) == MODE_INT)
8991               {
8992                 /* ADD.  */
8993                 *cost += extra_cost->alu.arith;
8994               }
8995             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8996               {
8997                 /* FADD.  */
8998                 *cost += extra_cost->fp[mode == DFmode].addsub;
8999               }
9000           }
9001         return true;
9002       }
9003
9004     case BSWAP:
9005       *cost = COSTS_N_INSNS (1);
9006
9007       if (speed)
9008         {
9009           if (VECTOR_MODE_P (mode))
9010             *cost += extra_cost->vect.alu;
9011           else
9012             *cost += extra_cost->alu.rev;
9013         }
9014       return false;
9015
9016     case IOR:
9017       if (aarch_rev16_p (x))
9018         {
9019           *cost = COSTS_N_INSNS (1);
9020
9021           if (speed)
9022             {
9023               if (VECTOR_MODE_P (mode))
9024                 *cost += extra_cost->vect.alu;
9025               else
9026                 *cost += extra_cost->alu.rev;
9027             }
9028           return true;
9029         }
9030
9031       if (aarch64_extr_rtx_p (x, &op0, &op1))
9032         {
9033           *cost += rtx_cost (op0, mode, IOR, 0, speed);
9034           *cost += rtx_cost (op1, mode, IOR, 1, speed);
9035           if (speed)
9036             *cost += extra_cost->alu.shift;
9037
9038           return true;
9039         }
9040     /* Fall through.  */
9041     case XOR:
9042     case AND:
9043     cost_logic:
9044       op0 = XEXP (x, 0);
9045       op1 = XEXP (x, 1);
9046
9047       if (VECTOR_MODE_P (mode))
9048         {
9049           if (speed)
9050             *cost += extra_cost->vect.alu;
9051           return true;
9052         }
9053
9054       if (code == AND
9055           && GET_CODE (op0) == MULT
9056           && CONST_INT_P (XEXP (op0, 1))
9057           && CONST_INT_P (op1)
9058           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
9059                                INTVAL (op1)) != 0)
9060         {
9061           /* This is a UBFM/SBFM.  */
9062           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
9063           if (speed)
9064             *cost += extra_cost->alu.bfx;
9065           return true;
9066         }
9067
9068       if (is_int_mode (mode, &int_mode))
9069         {
9070           if (CONST_INT_P (op1))
9071             {
9072               /* We have a mask + shift version of a UBFIZ
9073                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
9074               if (GET_CODE (op0) == ASHIFT
9075                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
9076                                                          XEXP (op0, 1)))
9077                 {
9078                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
9079                                      (enum rtx_code) code, 0, speed);
9080                   if (speed)
9081                     *cost += extra_cost->alu.bfx;
9082
9083                   return true;
9084                 }
9085               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
9086                 {
9087                 /* We possibly get the immediate for free, this is not
9088                    modelled.  */
9089                   *cost += rtx_cost (op0, int_mode,
9090                                      (enum rtx_code) code, 0, speed);
9091                   if (speed)
9092                     *cost += extra_cost->alu.logical;
9093
9094                   return true;
9095                 }
9096             }
9097           else
9098             {
9099               rtx new_op0 = op0;
9100
9101               /* Handle ORN, EON, or BIC.  */
9102               if (GET_CODE (op0) == NOT)
9103                 op0 = XEXP (op0, 0);
9104
9105               new_op0 = aarch64_strip_shift (op0);
9106
9107               /* If we had a shift on op0 then this is a logical-shift-
9108                  by-register/immediate operation.  Otherwise, this is just
9109                  a logical operation.  */
9110               if (speed)
9111                 {
9112                   if (new_op0 != op0)
9113                     {
9114                       /* Shift by immediate.  */
9115                       if (CONST_INT_P (XEXP (op0, 1)))
9116                         *cost += extra_cost->alu.log_shift;
9117                       else
9118                         *cost += extra_cost->alu.log_shift_reg;
9119                     }
9120                   else
9121                     *cost += extra_cost->alu.logical;
9122                 }
9123
9124               /* In both cases we want to cost both operands.  */
9125               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
9126                                  0, speed);
9127               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
9128                                  1, speed);
9129
9130               return true;
9131             }
9132         }
9133       return false;
9134
9135     case NOT:
9136       x = XEXP (x, 0);
9137       op0 = aarch64_strip_shift (x);
9138
9139       if (VECTOR_MODE_P (mode))
9140         {
9141           /* Vector NOT.  */
9142           *cost += extra_cost->vect.alu;
9143           return false;
9144         }
9145
9146       /* MVN-shifted-reg.  */
9147       if (op0 != x)
9148         {
9149           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9150
9151           if (speed)
9152             *cost += extra_cost->alu.log_shift;
9153
9154           return true;
9155         }
9156       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
9157          Handle the second form here taking care that 'a' in the above can
9158          be a shift.  */
9159       else if (GET_CODE (op0) == XOR)
9160         {
9161           rtx newop0 = XEXP (op0, 0);
9162           rtx newop1 = XEXP (op0, 1);
9163           rtx op0_stripped = aarch64_strip_shift (newop0);
9164
9165           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
9166           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
9167
9168           if (speed)
9169             {
9170               if (op0_stripped != newop0)
9171                 *cost += extra_cost->alu.log_shift;
9172               else
9173                 *cost += extra_cost->alu.logical;
9174             }
9175
9176           return true;
9177         }
9178       /* MVN.  */
9179       if (speed)
9180         *cost += extra_cost->alu.logical;
9181
9182       return false;
9183
9184     case ZERO_EXTEND:
9185
9186       op0 = XEXP (x, 0);
9187       /* If a value is written in SI mode, then zero extended to DI
9188          mode, the operation will in general be free as a write to
9189          a 'w' register implicitly zeroes the upper bits of an 'x'
9190          register.  However, if this is
9191
9192            (set (reg) (zero_extend (reg)))
9193
9194          we must cost the explicit register move.  */
9195       if (mode == DImode
9196           && GET_MODE (op0) == SImode
9197           && outer == SET)
9198         {
9199           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
9200
9201         /* If OP_COST is non-zero, then the cost of the zero extend
9202            is effectively the cost of the inner operation.  Otherwise
9203            we have a MOV instruction and we take the cost from the MOV
9204            itself.  This is true independently of whether we are
9205            optimizing for space or time.  */
9206           if (op_cost)
9207             *cost = op_cost;
9208
9209           return true;
9210         }
9211       else if (MEM_P (op0))
9212         {
9213           /* All loads can zero extend to any size for free.  */
9214           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
9215           return true;
9216         }
9217
9218       op0 = aarch64_extend_bitfield_pattern_p (x);
9219       if (op0)
9220         {
9221           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
9222           if (speed)
9223             *cost += extra_cost->alu.bfx;
9224           return true;
9225         }
9226
9227       if (speed)
9228         {
9229           if (VECTOR_MODE_P (mode))
9230             {
9231               /* UMOV.  */
9232               *cost += extra_cost->vect.alu;
9233             }
9234           else
9235             {
9236               /* We generate an AND instead of UXTB/UXTH.  */
9237               *cost += extra_cost->alu.logical;
9238             }
9239         }
9240       return false;
9241
9242     case SIGN_EXTEND:
9243       if (MEM_P (XEXP (x, 0)))
9244         {
9245           /* LDRSH.  */
9246           if (speed)
9247             {
9248               rtx address = XEXP (XEXP (x, 0), 0);
9249               *cost += extra_cost->ldst.load_sign_extend;
9250
9251               *cost +=
9252                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9253                                                      0, speed));
9254             }
9255           return true;
9256         }
9257
9258       op0 = aarch64_extend_bitfield_pattern_p (x);
9259       if (op0)
9260         {
9261           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
9262           if (speed)
9263             *cost += extra_cost->alu.bfx;
9264           return true;
9265         }
9266
9267       if (speed)
9268         {
9269           if (VECTOR_MODE_P (mode))
9270             *cost += extra_cost->vect.alu;
9271           else
9272             *cost += extra_cost->alu.extend;
9273         }
9274       return false;
9275
9276     case ASHIFT:
9277       op0 = XEXP (x, 0);
9278       op1 = XEXP (x, 1);
9279
9280       if (CONST_INT_P (op1))
9281         {
9282           if (speed)
9283             {
9284               if (VECTOR_MODE_P (mode))
9285                 {
9286                   /* Vector shift (immediate).  */
9287                   *cost += extra_cost->vect.alu;
9288                 }
9289               else
9290                 {
9291                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
9292                      aliases.  */
9293                   *cost += extra_cost->alu.shift;
9294                 }
9295             }
9296
9297           /* We can incorporate zero/sign extend for free.  */
9298           if (GET_CODE (op0) == ZERO_EXTEND
9299               || GET_CODE (op0) == SIGN_EXTEND)
9300             op0 = XEXP (op0, 0);
9301
9302           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
9303           return true;
9304         }
9305       else
9306         {
9307           if (VECTOR_MODE_P (mode))
9308             {
9309               if (speed)
9310                 /* Vector shift (register).  */
9311                 *cost += extra_cost->vect.alu;
9312             }
9313           else
9314             {
9315               if (speed)
9316                 /* LSLV.  */
9317                 *cost += extra_cost->alu.shift_reg;
9318
9319               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9320                   && CONST_INT_P (XEXP (op1, 1))
9321                   && known_eq (INTVAL (XEXP (op1, 1)),
9322                                GET_MODE_BITSIZE (mode) - 1))
9323                 {
9324                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9325                   /* We already demanded XEXP (op1, 0) to be REG_P, so
9326                      don't recurse into it.  */
9327                   return true;
9328                 }
9329             }
9330           return false;  /* All arguments need to be in registers.  */
9331         }
9332
9333     case ROTATE:
9334     case ROTATERT:
9335     case LSHIFTRT:
9336     case ASHIFTRT:
9337       op0 = XEXP (x, 0);
9338       op1 = XEXP (x, 1);
9339
9340       if (CONST_INT_P (op1))
9341         {
9342           /* ASR (immediate) and friends.  */
9343           if (speed)
9344             {
9345               if (VECTOR_MODE_P (mode))
9346                 *cost += extra_cost->vect.alu;
9347               else
9348                 *cost += extra_cost->alu.shift;
9349             }
9350
9351           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9352           return true;
9353         }
9354       else
9355         {
9356           if (VECTOR_MODE_P (mode))
9357             {
9358               if (speed)
9359                 /* Vector shift (register).  */
9360                 *cost += extra_cost->vect.alu;
9361             }
9362           else
9363             {
9364               if (speed)
9365                 /* ASR (register) and friends.  */
9366                 *cost += extra_cost->alu.shift_reg;
9367
9368               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9369                   && CONST_INT_P (XEXP (op1, 1))
9370                   && known_eq (INTVAL (XEXP (op1, 1)),
9371                                GET_MODE_BITSIZE (mode) - 1))
9372                 {
9373                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9374                   /* We already demanded XEXP (op1, 0) to be REG_P, so
9375                      don't recurse into it.  */
9376                   return true;
9377                 }
9378             }
9379           return false;  /* All arguments need to be in registers.  */
9380         }
9381
9382     case SYMBOL_REF:
9383
9384       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
9385           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
9386         {
9387           /* LDR.  */
9388           if (speed)
9389             *cost += extra_cost->ldst.load;
9390         }
9391       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
9392                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
9393         {
9394           /* ADRP, followed by ADD.  */
9395           *cost += COSTS_N_INSNS (1);
9396           if (speed)
9397             *cost += 2 * extra_cost->alu.arith;
9398         }
9399       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
9400                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
9401         {
9402           /* ADR.  */
9403           if (speed)
9404             *cost += extra_cost->alu.arith;
9405         }
9406
9407       if (flag_pic)
9408         {
9409           /* One extra load instruction, after accessing the GOT.  */
9410           *cost += COSTS_N_INSNS (1);
9411           if (speed)
9412             *cost += extra_cost->ldst.load;
9413         }
9414       return true;
9415
9416     case HIGH:
9417     case LO_SUM:
9418       /* ADRP/ADD (immediate).  */
9419       if (speed)
9420         *cost += extra_cost->alu.arith;
9421       return true;
9422
9423     case ZERO_EXTRACT:
9424     case SIGN_EXTRACT:
9425       /* UBFX/SBFX.  */
9426       if (speed)
9427         {
9428           if (VECTOR_MODE_P (mode))
9429             *cost += extra_cost->vect.alu;
9430           else
9431             *cost += extra_cost->alu.bfx;
9432         }
9433
9434       /* We can trust that the immediates used will be correct (there
9435          are no by-register forms), so we need only cost op0.  */
9436       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
9437       return true;
9438
9439     case MULT:
9440       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
9441       /* aarch64_rtx_mult_cost always handles recursion to its
9442          operands.  */
9443       return true;
9444
9445     case MOD:
9446     /* We can expand signed mod by power of 2 using a NEGS, two parallel
9447        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
9448        an unconditional negate.  This case should only ever be reached through
9449        the set_smod_pow2_cheap check in expmed.c.  */
9450       if (CONST_INT_P (XEXP (x, 1))
9451           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
9452           && (mode == SImode || mode == DImode))
9453         {
9454           /* We expand to 4 instructions.  Reset the baseline.  */
9455           *cost = COSTS_N_INSNS (4);
9456
9457           if (speed)
9458             *cost += 2 * extra_cost->alu.logical
9459                      + 2 * extra_cost->alu.arith;
9460
9461           return true;
9462         }
9463
9464     /* Fall-through.  */
9465     case UMOD:
9466       if (speed)
9467         {
9468           /* Slighly prefer UMOD over SMOD.  */
9469           if (VECTOR_MODE_P (mode))
9470             *cost += extra_cost->vect.alu;
9471           else if (GET_MODE_CLASS (mode) == MODE_INT)
9472             *cost += (extra_cost->mult[mode == DImode].add
9473                       + extra_cost->mult[mode == DImode].idiv
9474                       + (code == MOD ? 1 : 0));
9475         }
9476       return false;  /* All arguments need to be in registers.  */
9477
9478     case DIV:
9479     case UDIV:
9480     case SQRT:
9481       if (speed)
9482         {
9483           if (VECTOR_MODE_P (mode))
9484             *cost += extra_cost->vect.alu;
9485           else if (GET_MODE_CLASS (mode) == MODE_INT)
9486             /* There is no integer SQRT, so only DIV and UDIV can get
9487                here.  */
9488             *cost += (extra_cost->mult[mode == DImode].idiv
9489                      /* Slighly prefer UDIV over SDIV.  */
9490                      + (code == DIV ? 1 : 0));
9491           else
9492             *cost += extra_cost->fp[mode == DFmode].div;
9493         }
9494       return false;  /* All arguments need to be in registers.  */
9495
9496     case IF_THEN_ELSE:
9497       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
9498                                          XEXP (x, 2), cost, speed);
9499
9500     case EQ:
9501     case NE:
9502     case GT:
9503     case GTU:
9504     case LT:
9505     case LTU:
9506     case GE:
9507     case GEU:
9508     case LE:
9509     case LEU:
9510
9511       return false; /* All arguments must be in registers.  */
9512
9513     case FMA:
9514       op0 = XEXP (x, 0);
9515       op1 = XEXP (x, 1);
9516       op2 = XEXP (x, 2);
9517
9518       if (speed)
9519         {
9520           if (VECTOR_MODE_P (mode))
9521             *cost += extra_cost->vect.alu;
9522           else
9523             *cost += extra_cost->fp[mode == DFmode].fma;
9524         }
9525
9526       /* FMSUB, FNMADD, and FNMSUB are free.  */
9527       if (GET_CODE (op0) == NEG)
9528         op0 = XEXP (op0, 0);
9529
9530       if (GET_CODE (op2) == NEG)
9531         op2 = XEXP (op2, 0);
9532
9533       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
9534          and the by-element operand as operand 0.  */
9535       if (GET_CODE (op1) == NEG)
9536         op1 = XEXP (op1, 0);
9537
9538       /* Catch vector-by-element operations.  The by-element operand can
9539          either be (vec_duplicate (vec_select (x))) or just
9540          (vec_select (x)), depending on whether we are multiplying by
9541          a vector or a scalar.
9542
9543          Canonicalization is not very good in these cases, FMA4 will put the
9544          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
9545       if (GET_CODE (op0) == VEC_DUPLICATE)
9546         op0 = XEXP (op0, 0);
9547       else if (GET_CODE (op1) == VEC_DUPLICATE)
9548         op1 = XEXP (op1, 0);
9549
9550       if (GET_CODE (op0) == VEC_SELECT)
9551         op0 = XEXP (op0, 0);
9552       else if (GET_CODE (op1) == VEC_SELECT)
9553         op1 = XEXP (op1, 0);
9554
9555       /* If the remaining parameters are not registers,
9556          get the cost to put them into registers.  */
9557       *cost += rtx_cost (op0, mode, FMA, 0, speed);
9558       *cost += rtx_cost (op1, mode, FMA, 1, speed);
9559       *cost += rtx_cost (op2, mode, FMA, 2, speed);
9560       return true;
9561
9562     case FLOAT:
9563     case UNSIGNED_FLOAT:
9564       if (speed)
9565         *cost += extra_cost->fp[mode == DFmode].fromint;
9566       return false;
9567
9568     case FLOAT_EXTEND:
9569       if (speed)
9570         {
9571           if (VECTOR_MODE_P (mode))
9572             {
9573               /*Vector truncate.  */
9574               *cost += extra_cost->vect.alu;
9575             }
9576           else
9577             *cost += extra_cost->fp[mode == DFmode].widen;
9578         }
9579       return false;
9580
9581     case FLOAT_TRUNCATE:
9582       if (speed)
9583         {
9584           if (VECTOR_MODE_P (mode))
9585             {
9586               /*Vector conversion.  */
9587               *cost += extra_cost->vect.alu;
9588             }
9589           else
9590             *cost += extra_cost->fp[mode == DFmode].narrow;
9591         }
9592       return false;
9593
9594     case FIX:
9595     case UNSIGNED_FIX:
9596       x = XEXP (x, 0);
9597       /* Strip the rounding part.  They will all be implemented
9598          by the fcvt* family of instructions anyway.  */
9599       if (GET_CODE (x) == UNSPEC)
9600         {
9601           unsigned int uns_code = XINT (x, 1);
9602
9603           if (uns_code == UNSPEC_FRINTA
9604               || uns_code == UNSPEC_FRINTM
9605               || uns_code == UNSPEC_FRINTN
9606               || uns_code == UNSPEC_FRINTP
9607               || uns_code == UNSPEC_FRINTZ)
9608             x = XVECEXP (x, 0, 0);
9609         }
9610
9611       if (speed)
9612         {
9613           if (VECTOR_MODE_P (mode))
9614             *cost += extra_cost->vect.alu;
9615           else
9616             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
9617         }
9618
9619       /* We can combine fmul by a power of 2 followed by a fcvt into a single
9620          fixed-point fcvt.  */
9621       if (GET_CODE (x) == MULT
9622           && ((VECTOR_MODE_P (mode)
9623                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
9624               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
9625         {
9626           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
9627                              0, speed);
9628           return true;
9629         }
9630
9631       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
9632       return true;
9633
9634     case ABS:
9635       if (VECTOR_MODE_P (mode))
9636         {
9637           /* ABS (vector).  */
9638           if (speed)
9639             *cost += extra_cost->vect.alu;
9640         }
9641       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9642         {
9643           op0 = XEXP (x, 0);
9644
9645           /* FABD, which is analogous to FADD.  */
9646           if (GET_CODE (op0) == MINUS)
9647             {
9648               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
9649               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
9650               if (speed)
9651                 *cost += extra_cost->fp[mode == DFmode].addsub;
9652
9653               return true;
9654             }
9655           /* Simple FABS is analogous to FNEG.  */
9656           if (speed)
9657             *cost += extra_cost->fp[mode == DFmode].neg;
9658         }
9659       else
9660         {
9661           /* Integer ABS will either be split to
9662              two arithmetic instructions, or will be an ABS
9663              (scalar), which we don't model.  */
9664           *cost = COSTS_N_INSNS (2);
9665           if (speed)
9666             *cost += 2 * extra_cost->alu.arith;
9667         }
9668       return false;
9669
9670     case SMAX:
9671     case SMIN:
9672       if (speed)
9673         {
9674           if (VECTOR_MODE_P (mode))
9675             *cost += extra_cost->vect.alu;
9676           else
9677             {
9678               /* FMAXNM/FMINNM/FMAX/FMIN.
9679                  TODO: This may not be accurate for all implementations, but
9680                  we do not model this in the cost tables.  */
9681               *cost += extra_cost->fp[mode == DFmode].addsub;
9682             }
9683         }
9684       return false;
9685
9686     case UNSPEC:
9687       /* The floating point round to integer frint* instructions.  */
9688       if (aarch64_frint_unspec_p (XINT (x, 1)))
9689         {
9690           if (speed)
9691             *cost += extra_cost->fp[mode == DFmode].roundint;
9692
9693           return false;
9694         }
9695
9696       if (XINT (x, 1) == UNSPEC_RBIT)
9697         {
9698           if (speed)
9699             *cost += extra_cost->alu.rev;
9700
9701           return false;
9702         }
9703       break;
9704
9705     case TRUNCATE:
9706
9707       /* Decompose <su>muldi3_highpart.  */
9708       if (/* (truncate:DI  */
9709           mode == DImode
9710           /*   (lshiftrt:TI  */
9711           && GET_MODE (XEXP (x, 0)) == TImode
9712           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
9713           /*      (mult:TI  */
9714           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
9715           /*        (ANY_EXTEND:TI (reg:DI))
9716                     (ANY_EXTEND:TI (reg:DI)))  */
9717           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
9718                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
9719               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
9720                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
9721           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
9722           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
9723           /*     (const_int 64)  */
9724           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9725           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
9726         {
9727           /* UMULH/SMULH.  */
9728           if (speed)
9729             *cost += extra_cost->mult[mode == DImode].extend;
9730           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
9731                              mode, MULT, 0, speed);
9732           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
9733                              mode, MULT, 1, speed);
9734           return true;
9735         }
9736
9737       /* Fall through.  */
9738     default:
9739       break;
9740     }
9741
9742   if (dump_file
9743       && flag_aarch64_verbose_cost)
9744     fprintf (dump_file,
9745       "\nFailed to cost RTX.  Assuming default cost.\n");
9746
9747   return true;
9748 }
9749
9750 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
9751    calculated for X.  This cost is stored in *COST.  Returns true
9752    if the total cost of X was calculated.  */
9753 static bool
9754 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
9755                    int param, int *cost, bool speed)
9756 {
9757   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
9758
9759   if (dump_file
9760       && flag_aarch64_verbose_cost)
9761     {
9762       print_rtl_single (dump_file, x);
9763       fprintf (dump_file, "\n%s cost: %d (%s)\n",
9764                speed ? "Hot" : "Cold",
9765                *cost, result ? "final" : "partial");
9766     }
9767
9768   return result;
9769 }
9770
9771 static int
9772 aarch64_register_move_cost (machine_mode mode,
9773                             reg_class_t from_i, reg_class_t to_i)
9774 {
9775   enum reg_class from = (enum reg_class) from_i;
9776   enum reg_class to = (enum reg_class) to_i;
9777   const struct cpu_regmove_cost *regmove_cost
9778     = aarch64_tune_params.regmove_cost;
9779
9780   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
9781   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
9782     to = GENERAL_REGS;
9783
9784   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
9785     from = GENERAL_REGS;
9786
9787   /* Moving between GPR and stack cost is the same as GP2GP.  */
9788   if ((from == GENERAL_REGS && to == STACK_REG)
9789       || (to == GENERAL_REGS && from == STACK_REG))
9790     return regmove_cost->GP2GP;
9791
9792   /* To/From the stack register, we move via the gprs.  */
9793   if (to == STACK_REG || from == STACK_REG)
9794     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
9795             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
9796
9797   if (known_eq (GET_MODE_SIZE (mode), 16))
9798     {
9799       /* 128-bit operations on general registers require 2 instructions.  */
9800       if (from == GENERAL_REGS && to == GENERAL_REGS)
9801         return regmove_cost->GP2GP * 2;
9802       else if (from == GENERAL_REGS)
9803         return regmove_cost->GP2FP * 2;
9804       else if (to == GENERAL_REGS)
9805         return regmove_cost->FP2GP * 2;
9806
9807       /* When AdvSIMD instructions are disabled it is not possible to move
9808          a 128-bit value directly between Q registers.  This is handled in
9809          secondary reload.  A general register is used as a scratch to move
9810          the upper DI value and the lower DI value is moved directly,
9811          hence the cost is the sum of three moves. */
9812       if (! TARGET_SIMD)
9813         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
9814
9815       return regmove_cost->FP2FP;
9816     }
9817
9818   if (from == GENERAL_REGS && to == GENERAL_REGS)
9819     return regmove_cost->GP2GP;
9820   else if (from == GENERAL_REGS)
9821     return regmove_cost->GP2FP;
9822   else if (to == GENERAL_REGS)
9823     return regmove_cost->FP2GP;
9824
9825   return regmove_cost->FP2FP;
9826 }
9827
9828 static int
9829 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
9830                           reg_class_t rclass ATTRIBUTE_UNUSED,
9831                           bool in ATTRIBUTE_UNUSED)
9832 {
9833   return aarch64_tune_params.memmov_cost;
9834 }
9835
9836 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
9837    to optimize 1.0/sqrt.  */
9838
9839 static bool
9840 use_rsqrt_p (machine_mode mode)
9841 {
9842   return (!flag_trapping_math
9843           && flag_unsafe_math_optimizations
9844           && ((aarch64_tune_params.approx_modes->recip_sqrt
9845                & AARCH64_APPROX_MODE (mode))
9846               || flag_mrecip_low_precision_sqrt));
9847 }
9848
9849 /* Function to decide when to use the approximate reciprocal square root
9850    builtin.  */
9851
9852 static tree
9853 aarch64_builtin_reciprocal (tree fndecl)
9854 {
9855   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
9856
9857   if (!use_rsqrt_p (mode))
9858     return NULL_TREE;
9859   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
9860 }
9861
9862 typedef rtx (*rsqrte_type) (rtx, rtx);
9863
9864 /* Select reciprocal square root initial estimate insn depending on machine
9865    mode.  */
9866
9867 static rsqrte_type
9868 get_rsqrte_type (machine_mode mode)
9869 {
9870   switch (mode)
9871   {
9872     case E_DFmode:   return gen_aarch64_rsqrtedf;
9873     case E_SFmode:   return gen_aarch64_rsqrtesf;
9874     case E_V2DFmode: return gen_aarch64_rsqrtev2df;
9875     case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
9876     case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
9877     default: gcc_unreachable ();
9878   }
9879 }
9880
9881 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
9882
9883 /* Select reciprocal square root series step insn depending on machine mode.  */
9884
9885 static rsqrts_type
9886 get_rsqrts_type (machine_mode mode)
9887 {
9888   switch (mode)
9889   {
9890     case E_DFmode:   return gen_aarch64_rsqrtsdf;
9891     case E_SFmode:   return gen_aarch64_rsqrtssf;
9892     case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
9893     case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
9894     case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
9895     default: gcc_unreachable ();
9896   }
9897 }
9898
9899 /* Emit instruction sequence to compute either the approximate square root
9900    or its approximate reciprocal, depending on the flag RECP, and return
9901    whether the sequence was emitted or not.  */
9902
9903 bool
9904 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
9905 {
9906   machine_mode mode = GET_MODE (dst);
9907
9908   if (GET_MODE_INNER (mode) == HFmode)
9909     {
9910       gcc_assert (!recp);
9911       return false;
9912     }
9913
9914   if (!recp)
9915     {
9916       if (!(flag_mlow_precision_sqrt
9917             || (aarch64_tune_params.approx_modes->sqrt
9918                 & AARCH64_APPROX_MODE (mode))))
9919         return false;
9920
9921       if (flag_finite_math_only
9922           || flag_trapping_math
9923           || !flag_unsafe_math_optimizations
9924           || optimize_function_for_size_p (cfun))
9925         return false;
9926     }
9927   else
9928     /* Caller assumes we cannot fail.  */
9929     gcc_assert (use_rsqrt_p (mode));
9930
9931   machine_mode mmsk = mode_for_int_vector (mode).require ();
9932   rtx xmsk = gen_reg_rtx (mmsk);
9933   if (!recp)
9934     /* When calculating the approximate square root, compare the
9935        argument with 0.0 and create a mask.  */
9936     emit_insn (gen_rtx_SET (xmsk,
9937                             gen_rtx_NEG (mmsk,
9938                                          gen_rtx_EQ (mmsk, src,
9939                                                      CONST0_RTX (mode)))));
9940
9941   /* Estimate the approximate reciprocal square root.  */
9942   rtx xdst = gen_reg_rtx (mode);
9943   emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
9944
9945   /* Iterate over the series twice for SF and thrice for DF.  */
9946   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9947
9948   /* Optionally iterate over the series once less for faster performance
9949      while sacrificing the accuracy.  */
9950   if ((recp && flag_mrecip_low_precision_sqrt)
9951       || (!recp && flag_mlow_precision_sqrt))
9952     iterations--;
9953
9954   /* Iterate over the series to calculate the approximate reciprocal square
9955      root.  */
9956   rtx x1 = gen_reg_rtx (mode);
9957   while (iterations--)
9958     {
9959       rtx x2 = gen_reg_rtx (mode);
9960       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
9961
9962       emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
9963
9964       if (iterations > 0)
9965         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
9966     }
9967
9968   if (!recp)
9969     {
9970       /* Qualify the approximate reciprocal square root when the argument is
9971          0.0 by squashing the intermediary result to 0.0.  */
9972       rtx xtmp = gen_reg_rtx (mmsk);
9973       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
9974                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
9975       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
9976
9977       /* Calculate the approximate square root.  */
9978       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
9979     }
9980
9981   /* Finalize the approximation.  */
9982   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
9983
9984   return true;
9985 }
9986
9987 typedef rtx (*recpe_type) (rtx, rtx);
9988
9989 /* Select reciprocal initial estimate insn depending on machine mode.  */
9990
9991 static recpe_type
9992 get_recpe_type (machine_mode mode)
9993 {
9994   switch (mode)
9995   {
9996     case E_SFmode:   return (gen_aarch64_frecpesf);
9997     case E_V2SFmode: return (gen_aarch64_frecpev2sf);
9998     case E_V4SFmode: return (gen_aarch64_frecpev4sf);
9999     case E_DFmode:   return (gen_aarch64_frecpedf);
10000     case E_V2DFmode: return (gen_aarch64_frecpev2df);
10001     default:         gcc_unreachable ();
10002   }
10003 }
10004
10005 typedef rtx (*recps_type) (rtx, rtx, rtx);
10006
10007 /* Select reciprocal series step insn depending on machine mode.  */
10008
10009 static recps_type
10010 get_recps_type (machine_mode mode)
10011 {
10012   switch (mode)
10013   {
10014     case E_SFmode:   return (gen_aarch64_frecpssf);
10015     case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
10016     case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
10017     case E_DFmode:   return (gen_aarch64_frecpsdf);
10018     case E_V2DFmode: return (gen_aarch64_frecpsv2df);
10019     default:         gcc_unreachable ();
10020   }
10021 }
10022
10023 /* Emit the instruction sequence to compute the approximation for the division
10024    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
10025
10026 bool
10027 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
10028 {
10029   machine_mode mode = GET_MODE (quo);
10030
10031   if (GET_MODE_INNER (mode) == HFmode)
10032     return false;
10033
10034   bool use_approx_division_p = (flag_mlow_precision_div
10035                                 || (aarch64_tune_params.approx_modes->division
10036                                     & AARCH64_APPROX_MODE (mode)));
10037
10038   if (!flag_finite_math_only
10039       || flag_trapping_math
10040       || !flag_unsafe_math_optimizations
10041       || optimize_function_for_size_p (cfun)
10042       || !use_approx_division_p)
10043     return false;
10044
10045   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
10046     return false;
10047
10048   /* Estimate the approximate reciprocal.  */
10049   rtx xrcp = gen_reg_rtx (mode);
10050   emit_insn ((*get_recpe_type (mode)) (xrcp, den));
10051
10052   /* Iterate over the series twice for SF and thrice for DF.  */
10053   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10054
10055   /* Optionally iterate over the series once less for faster performance,
10056      while sacrificing the accuracy.  */
10057   if (flag_mlow_precision_div)
10058     iterations--;
10059
10060   /* Iterate over the series to calculate the approximate reciprocal.  */
10061   rtx xtmp = gen_reg_rtx (mode);
10062   while (iterations--)
10063     {
10064       emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
10065
10066       if (iterations > 0)
10067         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
10068     }
10069
10070   if (num != CONST1_RTX (mode))
10071     {
10072       /* As the approximate reciprocal of DEN is already calculated, only
10073          calculate the approximate division when NUM is not 1.0.  */
10074       rtx xnum = force_reg (mode, num);
10075       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
10076     }
10077
10078   /* Finalize the approximation.  */
10079   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
10080   return true;
10081 }
10082
10083 /* Return the number of instructions that can be issued per cycle.  */
10084 static int
10085 aarch64_sched_issue_rate (void)
10086 {
10087   return aarch64_tune_params.issue_rate;
10088 }
10089
10090 static int
10091 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10092 {
10093   int issue_rate = aarch64_sched_issue_rate ();
10094
10095   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
10096 }
10097
10098
10099 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
10100    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
10101    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
10102
10103 static int
10104 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
10105                                                     int ready_index)
10106 {
10107   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
10108 }
10109
10110
10111 /* Vectorizer cost model target hooks.  */
10112
10113 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
10114 static int
10115 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
10116                                     tree vectype,
10117                                     int misalign ATTRIBUTE_UNUSED)
10118 {
10119   unsigned elements;
10120   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
10121   bool fp = false;
10122
10123   if (vectype != NULL)
10124     fp = FLOAT_TYPE_P (vectype);
10125
10126   switch (type_of_cost)
10127     {
10128       case scalar_stmt:
10129         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
10130
10131       case scalar_load:
10132         return costs->scalar_load_cost;
10133
10134       case scalar_store:
10135         return costs->scalar_store_cost;
10136
10137       case vector_stmt:
10138         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10139
10140       case vector_load:
10141         return costs->vec_align_load_cost;
10142
10143       case vector_store:
10144         return costs->vec_store_cost;
10145
10146       case vec_to_scalar:
10147         return costs->vec_to_scalar_cost;
10148
10149       case scalar_to_vec:
10150         return costs->scalar_to_vec_cost;
10151
10152       case unaligned_load:
10153       case vector_gather_load:
10154         return costs->vec_unalign_load_cost;
10155
10156       case unaligned_store:
10157       case vector_scatter_store:
10158         return costs->vec_unalign_store_cost;
10159
10160       case cond_branch_taken:
10161         return costs->cond_taken_branch_cost;
10162
10163       case cond_branch_not_taken:
10164         return costs->cond_not_taken_branch_cost;
10165
10166       case vec_perm:
10167         return costs->vec_permute_cost;
10168
10169       case vec_promote_demote:
10170         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10171
10172       case vec_construct:
10173         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
10174         return elements / 2 + 1;
10175
10176       default:
10177         gcc_unreachable ();
10178     }
10179 }
10180
10181 /* Implement targetm.vectorize.add_stmt_cost.  */
10182 static unsigned
10183 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
10184                        struct _stmt_vec_info *stmt_info, int misalign,
10185                        enum vect_cost_model_location where)
10186 {
10187   unsigned *cost = (unsigned *) data;
10188   unsigned retval = 0;
10189
10190   if (flag_vect_cost_model)
10191     {
10192       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
10193       int stmt_cost =
10194             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
10195
10196       /* Statements in an inner loop relative to the loop being
10197          vectorized are weighted more heavily.  The value here is
10198          arbitrary and could potentially be improved with analysis.  */
10199       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
10200         count *= 50; /*  FIXME  */
10201
10202       retval = (unsigned) (count * stmt_cost);
10203       cost[where] += retval;
10204     }
10205
10206   return retval;
10207 }
10208
10209 static void initialize_aarch64_code_model (struct gcc_options *);
10210
10211 /* Parse the TO_PARSE string and put the architecture struct that it
10212    selects into RES and the architectural features into ISA_FLAGS.
10213    Return an aarch64_parse_opt_result describing the parse result.
10214    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
10215
10216 static enum aarch64_parse_opt_result
10217 aarch64_parse_arch (const char *to_parse, const struct processor **res,
10218                     unsigned long *isa_flags)
10219 {
10220   char *ext;
10221   const struct processor *arch;
10222   char *str = (char *) alloca (strlen (to_parse) + 1);
10223   size_t len;
10224
10225   strcpy (str, to_parse);
10226
10227   ext = strchr (str, '+');
10228
10229   if (ext != NULL)
10230     len = ext - str;
10231   else
10232     len = strlen (str);
10233
10234   if (len == 0)
10235     return AARCH64_PARSE_MISSING_ARG;
10236
10237
10238   /* Loop through the list of supported ARCHes to find a match.  */
10239   for (arch = all_architectures; arch->name != NULL; arch++)
10240     {
10241       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
10242         {
10243           unsigned long isa_temp = arch->flags;
10244
10245           if (ext != NULL)
10246             {
10247               /* TO_PARSE string contains at least one extension.  */
10248               enum aarch64_parse_opt_result ext_res
10249                 = aarch64_parse_extension (ext, &isa_temp);
10250
10251               if (ext_res != AARCH64_PARSE_OK)
10252                 return ext_res;
10253             }
10254           /* Extension parsing was successful.  Confirm the result
10255              arch and ISA flags.  */
10256           *res = arch;
10257           *isa_flags = isa_temp;
10258           return AARCH64_PARSE_OK;
10259         }
10260     }
10261
10262   /* ARCH name not found in list.  */
10263   return AARCH64_PARSE_INVALID_ARG;
10264 }
10265
10266 /* Parse the TO_PARSE string and put the result tuning in RES and the
10267    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
10268    describing the parse result.  If there is an error parsing, RES and
10269    ISA_FLAGS are left unchanged.  */
10270
10271 static enum aarch64_parse_opt_result
10272 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
10273                    unsigned long *isa_flags)
10274 {
10275   char *ext;
10276   const struct processor *cpu;
10277   char *str = (char *) alloca (strlen (to_parse) + 1);
10278   size_t len;
10279
10280   strcpy (str, to_parse);
10281
10282   ext = strchr (str, '+');
10283
10284   if (ext != NULL)
10285     len = ext - str;
10286   else
10287     len = strlen (str);
10288
10289   if (len == 0)
10290     return AARCH64_PARSE_MISSING_ARG;
10291
10292
10293   /* Loop through the list of supported CPUs to find a match.  */
10294   for (cpu = all_cores; cpu->name != NULL; cpu++)
10295     {
10296       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
10297         {
10298           unsigned long isa_temp = cpu->flags;
10299
10300
10301           if (ext != NULL)
10302             {
10303               /* TO_PARSE string contains at least one extension.  */
10304               enum aarch64_parse_opt_result ext_res
10305                 = aarch64_parse_extension (ext, &isa_temp);
10306
10307               if (ext_res != AARCH64_PARSE_OK)
10308                 return ext_res;
10309             }
10310           /* Extension parsing was successfull.  Confirm the result
10311              cpu and ISA flags.  */
10312           *res = cpu;
10313           *isa_flags = isa_temp;
10314           return AARCH64_PARSE_OK;
10315         }
10316     }
10317
10318   /* CPU name not found in list.  */
10319   return AARCH64_PARSE_INVALID_ARG;
10320 }
10321
10322 /* Parse the TO_PARSE string and put the cpu it selects into RES.
10323    Return an aarch64_parse_opt_result describing the parse result.
10324    If the parsing fails the RES does not change.  */
10325
10326 static enum aarch64_parse_opt_result
10327 aarch64_parse_tune (const char *to_parse, const struct processor **res)
10328 {
10329   const struct processor *cpu;
10330   char *str = (char *) alloca (strlen (to_parse) + 1);
10331
10332   strcpy (str, to_parse);
10333
10334   /* Loop through the list of supported CPUs to find a match.  */
10335   for (cpu = all_cores; cpu->name != NULL; cpu++)
10336     {
10337       if (strcmp (cpu->name, str) == 0)
10338         {
10339           *res = cpu;
10340           return AARCH64_PARSE_OK;
10341         }
10342     }
10343
10344   /* CPU name not found in list.  */
10345   return AARCH64_PARSE_INVALID_ARG;
10346 }
10347
10348 /* Parse TOKEN, which has length LENGTH to see if it is an option
10349    described in FLAG.  If it is, return the index bit for that fusion type.
10350    If not, error (printing OPTION_NAME) and return zero.  */
10351
10352 static unsigned int
10353 aarch64_parse_one_option_token (const char *token,
10354                                 size_t length,
10355                                 const struct aarch64_flag_desc *flag,
10356                                 const char *option_name)
10357 {
10358   for (; flag->name != NULL; flag++)
10359     {
10360       if (length == strlen (flag->name)
10361           && !strncmp (flag->name, token, length))
10362         return flag->flag;
10363     }
10364
10365   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
10366   return 0;
10367 }
10368
10369 /* Parse OPTION which is a comma-separated list of flags to enable.
10370    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
10371    default state we inherit from the CPU tuning structures.  OPTION_NAME
10372    gives the top-level option we are parsing in the -moverride string,
10373    for use in error messages.  */
10374
10375 static unsigned int
10376 aarch64_parse_boolean_options (const char *option,
10377                                const struct aarch64_flag_desc *flags,
10378                                unsigned int initial_state,
10379                                const char *option_name)
10380 {
10381   const char separator = '.';
10382   const char* specs = option;
10383   const char* ntoken = option;
10384   unsigned int found_flags = initial_state;
10385
10386   while ((ntoken = strchr (specs, separator)))
10387     {
10388       size_t token_length = ntoken - specs;
10389       unsigned token_ops = aarch64_parse_one_option_token (specs,
10390                                                            token_length,
10391                                                            flags,
10392                                                            option_name);
10393       /* If we find "none" (or, for simplicity's sake, an error) anywhere
10394          in the token stream, reset the supported operations.  So:
10395
10396            adrp+add.cmp+branch.none.adrp+add
10397
10398            would have the result of turning on only adrp+add fusion.  */
10399       if (!token_ops)
10400         found_flags = 0;
10401
10402       found_flags |= token_ops;
10403       specs = ++ntoken;
10404     }
10405
10406   /* We ended with a comma, print something.  */
10407   if (!(*specs))
10408     {
10409       error ("%s string ill-formed\n", option_name);
10410       return 0;
10411     }
10412
10413   /* We still have one more token to parse.  */
10414   size_t token_length = strlen (specs);
10415   unsigned token_ops = aarch64_parse_one_option_token (specs,
10416                                                        token_length,
10417                                                        flags,
10418                                                        option_name);
10419    if (!token_ops)
10420      found_flags = 0;
10421
10422   found_flags |= token_ops;
10423   return found_flags;
10424 }
10425
10426 /* Support for overriding instruction fusion.  */
10427
10428 static void
10429 aarch64_parse_fuse_string (const char *fuse_string,
10430                             struct tune_params *tune)
10431 {
10432   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
10433                                                      aarch64_fusible_pairs,
10434                                                      tune->fusible_ops,
10435                                                      "fuse=");
10436 }
10437
10438 /* Support for overriding other tuning flags.  */
10439
10440 static void
10441 aarch64_parse_tune_string (const char *tune_string,
10442                             struct tune_params *tune)
10443 {
10444   tune->extra_tuning_flags
10445     = aarch64_parse_boolean_options (tune_string,
10446                                      aarch64_tuning_flags,
10447                                      tune->extra_tuning_flags,
10448                                      "tune=");
10449 }
10450
10451 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
10452    we understand.  If it is, extract the option string and handoff to
10453    the appropriate function.  */
10454
10455 void
10456 aarch64_parse_one_override_token (const char* token,
10457                                   size_t length,
10458                                   struct tune_params *tune)
10459 {
10460   const struct aarch64_tuning_override_function *fn
10461     = aarch64_tuning_override_functions;
10462
10463   const char *option_part = strchr (token, '=');
10464   if (!option_part)
10465     {
10466       error ("tuning string missing in option (%s)", token);
10467       return;
10468     }
10469
10470   /* Get the length of the option name.  */
10471   length = option_part - token;
10472   /* Skip the '=' to get to the option string.  */
10473   option_part++;
10474
10475   for (; fn->name != NULL; fn++)
10476     {
10477       if (!strncmp (fn->name, token, length))
10478         {
10479           fn->parse_override (option_part, tune);
10480           return;
10481         }
10482     }
10483
10484   error ("unknown tuning option (%s)",token);
10485   return;
10486 }
10487
10488 /* A checking mechanism for the implementation of the tls size.  */
10489
10490 static void
10491 initialize_aarch64_tls_size (struct gcc_options *opts)
10492 {
10493   if (aarch64_tls_size == 0)
10494     aarch64_tls_size = 24;
10495
10496   switch (opts->x_aarch64_cmodel_var)
10497     {
10498     case AARCH64_CMODEL_TINY:
10499       /* Both the default and maximum TLS size allowed under tiny is 1M which
10500          needs two instructions to address, so we clamp the size to 24.  */
10501       if (aarch64_tls_size > 24)
10502         aarch64_tls_size = 24;
10503       break;
10504     case AARCH64_CMODEL_SMALL:
10505       /* The maximum TLS size allowed under small is 4G.  */
10506       if (aarch64_tls_size > 32)
10507         aarch64_tls_size = 32;
10508       break;
10509     case AARCH64_CMODEL_LARGE:
10510       /* The maximum TLS size allowed under large is 16E.
10511          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
10512       if (aarch64_tls_size > 48)
10513         aarch64_tls_size = 48;
10514       break;
10515     default:
10516       gcc_unreachable ();
10517     }
10518
10519   return;
10520 }
10521
10522 /* Parse STRING looking for options in the format:
10523      string     :: option:string
10524      option     :: name=substring
10525      name       :: {a-z}
10526      substring  :: defined by option.  */
10527
10528 static void
10529 aarch64_parse_override_string (const char* input_string,
10530                                struct tune_params* tune)
10531 {
10532   const char separator = ':';
10533   size_t string_length = strlen (input_string) + 1;
10534   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
10535   char *string = string_root;
10536   strncpy (string, input_string, string_length);
10537   string[string_length - 1] = '\0';
10538
10539   char* ntoken = string;
10540
10541   while ((ntoken = strchr (string, separator)))
10542     {
10543       size_t token_length = ntoken - string;
10544       /* Make this substring look like a string.  */
10545       *ntoken = '\0';
10546       aarch64_parse_one_override_token (string, token_length, tune);
10547       string = ++ntoken;
10548     }
10549
10550   /* One last option to parse.  */
10551   aarch64_parse_one_override_token (string, strlen (string), tune);
10552   free (string_root);
10553 }
10554
10555
10556 static void
10557 aarch64_override_options_after_change_1 (struct gcc_options *opts)
10558 {
10559   /* PR 70044: We have to be careful about being called multiple times for the
10560      same function.  This means all changes should be repeatable.  */
10561
10562   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
10563      Disable the frame pointer flag so the mid-end will not use a frame
10564      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
10565      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
10566      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
10567   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
10568   if (opts->x_flag_omit_frame_pointer == 0)
10569     opts->x_flag_omit_frame_pointer = 2;
10570
10571   /* If not optimizing for size, set the default
10572      alignment to what the target wants.  */
10573   if (!opts->x_optimize_size)
10574     {
10575       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
10576         opts->x_str_align_loops = aarch64_tune_params.loop_align;
10577       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
10578         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
10579       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
10580         opts->x_str_align_functions = aarch64_tune_params.function_align;
10581     }
10582
10583   /* We default to no pc-relative literal loads.  */
10584
10585   aarch64_pcrelative_literal_loads = false;
10586
10587   /* If -mpc-relative-literal-loads is set on the command line, this
10588      implies that the user asked for PC relative literal loads.  */
10589   if (opts->x_pcrelative_literal_loads == 1)
10590     aarch64_pcrelative_literal_loads = true;
10591
10592   /* In the tiny memory model it makes no sense to disallow PC relative
10593      literal pool loads.  */
10594   if (aarch64_cmodel == AARCH64_CMODEL_TINY
10595       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10596     aarch64_pcrelative_literal_loads = true;
10597
10598   /* When enabling the lower precision Newton series for the square root, also
10599      enable it for the reciprocal square root, since the latter is an
10600      intermediary step for the former.  */
10601   if (flag_mlow_precision_sqrt)
10602     flag_mrecip_low_precision_sqrt = true;
10603 }
10604
10605 /* 'Unpack' up the internal tuning structs and update the options
10606     in OPTS.  The caller must have set up selected_tune and selected_arch
10607     as all the other target-specific codegen decisions are
10608     derived from them.  */
10609
10610 void
10611 aarch64_override_options_internal (struct gcc_options *opts)
10612 {
10613   aarch64_tune_flags = selected_tune->flags;
10614   aarch64_tune = selected_tune->sched_core;
10615   /* Make a copy of the tuning parameters attached to the core, which
10616      we may later overwrite.  */
10617   aarch64_tune_params = *(selected_tune->tune);
10618   aarch64_architecture_version = selected_arch->architecture_version;
10619
10620   if (opts->x_aarch64_override_tune_string)
10621     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
10622                                   &aarch64_tune_params);
10623
10624   /* This target defaults to strict volatile bitfields.  */
10625   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
10626     opts->x_flag_strict_volatile_bitfields = 1;
10627
10628   initialize_aarch64_code_model (opts);
10629   initialize_aarch64_tls_size (opts);
10630
10631   int queue_depth = 0;
10632   switch (aarch64_tune_params.autoprefetcher_model)
10633     {
10634       case tune_params::AUTOPREFETCHER_OFF:
10635         queue_depth = -1;
10636         break;
10637       case tune_params::AUTOPREFETCHER_WEAK:
10638         queue_depth = 0;
10639         break;
10640       case tune_params::AUTOPREFETCHER_STRONG:
10641         queue_depth = max_insn_queue_index + 1;
10642         break;
10643       default:
10644         gcc_unreachable ();
10645     }
10646
10647   /* We don't mind passing in global_options_set here as we don't use
10648      the *options_set structs anyway.  */
10649   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
10650                          queue_depth,
10651                          opts->x_param_values,
10652                          global_options_set.x_param_values);
10653
10654   /* Set up parameters to be used in prefetching algorithm.  Do not
10655      override the defaults unless we are tuning for a core we have
10656      researched values for.  */
10657   if (aarch64_tune_params.prefetch->num_slots > 0)
10658     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
10659                            aarch64_tune_params.prefetch->num_slots,
10660                            opts->x_param_values,
10661                            global_options_set.x_param_values);
10662   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
10663     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
10664                            aarch64_tune_params.prefetch->l1_cache_size,
10665                            opts->x_param_values,
10666                            global_options_set.x_param_values);
10667   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
10668     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
10669                            aarch64_tune_params.prefetch->l1_cache_line_size,
10670                            opts->x_param_values,
10671                            global_options_set.x_param_values);
10672   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
10673     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
10674                            aarch64_tune_params.prefetch->l2_cache_size,
10675                            opts->x_param_values,
10676                            global_options_set.x_param_values);
10677   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
10678     maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
10679                            0,
10680                            opts->x_param_values,
10681                            global_options_set.x_param_values);
10682   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
10683     maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
10684                            aarch64_tune_params.prefetch->minimum_stride,
10685                            opts->x_param_values,
10686                            global_options_set.x_param_values);
10687
10688   /* Use the alternative scheduling-pressure algorithm by default.  */
10689   maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
10690                          opts->x_param_values,
10691                          global_options_set.x_param_values);
10692
10693   /* Enable sw prefetching at specified optimization level for
10694      CPUS that have prefetch.  Lower optimization level threshold by 1
10695      when profiling is enabled.  */
10696   if (opts->x_flag_prefetch_loop_arrays < 0
10697       && !opts->x_optimize_size
10698       && aarch64_tune_params.prefetch->default_opt_level >= 0
10699       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
10700     opts->x_flag_prefetch_loop_arrays = 1;
10701
10702   aarch64_override_options_after_change_1 (opts);
10703 }
10704
10705 /* Print a hint with a suggestion for a core or architecture name that
10706    most closely resembles what the user passed in STR.  ARCH is true if
10707    the user is asking for an architecture name.  ARCH is false if the user
10708    is asking for a core name.  */
10709
10710 static void
10711 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
10712 {
10713   auto_vec<const char *> candidates;
10714   const struct processor *entry = arch ? all_architectures : all_cores;
10715   for (; entry->name != NULL; entry++)
10716     candidates.safe_push (entry->name);
10717
10718 #ifdef HAVE_LOCAL_CPU_DETECT
10719   /* Add also "native" as possible value.  */
10720   if (arch)
10721     candidates.safe_push ("native");
10722 #endif
10723
10724   char *s;
10725   const char *hint = candidates_list_and_hint (str, s, candidates);
10726   if (hint)
10727     inform (input_location, "valid arguments are: %s;"
10728                              " did you mean %qs?", s, hint);
10729   else
10730     inform (input_location, "valid arguments are: %s", s);
10731
10732   XDELETEVEC (s);
10733 }
10734
10735 /* Print a hint with a suggestion for a core name that most closely resembles
10736    what the user passed in STR.  */
10737
10738 inline static void
10739 aarch64_print_hint_for_core (const char *str)
10740 {
10741   aarch64_print_hint_for_core_or_arch (str, false);
10742 }
10743
10744 /* Print a hint with a suggestion for an architecture name that most closely
10745    resembles what the user passed in STR.  */
10746
10747 inline static void
10748 aarch64_print_hint_for_arch (const char *str)
10749 {
10750   aarch64_print_hint_for_core_or_arch (str, true);
10751 }
10752
10753 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
10754    specified in STR and throw errors if appropriate.  Put the results if
10755    they are valid in RES and ISA_FLAGS.  Return whether the option is
10756    valid.  */
10757
10758 static bool
10759 aarch64_validate_mcpu (const char *str, const struct processor **res,
10760                        unsigned long *isa_flags)
10761 {
10762   enum aarch64_parse_opt_result parse_res
10763     = aarch64_parse_cpu (str, res, isa_flags);
10764
10765   if (parse_res == AARCH64_PARSE_OK)
10766     return true;
10767
10768   switch (parse_res)
10769     {
10770       case AARCH64_PARSE_MISSING_ARG:
10771         error ("missing cpu name in %<-mcpu=%s%>", str);
10772         break;
10773       case AARCH64_PARSE_INVALID_ARG:
10774         error ("unknown value %qs for -mcpu", str);
10775         aarch64_print_hint_for_core (str);
10776         break;
10777       case AARCH64_PARSE_INVALID_FEATURE:
10778         error ("invalid feature modifier in %<-mcpu=%s%>", str);
10779         break;
10780       default:
10781         gcc_unreachable ();
10782     }
10783
10784   return false;
10785 }
10786
10787 /* Validate a command-line -march option.  Parse the arch and extensions
10788    (if any) specified in STR and throw errors if appropriate.  Put the
10789    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
10790    option is valid.  */
10791
10792 static bool
10793 aarch64_validate_march (const char *str, const struct processor **res,
10794                          unsigned long *isa_flags)
10795 {
10796   enum aarch64_parse_opt_result parse_res
10797     = aarch64_parse_arch (str, res, isa_flags);
10798
10799   if (parse_res == AARCH64_PARSE_OK)
10800     return true;
10801
10802   switch (parse_res)
10803     {
10804       case AARCH64_PARSE_MISSING_ARG:
10805         error ("missing arch name in %<-march=%s%>", str);
10806         break;
10807       case AARCH64_PARSE_INVALID_ARG:
10808         error ("unknown value %qs for -march", str);
10809         aarch64_print_hint_for_arch (str);
10810         break;
10811       case AARCH64_PARSE_INVALID_FEATURE:
10812         error ("invalid feature modifier in %<-march=%s%>", str);
10813         break;
10814       default:
10815         gcc_unreachable ();
10816     }
10817
10818   return false;
10819 }
10820
10821 /* Validate a command-line -mtune option.  Parse the cpu
10822    specified in STR and throw errors if appropriate.  Put the
10823    result, if it is valid, in RES.  Return whether the option is
10824    valid.  */
10825
10826 static bool
10827 aarch64_validate_mtune (const char *str, const struct processor **res)
10828 {
10829   enum aarch64_parse_opt_result parse_res
10830     = aarch64_parse_tune (str, res);
10831
10832   if (parse_res == AARCH64_PARSE_OK)
10833     return true;
10834
10835   switch (parse_res)
10836     {
10837       case AARCH64_PARSE_MISSING_ARG:
10838         error ("missing cpu name in %<-mtune=%s%>", str);
10839         break;
10840       case AARCH64_PARSE_INVALID_ARG:
10841         error ("unknown value %qs for -mtune", str);
10842         aarch64_print_hint_for_core (str);
10843         break;
10844       default:
10845         gcc_unreachable ();
10846     }
10847   return false;
10848 }
10849
10850 /* Return the CPU corresponding to the enum CPU.
10851    If it doesn't specify a cpu, return the default.  */
10852
10853 static const struct processor *
10854 aarch64_get_tune_cpu (enum aarch64_processor cpu)
10855 {
10856   if (cpu != aarch64_none)
10857     return &all_cores[cpu];
10858
10859   /* The & 0x3f is to extract the bottom 6 bits that encode the
10860      default cpu as selected by the --with-cpu GCC configure option
10861      in config.gcc.
10862      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
10863      flags mechanism should be reworked to make it more sane.  */
10864   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10865 }
10866
10867 /* Return the architecture corresponding to the enum ARCH.
10868    If it doesn't specify a valid architecture, return the default.  */
10869
10870 static const struct processor *
10871 aarch64_get_arch (enum aarch64_arch arch)
10872 {
10873   if (arch != aarch64_no_arch)
10874     return &all_architectures[arch];
10875
10876   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10877
10878   return &all_architectures[cpu->arch];
10879 }
10880
10881 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
10882
10883 static poly_uint16
10884 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
10885 {
10886   /* For now generate vector-length agnostic code for -msve-vector-bits=128.
10887      This ensures we can clearly distinguish SVE and Advanced SIMD modes when
10888      deciding which .md file patterns to use and when deciding whether
10889      something is a legitimate address or constant.  */
10890   if (value == SVE_SCALABLE || value == SVE_128)
10891     return poly_uint16 (2, 2);
10892   else
10893     return (int) value / 64;
10894 }
10895
10896 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
10897    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
10898    tuning structs.  In particular it must set selected_tune and
10899    aarch64_isa_flags that define the available ISA features and tuning
10900    decisions.  It must also set selected_arch as this will be used to
10901    output the .arch asm tags for each function.  */
10902
10903 static void
10904 aarch64_override_options (void)
10905 {
10906   unsigned long cpu_isa = 0;
10907   unsigned long arch_isa = 0;
10908   aarch64_isa_flags = 0;
10909
10910   bool valid_cpu = true;
10911   bool valid_tune = true;
10912   bool valid_arch = true;
10913
10914   selected_cpu = NULL;
10915   selected_arch = NULL;
10916   selected_tune = NULL;
10917
10918   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
10919      If either of -march or -mtune is given, they override their
10920      respective component of -mcpu.  */
10921   if (aarch64_cpu_string)
10922     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
10923                                         &cpu_isa);
10924
10925   if (aarch64_arch_string)
10926     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
10927                                           &arch_isa);
10928
10929   if (aarch64_tune_string)
10930     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
10931
10932   /* If the user did not specify a processor, choose the default
10933      one for them.  This will be the CPU set during configuration using
10934      --with-cpu, otherwise it is "generic".  */
10935   if (!selected_cpu)
10936     {
10937       if (selected_arch)
10938         {
10939           selected_cpu = &all_cores[selected_arch->ident];
10940           aarch64_isa_flags = arch_isa;
10941           explicit_arch = selected_arch->arch;
10942         }
10943       else
10944         {
10945           /* Get default configure-time CPU.  */
10946           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
10947           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
10948         }
10949
10950       if (selected_tune)
10951         explicit_tune_core = selected_tune->ident;
10952     }
10953   /* If both -mcpu and -march are specified check that they are architecturally
10954      compatible, warn if they're not and prefer the -march ISA flags.  */
10955   else if (selected_arch)
10956     {
10957       if (selected_arch->arch != selected_cpu->arch)
10958         {
10959           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
10960                        all_architectures[selected_cpu->arch].name,
10961                        selected_arch->name);
10962         }
10963       aarch64_isa_flags = arch_isa;
10964       explicit_arch = selected_arch->arch;
10965       explicit_tune_core = selected_tune ? selected_tune->ident
10966                                           : selected_cpu->ident;
10967     }
10968   else
10969     {
10970       /* -mcpu but no -march.  */
10971       aarch64_isa_flags = cpu_isa;
10972       explicit_tune_core = selected_tune ? selected_tune->ident
10973                                           : selected_cpu->ident;
10974       gcc_assert (selected_cpu);
10975       selected_arch = &all_architectures[selected_cpu->arch];
10976       explicit_arch = selected_arch->arch;
10977     }
10978
10979   /* Set the arch as well as we will need it when outputing
10980      the .arch directive in assembly.  */
10981   if (!selected_arch)
10982     {
10983       gcc_assert (selected_cpu);
10984       selected_arch = &all_architectures[selected_cpu->arch];
10985     }
10986
10987   if (!selected_tune)
10988     selected_tune = selected_cpu;
10989
10990 #ifndef HAVE_AS_MABI_OPTION
10991   /* The compiler may have been configured with 2.23.* binutils, which does
10992      not have support for ILP32.  */
10993   if (TARGET_ILP32)
10994     error ("assembler does not support -mabi=ilp32");
10995 #endif
10996
10997   /* Convert -msve-vector-bits to a VG count.  */
10998   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
10999
11000   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
11001     sorry ("return address signing is only supported for -mabi=lp64");
11002
11003   /* Make sure we properly set up the explicit options.  */
11004   if ((aarch64_cpu_string && valid_cpu)
11005        || (aarch64_tune_string && valid_tune))
11006     gcc_assert (explicit_tune_core != aarch64_none);
11007
11008   if ((aarch64_cpu_string && valid_cpu)
11009        || (aarch64_arch_string && valid_arch))
11010     gcc_assert (explicit_arch != aarch64_no_arch);
11011
11012   aarch64_override_options_internal (&global_options);
11013
11014   /* Save these options as the default ones in case we push and pop them later
11015      while processing functions with potential target attributes.  */
11016   target_option_default_node = target_option_current_node
11017       = build_target_option_node (&global_options);
11018 }
11019
11020 /* Implement targetm.override_options_after_change.  */
11021
11022 static void
11023 aarch64_override_options_after_change (void)
11024 {
11025   aarch64_override_options_after_change_1 (&global_options);
11026 }
11027
11028 static struct machine_function *
11029 aarch64_init_machine_status (void)
11030 {
11031   struct machine_function *machine;
11032   machine = ggc_cleared_alloc<machine_function> ();
11033   return machine;
11034 }
11035
11036 void
11037 aarch64_init_expanders (void)
11038 {
11039   init_machine_status = aarch64_init_machine_status;
11040 }
11041
11042 /* A checking mechanism for the implementation of the various code models.  */
11043 static void
11044 initialize_aarch64_code_model (struct gcc_options *opts)
11045 {
11046    if (opts->x_flag_pic)
11047      {
11048        switch (opts->x_aarch64_cmodel_var)
11049          {
11050          case AARCH64_CMODEL_TINY:
11051            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
11052            break;
11053          case AARCH64_CMODEL_SMALL:
11054 #ifdef HAVE_AS_SMALL_PIC_RELOCS
11055            aarch64_cmodel = (flag_pic == 2
11056                              ? AARCH64_CMODEL_SMALL_PIC
11057                              : AARCH64_CMODEL_SMALL_SPIC);
11058 #else
11059            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
11060 #endif
11061            break;
11062          case AARCH64_CMODEL_LARGE:
11063            sorry ("code model %qs with -f%s", "large",
11064                   opts->x_flag_pic > 1 ? "PIC" : "pic");
11065            break;
11066          default:
11067            gcc_unreachable ();
11068          }
11069      }
11070    else
11071      aarch64_cmodel = opts->x_aarch64_cmodel_var;
11072 }
11073
11074 /* Implement TARGET_OPTION_SAVE.  */
11075
11076 static void
11077 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
11078 {
11079   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
11080 }
11081
11082 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
11083    using the information saved in PTR.  */
11084
11085 static void
11086 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
11087 {
11088   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
11089   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11090   opts->x_explicit_arch = ptr->x_explicit_arch;
11091   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
11092   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
11093
11094   aarch64_override_options_internal (opts);
11095 }
11096
11097 /* Implement TARGET_OPTION_PRINT.  */
11098
11099 static void
11100 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
11101 {
11102   const struct processor *cpu
11103     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11104   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
11105   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
11106   std::string extension
11107     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
11108
11109   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
11110   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
11111            arch->name, extension.c_str ());
11112 }
11113
11114 static GTY(()) tree aarch64_previous_fndecl;
11115
11116 void
11117 aarch64_reset_previous_fndecl (void)
11118 {
11119   aarch64_previous_fndecl = NULL;
11120 }
11121
11122 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
11123    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
11124    make sure optab availability predicates are recomputed when necessary.  */
11125
11126 void
11127 aarch64_save_restore_target_globals (tree new_tree)
11128 {
11129   if (TREE_TARGET_GLOBALS (new_tree))
11130     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
11131   else if (new_tree == target_option_default_node)
11132     restore_target_globals (&default_target_globals);
11133   else
11134     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
11135 }
11136
11137 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
11138    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
11139    of the function, if such exists.  This function may be called multiple
11140    times on a single function so use aarch64_previous_fndecl to avoid
11141    setting up identical state.  */
11142
11143 static void
11144 aarch64_set_current_function (tree fndecl)
11145 {
11146   if (!fndecl || fndecl == aarch64_previous_fndecl)
11147     return;
11148
11149   tree old_tree = (aarch64_previous_fndecl
11150                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
11151                    : NULL_TREE);
11152
11153   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11154
11155   /* If current function has no attributes but the previous one did,
11156      use the default node.  */
11157   if (!new_tree && old_tree)
11158     new_tree = target_option_default_node;
11159
11160   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
11161      the default have been handled by aarch64_save_restore_target_globals from
11162      aarch64_pragma_target_parse.  */
11163   if (old_tree == new_tree)
11164     return;
11165
11166   aarch64_previous_fndecl = fndecl;
11167
11168   /* First set the target options.  */
11169   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
11170
11171   aarch64_save_restore_target_globals (new_tree);
11172 }
11173
11174 /* Enum describing the various ways we can handle attributes.
11175    In many cases we can reuse the generic option handling machinery.  */
11176
11177 enum aarch64_attr_opt_type
11178 {
11179   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
11180   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
11181   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
11182   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
11183 };
11184
11185 /* All the information needed to handle a target attribute.
11186    NAME is the name of the attribute.
11187    ATTR_TYPE specifies the type of behavior of the attribute as described
11188    in the definition of enum aarch64_attr_opt_type.
11189    ALLOW_NEG is true if the attribute supports a "no-" form.
11190    HANDLER is the function that takes the attribute string as an argument
11191    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
11192    OPT_NUM is the enum specifying the option that the attribute modifies.
11193    This is needed for attributes that mirror the behavior of a command-line
11194    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
11195    aarch64_attr_enum.  */
11196
11197 struct aarch64_attribute_info
11198 {
11199   const char *name;
11200   enum aarch64_attr_opt_type attr_type;
11201   bool allow_neg;
11202   bool (*handler) (const char *);
11203   enum opt_code opt_num;
11204 };
11205
11206 /* Handle the ARCH_STR argument to the arch= target attribute.  */
11207
11208 static bool
11209 aarch64_handle_attr_arch (const char *str)
11210 {
11211   const struct processor *tmp_arch = NULL;
11212   enum aarch64_parse_opt_result parse_res
11213     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
11214
11215   if (parse_res == AARCH64_PARSE_OK)
11216     {
11217       gcc_assert (tmp_arch);
11218       selected_arch = tmp_arch;
11219       explicit_arch = selected_arch->arch;
11220       return true;
11221     }
11222
11223   switch (parse_res)
11224     {
11225       case AARCH64_PARSE_MISSING_ARG:
11226         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
11227         break;
11228       case AARCH64_PARSE_INVALID_ARG:
11229         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
11230         aarch64_print_hint_for_arch (str);
11231         break;
11232       case AARCH64_PARSE_INVALID_FEATURE:
11233         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11234         break;
11235       default:
11236         gcc_unreachable ();
11237     }
11238
11239   return false;
11240 }
11241
11242 /* Handle the argument CPU_STR to the cpu= target attribute.  */
11243
11244 static bool
11245 aarch64_handle_attr_cpu (const char *str)
11246 {
11247   const struct processor *tmp_cpu = NULL;
11248   enum aarch64_parse_opt_result parse_res
11249     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
11250
11251   if (parse_res == AARCH64_PARSE_OK)
11252     {
11253       gcc_assert (tmp_cpu);
11254       selected_tune = tmp_cpu;
11255       explicit_tune_core = selected_tune->ident;
11256
11257       selected_arch = &all_architectures[tmp_cpu->arch];
11258       explicit_arch = selected_arch->arch;
11259       return true;
11260     }
11261
11262   switch (parse_res)
11263     {
11264       case AARCH64_PARSE_MISSING_ARG:
11265         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
11266         break;
11267       case AARCH64_PARSE_INVALID_ARG:
11268         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
11269         aarch64_print_hint_for_core (str);
11270         break;
11271       case AARCH64_PARSE_INVALID_FEATURE:
11272         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11273         break;
11274       default:
11275         gcc_unreachable ();
11276     }
11277
11278   return false;
11279 }
11280
11281 /* Handle the argument STR to the tune= target attribute.  */
11282
11283 static bool
11284 aarch64_handle_attr_tune (const char *str)
11285 {
11286   const struct processor *tmp_tune = NULL;
11287   enum aarch64_parse_opt_result parse_res
11288     = aarch64_parse_tune (str, &tmp_tune);
11289
11290   if (parse_res == AARCH64_PARSE_OK)
11291     {
11292       gcc_assert (tmp_tune);
11293       selected_tune = tmp_tune;
11294       explicit_tune_core = selected_tune->ident;
11295       return true;
11296     }
11297
11298   switch (parse_res)
11299     {
11300       case AARCH64_PARSE_INVALID_ARG:
11301         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
11302         aarch64_print_hint_for_core (str);
11303         break;
11304       default:
11305         gcc_unreachable ();
11306     }
11307
11308   return false;
11309 }
11310
11311 /* Parse an architecture extensions target attribute string specified in STR.
11312    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
11313    if successful.  Update aarch64_isa_flags to reflect the ISA features
11314    modified.  */
11315
11316 static bool
11317 aarch64_handle_attr_isa_flags (char *str)
11318 {
11319   enum aarch64_parse_opt_result parse_res;
11320   unsigned long isa_flags = aarch64_isa_flags;
11321
11322   /* We allow "+nothing" in the beginning to clear out all architectural
11323      features if the user wants to handpick specific features.  */
11324   if (strncmp ("+nothing", str, 8) == 0)
11325     {
11326       isa_flags = 0;
11327       str += 8;
11328     }
11329
11330   parse_res = aarch64_parse_extension (str, &isa_flags);
11331
11332   if (parse_res == AARCH64_PARSE_OK)
11333     {
11334       aarch64_isa_flags = isa_flags;
11335       return true;
11336     }
11337
11338   switch (parse_res)
11339     {
11340       case AARCH64_PARSE_MISSING_ARG:
11341         error ("missing value in %<target()%> pragma or attribute");
11342         break;
11343
11344       case AARCH64_PARSE_INVALID_FEATURE:
11345         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11346         break;
11347
11348       default:
11349         gcc_unreachable ();
11350     }
11351
11352  return false;
11353 }
11354
11355 /* The target attributes that we support.  On top of these we also support just
11356    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
11357    handled explicitly in aarch64_process_one_target_attr.  */
11358
11359 static const struct aarch64_attribute_info aarch64_attributes[] =
11360 {
11361   { "general-regs-only", aarch64_attr_mask, false, NULL,
11362      OPT_mgeneral_regs_only },
11363   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
11364      OPT_mfix_cortex_a53_835769 },
11365   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
11366      OPT_mfix_cortex_a53_843419 },
11367   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
11368   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
11369   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
11370      OPT_momit_leaf_frame_pointer },
11371   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
11372   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
11373      OPT_march_ },
11374   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
11375   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
11376      OPT_mtune_ },
11377   { "sign-return-address", aarch64_attr_enum, false, NULL,
11378      OPT_msign_return_address_ },
11379   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
11380 };
11381
11382 /* Parse ARG_STR which contains the definition of one target attribute.
11383    Show appropriate errors if any or return true if the attribute is valid.  */
11384
11385 static bool
11386 aarch64_process_one_target_attr (char *arg_str)
11387 {
11388   bool invert = false;
11389
11390   size_t len = strlen (arg_str);
11391
11392   if (len == 0)
11393     {
11394       error ("malformed %<target()%> pragma or attribute");
11395       return false;
11396     }
11397
11398   char *str_to_check = (char *) alloca (len + 1);
11399   strcpy (str_to_check, arg_str);
11400
11401   /* Skip leading whitespace.  */
11402   while (*str_to_check == ' ' || *str_to_check == '\t')
11403     str_to_check++;
11404
11405   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
11406      It is easier to detect and handle it explicitly here rather than going
11407      through the machinery for the rest of the target attributes in this
11408      function.  */
11409   if (*str_to_check == '+')
11410     return aarch64_handle_attr_isa_flags (str_to_check);
11411
11412   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
11413     {
11414       invert = true;
11415       str_to_check += 3;
11416     }
11417   char *arg = strchr (str_to_check, '=');
11418
11419   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
11420      and point ARG to "foo".  */
11421   if (arg)
11422     {
11423       *arg = '\0';
11424       arg++;
11425     }
11426   const struct aarch64_attribute_info *p_attr;
11427   bool found = false;
11428   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
11429     {
11430       /* If the names don't match up, or the user has given an argument
11431          to an attribute that doesn't accept one, or didn't give an argument
11432          to an attribute that expects one, fail to match.  */
11433       if (strcmp (str_to_check, p_attr->name) != 0)
11434         continue;
11435
11436       found = true;
11437       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
11438                               || p_attr->attr_type == aarch64_attr_enum;
11439
11440       if (attr_need_arg_p ^ (arg != NULL))
11441         {
11442           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
11443           return false;
11444         }
11445
11446       /* If the name matches but the attribute does not allow "no-" versions
11447          then we can't match.  */
11448       if (invert && !p_attr->allow_neg)
11449         {
11450           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
11451           return false;
11452         }
11453
11454       switch (p_attr->attr_type)
11455         {
11456         /* Has a custom handler registered.
11457            For example, cpu=, arch=, tune=.  */
11458           case aarch64_attr_custom:
11459             gcc_assert (p_attr->handler);
11460             if (!p_attr->handler (arg))
11461               return false;
11462             break;
11463
11464           /* Either set or unset a boolean option.  */
11465           case aarch64_attr_bool:
11466             {
11467               struct cl_decoded_option decoded;
11468
11469               generate_option (p_attr->opt_num, NULL, !invert,
11470                                CL_TARGET, &decoded);
11471               aarch64_handle_option (&global_options, &global_options_set,
11472                                       &decoded, input_location);
11473               break;
11474             }
11475           /* Set or unset a bit in the target_flags.  aarch64_handle_option
11476              should know what mask to apply given the option number.  */
11477           case aarch64_attr_mask:
11478             {
11479               struct cl_decoded_option decoded;
11480               /* We only need to specify the option number.
11481                  aarch64_handle_option will know which mask to apply.  */
11482               decoded.opt_index = p_attr->opt_num;
11483               decoded.value = !invert;
11484               aarch64_handle_option (&global_options, &global_options_set,
11485                                       &decoded, input_location);
11486               break;
11487             }
11488           /* Use the option setting machinery to set an option to an enum.  */
11489           case aarch64_attr_enum:
11490             {
11491               gcc_assert (arg);
11492               bool valid;
11493               int value;
11494               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
11495                                               &value, CL_TARGET);
11496               if (valid)
11497                 {
11498                   set_option (&global_options, NULL, p_attr->opt_num, value,
11499                               NULL, DK_UNSPECIFIED, input_location,
11500                               global_dc);
11501                 }
11502               else
11503                 {
11504                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
11505                 }
11506               break;
11507             }
11508           default:
11509             gcc_unreachable ();
11510         }
11511     }
11512
11513   /* If we reached here we either have found an attribute and validated
11514      it or didn't match any.  If we matched an attribute but its arguments
11515      were malformed we will have returned false already.  */
11516   return found;
11517 }
11518
11519 /* Count how many times the character C appears in
11520    NULL-terminated string STR.  */
11521
11522 static unsigned int
11523 num_occurences_in_str (char c, char *str)
11524 {
11525   unsigned int res = 0;
11526   while (*str != '\0')
11527     {
11528       if (*str == c)
11529         res++;
11530
11531       str++;
11532     }
11533
11534   return res;
11535 }
11536
11537 /* Parse the tree in ARGS that contains the target attribute information
11538    and update the global target options space.  */
11539
11540 bool
11541 aarch64_process_target_attr (tree args)
11542 {
11543   if (TREE_CODE (args) == TREE_LIST)
11544     {
11545       do
11546         {
11547           tree head = TREE_VALUE (args);
11548           if (head)
11549             {
11550               if (!aarch64_process_target_attr (head))
11551                 return false;
11552             }
11553           args = TREE_CHAIN (args);
11554         } while (args);
11555
11556       return true;
11557     }
11558
11559   if (TREE_CODE (args) != STRING_CST)
11560     {
11561       error ("attribute %<target%> argument not a string");
11562       return false;
11563     }
11564
11565   size_t len = strlen (TREE_STRING_POINTER (args));
11566   char *str_to_check = (char *) alloca (len + 1);
11567   strcpy (str_to_check, TREE_STRING_POINTER (args));
11568
11569   if (len == 0)
11570     {
11571       error ("malformed %<target()%> pragma or attribute");
11572       return false;
11573     }
11574
11575   /* Used to catch empty spaces between commas i.e.
11576      attribute ((target ("attr1,,attr2"))).  */
11577   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
11578
11579   /* Handle multiple target attributes separated by ','.  */
11580   char *token = strtok (str_to_check, ",");
11581
11582   unsigned int num_attrs = 0;
11583   while (token)
11584     {
11585       num_attrs++;
11586       if (!aarch64_process_one_target_attr (token))
11587         {
11588           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
11589           return false;
11590         }
11591
11592       token = strtok (NULL, ",");
11593     }
11594
11595   if (num_attrs != num_commas + 1)
11596     {
11597       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
11598       return false;
11599     }
11600
11601   return true;
11602 }
11603
11604 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
11605    process attribute ((target ("..."))).  */
11606
11607 static bool
11608 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
11609 {
11610   struct cl_target_option cur_target;
11611   bool ret;
11612   tree old_optimize;
11613   tree new_target, new_optimize;
11614   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11615
11616   /* If what we're processing is the current pragma string then the
11617      target option node is already stored in target_option_current_node
11618      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
11619      having to re-parse the string.  This is especially useful to keep
11620      arm_neon.h compile times down since that header contains a lot
11621      of intrinsics enclosed in pragmas.  */
11622   if (!existing_target && args == current_target_pragma)
11623     {
11624       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
11625       return true;
11626     }
11627   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11628
11629   old_optimize = build_optimization_node (&global_options);
11630   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11631
11632   /* If the function changed the optimization levels as well as setting
11633      target options, start with the optimizations specified.  */
11634   if (func_optimize && func_optimize != old_optimize)
11635     cl_optimization_restore (&global_options,
11636                              TREE_OPTIMIZATION (func_optimize));
11637
11638   /* Save the current target options to restore at the end.  */
11639   cl_target_option_save (&cur_target, &global_options);
11640
11641   /* If fndecl already has some target attributes applied to it, unpack
11642      them so that we add this attribute on top of them, rather than
11643      overwriting them.  */
11644   if (existing_target)
11645     {
11646       struct cl_target_option *existing_options
11647         = TREE_TARGET_OPTION (existing_target);
11648
11649       if (existing_options)
11650         cl_target_option_restore (&global_options, existing_options);
11651     }
11652   else
11653     cl_target_option_restore (&global_options,
11654                         TREE_TARGET_OPTION (target_option_current_node));
11655
11656   ret = aarch64_process_target_attr (args);
11657
11658   /* Set up any additional state.  */
11659   if (ret)
11660     {
11661       aarch64_override_options_internal (&global_options);
11662       /* Initialize SIMD builtins if we haven't already.
11663          Set current_target_pragma to NULL for the duration so that
11664          the builtin initialization code doesn't try to tag the functions
11665          being built with the attributes specified by any current pragma, thus
11666          going into an infinite recursion.  */
11667       if (TARGET_SIMD)
11668         {
11669           tree saved_current_target_pragma = current_target_pragma;
11670           current_target_pragma = NULL;
11671           aarch64_init_simd_builtins ();
11672           current_target_pragma = saved_current_target_pragma;
11673         }
11674       new_target = build_target_option_node (&global_options);
11675     }
11676   else
11677     new_target = NULL;
11678
11679   new_optimize = build_optimization_node (&global_options);
11680
11681   if (fndecl && ret)
11682     {
11683       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
11684
11685       if (old_optimize != new_optimize)
11686         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
11687     }
11688
11689   cl_target_option_restore (&global_options, &cur_target);
11690
11691   if (old_optimize != new_optimize)
11692     cl_optimization_restore (&global_options,
11693                              TREE_OPTIMIZATION (old_optimize));
11694   return ret;
11695 }
11696
11697 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
11698    tri-bool options (yes, no, don't care) and the default value is
11699    DEF, determine whether to reject inlining.  */
11700
11701 static bool
11702 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
11703                                      int dont_care, int def)
11704 {
11705   /* If the callee doesn't care, always allow inlining.  */
11706   if (callee == dont_care)
11707     return true;
11708
11709   /* If the caller doesn't care, always allow inlining.  */
11710   if (caller == dont_care)
11711     return true;
11712
11713   /* Otherwise, allow inlining if either the callee and caller values
11714      agree, or if the callee is using the default value.  */
11715   return (callee == caller || callee == def);
11716 }
11717
11718 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
11719    to inline CALLEE into CALLER based on target-specific info.
11720    Make sure that the caller and callee have compatible architectural
11721    features.  Then go through the other possible target attributes
11722    and see if they can block inlining.  Try not to reject always_inline
11723    callees unless they are incompatible architecturally.  */
11724
11725 static bool
11726 aarch64_can_inline_p (tree caller, tree callee)
11727 {
11728   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
11729   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
11730
11731   struct cl_target_option *caller_opts
11732         = TREE_TARGET_OPTION (caller_tree ? caller_tree
11733                                            : target_option_default_node);
11734
11735   struct cl_target_option *callee_opts
11736         = TREE_TARGET_OPTION (callee_tree ? callee_tree
11737                                            : target_option_default_node);
11738
11739   /* Callee's ISA flags should be a subset of the caller's.  */
11740   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
11741        != callee_opts->x_aarch64_isa_flags)
11742     return false;
11743
11744   /* Allow non-strict aligned functions inlining into strict
11745      aligned ones.  */
11746   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
11747        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
11748       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
11749            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
11750     return false;
11751
11752   bool always_inline = lookup_attribute ("always_inline",
11753                                           DECL_ATTRIBUTES (callee));
11754
11755   /* If the architectural features match up and the callee is always_inline
11756      then the other attributes don't matter.  */
11757   if (always_inline)
11758     return true;
11759
11760   if (caller_opts->x_aarch64_cmodel_var
11761       != callee_opts->x_aarch64_cmodel_var)
11762     return false;
11763
11764   if (caller_opts->x_aarch64_tls_dialect
11765       != callee_opts->x_aarch64_tls_dialect)
11766     return false;
11767
11768   /* Honour explicit requests to workaround errata.  */
11769   if (!aarch64_tribools_ok_for_inlining_p (
11770           caller_opts->x_aarch64_fix_a53_err835769,
11771           callee_opts->x_aarch64_fix_a53_err835769,
11772           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
11773     return false;
11774
11775   if (!aarch64_tribools_ok_for_inlining_p (
11776           caller_opts->x_aarch64_fix_a53_err843419,
11777           callee_opts->x_aarch64_fix_a53_err843419,
11778           2, TARGET_FIX_ERR_A53_843419))
11779     return false;
11780
11781   /* If the user explicitly specified -momit-leaf-frame-pointer for the
11782      caller and calle and they don't match up, reject inlining.  */
11783   if (!aarch64_tribools_ok_for_inlining_p (
11784           caller_opts->x_flag_omit_leaf_frame_pointer,
11785           callee_opts->x_flag_omit_leaf_frame_pointer,
11786           2, 1))
11787     return false;
11788
11789   /* If the callee has specific tuning overrides, respect them.  */
11790   if (callee_opts->x_aarch64_override_tune_string != NULL
11791       && caller_opts->x_aarch64_override_tune_string == NULL)
11792     return false;
11793
11794   /* If the user specified tuning override strings for the
11795      caller and callee and they don't match up, reject inlining.
11796      We just do a string compare here, we don't analyze the meaning
11797      of the string, as it would be too costly for little gain.  */
11798   if (callee_opts->x_aarch64_override_tune_string
11799       && caller_opts->x_aarch64_override_tune_string
11800       && (strcmp (callee_opts->x_aarch64_override_tune_string,
11801                   caller_opts->x_aarch64_override_tune_string) != 0))
11802     return false;
11803
11804   return true;
11805 }
11806
11807 /* Return true if SYMBOL_REF X binds locally.  */
11808
11809 static bool
11810 aarch64_symbol_binds_local_p (const_rtx x)
11811 {
11812   return (SYMBOL_REF_DECL (x)
11813           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
11814           : SYMBOL_REF_LOCAL_P (x));
11815 }
11816
11817 /* Return true if SYMBOL_REF X is thread local */
11818 static bool
11819 aarch64_tls_symbol_p (rtx x)
11820 {
11821   if (! TARGET_HAVE_TLS)
11822     return false;
11823
11824   if (GET_CODE (x) != SYMBOL_REF)
11825     return false;
11826
11827   return SYMBOL_REF_TLS_MODEL (x) != 0;
11828 }
11829
11830 /* Classify a TLS symbol into one of the TLS kinds.  */
11831 enum aarch64_symbol_type
11832 aarch64_classify_tls_symbol (rtx x)
11833 {
11834   enum tls_model tls_kind = tls_symbolic_operand_type (x);
11835
11836   switch (tls_kind)
11837     {
11838     case TLS_MODEL_GLOBAL_DYNAMIC:
11839     case TLS_MODEL_LOCAL_DYNAMIC:
11840       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
11841
11842     case TLS_MODEL_INITIAL_EXEC:
11843       switch (aarch64_cmodel)
11844         {
11845         case AARCH64_CMODEL_TINY:
11846         case AARCH64_CMODEL_TINY_PIC:
11847           return SYMBOL_TINY_TLSIE;
11848         default:
11849           return SYMBOL_SMALL_TLSIE;
11850         }
11851
11852     case TLS_MODEL_LOCAL_EXEC:
11853       if (aarch64_tls_size == 12)
11854         return SYMBOL_TLSLE12;
11855       else if (aarch64_tls_size == 24)
11856         return SYMBOL_TLSLE24;
11857       else if (aarch64_tls_size == 32)
11858         return SYMBOL_TLSLE32;
11859       else if (aarch64_tls_size == 48)
11860         return SYMBOL_TLSLE48;
11861       else
11862         gcc_unreachable ();
11863
11864     case TLS_MODEL_EMULATED:
11865     case TLS_MODEL_NONE:
11866       return SYMBOL_FORCE_TO_MEM;
11867
11868     default:
11869       gcc_unreachable ();
11870     }
11871 }
11872
11873 /* Return the correct method for accessing X + OFFSET, where X is either
11874    a SYMBOL_REF or LABEL_REF.  */
11875
11876 enum aarch64_symbol_type
11877 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
11878 {
11879   if (GET_CODE (x) == LABEL_REF)
11880     {
11881       switch (aarch64_cmodel)
11882         {
11883         case AARCH64_CMODEL_LARGE:
11884           return SYMBOL_FORCE_TO_MEM;
11885
11886         case AARCH64_CMODEL_TINY_PIC:
11887         case AARCH64_CMODEL_TINY:
11888           return SYMBOL_TINY_ABSOLUTE;
11889
11890         case AARCH64_CMODEL_SMALL_SPIC:
11891         case AARCH64_CMODEL_SMALL_PIC:
11892         case AARCH64_CMODEL_SMALL:
11893           return SYMBOL_SMALL_ABSOLUTE;
11894
11895         default:
11896           gcc_unreachable ();
11897         }
11898     }
11899
11900   if (GET_CODE (x) == SYMBOL_REF)
11901     {
11902       if (aarch64_tls_symbol_p (x))
11903         return aarch64_classify_tls_symbol (x);
11904
11905       switch (aarch64_cmodel)
11906         {
11907         case AARCH64_CMODEL_TINY:
11908           /* When we retrieve symbol + offset address, we have to make sure
11909              the offset does not cause overflow of the final address.  But
11910              we have no way of knowing the address of symbol at compile time
11911              so we can't accurately say if the distance between the PC and
11912              symbol + offset is outside the addressible range of +/-1M in the
11913              TINY code model.  So we rely on images not being greater than
11914              1M and cap the offset at 1M and anything beyond 1M will have to
11915              be loaded using an alternative mechanism.  Furthermore if the
11916              symbol is a weak reference to something that isn't known to
11917              resolve to a symbol in this module, then force to memory.  */
11918           if ((SYMBOL_REF_WEAK (x)
11919                && !aarch64_symbol_binds_local_p (x))
11920               || !IN_RANGE (offset, -1048575, 1048575))
11921             return SYMBOL_FORCE_TO_MEM;
11922           return SYMBOL_TINY_ABSOLUTE;
11923
11924         case AARCH64_CMODEL_SMALL:
11925           /* Same reasoning as the tiny code model, but the offset cap here is
11926              4G.  */
11927           if ((SYMBOL_REF_WEAK (x)
11928                && !aarch64_symbol_binds_local_p (x))
11929               || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
11930                             HOST_WIDE_INT_C (4294967264)))
11931             return SYMBOL_FORCE_TO_MEM;
11932           return SYMBOL_SMALL_ABSOLUTE;
11933
11934         case AARCH64_CMODEL_TINY_PIC:
11935           if (!aarch64_symbol_binds_local_p (x))
11936             return SYMBOL_TINY_GOT;
11937           return SYMBOL_TINY_ABSOLUTE;
11938
11939         case AARCH64_CMODEL_SMALL_SPIC:
11940         case AARCH64_CMODEL_SMALL_PIC:
11941           if (!aarch64_symbol_binds_local_p (x))
11942             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
11943                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
11944           return SYMBOL_SMALL_ABSOLUTE;
11945
11946         case AARCH64_CMODEL_LARGE:
11947           /* This is alright even in PIC code as the constant
11948              pool reference is always PC relative and within
11949              the same translation unit.  */
11950           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
11951             return SYMBOL_SMALL_ABSOLUTE;
11952           else
11953             return SYMBOL_FORCE_TO_MEM;
11954
11955         default:
11956           gcc_unreachable ();
11957         }
11958     }
11959
11960   /* By default push everything into the constant pool.  */
11961   return SYMBOL_FORCE_TO_MEM;
11962 }
11963
11964 bool
11965 aarch64_constant_address_p (rtx x)
11966 {
11967   return (CONSTANT_P (x) && memory_address_p (DImode, x));
11968 }
11969
11970 bool
11971 aarch64_legitimate_pic_operand_p (rtx x)
11972 {
11973   if (GET_CODE (x) == SYMBOL_REF
11974       || (GET_CODE (x) == CONST
11975           && GET_CODE (XEXP (x, 0)) == PLUS
11976           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
11977      return false;
11978
11979   return true;
11980 }
11981
11982 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
11983    that should be rematerialized rather than spilled.  */
11984
11985 static bool
11986 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
11987 {
11988   /* Support CSE and rematerialization of common constants.  */
11989   if (CONST_INT_P (x)
11990       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
11991       || GET_CODE (x) == CONST_VECTOR)
11992     return true;
11993
11994   /* Do not allow vector struct mode constants for Advanced SIMD.
11995      We could support 0 and -1 easily, but they need support in
11996      aarch64-simd.md.  */
11997   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11998   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
11999     return false;
12000
12001   /* Only accept variable-length vector constants if they can be
12002      handled directly.
12003
12004      ??? It would be possible to handle rematerialization of other
12005      constants via secondary reloads.  */
12006   if (vec_flags & VEC_ANY_SVE)
12007     return aarch64_simd_valid_immediate (x, NULL);
12008
12009   if (GET_CODE (x) == HIGH)
12010     x = XEXP (x, 0);
12011
12012   /* Accept polynomial constants that can be calculated by using the
12013      destination of a move as the sole temporary.  Constants that
12014      require a second temporary cannot be rematerialized (they can't be
12015      forced to memory and also aren't legitimate constants).  */
12016   poly_int64 offset;
12017   if (poly_int_rtx_p (x, &offset))
12018     return aarch64_offset_temporaries (false, offset) <= 1;
12019
12020   /* If an offset is being added to something else, we need to allow the
12021      base to be moved into the destination register, meaning that there
12022      are no free temporaries for the offset.  */
12023   x = strip_offset (x, &offset);
12024   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
12025     return false;
12026
12027   /* Do not allow const (plus (anchor_symbol, const_int)).  */
12028   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
12029     return false;
12030
12031   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
12032      so spilling them is better than rematerialization.  */
12033   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
12034     return true;
12035
12036   /* Label references are always constant.  */
12037   if (GET_CODE (x) == LABEL_REF)
12038     return true;
12039
12040   return false;
12041 }
12042
12043 rtx
12044 aarch64_load_tp (rtx target)
12045 {
12046   if (!target
12047       || GET_MODE (target) != Pmode
12048       || !register_operand (target, Pmode))
12049     target = gen_reg_rtx (Pmode);
12050
12051   /* Can return in any reg.  */
12052   emit_insn (gen_aarch64_load_tp_hard (target));
12053   return target;
12054 }
12055
12056 /* On AAPCS systems, this is the "struct __va_list".  */
12057 static GTY(()) tree va_list_type;
12058
12059 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
12060    Return the type to use as __builtin_va_list.
12061
12062    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
12063
12064    struct __va_list
12065    {
12066      void *__stack;
12067      void *__gr_top;
12068      void *__vr_top;
12069      int   __gr_offs;
12070      int   __vr_offs;
12071    };  */
12072
12073 static tree
12074 aarch64_build_builtin_va_list (void)
12075 {
12076   tree va_list_name;
12077   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12078
12079   /* Create the type.  */
12080   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
12081   /* Give it the required name.  */
12082   va_list_name = build_decl (BUILTINS_LOCATION,
12083                              TYPE_DECL,
12084                              get_identifier ("__va_list"),
12085                              va_list_type);
12086   DECL_ARTIFICIAL (va_list_name) = 1;
12087   TYPE_NAME (va_list_type) = va_list_name;
12088   TYPE_STUB_DECL (va_list_type) = va_list_name;
12089
12090   /* Create the fields.  */
12091   f_stack = build_decl (BUILTINS_LOCATION,
12092                         FIELD_DECL, get_identifier ("__stack"),
12093                         ptr_type_node);
12094   f_grtop = build_decl (BUILTINS_LOCATION,
12095                         FIELD_DECL, get_identifier ("__gr_top"),
12096                         ptr_type_node);
12097   f_vrtop = build_decl (BUILTINS_LOCATION,
12098                         FIELD_DECL, get_identifier ("__vr_top"),
12099                         ptr_type_node);
12100   f_groff = build_decl (BUILTINS_LOCATION,
12101                         FIELD_DECL, get_identifier ("__gr_offs"),
12102                         integer_type_node);
12103   f_vroff = build_decl (BUILTINS_LOCATION,
12104                         FIELD_DECL, get_identifier ("__vr_offs"),
12105                         integer_type_node);
12106
12107   /* Tell tree-stdarg pass about our internal offset fields.
12108      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
12109      purpose to identify whether the code is updating va_list internal
12110      offset fields through irregular way.  */
12111   va_list_gpr_counter_field = f_groff;
12112   va_list_fpr_counter_field = f_vroff;
12113
12114   DECL_ARTIFICIAL (f_stack) = 1;
12115   DECL_ARTIFICIAL (f_grtop) = 1;
12116   DECL_ARTIFICIAL (f_vrtop) = 1;
12117   DECL_ARTIFICIAL (f_groff) = 1;
12118   DECL_ARTIFICIAL (f_vroff) = 1;
12119
12120   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
12121   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
12122   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
12123   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
12124   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
12125
12126   TYPE_FIELDS (va_list_type) = f_stack;
12127   DECL_CHAIN (f_stack) = f_grtop;
12128   DECL_CHAIN (f_grtop) = f_vrtop;
12129   DECL_CHAIN (f_vrtop) = f_groff;
12130   DECL_CHAIN (f_groff) = f_vroff;
12131
12132   /* Compute its layout.  */
12133   layout_type (va_list_type);
12134
12135   return va_list_type;
12136 }
12137
12138 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
12139 static void
12140 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
12141 {
12142   const CUMULATIVE_ARGS *cum;
12143   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12144   tree stack, grtop, vrtop, groff, vroff;
12145   tree t;
12146   int gr_save_area_size = cfun->va_list_gpr_size;
12147   int vr_save_area_size = cfun->va_list_fpr_size;
12148   int vr_offset;
12149
12150   cum = &crtl->args.info;
12151   if (cfun->va_list_gpr_size)
12152     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
12153                              cfun->va_list_gpr_size);
12154   if (cfun->va_list_fpr_size)
12155     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
12156                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
12157
12158   if (!TARGET_FLOAT)
12159     {
12160       gcc_assert (cum->aapcs_nvrn == 0);
12161       vr_save_area_size = 0;
12162     }
12163
12164   f_stack = TYPE_FIELDS (va_list_type_node);
12165   f_grtop = DECL_CHAIN (f_stack);
12166   f_vrtop = DECL_CHAIN (f_grtop);
12167   f_groff = DECL_CHAIN (f_vrtop);
12168   f_vroff = DECL_CHAIN (f_groff);
12169
12170   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
12171                   NULL_TREE);
12172   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
12173                   NULL_TREE);
12174   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
12175                   NULL_TREE);
12176   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
12177                   NULL_TREE);
12178   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
12179                   NULL_TREE);
12180
12181   /* Emit code to initialize STACK, which points to the next varargs stack
12182      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
12183      by named arguments.  STACK is 8-byte aligned.  */
12184   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
12185   if (cum->aapcs_stack_size > 0)
12186     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
12187   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
12188   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12189
12190   /* Emit code to initialize GRTOP, the top of the GR save area.
12191      virtual_incoming_args_rtx should have been 16 byte aligned.  */
12192   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
12193   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
12194   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12195
12196   /* Emit code to initialize VRTOP, the top of the VR save area.
12197      This address is gr_save_area_bytes below GRTOP, rounded
12198      down to the next 16-byte boundary.  */
12199   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
12200   vr_offset = ROUND_UP (gr_save_area_size,
12201                         STACK_BOUNDARY / BITS_PER_UNIT);
12202
12203   if (vr_offset)
12204     t = fold_build_pointer_plus_hwi (t, -vr_offset);
12205   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
12206   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12207
12208   /* Emit code to initialize GROFF, the offset from GRTOP of the
12209      next GPR argument.  */
12210   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
12211               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
12212   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12213
12214   /* Likewise emit code to initialize VROFF, the offset from FTOP
12215      of the next VR argument.  */
12216   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
12217               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
12218   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12219 }
12220
12221 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
12222
12223 static tree
12224 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
12225                               gimple_seq *post_p ATTRIBUTE_UNUSED)
12226 {
12227   tree addr;
12228   bool indirect_p;
12229   bool is_ha;           /* is HFA or HVA.  */
12230   bool dw_align;        /* double-word align.  */
12231   machine_mode ag_mode = VOIDmode;
12232   int nregs;
12233   machine_mode mode;
12234
12235   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12236   tree stack, f_top, f_off, off, arg, roundup, on_stack;
12237   HOST_WIDE_INT size, rsize, adjust, align;
12238   tree t, u, cond1, cond2;
12239
12240   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
12241   if (indirect_p)
12242     type = build_pointer_type (type);
12243
12244   mode = TYPE_MODE (type);
12245
12246   f_stack = TYPE_FIELDS (va_list_type_node);
12247   f_grtop = DECL_CHAIN (f_stack);
12248   f_vrtop = DECL_CHAIN (f_grtop);
12249   f_groff = DECL_CHAIN (f_vrtop);
12250   f_vroff = DECL_CHAIN (f_groff);
12251
12252   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
12253                   f_stack, NULL_TREE);
12254   size = int_size_in_bytes (type);
12255   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
12256
12257   dw_align = false;
12258   adjust = 0;
12259   if (aarch64_vfp_is_call_or_return_candidate (mode,
12260                                                type,
12261                                                &ag_mode,
12262                                                &nregs,
12263                                                &is_ha))
12264     {
12265       /* No frontends can create types with variable-sized modes, so we
12266          shouldn't be asked to pass or return them.  */
12267       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
12268
12269       /* TYPE passed in fp/simd registers.  */
12270       if (!TARGET_FLOAT)
12271         aarch64_err_no_fpadvsimd (mode);
12272
12273       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
12274                       unshare_expr (valist), f_vrtop, NULL_TREE);
12275       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
12276                       unshare_expr (valist), f_vroff, NULL_TREE);
12277
12278       rsize = nregs * UNITS_PER_VREG;
12279
12280       if (is_ha)
12281         {
12282           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
12283             adjust = UNITS_PER_VREG - ag_size;
12284         }
12285       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12286                && size < UNITS_PER_VREG)
12287         {
12288           adjust = UNITS_PER_VREG - size;
12289         }
12290     }
12291   else
12292     {
12293       /* TYPE passed in general registers.  */
12294       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
12295                       unshare_expr (valist), f_grtop, NULL_TREE);
12296       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
12297                       unshare_expr (valist), f_groff, NULL_TREE);
12298       rsize = ROUND_UP (size, UNITS_PER_WORD);
12299       nregs = rsize / UNITS_PER_WORD;
12300
12301       if (align > 8)
12302         dw_align = true;
12303
12304       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12305           && size < UNITS_PER_WORD)
12306         {
12307           adjust = UNITS_PER_WORD  - size;
12308         }
12309     }
12310
12311   /* Get a local temporary for the field value.  */
12312   off = get_initialized_tmp_var (f_off, pre_p, NULL);
12313
12314   /* Emit code to branch if off >= 0.  */
12315   t = build2 (GE_EXPR, boolean_type_node, off,
12316               build_int_cst (TREE_TYPE (off), 0));
12317   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
12318
12319   if (dw_align)
12320     {
12321       /* Emit: offs = (offs + 15) & -16.  */
12322       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12323                   build_int_cst (TREE_TYPE (off), 15));
12324       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
12325                   build_int_cst (TREE_TYPE (off), -16));
12326       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
12327     }
12328   else
12329     roundup = NULL;
12330
12331   /* Update ap.__[g|v]r_offs  */
12332   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12333               build_int_cst (TREE_TYPE (off), rsize));
12334   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
12335
12336   /* String up.  */
12337   if (roundup)
12338     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12339
12340   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
12341   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
12342               build_int_cst (TREE_TYPE (f_off), 0));
12343   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
12344
12345   /* String up: make sure the assignment happens before the use.  */
12346   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
12347   COND_EXPR_ELSE (cond1) = t;
12348
12349   /* Prepare the trees handling the argument that is passed on the stack;
12350      the top level node will store in ON_STACK.  */
12351   arg = get_initialized_tmp_var (stack, pre_p, NULL);
12352   if (align > 8)
12353     {
12354       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
12355       t = fold_build_pointer_plus_hwi (arg, 15);
12356       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12357                   build_int_cst (TREE_TYPE (t), -16));
12358       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
12359     }
12360   else
12361     roundup = NULL;
12362   /* Advance ap.__stack  */
12363   t = fold_build_pointer_plus_hwi (arg, size + 7);
12364   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12365               build_int_cst (TREE_TYPE (t), -8));
12366   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
12367   /* String up roundup and advance.  */
12368   if (roundup)
12369     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12370   /* String up with arg */
12371   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
12372   /* Big-endianness related address adjustment.  */
12373   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12374       && size < UNITS_PER_WORD)
12375   {
12376     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
12377                 size_int (UNITS_PER_WORD - size));
12378     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
12379   }
12380
12381   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
12382   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
12383
12384   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
12385   t = off;
12386   if (adjust)
12387     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
12388                 build_int_cst (TREE_TYPE (off), adjust));
12389
12390   t = fold_convert (sizetype, t);
12391   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
12392
12393   if (is_ha)
12394     {
12395       /* type ha; // treat as "struct {ftype field[n];}"
12396          ... [computing offs]
12397          for (i = 0; i <nregs; ++i, offs += 16)
12398            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
12399          return ha;  */
12400       int i;
12401       tree tmp_ha, field_t, field_ptr_t;
12402
12403       /* Declare a local variable.  */
12404       tmp_ha = create_tmp_var_raw (type, "ha");
12405       gimple_add_tmp_var (tmp_ha);
12406
12407       /* Establish the base type.  */
12408       switch (ag_mode)
12409         {
12410         case E_SFmode:
12411           field_t = float_type_node;
12412           field_ptr_t = float_ptr_type_node;
12413           break;
12414         case E_DFmode:
12415           field_t = double_type_node;
12416           field_ptr_t = double_ptr_type_node;
12417           break;
12418         case E_TFmode:
12419           field_t = long_double_type_node;
12420           field_ptr_t = long_double_ptr_type_node;
12421           break;
12422         case E_HFmode:
12423           field_t = aarch64_fp16_type_node;
12424           field_ptr_t = aarch64_fp16_ptr_type_node;
12425           break;
12426         case E_V2SImode:
12427         case E_V4SImode:
12428             {
12429               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
12430               field_t = build_vector_type_for_mode (innertype, ag_mode);
12431               field_ptr_t = build_pointer_type (field_t);
12432             }
12433           break;
12434         default:
12435           gcc_assert (0);
12436         }
12437
12438       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
12439       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
12440       addr = t;
12441       t = fold_convert (field_ptr_t, addr);
12442       t = build2 (MODIFY_EXPR, field_t,
12443                   build1 (INDIRECT_REF, field_t, tmp_ha),
12444                   build1 (INDIRECT_REF, field_t, t));
12445
12446       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
12447       for (i = 1; i < nregs; ++i)
12448         {
12449           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
12450           u = fold_convert (field_ptr_t, addr);
12451           u = build2 (MODIFY_EXPR, field_t,
12452                       build2 (MEM_REF, field_t, tmp_ha,
12453                               build_int_cst (field_ptr_t,
12454                                              (i *
12455                                               int_size_in_bytes (field_t)))),
12456                       build1 (INDIRECT_REF, field_t, u));
12457           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
12458         }
12459
12460       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
12461       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
12462     }
12463
12464   COND_EXPR_ELSE (cond2) = t;
12465   addr = fold_convert (build_pointer_type (type), cond1);
12466   addr = build_va_arg_indirect_ref (addr);
12467
12468   if (indirect_p)
12469     addr = build_va_arg_indirect_ref (addr);
12470
12471   return addr;
12472 }
12473
12474 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
12475
12476 static void
12477 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
12478                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
12479                                 int no_rtl)
12480 {
12481   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
12482   CUMULATIVE_ARGS local_cum;
12483   int gr_saved = cfun->va_list_gpr_size;
12484   int vr_saved = cfun->va_list_fpr_size;
12485
12486   /* The caller has advanced CUM up to, but not beyond, the last named
12487      argument.  Advance a local copy of CUM past the last "real" named
12488      argument, to find out how many registers are left over.  */
12489   local_cum = *cum;
12490   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
12491
12492   /* Found out how many registers we need to save.
12493      Honor tree-stdvar analysis results.  */
12494   if (cfun->va_list_gpr_size)
12495     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
12496                     cfun->va_list_gpr_size / UNITS_PER_WORD);
12497   if (cfun->va_list_fpr_size)
12498     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
12499                     cfun->va_list_fpr_size / UNITS_PER_VREG);
12500
12501   if (!TARGET_FLOAT)
12502     {
12503       gcc_assert (local_cum.aapcs_nvrn == 0);
12504       vr_saved = 0;
12505     }
12506
12507   if (!no_rtl)
12508     {
12509       if (gr_saved > 0)
12510         {
12511           rtx ptr, mem;
12512
12513           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
12514           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
12515                                - gr_saved * UNITS_PER_WORD);
12516           mem = gen_frame_mem (BLKmode, ptr);
12517           set_mem_alias_set (mem, get_varargs_alias_set ());
12518
12519           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
12520                                mem, gr_saved);
12521         }
12522       if (vr_saved > 0)
12523         {
12524           /* We can't use move_block_from_reg, because it will use
12525              the wrong mode, storing D regs only.  */
12526           machine_mode mode = TImode;
12527           int off, i, vr_start;
12528
12529           /* Set OFF to the offset from virtual_incoming_args_rtx of
12530              the first vector register.  The VR save area lies below
12531              the GR one, and is aligned to 16 bytes.  */
12532           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
12533                            STACK_BOUNDARY / BITS_PER_UNIT);
12534           off -= vr_saved * UNITS_PER_VREG;
12535
12536           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
12537           for (i = 0; i < vr_saved; ++i)
12538             {
12539               rtx ptr, mem;
12540
12541               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
12542               mem = gen_frame_mem (mode, ptr);
12543               set_mem_alias_set (mem, get_varargs_alias_set ());
12544               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
12545               off += UNITS_PER_VREG;
12546             }
12547         }
12548     }
12549
12550   /* We don't save the size into *PRETEND_SIZE because we want to avoid
12551      any complication of having crtl->args.pretend_args_size changed.  */
12552   cfun->machine->frame.saved_varargs_size
12553     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
12554                  STACK_BOUNDARY / BITS_PER_UNIT)
12555        + vr_saved * UNITS_PER_VREG);
12556 }
12557
12558 static void
12559 aarch64_conditional_register_usage (void)
12560 {
12561   int i;
12562   if (!TARGET_FLOAT)
12563     {
12564       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
12565         {
12566           fixed_regs[i] = 1;
12567           call_used_regs[i] = 1;
12568         }
12569     }
12570   if (!TARGET_SVE)
12571     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
12572       {
12573         fixed_regs[i] = 1;
12574         call_used_regs[i] = 1;
12575       }
12576 }
12577
12578 /* Walk down the type tree of TYPE counting consecutive base elements.
12579    If *MODEP is VOIDmode, then set it to the first valid floating point
12580    type.  If a non-floating point type is found, or if a floating point
12581    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
12582    otherwise return the count in the sub-tree.  */
12583 static int
12584 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
12585 {
12586   machine_mode mode;
12587   HOST_WIDE_INT size;
12588
12589   switch (TREE_CODE (type))
12590     {
12591     case REAL_TYPE:
12592       mode = TYPE_MODE (type);
12593       if (mode != DFmode && mode != SFmode
12594           && mode != TFmode && mode != HFmode)
12595         return -1;
12596
12597       if (*modep == VOIDmode)
12598         *modep = mode;
12599
12600       if (*modep == mode)
12601         return 1;
12602
12603       break;
12604
12605     case COMPLEX_TYPE:
12606       mode = TYPE_MODE (TREE_TYPE (type));
12607       if (mode != DFmode && mode != SFmode
12608           && mode != TFmode && mode != HFmode)
12609         return -1;
12610
12611       if (*modep == VOIDmode)
12612         *modep = mode;
12613
12614       if (*modep == mode)
12615         return 2;
12616
12617       break;
12618
12619     case VECTOR_TYPE:
12620       /* Use V2SImode and V4SImode as representatives of all 64-bit
12621          and 128-bit vector types.  */
12622       size = int_size_in_bytes (type);
12623       switch (size)
12624         {
12625         case 8:
12626           mode = V2SImode;
12627           break;
12628         case 16:
12629           mode = V4SImode;
12630           break;
12631         default:
12632           return -1;
12633         }
12634
12635       if (*modep == VOIDmode)
12636         *modep = mode;
12637
12638       /* Vector modes are considered to be opaque: two vectors are
12639          equivalent for the purposes of being homogeneous aggregates
12640          if they are the same size.  */
12641       if (*modep == mode)
12642         return 1;
12643
12644       break;
12645
12646     case ARRAY_TYPE:
12647       {
12648         int count;
12649         tree index = TYPE_DOMAIN (type);
12650
12651         /* Can't handle incomplete types nor sizes that are not
12652            fixed.  */
12653         if (!COMPLETE_TYPE_P (type)
12654             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12655           return -1;
12656
12657         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
12658         if (count == -1
12659             || !index
12660             || !TYPE_MAX_VALUE (index)
12661             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
12662             || !TYPE_MIN_VALUE (index)
12663             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
12664             || count < 0)
12665           return -1;
12666
12667         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
12668                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
12669
12670         /* There must be no padding.  */
12671         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12672                       count * GET_MODE_BITSIZE (*modep)))
12673           return -1;
12674
12675         return count;
12676       }
12677
12678     case RECORD_TYPE:
12679       {
12680         int count = 0;
12681         int sub_count;
12682         tree field;
12683
12684         /* Can't handle incomplete types nor sizes that are not
12685            fixed.  */
12686         if (!COMPLETE_TYPE_P (type)
12687             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12688           return -1;
12689
12690         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12691           {
12692             if (TREE_CODE (field) != FIELD_DECL)
12693               continue;
12694
12695             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12696             if (sub_count < 0)
12697               return -1;
12698             count += sub_count;
12699           }
12700
12701         /* There must be no padding.  */
12702         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12703                       count * GET_MODE_BITSIZE (*modep)))
12704           return -1;
12705
12706         return count;
12707       }
12708
12709     case UNION_TYPE:
12710     case QUAL_UNION_TYPE:
12711       {
12712         /* These aren't very interesting except in a degenerate case.  */
12713         int count = 0;
12714         int sub_count;
12715         tree field;
12716
12717         /* Can't handle incomplete types nor sizes that are not
12718            fixed.  */
12719         if (!COMPLETE_TYPE_P (type)
12720             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12721           return -1;
12722
12723         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12724           {
12725             if (TREE_CODE (field) != FIELD_DECL)
12726               continue;
12727
12728             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12729             if (sub_count < 0)
12730               return -1;
12731             count = count > sub_count ? count : sub_count;
12732           }
12733
12734         /* There must be no padding.  */
12735         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12736                       count * GET_MODE_BITSIZE (*modep)))
12737           return -1;
12738
12739         return count;
12740       }
12741
12742     default:
12743       break;
12744     }
12745
12746   return -1;
12747 }
12748
12749 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
12750    type as described in AAPCS64 \S 4.1.2.
12751
12752    See the comment above aarch64_composite_type_p for the notes on MODE.  */
12753
12754 static bool
12755 aarch64_short_vector_p (const_tree type,
12756                         machine_mode mode)
12757 {
12758   poly_int64 size = -1;
12759
12760   if (type && TREE_CODE (type) == VECTOR_TYPE)
12761     size = int_size_in_bytes (type);
12762   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
12763             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
12764     size = GET_MODE_SIZE (mode);
12765
12766   return known_eq (size, 8) || known_eq (size, 16);
12767 }
12768
12769 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
12770    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
12771    array types.  The C99 floating-point complex types are also considered
12772    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
12773    types, which are GCC extensions and out of the scope of AAPCS64, are
12774    treated as composite types here as well.
12775
12776    Note that MODE itself is not sufficient in determining whether a type
12777    is such a composite type or not.  This is because
12778    stor-layout.c:compute_record_mode may have already changed the MODE
12779    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
12780    structure with only one field may have its MODE set to the mode of the
12781    field.  Also an integer mode whose size matches the size of the
12782    RECORD_TYPE type may be used to substitute the original mode
12783    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
12784    solely relied on.  */
12785
12786 static bool
12787 aarch64_composite_type_p (const_tree type,
12788                           machine_mode mode)
12789 {
12790   if (aarch64_short_vector_p (type, mode))
12791     return false;
12792
12793   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
12794     return true;
12795
12796   if (mode == BLKmode
12797       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
12798       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
12799     return true;
12800
12801   return false;
12802 }
12803
12804 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
12805    shall be passed or returned in simd/fp register(s) (providing these
12806    parameter passing registers are available).
12807
12808    Upon successful return, *COUNT returns the number of needed registers,
12809    *BASE_MODE returns the mode of the individual register and when IS_HAF
12810    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
12811    floating-point aggregate or a homogeneous short-vector aggregate.  */
12812
12813 static bool
12814 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
12815                                          const_tree type,
12816                                          machine_mode *base_mode,
12817                                          int *count,
12818                                          bool *is_ha)
12819 {
12820   machine_mode new_mode = VOIDmode;
12821   bool composite_p = aarch64_composite_type_p (type, mode);
12822
12823   if (is_ha != NULL) *is_ha = false;
12824
12825   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
12826       || aarch64_short_vector_p (type, mode))
12827     {
12828       *count = 1;
12829       new_mode = mode;
12830     }
12831   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
12832     {
12833       if (is_ha != NULL) *is_ha = true;
12834       *count = 2;
12835       new_mode = GET_MODE_INNER (mode);
12836     }
12837   else if (type && composite_p)
12838     {
12839       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
12840
12841       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
12842         {
12843           if (is_ha != NULL) *is_ha = true;
12844           *count = ag_count;
12845         }
12846       else
12847         return false;
12848     }
12849   else
12850     return false;
12851
12852   *base_mode = new_mode;
12853   return true;
12854 }
12855
12856 /* Implement TARGET_STRUCT_VALUE_RTX.  */
12857
12858 static rtx
12859 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
12860                           int incoming ATTRIBUTE_UNUSED)
12861 {
12862   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
12863 }
12864
12865 /* Implements target hook vector_mode_supported_p.  */
12866 static bool
12867 aarch64_vector_mode_supported_p (machine_mode mode)
12868 {
12869   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12870   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
12871 }
12872
12873 /* Return appropriate SIMD container
12874    for MODE within a vector of WIDTH bits.  */
12875 static machine_mode
12876 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
12877 {
12878   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
12879     switch (mode)
12880       {
12881       case E_DFmode:
12882         return VNx2DFmode;
12883       case E_SFmode:
12884         return VNx4SFmode;
12885       case E_HFmode:
12886         return VNx8HFmode;
12887       case E_DImode:
12888         return VNx2DImode;
12889       case E_SImode:
12890         return VNx4SImode;
12891       case E_HImode:
12892         return VNx8HImode;
12893       case E_QImode:
12894         return VNx16QImode;
12895       default:
12896         return word_mode;
12897       }
12898
12899   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
12900   if (TARGET_SIMD)
12901     {
12902       if (known_eq (width, 128))
12903         switch (mode)
12904           {
12905           case E_DFmode:
12906             return V2DFmode;
12907           case E_SFmode:
12908             return V4SFmode;
12909           case E_HFmode:
12910             return V8HFmode;
12911           case E_SImode:
12912             return V4SImode;
12913           case E_HImode:
12914             return V8HImode;
12915           case E_QImode:
12916             return V16QImode;
12917           case E_DImode:
12918             return V2DImode;
12919           default:
12920             break;
12921           }
12922       else
12923         switch (mode)
12924           {
12925           case E_SFmode:
12926             return V2SFmode;
12927           case E_HFmode:
12928             return V4HFmode;
12929           case E_SImode:
12930             return V2SImode;
12931           case E_HImode:
12932             return V4HImode;
12933           case E_QImode:
12934             return V8QImode;
12935           default:
12936             break;
12937           }
12938     }
12939   return word_mode;
12940 }
12941
12942 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
12943 static machine_mode
12944 aarch64_preferred_simd_mode (scalar_mode mode)
12945 {
12946   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
12947   return aarch64_simd_container_mode (mode, bits);
12948 }
12949
12950 /* Return a list of possible vector sizes for the vectorizer
12951    to iterate over.  */
12952 static void
12953 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
12954 {
12955   if (TARGET_SVE)
12956     sizes->safe_push (BYTES_PER_SVE_VECTOR);
12957   sizes->safe_push (16);
12958   sizes->safe_push (8);
12959 }
12960
12961 /* Implement TARGET_MANGLE_TYPE.  */
12962
12963 static const char *
12964 aarch64_mangle_type (const_tree type)
12965 {
12966   /* The AArch64 ABI documents say that "__va_list" has to be
12967      managled as if it is in the "std" namespace.  */
12968   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
12969     return "St9__va_list";
12970
12971   /* Half-precision float.  */
12972   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
12973     return "Dh";
12974
12975   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
12976      builtin types.  */
12977   if (TYPE_NAME (type) != NULL)
12978     return aarch64_mangle_builtin_type (type);
12979
12980   /* Use the default mangling.  */
12981   return NULL;
12982 }
12983
12984 /* Find the first rtx_insn before insn that will generate an assembly
12985    instruction.  */
12986
12987 static rtx_insn *
12988 aarch64_prev_real_insn (rtx_insn *insn)
12989 {
12990   if (!insn)
12991     return NULL;
12992
12993   do
12994     {
12995       insn = prev_real_insn (insn);
12996     }
12997   while (insn && recog_memoized (insn) < 0);
12998
12999   return insn;
13000 }
13001
13002 static bool
13003 is_madd_op (enum attr_type t1)
13004 {
13005   unsigned int i;
13006   /* A number of these may be AArch32 only.  */
13007   enum attr_type mlatypes[] = {
13008     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
13009     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
13010     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
13011   };
13012
13013   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
13014     {
13015       if (t1 == mlatypes[i])
13016         return true;
13017     }
13018
13019   return false;
13020 }
13021
13022 /* Check if there is a register dependency between a load and the insn
13023    for which we hold recog_data.  */
13024
13025 static bool
13026 dep_between_memop_and_curr (rtx memop)
13027 {
13028   rtx load_reg;
13029   int opno;
13030
13031   gcc_assert (GET_CODE (memop) == SET);
13032
13033   if (!REG_P (SET_DEST (memop)))
13034     return false;
13035
13036   load_reg = SET_DEST (memop);
13037   for (opno = 1; opno < recog_data.n_operands; opno++)
13038     {
13039       rtx operand = recog_data.operand[opno];
13040       if (REG_P (operand)
13041           && reg_overlap_mentioned_p (load_reg, operand))
13042         return true;
13043
13044     }
13045   return false;
13046 }
13047
13048
13049 /* When working around the Cortex-A53 erratum 835769,
13050    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
13051    instruction and has a preceding memory instruction such that a NOP
13052    should be inserted between them.  */
13053
13054 bool
13055 aarch64_madd_needs_nop (rtx_insn* insn)
13056 {
13057   enum attr_type attr_type;
13058   rtx_insn *prev;
13059   rtx body;
13060
13061   if (!TARGET_FIX_ERR_A53_835769)
13062     return false;
13063
13064   if (!INSN_P (insn) || recog_memoized (insn) < 0)
13065     return false;
13066
13067   attr_type = get_attr_type (insn);
13068   if (!is_madd_op (attr_type))
13069     return false;
13070
13071   prev = aarch64_prev_real_insn (insn);
13072   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
13073      Restore recog state to INSN to avoid state corruption.  */
13074   extract_constrain_insn_cached (insn);
13075
13076   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
13077     return false;
13078
13079   body = single_set (prev);
13080
13081   /* If the previous insn is a memory op and there is no dependency between
13082      it and the DImode madd, emit a NOP between them.  If body is NULL then we
13083      have a complex memory operation, probably a load/store pair.
13084      Be conservative for now and emit a NOP.  */
13085   if (GET_MODE (recog_data.operand[0]) == DImode
13086       && (!body || !dep_between_memop_and_curr (body)))
13087     return true;
13088
13089   return false;
13090
13091 }
13092
13093
13094 /* Implement FINAL_PRESCAN_INSN.  */
13095
13096 void
13097 aarch64_final_prescan_insn (rtx_insn *insn)
13098 {
13099   if (aarch64_madd_needs_nop (insn))
13100     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
13101 }
13102
13103
13104 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
13105    instruction.  */
13106
13107 bool
13108 aarch64_sve_index_immediate_p (rtx base_or_step)
13109 {
13110   return (CONST_INT_P (base_or_step)
13111           && IN_RANGE (INTVAL (base_or_step), -16, 15));
13112 }
13113
13114 /* Return true if X is a valid immediate for the SVE ADD and SUB
13115    instructions.  Negate X first if NEGATE_P is true.  */
13116
13117 bool
13118 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
13119 {
13120   rtx elt;
13121
13122   if (!const_vec_duplicate_p (x, &elt)
13123       || !CONST_INT_P (elt))
13124     return false;
13125
13126   HOST_WIDE_INT val = INTVAL (elt);
13127   if (negate_p)
13128     val = -val;
13129   val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
13130
13131   if (val & 0xff)
13132     return IN_RANGE (val, 0, 0xff);
13133   return IN_RANGE (val, 0, 0xff00);
13134 }
13135
13136 /* Return true if X is a valid immediate operand for an SVE logical
13137    instruction such as AND.  */
13138
13139 bool
13140 aarch64_sve_bitmask_immediate_p (rtx x)
13141 {
13142   rtx elt;
13143
13144   return (const_vec_duplicate_p (x, &elt)
13145           && CONST_INT_P (elt)
13146           && aarch64_bitmask_imm (INTVAL (elt),
13147                                   GET_MODE_INNER (GET_MODE (x))));
13148 }
13149
13150 /* Return true if X is a valid immediate for the SVE DUP and CPY
13151    instructions.  */
13152
13153 bool
13154 aarch64_sve_dup_immediate_p (rtx x)
13155 {
13156   rtx elt;
13157
13158   if (!const_vec_duplicate_p (x, &elt)
13159       || !CONST_INT_P (elt))
13160     return false;
13161
13162   HOST_WIDE_INT val = INTVAL (elt);
13163   if (val & 0xff)
13164     return IN_RANGE (val, -0x80, 0x7f);
13165   return IN_RANGE (val, -0x8000, 0x7f00);
13166 }
13167
13168 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
13169    SIGNED_P says whether the operand is signed rather than unsigned.  */
13170
13171 bool
13172 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
13173 {
13174   rtx elt;
13175
13176   return (const_vec_duplicate_p (x, &elt)
13177           && CONST_INT_P (elt)
13178           && (signed_p
13179               ? IN_RANGE (INTVAL (elt), -16, 15)
13180               : IN_RANGE (INTVAL (elt), 0, 127)));
13181 }
13182
13183 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
13184    instruction.  Negate X first if NEGATE_P is true.  */
13185
13186 bool
13187 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
13188 {
13189   rtx elt;
13190   REAL_VALUE_TYPE r;
13191
13192   if (!const_vec_duplicate_p (x, &elt)
13193       || GET_CODE (elt) != CONST_DOUBLE)
13194     return false;
13195
13196   r = *CONST_DOUBLE_REAL_VALUE (elt);
13197
13198   if (negate_p)
13199     r = real_value_negate (&r);
13200
13201   if (real_equal (&r, &dconst1))
13202     return true;
13203   if (real_equal (&r, &dconsthalf))
13204     return true;
13205   return false;
13206 }
13207
13208 /* Return true if X is a valid immediate operand for an SVE FMUL
13209    instruction.  */
13210
13211 bool
13212 aarch64_sve_float_mul_immediate_p (rtx x)
13213 {
13214   rtx elt;
13215
13216   /* GCC will never generate a multiply with an immediate of 2, so there is no
13217      point testing for it (even though it is a valid constant).  */
13218   return (const_vec_duplicate_p (x, &elt)
13219           && GET_CODE (elt) == CONST_DOUBLE
13220           && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
13221 }
13222
13223 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
13224    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
13225    is nonnull, use it to describe valid immediates.  */
13226 static bool
13227 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
13228                                     simd_immediate_info *info,
13229                                     enum simd_immediate_check which,
13230                                     simd_immediate_info::insn_type insn)
13231 {
13232   /* Try a 4-byte immediate with LSL.  */
13233   for (unsigned int shift = 0; shift < 32; shift += 8)
13234     if ((val32 & (0xff << shift)) == val32)
13235       {
13236         if (info)
13237           *info = simd_immediate_info (SImode, val32 >> shift, insn,
13238                                        simd_immediate_info::LSL, shift);
13239         return true;
13240       }
13241
13242   /* Try a 2-byte immediate with LSL.  */
13243   unsigned int imm16 = val32 & 0xffff;
13244   if (imm16 == (val32 >> 16))
13245     for (unsigned int shift = 0; shift < 16; shift += 8)
13246       if ((imm16 & (0xff << shift)) == imm16)
13247         {
13248           if (info)
13249             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
13250                                          simd_immediate_info::LSL, shift);
13251           return true;
13252         }
13253
13254   /* Try a 4-byte immediate with MSL, except for cases that MVN
13255      can handle.  */
13256   if (which == AARCH64_CHECK_MOV)
13257     for (unsigned int shift = 8; shift < 24; shift += 8)
13258       {
13259         unsigned int low = (1 << shift) - 1;
13260         if (((val32 & (0xff << shift)) | low) == val32)
13261           {
13262             if (info)
13263               *info = simd_immediate_info (SImode, val32 >> shift, insn,
13264                                            simd_immediate_info::MSL, shift);
13265             return true;
13266           }
13267       }
13268
13269   return false;
13270 }
13271
13272 /* Return true if replicating VAL64 is a valid immediate for the
13273    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
13274    use it to describe valid immediates.  */
13275 static bool
13276 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
13277                                  simd_immediate_info *info,
13278                                  enum simd_immediate_check which)
13279 {
13280   unsigned int val32 = val64 & 0xffffffff;
13281   unsigned int val16 = val64 & 0xffff;
13282   unsigned int val8 = val64 & 0xff;
13283
13284   if (val32 == (val64 >> 32))
13285     {
13286       if ((which & AARCH64_CHECK_ORR) != 0
13287           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
13288                                                  simd_immediate_info::MOV))
13289         return true;
13290
13291       if ((which & AARCH64_CHECK_BIC) != 0
13292           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
13293                                                  simd_immediate_info::MVN))
13294         return true;
13295
13296       /* Try using a replicated byte.  */
13297       if (which == AARCH64_CHECK_MOV
13298           && val16 == (val32 >> 16)
13299           && val8 == (val16 >> 8))
13300         {
13301           if (info)
13302             *info = simd_immediate_info (QImode, val8);
13303           return true;
13304         }
13305     }
13306
13307   /* Try using a bit-to-bytemask.  */
13308   if (which == AARCH64_CHECK_MOV)
13309     {
13310       unsigned int i;
13311       for (i = 0; i < 64; i += 8)
13312         {
13313           unsigned char byte = (val64 >> i) & 0xff;
13314           if (byte != 0 && byte != 0xff)
13315             break;
13316         }
13317       if (i == 64)
13318         {
13319           if (info)
13320             *info = simd_immediate_info (DImode, val64);
13321           return true;
13322         }
13323     }
13324   return false;
13325 }
13326
13327 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
13328    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
13329
13330 static bool
13331 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
13332                              simd_immediate_info *info)
13333 {
13334   scalar_int_mode mode = DImode;
13335   unsigned int val32 = val64 & 0xffffffff;
13336   if (val32 == (val64 >> 32))
13337     {
13338       mode = SImode;
13339       unsigned int val16 = val32 & 0xffff;
13340       if (val16 == (val32 >> 16))
13341         {
13342           mode = HImode;
13343           unsigned int val8 = val16 & 0xff;
13344           if (val8 == (val16 >> 8))
13345             mode = QImode;
13346         }
13347     }
13348   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
13349   if (IN_RANGE (val, -0x80, 0x7f))
13350     {
13351       /* DUP with no shift.  */
13352       if (info)
13353         *info = simd_immediate_info (mode, val);
13354       return true;
13355     }
13356   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
13357     {
13358       /* DUP with LSL #8.  */
13359       if (info)
13360         *info = simd_immediate_info (mode, val);
13361       return true;
13362     }
13363   if (aarch64_bitmask_imm (val64, mode))
13364     {
13365       /* DUPM.  */
13366       if (info)
13367         *info = simd_immediate_info (mode, val);
13368       return true;
13369     }
13370   return false;
13371 }
13372
13373 /* Return true if OP is a valid SIMD immediate for the operation
13374    described by WHICH.  If INFO is nonnull, use it to describe valid
13375    immediates.  */
13376 bool
13377 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
13378                               enum simd_immediate_check which)
13379 {
13380   machine_mode mode = GET_MODE (op);
13381   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13382   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13383     return false;
13384
13385   scalar_mode elt_mode = GET_MODE_INNER (mode);
13386   rtx base, step;
13387   unsigned int n_elts;
13388   if (GET_CODE (op) == CONST_VECTOR
13389       && CONST_VECTOR_DUPLICATE_P (op))
13390     n_elts = CONST_VECTOR_NPATTERNS (op);
13391   else if ((vec_flags & VEC_SVE_DATA)
13392            && const_vec_series_p (op, &base, &step))
13393     {
13394       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
13395       if (!aarch64_sve_index_immediate_p (base)
13396           || !aarch64_sve_index_immediate_p (step))
13397         return false;
13398
13399       if (info)
13400         *info = simd_immediate_info (elt_mode, base, step);
13401       return true;
13402     }
13403   else if (GET_CODE (op) == CONST_VECTOR
13404            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
13405     /* N_ELTS set above.  */;
13406   else
13407     return false;
13408
13409   /* Handle PFALSE and PTRUE.  */
13410   if (vec_flags & VEC_SVE_PRED)
13411     return (op == CONST0_RTX (mode)
13412             || op == CONSTM1_RTX (mode));
13413
13414   scalar_float_mode elt_float_mode;
13415   if (n_elts == 1
13416       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
13417     {
13418       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
13419       if (aarch64_float_const_zero_rtx_p (elt)
13420           || aarch64_float_const_representable_p (elt))
13421         {
13422           if (info)
13423             *info = simd_immediate_info (elt_float_mode, elt);
13424           return true;
13425         }
13426     }
13427
13428   unsigned int elt_size = GET_MODE_SIZE (elt_mode);
13429   if (elt_size > 8)
13430     return false;
13431
13432   scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
13433
13434   /* Expand the vector constant out into a byte vector, with the least
13435      significant byte of the register first.  */
13436   auto_vec<unsigned char, 16> bytes;
13437   bytes.reserve (n_elts * elt_size);
13438   for (unsigned int i = 0; i < n_elts; i++)
13439     {
13440       /* The vector is provided in gcc endian-neutral fashion.
13441          For aarch64_be Advanced SIMD, it must be laid out in the vector
13442          register in reverse order.  */
13443       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
13444       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
13445
13446       if (elt_mode != elt_int_mode)
13447         elt = gen_lowpart (elt_int_mode, elt);
13448
13449       if (!CONST_INT_P (elt))
13450         return false;
13451
13452       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
13453       for (unsigned int byte = 0; byte < elt_size; byte++)
13454         {
13455           bytes.quick_push (elt_val & 0xff);
13456           elt_val >>= BITS_PER_UNIT;
13457         }
13458     }
13459
13460   /* The immediate must repeat every eight bytes.  */
13461   unsigned int nbytes = bytes.length ();
13462   for (unsigned i = 8; i < nbytes; ++i)
13463     if (bytes[i] != bytes[i - 8])
13464       return false;
13465
13466   /* Get the repeating 8-byte value as an integer.  No endian correction
13467      is needed here because bytes is already in lsb-first order.  */
13468   unsigned HOST_WIDE_INT val64 = 0;
13469   for (unsigned int i = 0; i < 8; i++)
13470     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
13471               << (i * BITS_PER_UNIT));
13472
13473   if (vec_flags & VEC_SVE_DATA)
13474     return aarch64_sve_valid_immediate (val64, info);
13475   else
13476     return aarch64_advsimd_valid_immediate (val64, info, which);
13477 }
13478
13479 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
13480    has a step in the range of INDEX.  Return the index expression if so,
13481    otherwise return null.  */
13482 rtx
13483 aarch64_check_zero_based_sve_index_immediate (rtx x)
13484 {
13485   rtx base, step;
13486   if (const_vec_series_p (x, &base, &step)
13487       && base == const0_rtx
13488       && aarch64_sve_index_immediate_p (step))
13489     return step;
13490   return NULL_RTX;
13491 }
13492
13493 /* Check of immediate shift constants are within range.  */
13494 bool
13495 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
13496 {
13497   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
13498   if (left)
13499     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
13500   else
13501     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
13502 }
13503
13504 /* Return the bitmask CONST_INT to select the bits required by a zero extract
13505    operation of width WIDTH at bit position POS.  */
13506
13507 rtx
13508 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
13509 {
13510   gcc_assert (CONST_INT_P (width));
13511   gcc_assert (CONST_INT_P (pos));
13512
13513   unsigned HOST_WIDE_INT mask
13514     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
13515   return GEN_INT (mask << UINTVAL (pos));
13516 }
13517
13518 bool
13519 aarch64_mov_operand_p (rtx x, machine_mode mode)
13520 {
13521   if (GET_CODE (x) == HIGH
13522       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
13523     return true;
13524
13525   if (CONST_INT_P (x))
13526     return true;
13527
13528   if (VECTOR_MODE_P (GET_MODE (x)))
13529     return aarch64_simd_valid_immediate (x, NULL);
13530
13531   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
13532     return true;
13533
13534   if (aarch64_sve_cnt_immediate_p (x))
13535     return true;
13536
13537   return aarch64_classify_symbolic_expression (x)
13538     == SYMBOL_TINY_ABSOLUTE;
13539 }
13540
13541 /* Return a const_int vector of VAL.  */
13542 rtx
13543 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
13544 {
13545   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
13546   return gen_const_vec_duplicate (mode, c);
13547 }
13548
13549 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
13550
13551 bool
13552 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
13553 {
13554   machine_mode vmode;
13555
13556   vmode = aarch64_simd_container_mode (mode, 64);
13557   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
13558   return aarch64_simd_valid_immediate (op_v, NULL);
13559 }
13560
13561 /* Construct and return a PARALLEL RTX vector with elements numbering the
13562    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
13563    the vector - from the perspective of the architecture.  This does not
13564    line up with GCC's perspective on lane numbers, so we end up with
13565    different masks depending on our target endian-ness.  The diagram
13566    below may help.  We must draw the distinction when building masks
13567    which select one half of the vector.  An instruction selecting
13568    architectural low-lanes for a big-endian target, must be described using
13569    a mask selecting GCC high-lanes.
13570
13571                  Big-Endian             Little-Endian
13572
13573 GCC             0   1   2   3           3   2   1   0
13574               | x | x | x | x |       | x | x | x | x |
13575 Architecture    3   2   1   0           3   2   1   0
13576
13577 Low Mask:         { 2, 3 }                { 0, 1 }
13578 High Mask:        { 0, 1 }                { 2, 3 }
13579
13580    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
13581
13582 rtx
13583 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
13584 {
13585   rtvec v = rtvec_alloc (nunits / 2);
13586   int high_base = nunits / 2;
13587   int low_base = 0;
13588   int base;
13589   rtx t1;
13590   int i;
13591
13592   if (BYTES_BIG_ENDIAN)
13593     base = high ? low_base : high_base;
13594   else
13595     base = high ? high_base : low_base;
13596
13597   for (i = 0; i < nunits / 2; i++)
13598     RTVEC_ELT (v, i) = GEN_INT (base + i);
13599
13600   t1 = gen_rtx_PARALLEL (mode, v);
13601   return t1;
13602 }
13603
13604 /* Check OP for validity as a PARALLEL RTX vector with elements
13605    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
13606    from the perspective of the architecture.  See the diagram above
13607    aarch64_simd_vect_par_cnst_half for more details.  */
13608
13609 bool
13610 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
13611                                        bool high)
13612 {
13613   int nelts;
13614   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
13615     return false;
13616
13617   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
13618   HOST_WIDE_INT count_op = XVECLEN (op, 0);
13619   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
13620   int i = 0;
13621
13622   if (count_op != count_ideal)
13623     return false;
13624
13625   for (i = 0; i < count_ideal; i++)
13626     {
13627       rtx elt_op = XVECEXP (op, 0, i);
13628       rtx elt_ideal = XVECEXP (ideal, 0, i);
13629
13630       if (!CONST_INT_P (elt_op)
13631           || INTVAL (elt_ideal) != INTVAL (elt_op))
13632         return false;
13633     }
13634   return true;
13635 }
13636
13637 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
13638    HIGH (exclusive).  */
13639 void
13640 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
13641                           const_tree exp)
13642 {
13643   HOST_WIDE_INT lane;
13644   gcc_assert (CONST_INT_P (operand));
13645   lane = INTVAL (operand);
13646
13647   if (lane < low || lane >= high)
13648   {
13649     if (exp)
13650       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
13651     else
13652       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
13653   }
13654 }
13655
13656 /* Peform endian correction on lane number N, which indexes a vector
13657    of mode MODE, and return the result as an SImode rtx.  */
13658
13659 rtx
13660 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
13661 {
13662   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
13663 }
13664
13665 /* Return TRUE if OP is a valid vector addressing mode.  */
13666
13667 bool
13668 aarch64_simd_mem_operand_p (rtx op)
13669 {
13670   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
13671                         || REG_P (XEXP (op, 0)));
13672 }
13673
13674 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
13675
13676 bool
13677 aarch64_sve_ld1r_operand_p (rtx op)
13678 {
13679   struct aarch64_address_info addr;
13680   scalar_mode mode;
13681
13682   return (MEM_P (op)
13683           && is_a <scalar_mode> (GET_MODE (op), &mode)
13684           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
13685           && addr.type == ADDRESS_REG_IMM
13686           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
13687 }
13688
13689 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
13690    The conditions for STR are the same.  */
13691 bool
13692 aarch64_sve_ldr_operand_p (rtx op)
13693 {
13694   struct aarch64_address_info addr;
13695
13696   return (MEM_P (op)
13697           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
13698                                        false, ADDR_QUERY_ANY)
13699           && addr.type == ADDRESS_REG_IMM);
13700 }
13701
13702 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
13703    We need to be able to access the individual pieces, so the range
13704    is different from LD[234] and ST[234].  */
13705 bool
13706 aarch64_sve_struct_memory_operand_p (rtx op)
13707 {
13708   if (!MEM_P (op))
13709     return false;
13710
13711   machine_mode mode = GET_MODE (op);
13712   struct aarch64_address_info addr;
13713   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
13714                                  ADDR_QUERY_ANY)
13715       || addr.type != ADDRESS_REG_IMM)
13716     return false;
13717
13718   poly_int64 first = addr.const_offset;
13719   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
13720   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
13721           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
13722 }
13723
13724 /* Emit a register copy from operand to operand, taking care not to
13725    early-clobber source registers in the process.
13726
13727    COUNT is the number of components into which the copy needs to be
13728    decomposed.  */
13729 void
13730 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
13731                                 unsigned int count)
13732 {
13733   unsigned int i;
13734   int rdest = REGNO (operands[0]);
13735   int rsrc = REGNO (operands[1]);
13736
13737   if (!reg_overlap_mentioned_p (operands[0], operands[1])
13738       || rdest < rsrc)
13739     for (i = 0; i < count; i++)
13740       emit_move_insn (gen_rtx_REG (mode, rdest + i),
13741                       gen_rtx_REG (mode, rsrc + i));
13742   else
13743     for (i = 0; i < count; i++)
13744       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
13745                       gen_rtx_REG (mode, rsrc + count - i - 1));
13746 }
13747
13748 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
13749    one of VSTRUCT modes: OI, CI, or XI.  */
13750 int
13751 aarch64_simd_attr_length_rglist (machine_mode mode)
13752 {
13753   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
13754   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
13755 }
13756
13757 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
13758    alignment of a vector to 128 bits.  SVE predicates have an alignment of
13759    16 bits.  */
13760 static HOST_WIDE_INT
13761 aarch64_simd_vector_alignment (const_tree type)
13762 {
13763   if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13764     /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
13765        be set for non-predicate vectors of booleans.  Modes are the most
13766        direct way we have of identifying real SVE predicate types.  */
13767     return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
13768   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
13769   return MIN (align, 128);
13770 }
13771
13772 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
13773 static HOST_WIDE_INT
13774 aarch64_vectorize_preferred_vector_alignment (const_tree type)
13775 {
13776   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
13777     {
13778       /* If the length of the vector is fixed, try to align to that length,
13779          otherwise don't try to align at all.  */
13780       HOST_WIDE_INT result;
13781       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
13782         result = TYPE_ALIGN (TREE_TYPE (type));
13783       return result;
13784     }
13785   return TYPE_ALIGN (type);
13786 }
13787
13788 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
13789 static bool
13790 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
13791 {
13792   if (is_packed)
13793     return false;
13794
13795   /* For fixed-length vectors, check that the vectorizer will aim for
13796      full-vector alignment.  This isn't true for generic GCC vectors
13797      that are wider than the ABI maximum of 128 bits.  */
13798   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
13799       && (wi::to_widest (TYPE_SIZE (type))
13800           != aarch64_vectorize_preferred_vector_alignment (type)))
13801     return false;
13802
13803   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
13804   return true;
13805 }
13806
13807 /* Return true if the vector misalignment factor is supported by the
13808    target.  */
13809 static bool
13810 aarch64_builtin_support_vector_misalignment (machine_mode mode,
13811                                              const_tree type, int misalignment,
13812                                              bool is_packed)
13813 {
13814   if (TARGET_SIMD && STRICT_ALIGNMENT)
13815     {
13816       /* Return if movmisalign pattern is not supported for this mode.  */
13817       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
13818         return false;
13819
13820       /* Misalignment factor is unknown at compile time.  */
13821       if (misalignment == -1)
13822         return false;
13823     }
13824   return default_builtin_support_vector_misalignment (mode, type, misalignment,
13825                                                       is_packed);
13826 }
13827
13828 /* If VALS is a vector constant that can be loaded into a register
13829    using DUP, generate instructions to do so and return an RTX to
13830    assign to the register.  Otherwise return NULL_RTX.  */
13831 static rtx
13832 aarch64_simd_dup_constant (rtx vals)
13833 {
13834   machine_mode mode = GET_MODE (vals);
13835   machine_mode inner_mode = GET_MODE_INNER (mode);
13836   rtx x;
13837
13838   if (!const_vec_duplicate_p (vals, &x))
13839     return NULL_RTX;
13840
13841   /* We can load this constant by using DUP and a constant in a
13842      single ARM register.  This will be cheaper than a vector
13843      load.  */
13844   x = copy_to_mode_reg (inner_mode, x);
13845   return gen_vec_duplicate (mode, x);
13846 }
13847
13848
13849 /* Generate code to load VALS, which is a PARALLEL containing only
13850    constants (for vec_init) or CONST_VECTOR, efficiently into a
13851    register.  Returns an RTX to copy into the register, or NULL_RTX
13852    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
13853 static rtx
13854 aarch64_simd_make_constant (rtx vals)
13855 {
13856   machine_mode mode = GET_MODE (vals);
13857   rtx const_dup;
13858   rtx const_vec = NULL_RTX;
13859   int n_const = 0;
13860   int i;
13861
13862   if (GET_CODE (vals) == CONST_VECTOR)
13863     const_vec = vals;
13864   else if (GET_CODE (vals) == PARALLEL)
13865     {
13866       /* A CONST_VECTOR must contain only CONST_INTs and
13867          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
13868          Only store valid constants in a CONST_VECTOR.  */
13869       int n_elts = XVECLEN (vals, 0);
13870       for (i = 0; i < n_elts; ++i)
13871         {
13872           rtx x = XVECEXP (vals, 0, i);
13873           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13874             n_const++;
13875         }
13876       if (n_const == n_elts)
13877         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
13878     }
13879   else
13880     gcc_unreachable ();
13881
13882   if (const_vec != NULL_RTX
13883       && aarch64_simd_valid_immediate (const_vec, NULL))
13884     /* Load using MOVI/MVNI.  */
13885     return const_vec;
13886   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
13887     /* Loaded using DUP.  */
13888     return const_dup;
13889   else if (const_vec != NULL_RTX)
13890     /* Load from constant pool. We can not take advantage of single-cycle
13891        LD1 because we need a PC-relative addressing mode.  */
13892     return const_vec;
13893   else
13894     /* A PARALLEL containing something not valid inside CONST_VECTOR.
13895        We can not construct an initializer.  */
13896     return NULL_RTX;
13897 }
13898
13899 /* Expand a vector initialisation sequence, such that TARGET is
13900    initialised to contain VALS.  */
13901
13902 void
13903 aarch64_expand_vector_init (rtx target, rtx vals)
13904 {
13905   machine_mode mode = GET_MODE (target);
13906   scalar_mode inner_mode = GET_MODE_INNER (mode);
13907   /* The number of vector elements.  */
13908   int n_elts = XVECLEN (vals, 0);
13909   /* The number of vector elements which are not constant.  */
13910   int n_var = 0;
13911   rtx any_const = NULL_RTX;
13912   /* The first element of vals.  */
13913   rtx v0 = XVECEXP (vals, 0, 0);
13914   bool all_same = true;
13915
13916   /* Count the number of variable elements to initialise.  */
13917   for (int i = 0; i < n_elts; ++i)
13918     {
13919       rtx x = XVECEXP (vals, 0, i);
13920       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
13921         ++n_var;
13922       else
13923         any_const = x;
13924
13925       all_same &= rtx_equal_p (x, v0);
13926     }
13927
13928   /* No variable elements, hand off to aarch64_simd_make_constant which knows
13929      how best to handle this.  */
13930   if (n_var == 0)
13931     {
13932       rtx constant = aarch64_simd_make_constant (vals);
13933       if (constant != NULL_RTX)
13934         {
13935           emit_move_insn (target, constant);
13936           return;
13937         }
13938     }
13939
13940   /* Splat a single non-constant element if we can.  */
13941   if (all_same)
13942     {
13943       rtx x = copy_to_mode_reg (inner_mode, v0);
13944       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13945       return;
13946     }
13947
13948   enum insn_code icode = optab_handler (vec_set_optab, mode);
13949   gcc_assert (icode != CODE_FOR_nothing);
13950
13951   /* If there are only variable elements, try to optimize
13952      the insertion using dup for the most common element
13953      followed by insertions.  */
13954
13955   /* The algorithm will fill matches[*][0] with the earliest matching element,
13956      and matches[X][1] with the count of duplicate elements (if X is the
13957      earliest element which has duplicates).  */
13958
13959   if (n_var == n_elts && n_elts <= 16)
13960     {
13961       int matches[16][2] = {0};
13962       for (int i = 0; i < n_elts; i++)
13963         {
13964           for (int j = 0; j <= i; j++)
13965             {
13966               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
13967                 {
13968                   matches[i][0] = j;
13969                   matches[j][1]++;
13970                   break;
13971                 }
13972             }
13973         }
13974       int maxelement = 0;
13975       int maxv = 0;
13976       for (int i = 0; i < n_elts; i++)
13977         if (matches[i][1] > maxv)
13978           {
13979             maxelement = i;
13980             maxv = matches[i][1];
13981           }
13982
13983       /* Create a duplicate of the most common element, unless all elements
13984          are equally useless to us, in which case just immediately set the
13985          vector register using the first element.  */
13986
13987       if (maxv == 1)
13988         {
13989           /* For vectors of two 64-bit elements, we can do even better.  */
13990           if (n_elts == 2
13991               && (inner_mode == E_DImode
13992                   || inner_mode == E_DFmode))
13993
13994             {
13995               rtx x0 = XVECEXP (vals, 0, 0);
13996               rtx x1 = XVECEXP (vals, 0, 1);
13997               /* Combine can pick up this case, but handling it directly
13998                  here leaves clearer RTL.
13999
14000                  This is load_pair_lanes<mode>, and also gives us a clean-up
14001                  for store_pair_lanes<mode>.  */
14002               if (memory_operand (x0, inner_mode)
14003                   && memory_operand (x1, inner_mode)
14004                   && !STRICT_ALIGNMENT
14005                   && rtx_equal_p (XEXP (x1, 0),
14006                                   plus_constant (Pmode,
14007                                                  XEXP (x0, 0),
14008                                                  GET_MODE_SIZE (inner_mode))))
14009                 {
14010                   rtx t;
14011                   if (inner_mode == DFmode)
14012                     t = gen_load_pair_lanesdf (target, x0, x1);
14013                   else
14014                     t = gen_load_pair_lanesdi (target, x0, x1);
14015                   emit_insn (t);
14016                   return;
14017                 }
14018             }
14019           /* The subreg-move sequence below will move into lane zero of the
14020              vector register.  For big-endian we want that position to hold
14021              the last element of VALS.  */
14022           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
14023           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
14024           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
14025         }
14026       else
14027         {
14028           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
14029           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
14030         }
14031
14032       /* Insert the rest.  */
14033       for (int i = 0; i < n_elts; i++)
14034         {
14035           rtx x = XVECEXP (vals, 0, i);
14036           if (matches[i][0] == maxelement)
14037             continue;
14038           x = copy_to_mode_reg (inner_mode, x);
14039           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
14040         }
14041       return;
14042     }
14043
14044   /* Initialise a vector which is part-variable.  We want to first try
14045      to build those lanes which are constant in the most efficient way we
14046      can.  */
14047   if (n_var != n_elts)
14048     {
14049       rtx copy = copy_rtx (vals);
14050
14051       /* Load constant part of vector.  We really don't care what goes into the
14052          parts we will overwrite, but we're more likely to be able to load the
14053          constant efficiently if it has fewer, larger, repeating parts
14054          (see aarch64_simd_valid_immediate).  */
14055       for (int i = 0; i < n_elts; i++)
14056         {
14057           rtx x = XVECEXP (vals, 0, i);
14058           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
14059             continue;
14060           rtx subst = any_const;
14061           for (int bit = n_elts / 2; bit > 0; bit /= 2)
14062             {
14063               /* Look in the copied vector, as more elements are const.  */
14064               rtx test = XVECEXP (copy, 0, i ^ bit);
14065               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
14066                 {
14067                   subst = test;
14068                   break;
14069                 }
14070             }
14071           XVECEXP (copy, 0, i) = subst;
14072         }
14073       aarch64_expand_vector_init (target, copy);
14074     }
14075
14076   /* Insert the variable lanes directly.  */
14077   for (int i = 0; i < n_elts; i++)
14078     {
14079       rtx x = XVECEXP (vals, 0, i);
14080       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
14081         continue;
14082       x = copy_to_mode_reg (inner_mode, x);
14083       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
14084     }
14085 }
14086
14087 static unsigned HOST_WIDE_INT
14088 aarch64_shift_truncation_mask (machine_mode mode)
14089 {
14090   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
14091     return 0;
14092   return GET_MODE_UNIT_BITSIZE (mode) - 1;
14093 }
14094
14095 /* Select a format to encode pointers in exception handling data.  */
14096 int
14097 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
14098 {
14099    int type;
14100    switch (aarch64_cmodel)
14101      {
14102      case AARCH64_CMODEL_TINY:
14103      case AARCH64_CMODEL_TINY_PIC:
14104      case AARCH64_CMODEL_SMALL:
14105      case AARCH64_CMODEL_SMALL_PIC:
14106      case AARCH64_CMODEL_SMALL_SPIC:
14107        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
14108           for everything.  */
14109        type = DW_EH_PE_sdata4;
14110        break;
14111      default:
14112        /* No assumptions here.  8-byte relocs required.  */
14113        type = DW_EH_PE_sdata8;
14114        break;
14115      }
14116    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
14117 }
14118
14119 /* The last .arch and .tune assembly strings that we printed.  */
14120 static std::string aarch64_last_printed_arch_string;
14121 static std::string aarch64_last_printed_tune_string;
14122
14123 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
14124    by the function fndecl.  */
14125
14126 void
14127 aarch64_declare_function_name (FILE *stream, const char* name,
14128                                 tree fndecl)
14129 {
14130   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
14131
14132   struct cl_target_option *targ_options;
14133   if (target_parts)
14134     targ_options = TREE_TARGET_OPTION (target_parts);
14135   else
14136     targ_options = TREE_TARGET_OPTION (target_option_current_node);
14137   gcc_assert (targ_options);
14138
14139   const struct processor *this_arch
14140     = aarch64_get_arch (targ_options->x_explicit_arch);
14141
14142   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
14143   std::string extension
14144     = aarch64_get_extension_string_for_isa_flags (isa_flags,
14145                                                   this_arch->flags);
14146   /* Only update the assembler .arch string if it is distinct from the last
14147      such string we printed.  */
14148   std::string to_print = this_arch->name + extension;
14149   if (to_print != aarch64_last_printed_arch_string)
14150     {
14151       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
14152       aarch64_last_printed_arch_string = to_print;
14153     }
14154
14155   /* Print the cpu name we're tuning for in the comments, might be
14156      useful to readers of the generated asm.  Do it only when it changes
14157      from function to function and verbose assembly is requested.  */
14158   const struct processor *this_tune
14159     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
14160
14161   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
14162     {
14163       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
14164                    this_tune->name);
14165       aarch64_last_printed_tune_string = this_tune->name;
14166     }
14167
14168   /* Don't forget the type directive for ELF.  */
14169   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
14170   ASM_OUTPUT_LABEL (stream, name);
14171 }
14172
14173 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
14174
14175 static void
14176 aarch64_start_file (void)
14177 {
14178   struct cl_target_option *default_options
14179     = TREE_TARGET_OPTION (target_option_default_node);
14180
14181   const struct processor *default_arch
14182     = aarch64_get_arch (default_options->x_explicit_arch);
14183   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
14184   std::string extension
14185     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
14186                                                   default_arch->flags);
14187
14188    aarch64_last_printed_arch_string = default_arch->name + extension;
14189    aarch64_last_printed_tune_string = "";
14190    asm_fprintf (asm_out_file, "\t.arch %s\n",
14191                 aarch64_last_printed_arch_string.c_str ());
14192
14193    default_file_start ();
14194 }
14195
14196 /* Emit load exclusive.  */
14197
14198 static void
14199 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
14200                              rtx mem, rtx model_rtx)
14201 {
14202   rtx (*gen) (rtx, rtx, rtx);
14203
14204   switch (mode)
14205     {
14206     case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
14207     case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
14208     case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
14209     case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
14210     default:
14211       gcc_unreachable ();
14212     }
14213
14214   emit_insn (gen (rval, mem, model_rtx));
14215 }
14216
14217 /* Emit store exclusive.  */
14218
14219 static void
14220 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
14221                               rtx rval, rtx mem, rtx model_rtx)
14222 {
14223   rtx (*gen) (rtx, rtx, rtx, rtx);
14224
14225   switch (mode)
14226     {
14227     case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
14228     case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
14229     case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
14230     case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
14231     default:
14232       gcc_unreachable ();
14233     }
14234
14235   emit_insn (gen (bval, rval, mem, model_rtx));
14236 }
14237
14238 /* Mark the previous jump instruction as unlikely.  */
14239
14240 static void
14241 aarch64_emit_unlikely_jump (rtx insn)
14242 {
14243   rtx_insn *jump = emit_jump_insn (insn);
14244   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
14245 }
14246
14247 /* Expand a compare and swap pattern.  */
14248
14249 void
14250 aarch64_expand_compare_and_swap (rtx operands[])
14251 {
14252   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
14253   machine_mode mode, cmp_mode;
14254   typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
14255   int idx;
14256   gen_cas_fn gen;
14257   const gen_cas_fn split_cas[] =
14258   {
14259     gen_aarch64_compare_and_swapqi,
14260     gen_aarch64_compare_and_swaphi,
14261     gen_aarch64_compare_and_swapsi,
14262     gen_aarch64_compare_and_swapdi
14263   };
14264   const gen_cas_fn atomic_cas[] =
14265   {
14266     gen_aarch64_compare_and_swapqi_lse,
14267     gen_aarch64_compare_and_swaphi_lse,
14268     gen_aarch64_compare_and_swapsi_lse,
14269     gen_aarch64_compare_and_swapdi_lse
14270   };
14271
14272   bval = operands[0];
14273   rval = operands[1];
14274   mem = operands[2];
14275   oldval = operands[3];
14276   newval = operands[4];
14277   is_weak = operands[5];
14278   mod_s = operands[6];
14279   mod_f = operands[7];
14280   mode = GET_MODE (mem);
14281   cmp_mode = mode;
14282
14283   /* Normally the succ memory model must be stronger than fail, but in the
14284      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
14285      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
14286
14287   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
14288       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
14289     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
14290
14291   switch (mode)
14292     {
14293     case E_QImode:
14294     case E_HImode:
14295       /* For short modes, we're going to perform the comparison in SImode,
14296          so do the zero-extension now.  */
14297       cmp_mode = SImode;
14298       rval = gen_reg_rtx (SImode);
14299       oldval = convert_modes (SImode, mode, oldval, true);
14300       /* Fall through.  */
14301
14302     case E_SImode:
14303     case E_DImode:
14304       /* Force the value into a register if needed.  */
14305       if (!aarch64_plus_operand (oldval, mode))
14306         oldval = force_reg (cmp_mode, oldval);
14307       break;
14308
14309     default:
14310       gcc_unreachable ();
14311     }
14312
14313   switch (mode)
14314     {
14315     case E_QImode: idx = 0; break;
14316     case E_HImode: idx = 1; break;
14317     case E_SImode: idx = 2; break;
14318     case E_DImode: idx = 3; break;
14319     default:
14320       gcc_unreachable ();
14321     }
14322   if (TARGET_LSE)
14323     gen = atomic_cas[idx];
14324   else
14325     gen = split_cas[idx];
14326
14327   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
14328
14329   if (mode == QImode || mode == HImode)
14330     emit_move_insn (operands[1], gen_lowpart (mode, rval));
14331
14332   x = gen_rtx_REG (CCmode, CC_REGNUM);
14333   x = gen_rtx_EQ (SImode, x, const0_rtx);
14334   emit_insn (gen_rtx_SET (bval, x));
14335 }
14336
14337 /* Test whether the target supports using a atomic load-operate instruction.
14338    CODE is the operation and AFTER is TRUE if the data in memory after the
14339    operation should be returned and FALSE if the data before the operation
14340    should be returned.  Returns FALSE if the operation isn't supported by the
14341    architecture.  */
14342
14343 bool
14344 aarch64_atomic_ldop_supported_p (enum rtx_code code)
14345 {
14346   if (!TARGET_LSE)
14347     return false;
14348
14349   switch (code)
14350     {
14351     case SET:
14352     case AND:
14353     case IOR:
14354     case XOR:
14355     case MINUS:
14356     case PLUS:
14357       return true;
14358     default:
14359       return false;
14360     }
14361 }
14362
14363 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
14364    sequence implementing an atomic operation.  */
14365
14366 static void
14367 aarch64_emit_post_barrier (enum memmodel model)
14368 {
14369   const enum memmodel base_model = memmodel_base (model);
14370
14371   if (is_mm_sync (model)
14372       && (base_model == MEMMODEL_ACQUIRE
14373           || base_model == MEMMODEL_ACQ_REL
14374           || base_model == MEMMODEL_SEQ_CST))
14375     {
14376       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
14377     }
14378 }
14379
14380 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
14381    for the data in memory.  EXPECTED is the value expected to be in memory.
14382    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
14383    is the memory ordering to use.  */
14384
14385 void
14386 aarch64_gen_atomic_cas (rtx rval, rtx mem,
14387                         rtx expected, rtx desired,
14388                         rtx model)
14389 {
14390   rtx (*gen) (rtx, rtx, rtx, rtx);
14391   machine_mode mode;
14392
14393   mode = GET_MODE (mem);
14394
14395   switch (mode)
14396     {
14397     case E_QImode: gen = gen_aarch64_atomic_casqi; break;
14398     case E_HImode: gen = gen_aarch64_atomic_cashi; break;
14399     case E_SImode: gen = gen_aarch64_atomic_cassi; break;
14400     case E_DImode: gen = gen_aarch64_atomic_casdi; break;
14401     default:
14402       gcc_unreachable ();
14403     }
14404
14405   /* Move the expected value into the CAS destination register.  */
14406   emit_insn (gen_rtx_SET (rval, expected));
14407
14408   /* Emit the CAS.  */
14409   emit_insn (gen (rval, mem, desired, model));
14410
14411   /* Compare the expected value with the value loaded by the CAS, to establish
14412      whether the swap was made.  */
14413   aarch64_gen_compare_reg (EQ, rval, expected);
14414 }
14415
14416 /* Split a compare and swap pattern.  */
14417
14418 void
14419 aarch64_split_compare_and_swap (rtx operands[])
14420 {
14421   rtx rval, mem, oldval, newval, scratch;
14422   machine_mode mode;
14423   bool is_weak;
14424   rtx_code_label *label1, *label2;
14425   rtx x, cond;
14426   enum memmodel model;
14427   rtx model_rtx;
14428
14429   rval = operands[0];
14430   mem = operands[1];
14431   oldval = operands[2];
14432   newval = operands[3];
14433   is_weak = (operands[4] != const0_rtx);
14434   model_rtx = operands[5];
14435   scratch = operands[7];
14436   mode = GET_MODE (mem);
14437   model = memmodel_from_int (INTVAL (model_rtx));
14438
14439   /* When OLDVAL is zero and we want the strong version we can emit a tighter
14440     loop:
14441     .label1:
14442         LD[A]XR rval, [mem]
14443         CBNZ    rval, .label2
14444         ST[L]XR scratch, newval, [mem]
14445         CBNZ    scratch, .label1
14446     .label2:
14447         CMP     rval, 0.  */
14448   bool strong_zero_p = !is_weak && oldval == const0_rtx;
14449
14450   label1 = NULL;
14451   if (!is_weak)
14452     {
14453       label1 = gen_label_rtx ();
14454       emit_label (label1);
14455     }
14456   label2 = gen_label_rtx ();
14457
14458   /* The initial load can be relaxed for a __sync operation since a final
14459      barrier will be emitted to stop code hoisting.  */
14460   if (is_mm_sync (model))
14461     aarch64_emit_load_exclusive (mode, rval, mem,
14462                                  GEN_INT (MEMMODEL_RELAXED));
14463   else
14464     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
14465
14466   if (strong_zero_p)
14467     {
14468       x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
14469       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14470                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14471       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14472     }
14473   else
14474     {
14475       cond = aarch64_gen_compare_reg (NE, rval, oldval);
14476       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14477       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14478                                  gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14479       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14480     }
14481
14482   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
14483
14484   if (!is_weak)
14485     {
14486       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
14487       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14488                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
14489       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14490     }
14491   else
14492     {
14493       cond = gen_rtx_REG (CCmode, CC_REGNUM);
14494       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
14495       emit_insn (gen_rtx_SET (cond, x));
14496     }
14497
14498   emit_label (label2);
14499   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
14500      to set the condition flags.  If this is not used it will be removed by
14501      later passes.  */
14502   if (strong_zero_p)
14503     {
14504       cond = gen_rtx_REG (CCmode, CC_REGNUM);
14505       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
14506       emit_insn (gen_rtx_SET (cond, x));
14507     }
14508   /* Emit any final barrier needed for a __sync operation.  */
14509   if (is_mm_sync (model))
14510     aarch64_emit_post_barrier (model);
14511 }
14512
14513 /* Emit a BIC instruction.  */
14514
14515 static void
14516 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
14517 {
14518   rtx shift_rtx = GEN_INT (shift);
14519   rtx (*gen) (rtx, rtx, rtx, rtx);
14520
14521   switch (mode)
14522     {
14523     case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
14524     case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
14525     default:
14526       gcc_unreachable ();
14527     }
14528
14529   emit_insn (gen (dst, s2, shift_rtx, s1));
14530 }
14531
14532 /* Emit an atomic swap.  */
14533
14534 static void
14535 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
14536                           rtx mem, rtx model)
14537 {
14538   rtx (*gen) (rtx, rtx, rtx, rtx);
14539
14540   switch (mode)
14541     {
14542     case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
14543     case E_HImode: gen = gen_aarch64_atomic_swphi; break;
14544     case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
14545     case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
14546     default:
14547       gcc_unreachable ();
14548     }
14549
14550   emit_insn (gen (dst, mem, value, model));
14551 }
14552
14553 /* Operations supported by aarch64_emit_atomic_load_op.  */
14554
14555 enum aarch64_atomic_load_op_code
14556 {
14557   AARCH64_LDOP_PLUS,    /* A + B  */
14558   AARCH64_LDOP_XOR,     /* A ^ B  */
14559   AARCH64_LDOP_OR,      /* A | B  */
14560   AARCH64_LDOP_BIC      /* A & ~B  */
14561 };
14562
14563 /* Emit an atomic load-operate.  */
14564
14565 static void
14566 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
14567                              machine_mode mode, rtx dst, rtx src,
14568                              rtx mem, rtx model)
14569 {
14570   typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
14571   const aarch64_atomic_load_op_fn plus[] =
14572   {
14573     gen_aarch64_atomic_loadaddqi,
14574     gen_aarch64_atomic_loadaddhi,
14575     gen_aarch64_atomic_loadaddsi,
14576     gen_aarch64_atomic_loadadddi
14577   };
14578   const aarch64_atomic_load_op_fn eor[] =
14579   {
14580     gen_aarch64_atomic_loadeorqi,
14581     gen_aarch64_atomic_loadeorhi,
14582     gen_aarch64_atomic_loadeorsi,
14583     gen_aarch64_atomic_loadeordi
14584   };
14585   const aarch64_atomic_load_op_fn ior[] =
14586   {
14587     gen_aarch64_atomic_loadsetqi,
14588     gen_aarch64_atomic_loadsethi,
14589     gen_aarch64_atomic_loadsetsi,
14590     gen_aarch64_atomic_loadsetdi
14591   };
14592   const aarch64_atomic_load_op_fn bic[] =
14593   {
14594     gen_aarch64_atomic_loadclrqi,
14595     gen_aarch64_atomic_loadclrhi,
14596     gen_aarch64_atomic_loadclrsi,
14597     gen_aarch64_atomic_loadclrdi
14598   };
14599   aarch64_atomic_load_op_fn gen;
14600   int idx = 0;
14601
14602   switch (mode)
14603     {
14604     case E_QImode: idx = 0; break;
14605     case E_HImode: idx = 1; break;
14606     case E_SImode: idx = 2; break;
14607     case E_DImode: idx = 3; break;
14608     default:
14609       gcc_unreachable ();
14610     }
14611
14612   switch (code)
14613     {
14614     case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
14615     case AARCH64_LDOP_XOR: gen = eor[idx]; break;
14616     case AARCH64_LDOP_OR: gen = ior[idx]; break;
14617     case AARCH64_LDOP_BIC: gen = bic[idx]; break;
14618     default:
14619       gcc_unreachable ();
14620     }
14621
14622   emit_insn (gen (dst, mem, src, model));
14623 }
14624
14625 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
14626    location to store the data read from memory.  OUT_RESULT is the location to
14627    store the result of the operation.  MEM is the memory location to read and
14628    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
14629    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
14630    be NULL.  */
14631
14632 void
14633 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
14634                          rtx mem, rtx value, rtx model_rtx)
14635 {
14636   machine_mode mode = GET_MODE (mem);
14637   machine_mode wmode = (mode == DImode ? DImode : SImode);
14638   const bool short_mode = (mode < SImode);
14639   aarch64_atomic_load_op_code ldop_code;
14640   rtx src;
14641   rtx x;
14642
14643   if (out_data)
14644     out_data = gen_lowpart (mode, out_data);
14645
14646   if (out_result)
14647     out_result = gen_lowpart (mode, out_result);
14648
14649   /* Make sure the value is in a register, putting it into a destination
14650      register if it needs to be manipulated.  */
14651   if (!register_operand (value, mode)
14652       || code == AND || code == MINUS)
14653     {
14654       src = out_result ? out_result : out_data;
14655       emit_move_insn (src, gen_lowpart (mode, value));
14656     }
14657   else
14658     src = value;
14659   gcc_assert (register_operand (src, mode));
14660
14661   /* Preprocess the data for the operation as necessary.  If the operation is
14662      a SET then emit a swap instruction and finish.  */
14663   switch (code)
14664     {
14665     case SET:
14666       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
14667       return;
14668
14669     case MINUS:
14670       /* Negate the value and treat it as a PLUS.  */
14671       {
14672         rtx neg_src;
14673
14674         /* Resize the value if necessary.  */
14675         if (short_mode)
14676           src = gen_lowpart (wmode, src);
14677
14678         neg_src = gen_rtx_NEG (wmode, src);
14679         emit_insn (gen_rtx_SET (src, neg_src));
14680
14681         if (short_mode)
14682           src = gen_lowpart (mode, src);
14683       }
14684       /* Fall-through.  */
14685     case PLUS:
14686       ldop_code = AARCH64_LDOP_PLUS;
14687       break;
14688
14689     case IOR:
14690       ldop_code = AARCH64_LDOP_OR;
14691       break;
14692
14693     case XOR:
14694       ldop_code = AARCH64_LDOP_XOR;
14695       break;
14696
14697     case AND:
14698       {
14699         rtx not_src;
14700
14701         /* Resize the value if necessary.  */
14702         if (short_mode)
14703           src = gen_lowpart (wmode, src);
14704
14705         not_src = gen_rtx_NOT (wmode, src);
14706         emit_insn (gen_rtx_SET (src, not_src));
14707
14708         if (short_mode)
14709           src = gen_lowpart (mode, src);
14710       }
14711       ldop_code = AARCH64_LDOP_BIC;
14712       break;
14713
14714     default:
14715       /* The operation can't be done with atomic instructions.  */
14716       gcc_unreachable ();
14717     }
14718
14719   aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
14720
14721   /* If necessary, calculate the data in memory after the update by redoing the
14722      operation from values in registers.  */
14723   if (!out_result)
14724     return;
14725
14726   if (short_mode)
14727     {
14728       src = gen_lowpart (wmode, src);
14729       out_data = gen_lowpart (wmode, out_data);
14730       out_result = gen_lowpart (wmode, out_result);
14731     }
14732
14733   x = NULL_RTX;
14734
14735   switch (code)
14736     {
14737     case MINUS:
14738     case PLUS:
14739       x = gen_rtx_PLUS (wmode, out_data, src);
14740       break;
14741     case IOR:
14742       x = gen_rtx_IOR (wmode, out_data, src);
14743       break;
14744     case XOR:
14745       x = gen_rtx_XOR (wmode, out_data, src);
14746       break;
14747     case AND:
14748       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
14749       return;
14750     default:
14751       gcc_unreachable ();
14752     }
14753
14754   emit_set_insn (out_result, x);
14755
14756   return;
14757 }
14758
14759 /* Split an atomic operation.  */
14760
14761 void
14762 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
14763                          rtx value, rtx model_rtx, rtx cond)
14764 {
14765   machine_mode mode = GET_MODE (mem);
14766   machine_mode wmode = (mode == DImode ? DImode : SImode);
14767   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
14768   const bool is_sync = is_mm_sync (model);
14769   rtx_code_label *label;
14770   rtx x;
14771
14772   /* Split the atomic operation into a sequence.  */
14773   label = gen_label_rtx ();
14774   emit_label (label);
14775
14776   if (new_out)
14777     new_out = gen_lowpart (wmode, new_out);
14778   if (old_out)
14779     old_out = gen_lowpart (wmode, old_out);
14780   else
14781     old_out = new_out;
14782   value = simplify_gen_subreg (wmode, value, mode, 0);
14783
14784   /* The initial load can be relaxed for a __sync operation since a final
14785      barrier will be emitted to stop code hoisting.  */
14786  if (is_sync)
14787     aarch64_emit_load_exclusive (mode, old_out, mem,
14788                                  GEN_INT (MEMMODEL_RELAXED));
14789   else
14790     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
14791
14792   switch (code)
14793     {
14794     case SET:
14795       new_out = value;
14796       break;
14797
14798     case NOT:
14799       x = gen_rtx_AND (wmode, old_out, value);
14800       emit_insn (gen_rtx_SET (new_out, x));
14801       x = gen_rtx_NOT (wmode, new_out);
14802       emit_insn (gen_rtx_SET (new_out, x));
14803       break;
14804
14805     case MINUS:
14806       if (CONST_INT_P (value))
14807         {
14808           value = GEN_INT (-INTVAL (value));
14809           code = PLUS;
14810         }
14811       /* Fall through.  */
14812
14813     default:
14814       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
14815       emit_insn (gen_rtx_SET (new_out, x));
14816       break;
14817     }
14818
14819   aarch64_emit_store_exclusive (mode, cond, mem,
14820                                 gen_lowpart (mode, new_out), model_rtx);
14821
14822   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14823   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14824                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
14825   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14826
14827   /* Emit any final barrier needed for a __sync operation.  */
14828   if (is_sync)
14829     aarch64_emit_post_barrier (model);
14830 }
14831
14832 static void
14833 aarch64_init_libfuncs (void)
14834 {
14835    /* Half-precision float operations.  The compiler handles all operations
14836      with NULL libfuncs by converting to SFmode.  */
14837
14838   /* Conversions.  */
14839   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
14840   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
14841
14842   /* Arithmetic.  */
14843   set_optab_libfunc (add_optab, HFmode, NULL);
14844   set_optab_libfunc (sdiv_optab, HFmode, NULL);
14845   set_optab_libfunc (smul_optab, HFmode, NULL);
14846   set_optab_libfunc (neg_optab, HFmode, NULL);
14847   set_optab_libfunc (sub_optab, HFmode, NULL);
14848
14849   /* Comparisons.  */
14850   set_optab_libfunc (eq_optab, HFmode, NULL);
14851   set_optab_libfunc (ne_optab, HFmode, NULL);
14852   set_optab_libfunc (lt_optab, HFmode, NULL);
14853   set_optab_libfunc (le_optab, HFmode, NULL);
14854   set_optab_libfunc (ge_optab, HFmode, NULL);
14855   set_optab_libfunc (gt_optab, HFmode, NULL);
14856   set_optab_libfunc (unord_optab, HFmode, NULL);
14857 }
14858
14859 /* Target hook for c_mode_for_suffix.  */
14860 static machine_mode
14861 aarch64_c_mode_for_suffix (char suffix)
14862 {
14863   if (suffix == 'q')
14864     return TFmode;
14865
14866   return VOIDmode;
14867 }
14868
14869 /* We can only represent floating point constants which will fit in
14870    "quarter-precision" values.  These values are characterised by
14871    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
14872    by:
14873
14874    (-1)^s * (n/16) * 2^r
14875
14876    Where:
14877      's' is the sign bit.
14878      'n' is an integer in the range 16 <= n <= 31.
14879      'r' is an integer in the range -3 <= r <= 4.  */
14880
14881 /* Return true iff X can be represented by a quarter-precision
14882    floating point immediate operand X.  Note, we cannot represent 0.0.  */
14883 bool
14884 aarch64_float_const_representable_p (rtx x)
14885 {
14886   /* This represents our current view of how many bits
14887      make up the mantissa.  */
14888   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
14889   int exponent;
14890   unsigned HOST_WIDE_INT mantissa, mask;
14891   REAL_VALUE_TYPE r, m;
14892   bool fail;
14893
14894   if (!CONST_DOUBLE_P (x))
14895     return false;
14896
14897   /* We don't support HFmode constants yet.  */
14898   if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
14899     return false;
14900
14901   r = *CONST_DOUBLE_REAL_VALUE (x);
14902
14903   /* We cannot represent infinities, NaNs or +/-zero.  We won't
14904      know if we have +zero until we analyse the mantissa, but we
14905      can reject the other invalid values.  */
14906   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
14907       || REAL_VALUE_MINUS_ZERO (r))
14908     return false;
14909
14910   /* Extract exponent.  */
14911   r = real_value_abs (&r);
14912   exponent = REAL_EXP (&r);
14913
14914   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
14915      highest (sign) bit, with a fixed binary point at bit point_pos.
14916      m1 holds the low part of the mantissa, m2 the high part.
14917      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
14918      bits for the mantissa, this can fail (low bits will be lost).  */
14919   real_ldexp (&m, &r, point_pos - exponent);
14920   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
14921
14922   /* If the low part of the mantissa has bits set we cannot represent
14923      the value.  */
14924   if (w.ulow () != 0)
14925     return false;
14926   /* We have rejected the lower HOST_WIDE_INT, so update our
14927      understanding of how many bits lie in the mantissa and
14928      look only at the high HOST_WIDE_INT.  */
14929   mantissa = w.elt (1);
14930   point_pos -= HOST_BITS_PER_WIDE_INT;
14931
14932   /* We can only represent values with a mantissa of the form 1.xxxx.  */
14933   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
14934   if ((mantissa & mask) != 0)
14935     return false;
14936
14937   /* Having filtered unrepresentable values, we may now remove all
14938      but the highest 5 bits.  */
14939   mantissa >>= point_pos - 5;
14940
14941   /* We cannot represent the value 0.0, so reject it.  This is handled
14942      elsewhere.  */
14943   if (mantissa == 0)
14944     return false;
14945
14946   /* Then, as bit 4 is always set, we can mask it off, leaving
14947      the mantissa in the range [0, 15].  */
14948   mantissa &= ~(1 << 4);
14949   gcc_assert (mantissa <= 15);
14950
14951   /* GCC internally does not use IEEE754-like encoding (where normalized
14952      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
14953      Our mantissa values are shifted 4 places to the left relative to
14954      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
14955      by 5 places to correct for GCC's representation.  */
14956   exponent = 5 - exponent;
14957
14958   return (exponent >= 0 && exponent <= 7);
14959 }
14960
14961 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
14962    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
14963    output MOVI/MVNI, ORR or BIC immediate.  */
14964 char*
14965 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
14966                                    enum simd_immediate_check which)
14967 {
14968   bool is_valid;
14969   static char templ[40];
14970   const char *mnemonic;
14971   const char *shift_op;
14972   unsigned int lane_count = 0;
14973   char element_char;
14974
14975   struct simd_immediate_info info;
14976
14977   /* This will return true to show const_vector is legal for use as either
14978      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
14979      It will also update INFO to show how the immediate should be generated.
14980      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
14981   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
14982   gcc_assert (is_valid);
14983
14984   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14985   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
14986
14987   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
14988     {
14989       gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
14990       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
14991          move immediate path.  */
14992       if (aarch64_float_const_zero_rtx_p (info.value))
14993         info.value = GEN_INT (0);
14994       else
14995         {
14996           const unsigned int buf_size = 20;
14997           char float_buf[buf_size] = {'\0'};
14998           real_to_decimal_for_mode (float_buf,
14999                                     CONST_DOUBLE_REAL_VALUE (info.value),
15000                                     buf_size, buf_size, 1, info.elt_mode);
15001
15002           if (lane_count == 1)
15003             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
15004           else
15005             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
15006                       lane_count, element_char, float_buf);
15007           return templ;
15008         }
15009     }
15010
15011   gcc_assert (CONST_INT_P (info.value));
15012
15013   if (which == AARCH64_CHECK_MOV)
15014     {
15015       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
15016       shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
15017       if (lane_count == 1)
15018         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
15019                   mnemonic, UINTVAL (info.value));
15020       else if (info.shift)
15021         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
15022                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
15023                   element_char, UINTVAL (info.value), shift_op, info.shift);
15024       else
15025         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
15026                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
15027                   element_char, UINTVAL (info.value));
15028     }
15029   else
15030     {
15031       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
15032       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
15033       if (info.shift)
15034         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
15035                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
15036                   element_char, UINTVAL (info.value), "lsl", info.shift);
15037       else
15038         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
15039                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
15040                   element_char, UINTVAL (info.value));
15041     }
15042   return templ;
15043 }
15044
15045 char*
15046 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
15047 {
15048
15049   /* If a floating point number was passed and we desire to use it in an
15050      integer mode do the conversion to integer.  */
15051   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
15052     {
15053       unsigned HOST_WIDE_INT ival;
15054       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
15055           gcc_unreachable ();
15056       immediate = gen_int_mode (ival, mode);
15057     }
15058
15059   machine_mode vmode;
15060   /* use a 64 bit mode for everything except for DI/DF mode, where we use
15061      a 128 bit vector mode.  */
15062   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
15063
15064   vmode = aarch64_simd_container_mode (mode, width);
15065   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
15066   return aarch64_output_simd_mov_immediate (v_op, width);
15067 }
15068
15069 /* Return the output string to use for moving immediate CONST_VECTOR
15070    into an SVE register.  */
15071
15072 char *
15073 aarch64_output_sve_mov_immediate (rtx const_vector)
15074 {
15075   static char templ[40];
15076   struct simd_immediate_info info;
15077   char element_char;
15078
15079   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
15080   gcc_assert (is_valid);
15081
15082   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
15083
15084   if (info.step)
15085     {
15086       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
15087                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
15088                 element_char, INTVAL (info.value), INTVAL (info.step));
15089       return templ;
15090     }
15091
15092   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15093     {
15094       if (aarch64_float_const_zero_rtx_p (info.value))
15095         info.value = GEN_INT (0);
15096       else
15097         {
15098           const int buf_size = 20;
15099           char float_buf[buf_size] = {};
15100           real_to_decimal_for_mode (float_buf,
15101                                     CONST_DOUBLE_REAL_VALUE (info.value),
15102                                     buf_size, buf_size, 1, info.elt_mode);
15103
15104           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
15105                     element_char, float_buf);
15106           return templ;
15107         }
15108     }
15109
15110   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
15111             element_char, INTVAL (info.value));
15112   return templ;
15113 }
15114
15115 /* Return the asm format for a PTRUE instruction whose destination has
15116    mode MODE.  SUFFIX is the element size suffix.  */
15117
15118 char *
15119 aarch64_output_ptrue (machine_mode mode, char suffix)
15120 {
15121   unsigned int nunits;
15122   static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
15123   if (GET_MODE_NUNITS (mode).is_constant (&nunits))
15124     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
15125   else
15126     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
15127   return buf;
15128 }
15129
15130 /* Split operands into moves from op[1] + op[2] into op[0].  */
15131
15132 void
15133 aarch64_split_combinev16qi (rtx operands[3])
15134 {
15135   unsigned int dest = REGNO (operands[0]);
15136   unsigned int src1 = REGNO (operands[1]);
15137   unsigned int src2 = REGNO (operands[2]);
15138   machine_mode halfmode = GET_MODE (operands[1]);
15139   unsigned int halfregs = REG_NREGS (operands[1]);
15140   rtx destlo, desthi;
15141
15142   gcc_assert (halfmode == V16QImode);
15143
15144   if (src1 == dest && src2 == dest + halfregs)
15145     {
15146       /* No-op move.  Can't split to nothing; emit something.  */
15147       emit_note (NOTE_INSN_DELETED);
15148       return;
15149     }
15150
15151   /* Preserve register attributes for variable tracking.  */
15152   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
15153   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
15154                                GET_MODE_SIZE (halfmode));
15155
15156   /* Special case of reversed high/low parts.  */
15157   if (reg_overlap_mentioned_p (operands[2], destlo)
15158       && reg_overlap_mentioned_p (operands[1], desthi))
15159     {
15160       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15161       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
15162       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15163     }
15164   else if (!reg_overlap_mentioned_p (operands[2], destlo))
15165     {
15166       /* Try to avoid unnecessary moves if part of the result
15167          is in the right place already.  */
15168       if (src1 != dest)
15169         emit_move_insn (destlo, operands[1]);
15170       if (src2 != dest + halfregs)
15171         emit_move_insn (desthi, operands[2]);
15172     }
15173   else
15174     {
15175       if (src2 != dest + halfregs)
15176         emit_move_insn (desthi, operands[2]);
15177       if (src1 != dest)
15178         emit_move_insn (destlo, operands[1]);
15179     }
15180 }
15181
15182 /* vec_perm support.  */
15183
15184 struct expand_vec_perm_d
15185 {
15186   rtx target, op0, op1;
15187   vec_perm_indices perm;
15188   machine_mode vmode;
15189   unsigned int vec_flags;
15190   bool one_vector_p;
15191   bool testing_p;
15192 };
15193
15194 /* Generate a variable permutation.  */
15195
15196 static void
15197 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
15198 {
15199   machine_mode vmode = GET_MODE (target);
15200   bool one_vector_p = rtx_equal_p (op0, op1);
15201
15202   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
15203   gcc_checking_assert (GET_MODE (op0) == vmode);
15204   gcc_checking_assert (GET_MODE (op1) == vmode);
15205   gcc_checking_assert (GET_MODE (sel) == vmode);
15206   gcc_checking_assert (TARGET_SIMD);
15207
15208   if (one_vector_p)
15209     {
15210       if (vmode == V8QImode)
15211         {
15212           /* Expand the argument to a V16QI mode by duplicating it.  */
15213           rtx pair = gen_reg_rtx (V16QImode);
15214           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
15215           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15216         }
15217       else
15218         {
15219           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
15220         }
15221     }
15222   else
15223     {
15224       rtx pair;
15225
15226       if (vmode == V8QImode)
15227         {
15228           pair = gen_reg_rtx (V16QImode);
15229           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
15230           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15231         }
15232       else
15233         {
15234           pair = gen_reg_rtx (OImode);
15235           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
15236           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
15237         }
15238     }
15239 }
15240
15241 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
15242    NELT is the number of elements in the vector.  */
15243
15244 void
15245 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
15246                          unsigned int nelt)
15247 {
15248   machine_mode vmode = GET_MODE (target);
15249   bool one_vector_p = rtx_equal_p (op0, op1);
15250   rtx mask;
15251
15252   /* The TBL instruction does not use a modulo index, so we must take care
15253      of that ourselves.  */
15254   mask = aarch64_simd_gen_const_vector_dup (vmode,
15255       one_vector_p ? nelt - 1 : 2 * nelt - 1);
15256   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
15257
15258   /* For big-endian, we also need to reverse the index within the vector
15259      (but not which vector).  */
15260   if (BYTES_BIG_ENDIAN)
15261     {
15262       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
15263       if (!one_vector_p)
15264         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
15265       sel = expand_simple_binop (vmode, XOR, sel, mask,
15266                                  NULL, 0, OPTAB_LIB_WIDEN);
15267     }
15268   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
15269 }
15270
15271 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
15272
15273 static void
15274 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
15275 {
15276   emit_insn (gen_rtx_SET (target,
15277                           gen_rtx_UNSPEC (GET_MODE (target),
15278                                           gen_rtvec (2, op0, op1), code)));
15279 }
15280
15281 /* Expand an SVE vec_perm with the given operands.  */
15282
15283 void
15284 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
15285 {
15286   machine_mode data_mode = GET_MODE (target);
15287   machine_mode sel_mode = GET_MODE (sel);
15288   /* Enforced by the pattern condition.  */
15289   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
15290
15291   /* Note: vec_perm indices are supposed to wrap when they go beyond the
15292      size of the two value vectors, i.e. the upper bits of the indices
15293      are effectively ignored.  SVE TBL instead produces 0 for any
15294      out-of-range indices, so we need to modulo all the vec_perm indices
15295      to ensure they are all in range.  */
15296   rtx sel_reg = force_reg (sel_mode, sel);
15297
15298   /* Check if the sel only references the first values vector.  */
15299   if (GET_CODE (sel) == CONST_VECTOR
15300       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
15301     {
15302       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
15303       return;
15304     }
15305
15306   /* Check if the two values vectors are the same.  */
15307   if (rtx_equal_p (op0, op1))
15308     {
15309       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
15310       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15311                                          NULL, 0, OPTAB_DIRECT);
15312       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
15313       return;
15314     }
15315
15316   /* Run TBL on for each value vector and combine the results.  */
15317
15318   rtx res0 = gen_reg_rtx (data_mode);
15319   rtx res1 = gen_reg_rtx (data_mode);
15320   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
15321   if (GET_CODE (sel) != CONST_VECTOR
15322       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
15323     {
15324       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
15325                                                        2 * nunits - 1);
15326       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15327                                      NULL, 0, OPTAB_DIRECT);
15328     }
15329   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
15330   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
15331                                      NULL, 0, OPTAB_DIRECT);
15332   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
15333   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
15334     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
15335   else
15336     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
15337 }
15338
15339 /* Recognize patterns suitable for the TRN instructions.  */
15340 static bool
15341 aarch64_evpc_trn (struct expand_vec_perm_d *d)
15342 {
15343   HOST_WIDE_INT odd;
15344   poly_uint64 nelt = d->perm.length ();
15345   rtx out, in0, in1, x;
15346   machine_mode vmode = d->vmode;
15347
15348   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15349     return false;
15350
15351   /* Note that these are little-endian tests.
15352      We correct for big-endian later.  */
15353   if (!d->perm[0].is_constant (&odd)
15354       || (odd != 0 && odd != 1)
15355       || !d->perm.series_p (0, 2, odd, 2)
15356       || !d->perm.series_p (1, 2, nelt + odd, 2))
15357     return false;
15358
15359   /* Success!  */
15360   if (d->testing_p)
15361     return true;
15362
15363   in0 = d->op0;
15364   in1 = d->op1;
15365   /* We don't need a big-endian lane correction for SVE; see the comment
15366      at the head of aarch64-sve.md for details.  */
15367   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15368     {
15369       x = in0, in0 = in1, in1 = x;
15370       odd = !odd;
15371     }
15372   out = d->target;
15373
15374   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15375                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
15376   return true;
15377 }
15378
15379 /* Recognize patterns suitable for the UZP instructions.  */
15380 static bool
15381 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
15382 {
15383   HOST_WIDE_INT odd;
15384   rtx out, in0, in1, x;
15385   machine_mode vmode = d->vmode;
15386
15387   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15388     return false;
15389
15390   /* Note that these are little-endian tests.
15391      We correct for big-endian later.  */
15392   if (!d->perm[0].is_constant (&odd)
15393       || (odd != 0 && odd != 1)
15394       || !d->perm.series_p (0, 1, odd, 2))
15395     return false;
15396
15397   /* Success!  */
15398   if (d->testing_p)
15399     return true;
15400
15401   in0 = d->op0;
15402   in1 = d->op1;
15403   /* We don't need a big-endian lane correction for SVE; see the comment
15404      at the head of aarch64-sve.md for details.  */
15405   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15406     {
15407       x = in0, in0 = in1, in1 = x;
15408       odd = !odd;
15409     }
15410   out = d->target;
15411
15412   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15413                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
15414   return true;
15415 }
15416
15417 /* Recognize patterns suitable for the ZIP instructions.  */
15418 static bool
15419 aarch64_evpc_zip (struct expand_vec_perm_d *d)
15420 {
15421   unsigned int high;
15422   poly_uint64 nelt = d->perm.length ();
15423   rtx out, in0, in1, x;
15424   machine_mode vmode = d->vmode;
15425
15426   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15427     return false;
15428
15429   /* Note that these are little-endian tests.
15430      We correct for big-endian later.  */
15431   poly_uint64 first = d->perm[0];
15432   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
15433       || !d->perm.series_p (0, 2, first, 1)
15434       || !d->perm.series_p (1, 2, first + nelt, 1))
15435     return false;
15436   high = maybe_ne (first, 0U);
15437
15438   /* Success!  */
15439   if (d->testing_p)
15440     return true;
15441
15442   in0 = d->op0;
15443   in1 = d->op1;
15444   /* We don't need a big-endian lane correction for SVE; see the comment
15445      at the head of aarch64-sve.md for details.  */
15446   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15447     {
15448       x = in0, in0 = in1, in1 = x;
15449       high = !high;
15450     }
15451   out = d->target;
15452
15453   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15454                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
15455   return true;
15456 }
15457
15458 /* Recognize patterns for the EXT insn.  */
15459
15460 static bool
15461 aarch64_evpc_ext (struct expand_vec_perm_d *d)
15462 {
15463   HOST_WIDE_INT location;
15464   rtx offset;
15465
15466   /* The first element always refers to the first vector.
15467      Check if the extracted indices are increasing by one.  */
15468   if (d->vec_flags == VEC_SVE_PRED
15469       || !d->perm[0].is_constant (&location)
15470       || !d->perm.series_p (0, 1, location, 1))
15471     return false;
15472
15473   /* Success! */
15474   if (d->testing_p)
15475     return true;
15476
15477   /* The case where (location == 0) is a no-op for both big- and little-endian,
15478      and is removed by the mid-end at optimization levels -O1 and higher.
15479
15480      We don't need a big-endian lane correction for SVE; see the comment
15481      at the head of aarch64-sve.md for details.  */
15482   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
15483     {
15484       /* After setup, we want the high elements of the first vector (stored
15485          at the LSB end of the register), and the low elements of the second
15486          vector (stored at the MSB end of the register). So swap.  */
15487       std::swap (d->op0, d->op1);
15488       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
15489          to_constant () is safe since this is restricted to Advanced SIMD
15490          vectors.  */
15491       location = d->perm.length ().to_constant () - location;
15492     }
15493
15494   offset = GEN_INT (location);
15495   emit_set_insn (d->target,
15496                  gen_rtx_UNSPEC (d->vmode,
15497                                  gen_rtvec (3, d->op0, d->op1, offset),
15498                                  UNSPEC_EXT));
15499   return true;
15500 }
15501
15502 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
15503    within each 64-bit, 32-bit or 16-bit granule.  */
15504
15505 static bool
15506 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
15507 {
15508   HOST_WIDE_INT diff;
15509   unsigned int i, size, unspec;
15510   machine_mode pred_mode;
15511
15512   if (d->vec_flags == VEC_SVE_PRED
15513       || !d->one_vector_p
15514       || !d->perm[0].is_constant (&diff))
15515     return false;
15516
15517   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
15518   if (size == 8)
15519     {
15520       unspec = UNSPEC_REV64;
15521       pred_mode = VNx2BImode;
15522     }
15523   else if (size == 4)
15524     {
15525       unspec = UNSPEC_REV32;
15526       pred_mode = VNx4BImode;
15527     }
15528   else if (size == 2)
15529     {
15530       unspec = UNSPEC_REV16;
15531       pred_mode = VNx8BImode;
15532     }
15533   else
15534     return false;
15535
15536   unsigned int step = diff + 1;
15537   for (i = 0; i < step; ++i)
15538     if (!d->perm.series_p (i, step, diff - i, step))
15539       return false;
15540
15541   /* Success! */
15542   if (d->testing_p)
15543     return true;
15544
15545   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
15546   if (d->vec_flags == VEC_SVE_DATA)
15547     {
15548       rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15549       src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
15550                             UNSPEC_MERGE_PTRUE);
15551     }
15552   emit_set_insn (d->target, src);
15553   return true;
15554 }
15555
15556 /* Recognize patterns for the REV insn, which reverses elements within
15557    a full vector.  */
15558
15559 static bool
15560 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
15561 {
15562   poly_uint64 nelt = d->perm.length ();
15563
15564   if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
15565     return false;
15566
15567   if (!d->perm.series_p (0, 1, nelt - 1, -1))
15568     return false;
15569
15570   /* Success! */
15571   if (d->testing_p)
15572     return true;
15573
15574   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
15575   emit_set_insn (d->target, src);
15576   return true;
15577 }
15578
15579 static bool
15580 aarch64_evpc_dup (struct expand_vec_perm_d *d)
15581 {
15582   rtx out = d->target;
15583   rtx in0;
15584   HOST_WIDE_INT elt;
15585   machine_mode vmode = d->vmode;
15586   rtx lane;
15587
15588   if (d->vec_flags == VEC_SVE_PRED
15589       || d->perm.encoding ().encoded_nelts () != 1
15590       || !d->perm[0].is_constant (&elt))
15591     return false;
15592
15593   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
15594     return false;
15595
15596   /* Success! */
15597   if (d->testing_p)
15598     return true;
15599
15600   /* The generic preparation in aarch64_expand_vec_perm_const_1
15601      swaps the operand order and the permute indices if it finds
15602      d->perm[0] to be in the second operand.  Thus, we can always
15603      use d->op0 and need not do any extra arithmetic to get the
15604      correct lane number.  */
15605   in0 = d->op0;
15606   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
15607
15608   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
15609   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
15610   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
15611   return true;
15612 }
15613
15614 static bool
15615 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
15616 {
15617   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
15618   machine_mode vmode = d->vmode;
15619
15620   /* Make sure that the indices are constant.  */
15621   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
15622   for (unsigned int i = 0; i < encoded_nelts; ++i)
15623     if (!d->perm[i].is_constant ())
15624       return false;
15625
15626   if (d->testing_p)
15627     return true;
15628
15629   /* Generic code will try constant permutation twice.  Once with the
15630      original mode and again with the elements lowered to QImode.
15631      So wait and don't do the selector expansion ourselves.  */
15632   if (vmode != V8QImode && vmode != V16QImode)
15633     return false;
15634
15635   /* to_constant is safe since this routine is specific to Advanced SIMD
15636      vectors.  */
15637   unsigned int nelt = d->perm.length ().to_constant ();
15638   for (unsigned int i = 0; i < nelt; ++i)
15639     /* If big-endian and two vectors we end up with a weird mixed-endian
15640        mode on NEON.  Reverse the index within each word but not the word
15641        itself.  to_constant is safe because we checked is_constant above.  */
15642     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
15643                         ? d->perm[i].to_constant () ^ (nelt - 1)
15644                         : d->perm[i].to_constant ());
15645
15646   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
15647   sel = force_reg (vmode, sel);
15648
15649   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
15650   return true;
15651 }
15652
15653 /* Try to implement D using an SVE TBL instruction.  */
15654
15655 static bool
15656 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
15657 {
15658   unsigned HOST_WIDE_INT nelt;
15659
15660   /* Permuting two variable-length vectors could overflow the
15661      index range.  */
15662   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
15663     return false;
15664
15665   if (d->testing_p)
15666     return true;
15667
15668   machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
15669   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
15670   aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
15671   return true;
15672 }
15673
15674 static bool
15675 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
15676 {
15677   /* The pattern matching functions above are written to look for a small
15678      number to begin the sequence (0, 1, N/2).  If we begin with an index
15679      from the second operand, we can swap the operands.  */
15680   poly_int64 nelt = d->perm.length ();
15681   if (known_ge (d->perm[0], nelt))
15682     {
15683       d->perm.rotate_inputs (1);
15684       std::swap (d->op0, d->op1);
15685     }
15686
15687   if ((d->vec_flags == VEC_ADVSIMD
15688        || d->vec_flags == VEC_SVE_DATA
15689        || d->vec_flags == VEC_SVE_PRED)
15690       && known_gt (nelt, 1))
15691     {
15692       if (aarch64_evpc_rev_local (d))
15693         return true;
15694       else if (aarch64_evpc_rev_global (d))
15695         return true;
15696       else if (aarch64_evpc_ext (d))
15697         return true;
15698       else if (aarch64_evpc_dup (d))
15699         return true;
15700       else if (aarch64_evpc_zip (d))
15701         return true;
15702       else if (aarch64_evpc_uzp (d))
15703         return true;
15704       else if (aarch64_evpc_trn (d))
15705         return true;
15706       if (d->vec_flags == VEC_SVE_DATA)
15707         return aarch64_evpc_sve_tbl (d);
15708       else if (d->vec_flags == VEC_SVE_DATA)
15709         return aarch64_evpc_tbl (d);
15710     }
15711   return false;
15712 }
15713
15714 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
15715
15716 static bool
15717 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
15718                                   rtx op1, const vec_perm_indices &sel)
15719 {
15720   struct expand_vec_perm_d d;
15721
15722   /* Check whether the mask can be applied to a single vector.  */
15723   if (op0 && rtx_equal_p (op0, op1))
15724     d.one_vector_p = true;
15725   else if (sel.all_from_input_p (0))
15726     {
15727       d.one_vector_p = true;
15728       op1 = op0;
15729     }
15730   else if (sel.all_from_input_p (1))
15731     {
15732       d.one_vector_p = true;
15733       op0 = op1;
15734     }
15735   else
15736     d.one_vector_p = false;
15737
15738   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
15739                      sel.nelts_per_input ());
15740   d.vmode = vmode;
15741   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
15742   d.target = target;
15743   d.op0 = op0;
15744   d.op1 = op1;
15745   d.testing_p = !target;
15746
15747   if (!d.testing_p)
15748     return aarch64_expand_vec_perm_const_1 (&d);
15749
15750   rtx_insn *last = get_last_insn ();
15751   bool ret = aarch64_expand_vec_perm_const_1 (&d);
15752   gcc_assert (last == get_last_insn ());
15753
15754   return ret;
15755 }
15756
15757 /* Generate a byte permute mask for a register of mode MODE,
15758    which has NUNITS units.  */
15759
15760 rtx
15761 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
15762 {
15763   /* We have to reverse each vector because we dont have
15764      a permuted load that can reverse-load according to ABI rules.  */
15765   rtx mask;
15766   rtvec v = rtvec_alloc (16);
15767   unsigned int i, j;
15768   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
15769
15770   gcc_assert (BYTES_BIG_ENDIAN);
15771   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
15772
15773   for (i = 0; i < nunits; i++)
15774     for (j = 0; j < usize; j++)
15775       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
15776   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
15777   return force_reg (V16QImode, mask);
15778 }
15779
15780 /* Return true if X is a valid second operand for the SVE instruction
15781    that implements integer comparison OP_CODE.  */
15782
15783 static bool
15784 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
15785 {
15786   if (register_operand (x, VOIDmode))
15787     return true;
15788
15789   switch (op_code)
15790     {
15791     case LTU:
15792     case LEU:
15793     case GEU:
15794     case GTU:
15795       return aarch64_sve_cmp_immediate_p (x, false);
15796     case LT:
15797     case LE:
15798     case GE:
15799     case GT:
15800     case NE:
15801     case EQ:
15802       return aarch64_sve_cmp_immediate_p (x, true);
15803     default:
15804       gcc_unreachable ();
15805     }
15806 }
15807
15808 /* Use predicated SVE instructions to implement the equivalent of:
15809
15810      (set TARGET OP)
15811
15812    given that PTRUE is an all-true predicate of the appropriate mode.  */
15813
15814 static void
15815 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
15816 {
15817   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
15818                                gen_rtvec (2, ptrue, op),
15819                                UNSPEC_MERGE_PTRUE);
15820   rtx_insn *insn = emit_set_insn (target, unspec);
15821   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
15822 }
15823
15824 /* Likewise, but also clobber the condition codes.  */
15825
15826 static void
15827 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
15828 {
15829   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
15830                                gen_rtvec (2, ptrue, op),
15831                                UNSPEC_MERGE_PTRUE);
15832   rtx_insn *insn = emit_insn (gen_set_clobber_cc (target, unspec));
15833   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
15834 }
15835
15836 /* Return the UNSPEC_COND_* code for comparison CODE.  */
15837
15838 static unsigned int
15839 aarch64_unspec_cond_code (rtx_code code)
15840 {
15841   switch (code)
15842     {
15843     case NE:
15844       return UNSPEC_COND_NE;
15845     case EQ:
15846       return UNSPEC_COND_EQ;
15847     case LT:
15848       return UNSPEC_COND_LT;
15849     case GT:
15850       return UNSPEC_COND_GT;
15851     case LE:
15852       return UNSPEC_COND_LE;
15853     case GE:
15854       return UNSPEC_COND_GE;
15855     default:
15856       gcc_unreachable ();
15857     }
15858 }
15859
15860 /* Emit:
15861
15862       (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
15863
15864    where <X> is the operation associated with comparison CODE.  This form
15865    of instruction is used when (and (CODE OP0 OP1) PRED) would have different
15866    semantics, such as when PRED might not be all-true and when comparing
15867    inactive lanes could have side effects.  */
15868
15869 static void
15870 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
15871                                   rtx pred, rtx op0, rtx op1)
15872 {
15873   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
15874                                gen_rtvec (3, pred, op0, op1),
15875                                aarch64_unspec_cond_code (code));
15876   emit_set_insn (target, unspec);
15877 }
15878
15879 /* Expand an SVE integer comparison using the SVE equivalent of:
15880
15881      (set TARGET (CODE OP0 OP1)).  */
15882
15883 void
15884 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
15885 {
15886   machine_mode pred_mode = GET_MODE (target);
15887   machine_mode data_mode = GET_MODE (op0);
15888
15889   if (!aarch64_sve_cmp_operand_p (code, op1))
15890     op1 = force_reg (data_mode, op1);
15891
15892   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15893   rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15894   aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
15895 }
15896
15897 /* Emit the SVE equivalent of:
15898
15899       (set TMP1 (CODE1 OP0 OP1))
15900       (set TMP2 (CODE2 OP0 OP1))
15901       (set TARGET (ior:PRED_MODE TMP1 TMP2))
15902
15903    PTRUE is an all-true predicate with the same mode as TARGET.  */
15904
15905 static void
15906 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
15907                            rtx ptrue, rtx op0, rtx op1)
15908 {
15909   machine_mode pred_mode = GET_MODE (ptrue);
15910   rtx tmp1 = gen_reg_rtx (pred_mode);
15911   aarch64_emit_sve_ptrue_op (tmp1, ptrue,
15912                              gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
15913   rtx tmp2 = gen_reg_rtx (pred_mode);
15914   aarch64_emit_sve_ptrue_op (tmp2, ptrue,
15915                              gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
15916   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
15917 }
15918
15919 /* Emit the SVE equivalent of:
15920
15921       (set TMP (CODE OP0 OP1))
15922       (set TARGET (not TMP))
15923
15924    PTRUE is an all-true predicate with the same mode as TARGET.  */
15925
15926 static void
15927 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
15928                                 rtx op0, rtx op1)
15929 {
15930   machine_mode pred_mode = GET_MODE (ptrue);
15931   rtx tmp = gen_reg_rtx (pred_mode);
15932   aarch64_emit_sve_ptrue_op (tmp, ptrue,
15933                              gen_rtx_fmt_ee (code, pred_mode, op0, op1));
15934   aarch64_emit_unop (target, one_cmpl_optab, tmp);
15935 }
15936
15937 /* Expand an SVE floating-point comparison using the SVE equivalent of:
15938
15939      (set TARGET (CODE OP0 OP1))
15940
15941    If CAN_INVERT_P is true, the caller can also handle inverted results;
15942    return true if the result is in fact inverted.  */
15943
15944 bool
15945 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
15946                                   rtx op0, rtx op1, bool can_invert_p)
15947 {
15948   machine_mode pred_mode = GET_MODE (target);
15949   machine_mode data_mode = GET_MODE (op0);
15950
15951   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15952   switch (code)
15953     {
15954     case UNORDERED:
15955       /* UNORDERED has no immediate form.  */
15956       op1 = force_reg (data_mode, op1);
15957       /* fall through */
15958     case LT:
15959     case LE:
15960     case GT:
15961     case GE:
15962     case EQ:
15963     case NE:
15964       {
15965         /* There is native support for the comparison.  */
15966         rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15967         aarch64_emit_sve_ptrue_op (target, ptrue, cond);
15968         return false;
15969       }
15970
15971     case LTGT:
15972       /* This is a trapping operation (LT or GT).  */
15973       aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
15974       return false;
15975
15976     case UNEQ:
15977       if (!flag_trapping_math)
15978         {
15979           /* This would trap for signaling NaNs.  */
15980           op1 = force_reg (data_mode, op1);
15981           aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
15982           return false;
15983         }
15984       /* fall through */
15985     case UNLT:
15986     case UNLE:
15987     case UNGT:
15988     case UNGE:
15989       if (flag_trapping_math)
15990         {
15991           /* Work out which elements are ordered.  */
15992           rtx ordered = gen_reg_rtx (pred_mode);
15993           op1 = force_reg (data_mode, op1);
15994           aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
15995
15996           /* Test the opposite condition for the ordered elements,
15997              then invert the result.  */
15998           if (code == UNEQ)
15999             code = NE;
16000           else
16001             code = reverse_condition_maybe_unordered (code);
16002           if (can_invert_p)
16003             {
16004               aarch64_emit_sve_predicated_cond (target, code,
16005                                                 ordered, op0, op1);
16006               return true;
16007             }
16008           rtx tmp = gen_reg_rtx (pred_mode);
16009           aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
16010           aarch64_emit_unop (target, one_cmpl_optab, tmp);
16011           return false;
16012         }
16013       break;
16014
16015     case ORDERED:
16016       /* ORDERED has no immediate form.  */
16017       op1 = force_reg (data_mode, op1);
16018       break;
16019
16020     default:
16021       gcc_unreachable ();
16022     }
16023
16024   /* There is native support for the inverse comparison.  */
16025   code = reverse_condition_maybe_unordered (code);
16026   if (can_invert_p)
16027     {
16028       rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16029       aarch64_emit_sve_ptrue_op (target, ptrue, cond);
16030       return true;
16031     }
16032   aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
16033   return false;
16034 }
16035
16036 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
16037    of the data being selected and CMP_MODE is the mode of the values being
16038    compared.  */
16039
16040 void
16041 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
16042                           rtx *ops)
16043 {
16044   machine_mode pred_mode
16045     = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
16046                              GET_MODE_SIZE (cmp_mode)).require ();
16047   rtx pred = gen_reg_rtx (pred_mode);
16048   if (FLOAT_MODE_P (cmp_mode))
16049     {
16050       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
16051                                             ops[4], ops[5], true))
16052         std::swap (ops[1], ops[2]);
16053     }
16054   else
16055     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
16056
16057   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
16058   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
16059 }
16060
16061 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
16062    true.  However due to issues with register allocation it is preferable
16063    to avoid tieing integer scalar and FP scalar modes.  Executing integer
16064    operations in general registers is better than treating them as scalar
16065    vector operations.  This reduces latency and avoids redundant int<->FP
16066    moves.  So tie modes if they are either the same class, or vector modes
16067    with other vector modes, vector structs or any scalar mode.  */
16068
16069 static bool
16070 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
16071 {
16072   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
16073     return true;
16074
16075   /* We specifically want to allow elements of "structure" modes to
16076      be tieable to the structure.  This more general condition allows
16077      other rarer situations too.  The reason we don't extend this to
16078      predicate modes is that there are no predicate structure modes
16079      nor any specific instructions for extracting part of a predicate
16080      register.  */
16081   if (aarch64_vector_data_mode_p (mode1)
16082       && aarch64_vector_data_mode_p (mode2))
16083     return true;
16084
16085   /* Also allow any scalar modes with vectors.  */
16086   if (aarch64_vector_mode_supported_p (mode1)
16087       || aarch64_vector_mode_supported_p (mode2))
16088     return true;
16089
16090   return false;
16091 }
16092
16093 /* Return a new RTX holding the result of moving POINTER forward by
16094    AMOUNT bytes.  */
16095
16096 static rtx
16097 aarch64_move_pointer (rtx pointer, poly_int64 amount)
16098 {
16099   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
16100
16101   return adjust_automodify_address (pointer, GET_MODE (pointer),
16102                                     next, amount);
16103 }
16104
16105 /* Return a new RTX holding the result of moving POINTER forward by the
16106    size of the mode it points to.  */
16107
16108 static rtx
16109 aarch64_progress_pointer (rtx pointer)
16110 {
16111   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
16112 }
16113
16114 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
16115    MODE bytes.  */
16116
16117 static void
16118 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
16119                                               machine_mode mode)
16120 {
16121   rtx reg = gen_reg_rtx (mode);
16122
16123   /* "Cast" the pointers to the correct mode.  */
16124   *src = adjust_address (*src, mode, 0);
16125   *dst = adjust_address (*dst, mode, 0);
16126   /* Emit the memcpy.  */
16127   emit_move_insn (reg, *src);
16128   emit_move_insn (*dst, reg);
16129   /* Move the pointers forward.  */
16130   *src = aarch64_progress_pointer (*src);
16131   *dst = aarch64_progress_pointer (*dst);
16132 }
16133
16134 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
16135    we succeed, otherwise return false.  */
16136
16137 bool
16138 aarch64_expand_movmem (rtx *operands)
16139 {
16140   unsigned int n;
16141   rtx dst = operands[0];
16142   rtx src = operands[1];
16143   rtx base;
16144   bool speed_p = !optimize_function_for_size_p (cfun);
16145
16146   /* When optimizing for size, give a better estimate of the length of a
16147      memcpy call, but use the default otherwise.  */
16148   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
16149
16150   /* We can't do anything smart if the amount to copy is not constant.  */
16151   if (!CONST_INT_P (operands[2]))
16152     return false;
16153
16154   n = UINTVAL (operands[2]);
16155
16156   /* Try to keep the number of instructions low.  For cases below 16 bytes we
16157      need to make at most two moves.  For cases above 16 bytes it will be one
16158      move for each 16 byte chunk, then at most two additional moves.  */
16159   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
16160     return false;
16161
16162   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
16163   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
16164
16165   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
16166   src = adjust_automodify_address (src, VOIDmode, base, 0);
16167
16168   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
16169      1-byte chunk.  */
16170   if (n < 4)
16171     {
16172       if (n >= 2)
16173         {
16174           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
16175           n -= 2;
16176         }
16177
16178       if (n == 1)
16179         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
16180
16181       return true;
16182     }
16183
16184   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
16185      4-byte chunk, partially overlapping with the previously copied chunk.  */
16186   if (n < 8)
16187     {
16188       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16189       n -= 4;
16190       if (n > 0)
16191         {
16192           int move = n - 4;
16193
16194           src = aarch64_move_pointer (src, move);
16195           dst = aarch64_move_pointer (dst, move);
16196           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16197         }
16198       return true;
16199     }
16200
16201   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
16202      them, then (if applicable) an 8-byte chunk.  */
16203   while (n >= 8)
16204     {
16205       if (n / 16)
16206         {
16207           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
16208           n -= 16;
16209         }
16210       else
16211         {
16212           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
16213           n -= 8;
16214         }
16215     }
16216
16217   /* Finish the final bytes of the copy.  We can always do this in one
16218      instruction.  We either copy the exact amount we need, or partially
16219      overlap with the previous chunk we copied and copy 8-bytes.  */
16220   if (n == 0)
16221     return true;
16222   else if (n == 1)
16223     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
16224   else if (n == 2)
16225     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
16226   else if (n == 4)
16227     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16228   else
16229     {
16230       if (n == 3)
16231         {
16232           src = aarch64_move_pointer (src, -1);
16233           dst = aarch64_move_pointer (dst, -1);
16234           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16235         }
16236       else
16237         {
16238           int move = n - 8;
16239
16240           src = aarch64_move_pointer (src, move);
16241           dst = aarch64_move_pointer (dst, move);
16242           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
16243         }
16244     }
16245
16246   return true;
16247 }
16248
16249 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
16250    SImode stores.  Handle the case when the constant has identical
16251    bottom and top halves.  This is beneficial when the two stores can be
16252    merged into an STP and we avoid synthesising potentially expensive
16253    immediates twice.  Return true if such a split is possible.  */
16254
16255 bool
16256 aarch64_split_dimode_const_store (rtx dst, rtx src)
16257 {
16258   rtx lo = gen_lowpart (SImode, src);
16259   rtx hi = gen_highpart_mode (SImode, DImode, src);
16260
16261   bool size_p = optimize_function_for_size_p (cfun);
16262
16263   if (!rtx_equal_p (lo, hi))
16264     return false;
16265
16266   unsigned int orig_cost
16267     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
16268   unsigned int lo_cost
16269     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
16270
16271   /* We want to transform:
16272      MOV        x1, 49370
16273      MOVK       x1, 0x140, lsl 16
16274      MOVK       x1, 0xc0da, lsl 32
16275      MOVK       x1, 0x140, lsl 48
16276      STR        x1, [x0]
16277    into:
16278      MOV        w1, 49370
16279      MOVK       w1, 0x140, lsl 16
16280      STP        w1, w1, [x0]
16281    So we want to perform this only when we save two instructions
16282    or more.  When optimizing for size, however, accept any code size
16283    savings we can.  */
16284   if (size_p && orig_cost <= lo_cost)
16285     return false;
16286
16287   if (!size_p
16288       && (orig_cost <= lo_cost + 1))
16289     return false;
16290
16291   rtx mem_lo = adjust_address (dst, SImode, 0);
16292   if (!aarch64_mem_pair_operand (mem_lo, SImode))
16293     return false;
16294
16295   rtx tmp_reg = gen_reg_rtx (SImode);
16296   aarch64_expand_mov_immediate (tmp_reg, lo);
16297   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
16298   /* Don't emit an explicit store pair as this may not be always profitable.
16299      Let the sched-fusion logic decide whether to merge them.  */
16300   emit_move_insn (mem_lo, tmp_reg);
16301   emit_move_insn (mem_hi, tmp_reg);
16302
16303   return true;
16304 }
16305
16306 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
16307
16308 static unsigned HOST_WIDE_INT
16309 aarch64_asan_shadow_offset (void)
16310 {
16311   return (HOST_WIDE_INT_1 << 36);
16312 }
16313
16314 static rtx
16315 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
16316                         int code, tree treeop0, tree treeop1)
16317 {
16318   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16319   rtx op0, op1;
16320   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16321   insn_code icode;
16322   struct expand_operand ops[4];
16323
16324   start_sequence ();
16325   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16326
16327   op_mode = GET_MODE (op0);
16328   if (op_mode == VOIDmode)
16329     op_mode = GET_MODE (op1);
16330
16331   switch (op_mode)
16332     {
16333     case E_QImode:
16334     case E_HImode:
16335     case E_SImode:
16336       cmp_mode = SImode;
16337       icode = CODE_FOR_cmpsi;
16338       break;
16339
16340     case E_DImode:
16341       cmp_mode = DImode;
16342       icode = CODE_FOR_cmpdi;
16343       break;
16344
16345     case E_SFmode:
16346       cmp_mode = SFmode;
16347       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16348       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
16349       break;
16350
16351     case E_DFmode:
16352       cmp_mode = DFmode;
16353       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16354       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
16355       break;
16356
16357     default:
16358       end_sequence ();
16359       return NULL_RTX;
16360     }
16361
16362   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
16363   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
16364   if (!op0 || !op1)
16365     {
16366       end_sequence ();
16367       return NULL_RTX;
16368     }
16369   *prep_seq = get_insns ();
16370   end_sequence ();
16371
16372   create_fixed_operand (&ops[0], op0);
16373   create_fixed_operand (&ops[1], op1);
16374
16375   start_sequence ();
16376   if (!maybe_expand_insn (icode, 2, ops))
16377     {
16378       end_sequence ();
16379       return NULL_RTX;
16380     }
16381   *gen_seq = get_insns ();
16382   end_sequence ();
16383
16384   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
16385                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
16386 }
16387
16388 static rtx
16389 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
16390                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
16391 {
16392   rtx op0, op1, target;
16393   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16394   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16395   insn_code icode;
16396   struct expand_operand ops[6];
16397   int aarch64_cond;
16398
16399   push_to_sequence (*prep_seq);
16400   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16401
16402   op_mode = GET_MODE (op0);
16403   if (op_mode == VOIDmode)
16404     op_mode = GET_MODE (op1);
16405
16406   switch (op_mode)
16407     {
16408     case E_QImode:
16409     case E_HImode:
16410     case E_SImode:
16411       cmp_mode = SImode;
16412       icode = CODE_FOR_ccmpsi;
16413       break;
16414
16415     case E_DImode:
16416       cmp_mode = DImode;
16417       icode = CODE_FOR_ccmpdi;
16418       break;
16419
16420     case E_SFmode:
16421       cmp_mode = SFmode;
16422       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16423       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
16424       break;
16425
16426     case E_DFmode:
16427       cmp_mode = DFmode;
16428       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16429       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
16430       break;
16431
16432     default:
16433       end_sequence ();
16434       return NULL_RTX;
16435     }
16436
16437   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
16438   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
16439   if (!op0 || !op1)
16440     {
16441       end_sequence ();
16442       return NULL_RTX;
16443     }
16444   *prep_seq = get_insns ();
16445   end_sequence ();
16446
16447   target = gen_rtx_REG (cc_mode, CC_REGNUM);
16448   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
16449
16450   if (bit_code != AND)
16451     {
16452       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
16453                                                 GET_MODE (XEXP (prev, 0))),
16454                              VOIDmode, XEXP (prev, 0), const0_rtx);
16455       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
16456     }
16457
16458   create_fixed_operand (&ops[0], XEXP (prev, 0));
16459   create_fixed_operand (&ops[1], target);
16460   create_fixed_operand (&ops[2], op0);
16461   create_fixed_operand (&ops[3], op1);
16462   create_fixed_operand (&ops[4], prev);
16463   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
16464
16465   push_to_sequence (*gen_seq);
16466   if (!maybe_expand_insn (icode, 6, ops))
16467     {
16468       end_sequence ();
16469       return NULL_RTX;
16470     }
16471
16472   *gen_seq = get_insns ();
16473   end_sequence ();
16474
16475   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
16476 }
16477
16478 #undef TARGET_GEN_CCMP_FIRST
16479 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
16480
16481 #undef TARGET_GEN_CCMP_NEXT
16482 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
16483
16484 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
16485    instruction fusion of some sort.  */
16486
16487 static bool
16488 aarch64_macro_fusion_p (void)
16489 {
16490   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
16491 }
16492
16493
16494 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
16495    should be kept together during scheduling.  */
16496
16497 static bool
16498 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
16499 {
16500   rtx set_dest;
16501   rtx prev_set = single_set (prev);
16502   rtx curr_set = single_set (curr);
16503   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
16504   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
16505
16506   if (!aarch64_macro_fusion_p ())
16507     return false;
16508
16509   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
16510     {
16511       /* We are trying to match:
16512          prev (mov)  == (set (reg r0) (const_int imm16))
16513          curr (movk) == (set (zero_extract (reg r0)
16514                                            (const_int 16)
16515                                            (const_int 16))
16516                              (const_int imm16_1))  */
16517
16518       set_dest = SET_DEST (curr_set);
16519
16520       if (GET_CODE (set_dest) == ZERO_EXTRACT
16521           && CONST_INT_P (SET_SRC (curr_set))
16522           && CONST_INT_P (SET_SRC (prev_set))
16523           && CONST_INT_P (XEXP (set_dest, 2))
16524           && INTVAL (XEXP (set_dest, 2)) == 16
16525           && REG_P (XEXP (set_dest, 0))
16526           && REG_P (SET_DEST (prev_set))
16527           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
16528         {
16529           return true;
16530         }
16531     }
16532
16533   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
16534     {
16535
16536       /*  We're trying to match:
16537           prev (adrp) == (set (reg r1)
16538                               (high (symbol_ref ("SYM"))))
16539           curr (add) == (set (reg r0)
16540                              (lo_sum (reg r1)
16541                                      (symbol_ref ("SYM"))))
16542           Note that r0 need not necessarily be the same as r1, especially
16543           during pre-regalloc scheduling.  */
16544
16545       if (satisfies_constraint_Ush (SET_SRC (prev_set))
16546           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16547         {
16548           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
16549               && REG_P (XEXP (SET_SRC (curr_set), 0))
16550               && REGNO (XEXP (SET_SRC (curr_set), 0))
16551                  == REGNO (SET_DEST (prev_set))
16552               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
16553                               XEXP (SET_SRC (curr_set), 1)))
16554             return true;
16555         }
16556     }
16557
16558   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
16559     {
16560
16561       /* We're trying to match:
16562          prev (movk) == (set (zero_extract (reg r0)
16563                                            (const_int 16)
16564                                            (const_int 32))
16565                              (const_int imm16_1))
16566          curr (movk) == (set (zero_extract (reg r0)
16567                                            (const_int 16)
16568                                            (const_int 48))
16569                              (const_int imm16_2))  */
16570
16571       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
16572           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
16573           && REG_P (XEXP (SET_DEST (prev_set), 0))
16574           && REG_P (XEXP (SET_DEST (curr_set), 0))
16575           && REGNO (XEXP (SET_DEST (prev_set), 0))
16576              == REGNO (XEXP (SET_DEST (curr_set), 0))
16577           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
16578           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
16579           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
16580           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
16581           && CONST_INT_P (SET_SRC (prev_set))
16582           && CONST_INT_P (SET_SRC (curr_set)))
16583         return true;
16584
16585     }
16586   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
16587     {
16588       /* We're trying to match:
16589           prev (adrp) == (set (reg r0)
16590                               (high (symbol_ref ("SYM"))))
16591           curr (ldr) == (set (reg r1)
16592                              (mem (lo_sum (reg r0)
16593                                              (symbol_ref ("SYM")))))
16594                  or
16595           curr (ldr) == (set (reg r1)
16596                              (zero_extend (mem
16597                                            (lo_sum (reg r0)
16598                                                    (symbol_ref ("SYM"))))))  */
16599       if (satisfies_constraint_Ush (SET_SRC (prev_set))
16600           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16601         {
16602           rtx curr_src = SET_SRC (curr_set);
16603
16604           if (GET_CODE (curr_src) == ZERO_EXTEND)
16605             curr_src = XEXP (curr_src, 0);
16606
16607           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
16608               && REG_P (XEXP (XEXP (curr_src, 0), 0))
16609               && REGNO (XEXP (XEXP (curr_src, 0), 0))
16610                  == REGNO (SET_DEST (prev_set))
16611               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
16612                               XEXP (SET_SRC (prev_set), 0)))
16613               return true;
16614         }
16615     }
16616
16617   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
16618        && aarch_crypto_can_dual_issue (prev, curr))
16619     return true;
16620
16621   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
16622       && any_condjump_p (curr))
16623     {
16624       enum attr_type prev_type = get_attr_type (prev);
16625
16626       unsigned int condreg1, condreg2;
16627       rtx cc_reg_1;
16628       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
16629       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
16630
16631       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
16632           && prev
16633           && modified_in_p (cc_reg_1, prev))
16634         {
16635           /* FIXME: this misses some which is considered simple arthematic
16636              instructions for ThunderX.  Simple shifts are missed here.  */
16637           if (prev_type == TYPE_ALUS_SREG
16638               || prev_type == TYPE_ALUS_IMM
16639               || prev_type == TYPE_LOGICS_REG
16640               || prev_type == TYPE_LOGICS_IMM)
16641             return true;
16642         }
16643     }
16644
16645   if (prev_set
16646       && curr_set
16647       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
16648       && any_condjump_p (curr))
16649     {
16650       /* We're trying to match:
16651           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
16652           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
16653                                                          (const_int 0))
16654                                                  (label_ref ("SYM"))
16655                                                  (pc))  */
16656       if (SET_DEST (curr_set) == (pc_rtx)
16657           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
16658           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
16659           && REG_P (SET_DEST (prev_set))
16660           && REGNO (SET_DEST (prev_set))
16661              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
16662         {
16663           /* Fuse ALU operations followed by conditional branch instruction.  */
16664           switch (get_attr_type (prev))
16665             {
16666             case TYPE_ALU_IMM:
16667             case TYPE_ALU_SREG:
16668             case TYPE_ADC_REG:
16669             case TYPE_ADC_IMM:
16670             case TYPE_ADCS_REG:
16671             case TYPE_ADCS_IMM:
16672             case TYPE_LOGIC_REG:
16673             case TYPE_LOGIC_IMM:
16674             case TYPE_CSEL:
16675             case TYPE_ADR:
16676             case TYPE_MOV_IMM:
16677             case TYPE_SHIFT_REG:
16678             case TYPE_SHIFT_IMM:
16679             case TYPE_BFM:
16680             case TYPE_RBIT:
16681             case TYPE_REV:
16682             case TYPE_EXTEND:
16683               return true;
16684
16685             default:;
16686             }
16687         }
16688     }
16689
16690   return false;
16691 }
16692
16693 /* Return true iff the instruction fusion described by OP is enabled.  */
16694
16695 bool
16696 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
16697 {
16698   return (aarch64_tune_params.fusible_ops & op) != 0;
16699 }
16700
16701 /* If MEM is in the form of [base+offset], extract the two parts
16702    of address and set to BASE and OFFSET, otherwise return false
16703    after clearing BASE and OFFSET.  */
16704
16705 bool
16706 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
16707 {
16708   rtx addr;
16709
16710   gcc_assert (MEM_P (mem));
16711
16712   addr = XEXP (mem, 0);
16713
16714   if (REG_P (addr))
16715     {
16716       *base = addr;
16717       *offset = const0_rtx;
16718       return true;
16719     }
16720
16721   if (GET_CODE (addr) == PLUS
16722       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
16723     {
16724       *base = XEXP (addr, 0);
16725       *offset = XEXP (addr, 1);
16726       return true;
16727     }
16728
16729   *base = NULL_RTX;
16730   *offset = NULL_RTX;
16731
16732   return false;
16733 }
16734
16735 /* Types for scheduling fusion.  */
16736 enum sched_fusion_type
16737 {
16738   SCHED_FUSION_NONE = 0,
16739   SCHED_FUSION_LD_SIGN_EXTEND,
16740   SCHED_FUSION_LD_ZERO_EXTEND,
16741   SCHED_FUSION_LD,
16742   SCHED_FUSION_ST,
16743   SCHED_FUSION_NUM
16744 };
16745
16746 /* If INSN is a load or store of address in the form of [base+offset],
16747    extract the two parts and set to BASE and OFFSET.  Return scheduling
16748    fusion type this INSN is.  */
16749
16750 static enum sched_fusion_type
16751 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
16752 {
16753   rtx x, dest, src;
16754   enum sched_fusion_type fusion = SCHED_FUSION_LD;
16755
16756   gcc_assert (INSN_P (insn));
16757   x = PATTERN (insn);
16758   if (GET_CODE (x) != SET)
16759     return SCHED_FUSION_NONE;
16760
16761   src = SET_SRC (x);
16762   dest = SET_DEST (x);
16763
16764   machine_mode dest_mode = GET_MODE (dest);
16765
16766   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
16767     return SCHED_FUSION_NONE;
16768
16769   if (GET_CODE (src) == SIGN_EXTEND)
16770     {
16771       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
16772       src = XEXP (src, 0);
16773       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16774         return SCHED_FUSION_NONE;
16775     }
16776   else if (GET_CODE (src) == ZERO_EXTEND)
16777     {
16778       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
16779       src = XEXP (src, 0);
16780       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16781         return SCHED_FUSION_NONE;
16782     }
16783
16784   if (GET_CODE (src) == MEM && REG_P (dest))
16785     extract_base_offset_in_addr (src, base, offset);
16786   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
16787     {
16788       fusion = SCHED_FUSION_ST;
16789       extract_base_offset_in_addr (dest, base, offset);
16790     }
16791   else
16792     return SCHED_FUSION_NONE;
16793
16794   if (*base == NULL_RTX || *offset == NULL_RTX)
16795     fusion = SCHED_FUSION_NONE;
16796
16797   return fusion;
16798 }
16799
16800 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
16801
16802    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
16803    and PRI are only calculated for these instructions.  For other instruction,
16804    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
16805    type instruction fusion can be added by returning different priorities.
16806
16807    It's important that irrelevant instructions get the largest FUSION_PRI.  */
16808
16809 static void
16810 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
16811                                int *fusion_pri, int *pri)
16812 {
16813   int tmp, off_val;
16814   rtx base, offset;
16815   enum sched_fusion_type fusion;
16816
16817   gcc_assert (INSN_P (insn));
16818
16819   tmp = max_pri - 1;
16820   fusion = fusion_load_store (insn, &base, &offset);
16821   if (fusion == SCHED_FUSION_NONE)
16822     {
16823       *pri = tmp;
16824       *fusion_pri = tmp;
16825       return;
16826     }
16827
16828   /* Set FUSION_PRI according to fusion type and base register.  */
16829   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
16830
16831   /* Calculate PRI.  */
16832   tmp /= 2;
16833
16834   /* INSN with smaller offset goes first.  */
16835   off_val = (int)(INTVAL (offset));
16836   if (off_val >= 0)
16837     tmp -= (off_val & 0xfffff);
16838   else
16839     tmp += ((- off_val) & 0xfffff);
16840
16841   *pri = tmp;
16842   return;
16843 }
16844
16845 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
16846    Adjust priority of sha1h instructions so they are scheduled before
16847    other SHA1 instructions.  */
16848
16849 static int
16850 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
16851 {
16852   rtx x = PATTERN (insn);
16853
16854   if (GET_CODE (x) == SET)
16855     {
16856       x = SET_SRC (x);
16857
16858       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
16859         return priority + 10;
16860     }
16861
16862   return priority;
16863 }
16864
16865 /* Given OPERANDS of consecutive load/store, check if we can merge
16866    them into ldp/stp.  LOAD is true if they are load instructions.
16867    MODE is the mode of memory operands.  */
16868
16869 bool
16870 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
16871                                 machine_mode mode)
16872 {
16873   HOST_WIDE_INT offval_1, offval_2, msize;
16874   enum reg_class rclass_1, rclass_2;
16875   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
16876
16877   if (load)
16878     {
16879       mem_1 = operands[1];
16880       mem_2 = operands[3];
16881       reg_1 = operands[0];
16882       reg_2 = operands[2];
16883       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
16884       if (REGNO (reg_1) == REGNO (reg_2))
16885         return false;
16886     }
16887   else
16888     {
16889       mem_1 = operands[0];
16890       mem_2 = operands[2];
16891       reg_1 = operands[1];
16892       reg_2 = operands[3];
16893     }
16894
16895   /* The mems cannot be volatile.  */
16896   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
16897     return false;
16898
16899   /* If we have SImode and slow unaligned ldp,
16900      check the alignment to be at least 8 byte. */
16901   if (mode == SImode
16902       && (aarch64_tune_params.extra_tuning_flags
16903           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16904       && !optimize_size
16905       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
16906     return false;
16907
16908   /* Check if the addresses are in the form of [base+offset].  */
16909   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16910   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16911     return false;
16912   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16913   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16914     return false;
16915
16916   /* Check if the bases are same.  */
16917   if (!rtx_equal_p (base_1, base_2))
16918     return false;
16919
16920   /* The operands must be of the same size.  */
16921   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
16922                          GET_MODE_SIZE (GET_MODE (mem_2))));
16923
16924   offval_1 = INTVAL (offset_1);
16925   offval_2 = INTVAL (offset_2);
16926   /* We should only be trying this for fixed-sized modes.  There is no
16927      SVE LDP/STP instruction.  */
16928   msize = GET_MODE_SIZE (mode).to_constant ();
16929   /* Check if the offsets are consecutive.  */
16930   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
16931     return false;
16932
16933   /* Check if the addresses are clobbered by load.  */
16934   if (load)
16935     {
16936       if (reg_mentioned_p (reg_1, mem_1))
16937         return false;
16938
16939       /* In increasing order, the last load can clobber the address.  */
16940       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
16941         return false;
16942     }
16943
16944   /* One of the memory accesses must be a mempair operand.
16945      If it is not the first one, they need to be swapped by the
16946      peephole.  */
16947   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
16948        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
16949     return false;
16950
16951   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
16952     rclass_1 = FP_REGS;
16953   else
16954     rclass_1 = GENERAL_REGS;
16955
16956   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
16957     rclass_2 = FP_REGS;
16958   else
16959     rclass_2 = GENERAL_REGS;
16960
16961   /* Check if the registers are of same class.  */
16962   if (rclass_1 != rclass_2)
16963     return false;
16964
16965   return true;
16966 }
16967
16968 /* Given OPERANDS of consecutive load/store that can be merged,
16969    swap them if they are not in ascending order.  */
16970 void
16971 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
16972 {
16973   rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
16974   HOST_WIDE_INT offval_1, offval_2;
16975
16976   if (load)
16977     {
16978       mem_1 = operands[1];
16979       mem_2 = operands[3];
16980     }
16981   else
16982     {
16983       mem_1 = operands[0];
16984       mem_2 = operands[2];
16985     }
16986
16987   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16988   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16989
16990   offval_1 = INTVAL (offset_1);
16991   offval_2 = INTVAL (offset_2);
16992
16993   if (offval_1 > offval_2)
16994     {
16995       /* Irrespective of whether this is a load or a store,
16996          we do the same swap.  */
16997       std::swap (operands[0], operands[2]);
16998       std::swap (operands[1], operands[3]);
16999     }
17000 }
17001
17002 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
17003    comparison between the two.  */
17004 int
17005 aarch64_host_wide_int_compare (const void *x, const void *y)
17006 {
17007   return wi::cmps (* ((const HOST_WIDE_INT *) x),
17008                    * ((const HOST_WIDE_INT *) y));
17009 }
17010
17011 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
17012    other pointing to a REG rtx containing an offset, compare the offsets
17013    of the two pairs.
17014
17015    Return:
17016
17017         1 iff offset (X) > offset (Y)
17018         0 iff offset (X) == offset (Y)
17019         -1 iff offset (X) < offset (Y)  */
17020 int
17021 aarch64_ldrstr_offset_compare (const void *x, const void *y)
17022 {
17023   const rtx * operands_1 = (const rtx *) x;
17024   const rtx * operands_2 = (const rtx *) y;
17025   rtx mem_1, mem_2, base, offset_1, offset_2;
17026
17027   if (MEM_P (operands_1[0]))
17028     mem_1 = operands_1[0];
17029   else
17030     mem_1 = operands_1[1];
17031
17032   if (MEM_P (operands_2[0]))
17033     mem_2 = operands_2[0];
17034   else
17035     mem_2 = operands_2[1];
17036
17037   /* Extract the offsets.  */
17038   extract_base_offset_in_addr (mem_1, &base, &offset_1);
17039   extract_base_offset_in_addr (mem_2, &base, &offset_2);
17040
17041   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
17042
17043   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
17044 }
17045
17046 /* Given OPERANDS of consecutive load/store, check if we can merge
17047    them into ldp/stp by adjusting the offset.  LOAD is true if they
17048    are load instructions.  MODE is the mode of memory operands.
17049
17050    Given below consecutive stores:
17051
17052      str  w1, [xb, 0x100]
17053      str  w1, [xb, 0x104]
17054      str  w1, [xb, 0x108]
17055      str  w1, [xb, 0x10c]
17056
17057    Though the offsets are out of the range supported by stp, we can
17058    still pair them after adjusting the offset, like:
17059
17060      add  scratch, xb, 0x100
17061      stp  w1, w1, [scratch]
17062      stp  w1, w1, [scratch, 0x8]
17063
17064    The peephole patterns detecting this opportunity should guarantee
17065    the scratch register is avaliable.  */
17066
17067 bool
17068 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
17069                                        scalar_mode mode)
17070 {
17071   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
17072   HOST_WIDE_INT offvals[4], msize;
17073   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
17074   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
17075
17076   if (load)
17077     {
17078       reg_1 = operands[0];
17079       mem_1 = operands[1];
17080       reg_2 = operands[2];
17081       mem_2 = operands[3];
17082       reg_3 = operands[4];
17083       mem_3 = operands[5];
17084       reg_4 = operands[6];
17085       mem_4 = operands[7];
17086       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
17087                   && REG_P (reg_3) && REG_P (reg_4));
17088
17089       /* Do not attempt to merge the loads if the loads clobber each other.  */
17090       for (int i = 0; i < 8; i += 2)
17091         for (int j = i + 2; j < 8; j += 2)
17092           if (reg_overlap_mentioned_p (operands[i], operands[j]))
17093             return false;
17094     }
17095   else
17096     {
17097       mem_1 = operands[0];
17098       reg_1 = operands[1];
17099       mem_2 = operands[2];
17100       reg_2 = operands[3];
17101       mem_3 = operands[4];
17102       reg_3 = operands[5];
17103       mem_4 = operands[6];
17104       reg_4 = operands[7];
17105     }
17106   /* Skip if memory operand is by itslef valid for ldp/stp.  */
17107   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
17108     return false;
17109
17110   /* The mems cannot be volatile.  */
17111   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
17112       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
17113     return false;
17114
17115   /* Check if the addresses are in the form of [base+offset].  */
17116   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
17117   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
17118     return false;
17119   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
17120   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
17121     return false;
17122   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
17123   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
17124     return false;
17125   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
17126   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
17127     return false;
17128
17129   /* Check if the bases are same.  */
17130   if (!rtx_equal_p (base_1, base_2)
17131       || !rtx_equal_p (base_2, base_3)
17132       || !rtx_equal_p (base_3, base_4))
17133     return false;
17134
17135   offvals[0] = INTVAL (offset_1);
17136   offvals[1] = INTVAL (offset_2);
17137   offvals[2] = INTVAL (offset_3);
17138   offvals[3] = INTVAL (offset_4);
17139   msize = GET_MODE_SIZE (mode);
17140
17141   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
17142   qsort (offvals, 4, sizeof (HOST_WIDE_INT), aarch64_host_wide_int_compare);
17143
17144   if (!(offvals[1] == offvals[0] + msize
17145         && offvals[3] == offvals[2] + msize))
17146     return false;
17147
17148   /* Check that offsets are within range of each other.  The ldp/stp
17149      instructions have 7 bit immediate offsets, so use 0x80.  */
17150   if (offvals[2] - offvals[0] >= msize * 0x80)
17151     return false;
17152
17153   /* The offsets must be aligned with respect to each other.  */
17154   if (offvals[0] % msize != offvals[2] % msize)
17155     return false;
17156
17157   /* Check if the addresses are clobbered by load.  */
17158   if (load && (reg_mentioned_p (reg_1, mem_1)
17159                || reg_mentioned_p (reg_2, mem_2)
17160                || reg_mentioned_p (reg_3, mem_3)
17161                || reg_mentioned_p (reg_4, mem_4)))
17162     return false;
17163
17164   /* If we have SImode and slow unaligned ldp,
17165      check the alignment to be at least 8 byte. */
17166   if (mode == SImode
17167       && (aarch64_tune_params.extra_tuning_flags
17168           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
17169       && !optimize_size
17170       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
17171     return false;
17172
17173   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
17174     rclass_1 = FP_REGS;
17175   else
17176     rclass_1 = GENERAL_REGS;
17177
17178   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
17179     rclass_2 = FP_REGS;
17180   else
17181     rclass_2 = GENERAL_REGS;
17182
17183   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
17184     rclass_3 = FP_REGS;
17185   else
17186     rclass_3 = GENERAL_REGS;
17187
17188   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
17189     rclass_4 = FP_REGS;
17190   else
17191     rclass_4 = GENERAL_REGS;
17192
17193   /* Check if the registers are of same class.  */
17194   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
17195     return false;
17196
17197   return true;
17198 }
17199
17200 /* Given OPERANDS of consecutive load/store, this function pairs them
17201    into LDP/STP after adjusting the offset.  It depends on the fact
17202    that the operands can be sorted so the offsets are correct for STP.
17203    MODE is the mode of memory operands.  CODE is the rtl operator
17204    which should be applied to all memory operands, it's SIGN_EXTEND,
17205    ZERO_EXTEND or UNKNOWN.  */
17206
17207 bool
17208 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
17209                              scalar_mode mode, RTX_CODE code)
17210 {
17211   rtx base, offset_1, offset_3, t1, t2;
17212   rtx mem_1, mem_2, mem_3, mem_4;
17213   rtx temp_operands[8];
17214   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
17215                 stp_off_upper_limit, stp_off_lower_limit, msize;
17216
17217   /* We make changes on a copy as we may still bail out.  */
17218   for (int i = 0; i < 8; i ++)
17219     temp_operands[i] = operands[i];
17220
17221   /* Sort the operands.  */
17222   qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
17223
17224   if (load)
17225     {
17226       mem_1 = temp_operands[1];
17227       mem_2 = temp_operands[3];
17228       mem_3 = temp_operands[5];
17229       mem_4 = temp_operands[7];
17230     }
17231   else
17232     {
17233       mem_1 = temp_operands[0];
17234       mem_2 = temp_operands[2];
17235       mem_3 = temp_operands[4];
17236       mem_4 = temp_operands[6];
17237       gcc_assert (code == UNKNOWN);
17238     }
17239
17240   extract_base_offset_in_addr (mem_1, &base, &offset_1);
17241   extract_base_offset_in_addr (mem_3, &base, &offset_3);
17242   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
17243               && offset_3 != NULL_RTX);
17244
17245   /* Adjust offset so it can fit in LDP/STP instruction.  */
17246   msize = GET_MODE_SIZE (mode);
17247   stp_off_upper_limit = msize * (0x40 - 1);
17248   stp_off_lower_limit = - msize * 0x40;
17249
17250   off_val_1 = INTVAL (offset_1);
17251   off_val_3 = INTVAL (offset_3);
17252
17253   /* The base offset is optimally half way between the two STP/LDP offsets.  */
17254   if (msize <= 4)
17255     base_off = (off_val_1 + off_val_3) / 2;
17256   else
17257     /* However, due to issues with negative LDP/STP offset generation for
17258        larger modes, for DF, DI and vector modes. we must not use negative
17259        addresses smaller than 9 signed unadjusted bits can store.  This
17260        provides the most range in this case.  */
17261     base_off = off_val_1;
17262
17263   /* Adjust the base so that it is aligned with the addresses but still
17264      optimal.  */
17265   if (base_off % msize != off_val_1 % msize)
17266     /* Fix the offset, bearing in mind we want to make it bigger not
17267        smaller.  */
17268     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17269   else if (msize <= 4)
17270     /* The negative range of LDP/STP is one larger than the positive range.  */
17271     base_off += msize;
17272
17273   /* Check if base offset is too big or too small.  We can attempt to resolve
17274      this issue by setting it to the maximum value and seeing if the offsets
17275      still fit.  */
17276   if (base_off >= 0x1000)
17277     {
17278       base_off = 0x1000 - 1;
17279       /* We must still make sure that the base offset is aligned with respect
17280          to the address.  But it may may not be made any bigger.  */
17281       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17282     }
17283
17284   /* Likewise for the case where the base is too small.  */
17285   if (base_off <= -0x1000)
17286     {
17287       base_off = -0x1000 + 1;
17288       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17289     }
17290
17291   /* Offset of the first STP/LDP.  */
17292   new_off_1 = off_val_1 - base_off;
17293
17294   /* Offset of the second STP/LDP.  */
17295   new_off_3 = off_val_3 - base_off;
17296
17297   /* The offsets must be within the range of the LDP/STP instructions.  */
17298   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
17299       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
17300     return false;
17301
17302   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
17303                                                   new_off_1), true);
17304   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
17305                                                   new_off_1 + msize), true);
17306   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
17307                                                   new_off_3), true);
17308   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
17309                                                   new_off_3 + msize), true);
17310
17311   if (!aarch64_mem_pair_operand (mem_1, mode)
17312       || !aarch64_mem_pair_operand (mem_3, mode))
17313     return false;
17314
17315   if (code == ZERO_EXTEND)
17316     {
17317       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
17318       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
17319       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
17320       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
17321     }
17322   else if (code == SIGN_EXTEND)
17323     {
17324       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
17325       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
17326       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
17327       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
17328     }
17329
17330   if (load)
17331     {
17332       operands[0] = temp_operands[0];
17333       operands[1] = mem_1;
17334       operands[2] = temp_operands[2];
17335       operands[3] = mem_2;
17336       operands[4] = temp_operands[4];
17337       operands[5] = mem_3;
17338       operands[6] = temp_operands[6];
17339       operands[7] = mem_4;
17340     }
17341   else
17342     {
17343       operands[0] = mem_1;
17344       operands[1] = temp_operands[1];
17345       operands[2] = mem_2;
17346       operands[3] = temp_operands[3];
17347       operands[4] = mem_3;
17348       operands[5] = temp_operands[5];
17349       operands[6] = mem_4;
17350       operands[7] = temp_operands[7];
17351     }
17352
17353   /* Emit adjusting instruction.  */
17354   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
17355   /* Emit ldp/stp instructions.  */
17356   t1 = gen_rtx_SET (operands[0], operands[1]);
17357   t2 = gen_rtx_SET (operands[2], operands[3]);
17358   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17359   t1 = gen_rtx_SET (operands[4], operands[5]);
17360   t2 = gen_rtx_SET (operands[6], operands[7]);
17361   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17362   return true;
17363 }
17364
17365 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
17366    it isn't worth branching around empty masked ops (including masked
17367    stores).  */
17368
17369 static bool
17370 aarch64_empty_mask_is_expensive (unsigned)
17371 {
17372   return false;
17373 }
17374
17375 /* Return 1 if pseudo register should be created and used to hold
17376    GOT address for PIC code.  */
17377
17378 bool
17379 aarch64_use_pseudo_pic_reg (void)
17380 {
17381   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
17382 }
17383
17384 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
17385
17386 static int
17387 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
17388 {
17389   switch (XINT (x, 1))
17390     {
17391     case UNSPEC_GOTSMALLPIC:
17392     case UNSPEC_GOTSMALLPIC28K:
17393     case UNSPEC_GOTTINYPIC:
17394       return 0;
17395     default:
17396       break;
17397     }
17398
17399   return default_unspec_may_trap_p (x, flags);
17400 }
17401
17402
17403 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
17404    return the log2 of that value.  Otherwise return -1.  */
17405
17406 int
17407 aarch64_fpconst_pow_of_2 (rtx x)
17408 {
17409   const REAL_VALUE_TYPE *r;
17410
17411   if (!CONST_DOUBLE_P (x))
17412     return -1;
17413
17414   r = CONST_DOUBLE_REAL_VALUE (x);
17415
17416   if (REAL_VALUE_NEGATIVE (*r)
17417       || REAL_VALUE_ISNAN (*r)
17418       || REAL_VALUE_ISINF (*r)
17419       || !real_isinteger (r, DFmode))
17420     return -1;
17421
17422   return exact_log2 (real_to_integer (r));
17423 }
17424
17425 /* If X is a vector of equal CONST_DOUBLE values and that value is
17426    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
17427
17428 int
17429 aarch64_vec_fpconst_pow_of_2 (rtx x)
17430 {
17431   int nelts;
17432   if (GET_CODE (x) != CONST_VECTOR
17433       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
17434     return -1;
17435
17436   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
17437     return -1;
17438
17439   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
17440   if (firstval <= 0)
17441     return -1;
17442
17443   for (int i = 1; i < nelts; i++)
17444     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
17445       return -1;
17446
17447   return firstval;
17448 }
17449
17450 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
17451    to float.
17452
17453    __fp16 always promotes through this hook.
17454    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
17455    through the generic excess precision logic rather than here.  */
17456
17457 static tree
17458 aarch64_promoted_type (const_tree t)
17459 {
17460   if (SCALAR_FLOAT_TYPE_P (t)
17461       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
17462     return float_type_node;
17463
17464   return NULL_TREE;
17465 }
17466
17467 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
17468
17469 static bool
17470 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
17471                            optimization_type opt_type)
17472 {
17473   switch (op)
17474     {
17475     case rsqrt_optab:
17476       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
17477
17478     default:
17479       return true;
17480     }
17481 }
17482
17483 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
17484
17485 static unsigned int
17486 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
17487                                         int *offset)
17488 {
17489   /* Polynomial invariant 1 == (VG / 2) - 1.  */
17490   gcc_assert (i == 1);
17491   *factor = 2;
17492   *offset = 1;
17493   return AARCH64_DWARF_VG;
17494 }
17495
17496 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
17497    if MODE is HFmode, and punt to the generic implementation otherwise.  */
17498
17499 static bool
17500 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
17501 {
17502   return (mode == HFmode
17503           ? true
17504           : default_libgcc_floating_mode_supported_p (mode));
17505 }
17506
17507 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
17508    if MODE is HFmode, and punt to the generic implementation otherwise.  */
17509
17510 static bool
17511 aarch64_scalar_mode_supported_p (scalar_mode mode)
17512 {
17513   return (mode == HFmode
17514           ? true
17515           : default_scalar_mode_supported_p (mode));
17516 }
17517
17518 /* Set the value of FLT_EVAL_METHOD.
17519    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
17520
17521     0: evaluate all operations and constants, whose semantic type has at
17522        most the range and precision of type float, to the range and
17523        precision of float; evaluate all other operations and constants to
17524        the range and precision of the semantic type;
17525
17526     N, where _FloatN is a supported interchange floating type
17527        evaluate all operations and constants, whose semantic type has at
17528        most the range and precision of _FloatN type, to the range and
17529        precision of the _FloatN type; evaluate all other operations and
17530        constants to the range and precision of the semantic type;
17531
17532    If we have the ARMv8.2-A extensions then we support _Float16 in native
17533    precision, so we should set this to 16.  Otherwise, we support the type,
17534    but want to evaluate expressions in float precision, so set this to
17535    0.  */
17536
17537 static enum flt_eval_method
17538 aarch64_excess_precision (enum excess_precision_type type)
17539 {
17540   switch (type)
17541     {
17542       case EXCESS_PRECISION_TYPE_FAST:
17543       case EXCESS_PRECISION_TYPE_STANDARD:
17544         /* We can calculate either in 16-bit range and precision or
17545            32-bit range and precision.  Make that decision based on whether
17546            we have native support for the ARMv8.2-A 16-bit floating-point
17547            instructions or not.  */
17548         return (TARGET_FP_F16INST
17549                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
17550                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
17551       case EXCESS_PRECISION_TYPE_IMPLICIT:
17552         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
17553       default:
17554         gcc_unreachable ();
17555     }
17556   return FLT_EVAL_METHOD_UNPREDICTABLE;
17557 }
17558
17559 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
17560    scheduled for speculative execution.  Reject the long-running division
17561    and square-root instructions.  */
17562
17563 static bool
17564 aarch64_sched_can_speculate_insn (rtx_insn *insn)
17565 {
17566   switch (get_attr_type (insn))
17567     {
17568       case TYPE_SDIV:
17569       case TYPE_UDIV:
17570       case TYPE_FDIVS:
17571       case TYPE_FDIVD:
17572       case TYPE_FSQRTS:
17573       case TYPE_FSQRTD:
17574       case TYPE_NEON_FP_SQRT_S:
17575       case TYPE_NEON_FP_SQRT_D:
17576       case TYPE_NEON_FP_SQRT_S_Q:
17577       case TYPE_NEON_FP_SQRT_D_Q:
17578       case TYPE_NEON_FP_DIV_S:
17579       case TYPE_NEON_FP_DIV_D:
17580       case TYPE_NEON_FP_DIV_S_Q:
17581       case TYPE_NEON_FP_DIV_D_Q:
17582         return false;
17583       default:
17584         return true;
17585     }
17586 }
17587
17588 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
17589
17590 static int
17591 aarch64_compute_pressure_classes (reg_class *classes)
17592 {
17593   int i = 0;
17594   classes[i++] = GENERAL_REGS;
17595   classes[i++] = FP_REGS;
17596   /* PR_REGS isn't a useful pressure class because many predicate pseudo
17597      registers need to go in PR_LO_REGS at some point during their
17598      lifetime.  Splitting it into two halves has the effect of making
17599      all predicates count against PR_LO_REGS, so that we try whenever
17600      possible to restrict the number of live predicates to 8.  This
17601      greatly reduces the amount of spilling in certain loops.  */
17602   classes[i++] = PR_LO_REGS;
17603   classes[i++] = PR_HI_REGS;
17604   return i;
17605 }
17606
17607 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
17608
17609 static bool
17610 aarch64_can_change_mode_class (machine_mode from,
17611                                machine_mode to, reg_class_t)
17612 {
17613   if (BYTES_BIG_ENDIAN)
17614     {
17615       bool from_sve_p = aarch64_sve_data_mode_p (from);
17616       bool to_sve_p = aarch64_sve_data_mode_p (to);
17617
17618       /* Don't allow changes between SVE data modes and non-SVE modes.
17619          See the comment at the head of aarch64-sve.md for details.  */
17620       if (from_sve_p != to_sve_p)
17621         return false;
17622
17623       /* Don't allow changes in element size: lane 0 of the new vector
17624          would not then be lane 0 of the old vector.  See the comment
17625          above aarch64_maybe_expand_sve_subreg_move for a more detailed
17626          description.
17627
17628          In the worst case, this forces a register to be spilled in
17629          one mode and reloaded in the other, which handles the
17630          endianness correctly.  */
17631       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
17632         return false;
17633     }
17634   return true;
17635 }
17636
17637 /* Implement TARGET_EARLY_REMAT_MODES.  */
17638
17639 static void
17640 aarch64_select_early_remat_modes (sbitmap modes)
17641 {
17642   /* SVE values are not normally live across a call, so it should be
17643      worth doing early rematerialization even in VL-specific mode.  */
17644   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
17645     {
17646       machine_mode mode = (machine_mode) i;
17647       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
17648       if (vec_flags & VEC_ANY_SVE)
17649         bitmap_set_bit (modes, i);
17650     }
17651 }
17652
17653 /* Target-specific selftests.  */
17654
17655 #if CHECKING_P
17656
17657 namespace selftest {
17658
17659 /* Selftest for the RTL loader.
17660    Verify that the RTL loader copes with a dump from
17661    print_rtx_function.  This is essentially just a test that class
17662    function_reader can handle a real dump, but it also verifies
17663    that lookup_reg_by_dump_name correctly handles hard regs.
17664    The presence of hard reg names in the dump means that the test is
17665    target-specific, hence it is in this file.  */
17666
17667 static void
17668 aarch64_test_loading_full_dump ()
17669 {
17670   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
17671
17672   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
17673
17674   rtx_insn *insn_1 = get_insn_by_uid (1);
17675   ASSERT_EQ (NOTE, GET_CODE (insn_1));
17676
17677   rtx_insn *insn_15 = get_insn_by_uid (15);
17678   ASSERT_EQ (INSN, GET_CODE (insn_15));
17679   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
17680
17681   /* Verify crtl->return_rtx.  */
17682   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
17683   ASSERT_EQ (0, REGNO (crtl->return_rtx));
17684   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
17685 }
17686
17687 /* Run all target-specific selftests.  */
17688
17689 static void
17690 aarch64_run_selftests (void)
17691 {
17692   aarch64_test_loading_full_dump ();
17693 }
17694
17695 } // namespace selftest
17696
17697 #endif /* #if CHECKING_P */
17698
17699 #undef TARGET_ADDRESS_COST
17700 #define TARGET_ADDRESS_COST aarch64_address_cost
17701
17702 /* This hook will determines whether unnamed bitfields affect the alignment
17703    of the containing structure.  The hook returns true if the structure
17704    should inherit the alignment requirements of an unnamed bitfield's
17705    type.  */
17706 #undef TARGET_ALIGN_ANON_BITFIELD
17707 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
17708
17709 #undef TARGET_ASM_ALIGNED_DI_OP
17710 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
17711
17712 #undef TARGET_ASM_ALIGNED_HI_OP
17713 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
17714
17715 #undef TARGET_ASM_ALIGNED_SI_OP
17716 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
17717
17718 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
17719 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
17720   hook_bool_const_tree_hwi_hwi_const_tree_true
17721
17722 #undef TARGET_ASM_FILE_START
17723 #define TARGET_ASM_FILE_START aarch64_start_file
17724
17725 #undef TARGET_ASM_OUTPUT_MI_THUNK
17726 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
17727
17728 #undef TARGET_ASM_SELECT_RTX_SECTION
17729 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
17730
17731 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
17732 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
17733
17734 #undef TARGET_BUILD_BUILTIN_VA_LIST
17735 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
17736
17737 #undef TARGET_CALLEE_COPIES
17738 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
17739
17740 #undef TARGET_CAN_ELIMINATE
17741 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
17742
17743 #undef TARGET_CAN_INLINE_P
17744 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
17745
17746 #undef TARGET_CANNOT_FORCE_CONST_MEM
17747 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
17748
17749 #undef TARGET_CASE_VALUES_THRESHOLD
17750 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
17751
17752 #undef TARGET_CONDITIONAL_REGISTER_USAGE
17753 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
17754
17755 /* Only the least significant bit is used for initialization guard
17756    variables.  */
17757 #undef TARGET_CXX_GUARD_MASK_BIT
17758 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
17759
17760 #undef TARGET_C_MODE_FOR_SUFFIX
17761 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
17762
17763 #ifdef TARGET_BIG_ENDIAN_DEFAULT
17764 #undef  TARGET_DEFAULT_TARGET_FLAGS
17765 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
17766 #endif
17767
17768 #undef TARGET_CLASS_MAX_NREGS
17769 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
17770
17771 #undef TARGET_BUILTIN_DECL
17772 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
17773
17774 #undef TARGET_BUILTIN_RECIPROCAL
17775 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
17776
17777 #undef TARGET_C_EXCESS_PRECISION
17778 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
17779
17780 #undef  TARGET_EXPAND_BUILTIN
17781 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
17782
17783 #undef TARGET_EXPAND_BUILTIN_VA_START
17784 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
17785
17786 #undef TARGET_FOLD_BUILTIN
17787 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
17788
17789 #undef TARGET_FUNCTION_ARG
17790 #define TARGET_FUNCTION_ARG aarch64_function_arg
17791
17792 #undef TARGET_FUNCTION_ARG_ADVANCE
17793 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
17794
17795 #undef TARGET_FUNCTION_ARG_BOUNDARY
17796 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
17797
17798 #undef TARGET_FUNCTION_ARG_PADDING
17799 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
17800
17801 #undef TARGET_GET_RAW_RESULT_MODE
17802 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
17803 #undef TARGET_GET_RAW_ARG_MODE
17804 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
17805
17806 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
17807 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
17808
17809 #undef TARGET_FUNCTION_VALUE
17810 #define TARGET_FUNCTION_VALUE aarch64_function_value
17811
17812 #undef TARGET_FUNCTION_VALUE_REGNO_P
17813 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
17814
17815 #undef TARGET_GIMPLE_FOLD_BUILTIN
17816 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
17817
17818 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
17819 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
17820
17821 #undef  TARGET_INIT_BUILTINS
17822 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
17823
17824 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
17825 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
17826   aarch64_ira_change_pseudo_allocno_class
17827
17828 #undef TARGET_LEGITIMATE_ADDRESS_P
17829 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
17830
17831 #undef TARGET_LEGITIMATE_CONSTANT_P
17832 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
17833
17834 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
17835 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
17836   aarch64_legitimize_address_displacement
17837
17838 #undef TARGET_LIBGCC_CMP_RETURN_MODE
17839 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
17840
17841 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
17842 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
17843 aarch64_libgcc_floating_mode_supported_p
17844
17845 #undef TARGET_MANGLE_TYPE
17846 #define TARGET_MANGLE_TYPE aarch64_mangle_type
17847
17848 #undef TARGET_MEMORY_MOVE_COST
17849 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
17850
17851 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
17852 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
17853
17854 #undef TARGET_MUST_PASS_IN_STACK
17855 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
17856
17857 /* This target hook should return true if accesses to volatile bitfields
17858    should use the narrowest mode possible.  It should return false if these
17859    accesses should use the bitfield container type.  */
17860 #undef TARGET_NARROW_VOLATILE_BITFIELD
17861 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
17862
17863 #undef  TARGET_OPTION_OVERRIDE
17864 #define TARGET_OPTION_OVERRIDE aarch64_override_options
17865
17866 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
17867 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
17868   aarch64_override_options_after_change
17869
17870 #undef TARGET_OPTION_SAVE
17871 #define TARGET_OPTION_SAVE aarch64_option_save
17872
17873 #undef TARGET_OPTION_RESTORE
17874 #define TARGET_OPTION_RESTORE aarch64_option_restore
17875
17876 #undef TARGET_OPTION_PRINT
17877 #define TARGET_OPTION_PRINT aarch64_option_print
17878
17879 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
17880 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
17881
17882 #undef TARGET_SET_CURRENT_FUNCTION
17883 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
17884
17885 #undef TARGET_PASS_BY_REFERENCE
17886 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
17887
17888 #undef TARGET_PREFERRED_RELOAD_CLASS
17889 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
17890
17891 #undef TARGET_SCHED_REASSOCIATION_WIDTH
17892 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
17893
17894 #undef TARGET_PROMOTED_TYPE
17895 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
17896
17897 #undef TARGET_SECONDARY_RELOAD
17898 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
17899
17900 #undef TARGET_SHIFT_TRUNCATION_MASK
17901 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
17902
17903 #undef TARGET_SETUP_INCOMING_VARARGS
17904 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
17905
17906 #undef TARGET_STRUCT_VALUE_RTX
17907 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
17908
17909 #undef TARGET_REGISTER_MOVE_COST
17910 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
17911
17912 #undef TARGET_RETURN_IN_MEMORY
17913 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
17914
17915 #undef TARGET_RETURN_IN_MSB
17916 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
17917
17918 #undef TARGET_RTX_COSTS
17919 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
17920
17921 #undef TARGET_SCALAR_MODE_SUPPORTED_P
17922 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
17923
17924 #undef TARGET_SCHED_ISSUE_RATE
17925 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
17926
17927 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
17928 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
17929   aarch64_sched_first_cycle_multipass_dfa_lookahead
17930
17931 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
17932 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
17933   aarch64_first_cycle_multipass_dfa_lookahead_guard
17934
17935 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
17936 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
17937   aarch64_get_separate_components
17938
17939 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
17940 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
17941   aarch64_components_for_bb
17942
17943 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
17944 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
17945   aarch64_disqualify_components
17946
17947 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
17948 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
17949   aarch64_emit_prologue_components
17950
17951 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
17952 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
17953   aarch64_emit_epilogue_components
17954
17955 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
17956 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
17957   aarch64_set_handled_components
17958
17959 #undef TARGET_TRAMPOLINE_INIT
17960 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
17961
17962 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
17963 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
17964
17965 #undef TARGET_VECTOR_MODE_SUPPORTED_P
17966 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
17967
17968 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
17969 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
17970   aarch64_builtin_support_vector_misalignment
17971
17972 #undef TARGET_ARRAY_MODE
17973 #define TARGET_ARRAY_MODE aarch64_array_mode
17974
17975 #undef TARGET_ARRAY_MODE_SUPPORTED_P
17976 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
17977
17978 #undef TARGET_VECTORIZE_ADD_STMT_COST
17979 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
17980
17981 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
17982 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
17983   aarch64_builtin_vectorization_cost
17984
17985 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
17986 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
17987
17988 #undef TARGET_VECTORIZE_BUILTINS
17989 #define TARGET_VECTORIZE_BUILTINS
17990
17991 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
17992 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
17993   aarch64_builtin_vectorized_function
17994
17995 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
17996 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
17997   aarch64_autovectorize_vector_sizes
17998
17999 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
18000 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
18001   aarch64_atomic_assign_expand_fenv
18002
18003 /* Section anchor support.  */
18004
18005 #undef TARGET_MIN_ANCHOR_OFFSET
18006 #define TARGET_MIN_ANCHOR_OFFSET -256
18007
18008 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
18009    byte offset; we can do much more for larger data types, but have no way
18010    to determine the size of the access.  We assume accesses are aligned.  */
18011 #undef TARGET_MAX_ANCHOR_OFFSET
18012 #define TARGET_MAX_ANCHOR_OFFSET 4095
18013
18014 #undef TARGET_VECTOR_ALIGNMENT
18015 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
18016
18017 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
18018 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
18019   aarch64_vectorize_preferred_vector_alignment
18020 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
18021 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
18022   aarch64_simd_vector_alignment_reachable
18023
18024 /* vec_perm support.  */
18025
18026 #undef TARGET_VECTORIZE_VEC_PERM_CONST
18027 #define TARGET_VECTORIZE_VEC_PERM_CONST \
18028   aarch64_vectorize_vec_perm_const
18029
18030 #undef TARGET_VECTORIZE_GET_MASK_MODE
18031 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
18032 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
18033 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
18034   aarch64_empty_mask_is_expensive
18035
18036 #undef TARGET_INIT_LIBFUNCS
18037 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
18038
18039 #undef TARGET_FIXED_CONDITION_CODE_REGS
18040 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
18041
18042 #undef TARGET_FLAGS_REGNUM
18043 #define TARGET_FLAGS_REGNUM CC_REGNUM
18044
18045 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
18046 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
18047
18048 #undef TARGET_ASAN_SHADOW_OFFSET
18049 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
18050
18051 #undef TARGET_LEGITIMIZE_ADDRESS
18052 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
18053
18054 #undef TARGET_SCHED_CAN_SPECULATE_INSN
18055 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
18056
18057 #undef TARGET_CAN_USE_DOLOOP_P
18058 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
18059
18060 #undef TARGET_SCHED_ADJUST_PRIORITY
18061 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
18062
18063 #undef TARGET_SCHED_MACRO_FUSION_P
18064 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
18065
18066 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
18067 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
18068
18069 #undef TARGET_SCHED_FUSION_PRIORITY
18070 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
18071
18072 #undef TARGET_UNSPEC_MAY_TRAP_P
18073 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
18074
18075 #undef TARGET_USE_PSEUDO_PIC_REG
18076 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
18077
18078 #undef TARGET_PRINT_OPERAND
18079 #define TARGET_PRINT_OPERAND aarch64_print_operand
18080
18081 #undef TARGET_PRINT_OPERAND_ADDRESS
18082 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
18083
18084 #undef TARGET_OPTAB_SUPPORTED_P
18085 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
18086
18087 #undef TARGET_OMIT_STRUCT_RETURN_REG
18088 #define TARGET_OMIT_STRUCT_RETURN_REG true
18089
18090 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
18091 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
18092   aarch64_dwarf_poly_indeterminate_value
18093
18094 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
18095 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
18096 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
18097
18098 #undef TARGET_HARD_REGNO_NREGS
18099 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
18100 #undef TARGET_HARD_REGNO_MODE_OK
18101 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
18102
18103 #undef TARGET_MODES_TIEABLE_P
18104 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
18105
18106 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
18107 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
18108   aarch64_hard_regno_call_part_clobbered
18109
18110 #undef TARGET_CONSTANT_ALIGNMENT
18111 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
18112
18113 #undef TARGET_COMPUTE_PRESSURE_CLASSES
18114 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
18115
18116 #undef TARGET_CAN_CHANGE_MODE_CLASS
18117 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
18118
18119 #undef TARGET_SELECT_EARLY_REMAT_MODES
18120 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
18121
18122 #if CHECKING_P
18123 #undef TARGET_RUN_TARGET_SELFTESTS
18124 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
18125 #endif /* #if CHECKING_P */
18126
18127 struct gcc_target targetm = TARGET_INITIALIZER;
18128
18129 #include "gt-aarch64.h"