gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2018 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #include "config.h"
  24 #define INCLUDE_STRING
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "memmodel.h"
  32 #include "gimple.h"
  33 #include "cfghooks.h"
  34 #include "cfgloop.h"
  35 #include "df.h"
  36 #include "tm_p.h"
  37 #include "stringpool.h"
  38 #include "attribs.h"
  39 #include "optabs.h"
  40 #include "regs.h"
  41 #include "emit-rtl.h"
  42 #include "recog.h"
  43 #include "diagnostic.h"
  44 #include "insn-attr.h"
  45 #include "alias.h"
  46 #include "fold-const.h"
  47 #include "stor-layout.h"
  48 #include "calls.h"
  49 #include "varasm.h"
  50 #include "output.h"
  51 #include "flags.h"
  52 #include "explow.h"
  53 #include "expr.h"
  54 #include "reload.h"
  55 #include "langhooks.h"
  56 #include "opts.h"
  57 #include "params.h"
  58 #include "gimplify.h"
  59 #include "dwarf2.h"
  60 #include "gimple-iterator.h"
  61 #include "tree-vectorizer.h"
  62 #include "aarch64-cost-tables.h"
  63 #include "dumpfile.h"
  64 #include "builtins.h"
  65 #include "rtl-iter.h"
  66 #include "tm-constrs.h"
  67 #include "sched-int.h"
  68 #include "target-globals.h"
  69 #include "common/common-target.h"
  70 #include "cfgrtl.h"
  71 #include "selftest.h"
  72 #include "selftest-rtl.h"
  73 #include "rtx-vector-builder.h"
  74
  75 /* This file should be included last.  */
  76 #include "target-def.h"
  77
  78 /* Defined for convenience.  */
  79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  80
  81 /* Classifies an address.
  82
  83    ADDRESS_REG_IMM
  84        A simple base register plus immediate offset.
  85
  86    ADDRESS_REG_WB
  87        A base register indexed by immediate offset with writeback.
  88
  89    ADDRESS_REG_REG
  90        A base register indexed by (optionally scaled) register.
  91
  92    ADDRESS_REG_UXTW
  93        A base register indexed by (optionally scaled) zero-extended register.
  94
  95    ADDRESS_REG_SXTW
  96        A base register indexed by (optionally scaled) sign-extended register.
  97
  98    ADDRESS_LO_SUM
  99        A LO_SUM rtx with a base register and "LO12" symbol relocation.
 100
 101    ADDRESS_SYMBOLIC:
 102        A constant symbolic address, in pc-relative literal pool.  */
 103
 104 enum aarch64_address_type {
 105   ADDRESS_REG_IMM,
 106   ADDRESS_REG_WB,
 107   ADDRESS_REG_REG,
 108   ADDRESS_REG_UXTW,
 109   ADDRESS_REG_SXTW,
 110   ADDRESS_LO_SUM,
 111   ADDRESS_SYMBOLIC
 112 };
 113
 114 struct aarch64_address_info {
 115   enum aarch64_address_type type;
 116   rtx base;
 117   rtx offset;
 118   poly_int64 const_offset;
 119   int shift;
 120   enum aarch64_symbol_type symbol_type;
 121 };
 122
 123 /* Information about a legitimate vector immediate operand.  */
 124 struct simd_immediate_info
 125 {
 126   enum insn_type { MOV, MVN };
 127   enum modifier_type { LSL, MSL };
 128
 129   simd_immediate_info () {}
 130   simd_immediate_info (scalar_float_mode, rtx);
 131   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
 132                        insn_type = MOV, modifier_type = LSL,
 133                        unsigned int = 0);
 134   simd_immediate_info (scalar_mode, rtx, rtx);
 135
 136   /* The mode of the elements.  */
 137   scalar_mode elt_mode;
 138
 139   /* The value of each element if all elements are the same, or the
 140      first value if the constant is a series.  */
 141   rtx value;
 142
 143   /* The value of the step if the constant is a series, null otherwise.  */
 144   rtx step;
 145
 146   /* The instruction to use to move the immediate into a vector.  */
 147   insn_type insn;
 148
 149   /* The kind of shift modifier to use, and the number of bits to shift.
 150      This is (LSL, 0) if no shift is needed.  */
 151   modifier_type modifier;
 152   unsigned int shift;
 153 };
 154
 155 /* Construct a floating-point immediate in which each element has mode
 156    ELT_MODE_IN and value VALUE_IN.  */
 157 inline simd_immediate_info
 158 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 159   : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
 160     modifier (LSL), shift (0)
 161 {}
 162
 163 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 164    and value VALUE_IN.  The other parameters are as for the structure
 165    fields.  */
 166 inline simd_immediate_info
 167 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 168                        unsigned HOST_WIDE_INT value_in,
 169                        insn_type insn_in, modifier_type modifier_in,
 170                        unsigned int shift_in)
 171   : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
 172     step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
 173 {}
 174
 175 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 176    and where element I is equal to VALUE_IN + I * STEP_IN.  */
 177 inline simd_immediate_info
 178 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
 179   : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
 180     modifier (LSL), shift (0)
 181 {}
 182
 183 /* The current code model.  */
 184 enum aarch64_code_model aarch64_cmodel;
 185
 186 /* The number of 64-bit elements in an SVE vector.  */
 187 poly_uint16 aarch64_sve_vg;
 188
 189 #ifdef HAVE_AS_TLS
 190 #undef TARGET_HAVE_TLS
 191 #define TARGET_HAVE_TLS 1
 192 #endif
 193
 194 static bool aarch64_composite_type_p (const_tree, machine_mode);
 195 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 196                                                      const_tree,
 197                                                      machine_mode *, int *,
 198                                                      bool *);
 199 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 200 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 201 static void aarch64_override_options_after_change (void);
 202 static bool aarch64_vector_mode_supported_p (machine_mode);
 203 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 204 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 205                                                          const_tree type,
 206                                                          int misalignment,
 207                                                          bool is_packed);
 208 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 209 static bool aarch64_print_ldpstp_address (FILE *, machine_mode, rtx);
 210
 211 /* Major revision number of the ARM Architecture implemented by the target.  */
 212 unsigned aarch64_architecture_version;
 213
 214 /* The processor for which instructions should be scheduled.  */
 215 enum aarch64_processor aarch64_tune = cortexa53;
 216
 217 /* Mask to specify which instruction scheduling options should be used.  */
 218 unsigned long aarch64_tune_flags = 0;
 219
 220 /* Global flag for PC relative loads.  */
 221 bool aarch64_pcrelative_literal_loads;
 222
 223 /* Global flag for whether frame pointer is enabled.  */
 224 bool aarch64_use_frame_pointer;
 225
 226 /* Support for command line parsing of boolean flags in the tuning
 227    structures.  */
 228 struct aarch64_flag_desc
 229 {
 230   const char* name;
 231   unsigned int flag;
 232 };
 233
 234 #define AARCH64_FUSION_PAIR(name, internal_name) \
 235   { name, AARCH64_FUSE_##internal_name },
 236 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 237 {
 238   { "none", AARCH64_FUSE_NOTHING },
 239 #include "aarch64-fusion-pairs.def"
 240   { "all", AARCH64_FUSE_ALL },
 241   { NULL, AARCH64_FUSE_NOTHING }
 242 };
 243
 244 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 245   { name, AARCH64_EXTRA_TUNE_##internal_name },
 246 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 247 {
 248   { "none", AARCH64_EXTRA_TUNE_NONE },
 249 #include "aarch64-tuning-flags.def"
 250   { "all", AARCH64_EXTRA_TUNE_ALL },
 251   { NULL, AARCH64_EXTRA_TUNE_NONE }
 252 };
 253
 254 /* Tuning parameters.  */
 255
 256 static const struct cpu_addrcost_table generic_addrcost_table =
 257 {
 258     {
 259       1, /* hi  */
 260       0, /* si  */
 261       0, /* di  */
 262       1, /* ti  */
 263     },
 264   0, /* pre_modify  */
 265   0, /* post_modify  */
 266   0, /* register_offset  */
 267   0, /* register_sextend  */
 268   0, /* register_zextend  */
 269   0 /* imm_offset  */
 270 };
 271
 272 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 273 {
 274     {
 275       0, /* hi  */
 276       0, /* si  */
 277       0, /* di  */
 278       2, /* ti  */
 279     },
 280   0, /* pre_modify  */
 281   0, /* post_modify  */
 282   1, /* register_offset  */
 283   1, /* register_sextend  */
 284   2, /* register_zextend  */
 285   0, /* imm_offset  */
 286 };
 287
 288 static const struct cpu_addrcost_table xgene1_addrcost_table =
 289 {
 290     {
 291       1, /* hi  */
 292       0, /* si  */
 293       0, /* di  */
 294       1, /* ti  */
 295     },
 296   1, /* pre_modify  */
 297   0, /* post_modify  */
 298   0, /* register_offset  */
 299   1, /* register_sextend  */
 300   1, /* register_zextend  */
 301   0, /* imm_offset  */
 302 };
 303
 304 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 305 {
 306     {
 307       1, /* hi  */
 308       1, /* si  */
 309       1, /* di  */
 310       2, /* ti  */
 311     },
 312   0, /* pre_modify  */
 313   0, /* post_modify  */
 314   2, /* register_offset  */
 315   3, /* register_sextend  */
 316   3, /* register_zextend  */
 317   0, /* imm_offset  */
 318 };
 319
 320 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 321 {
 322     {
 323       1, /* hi  */
 324       1, /* si  */
 325       1, /* di  */
 326       2, /* ti  */
 327     },
 328   1, /* pre_modify  */
 329   1, /* post_modify  */
 330   3, /* register_offset  */
 331   4, /* register_sextend  */
 332   3, /* register_zextend  */
 333   2, /* imm_offset  */
 334 };
 335
 336 static const struct cpu_regmove_cost generic_regmove_cost =
 337 {
 338   1, /* GP2GP  */
 339   /* Avoid the use of slow int<->fp moves for spilling by setting
 340      their cost higher than memmov_cost.  */
 341   5, /* GP2FP  */
 342   5, /* FP2GP  */
 343   2 /* FP2FP  */
 344 };
 345
 346 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 347 {
 348   1, /* GP2GP  */
 349   /* Avoid the use of slow int<->fp moves for spilling by setting
 350      their cost higher than memmov_cost.  */
 351   5, /* GP2FP  */
 352   5, /* FP2GP  */
 353   2 /* FP2FP  */
 354 };
 355
 356 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 357 {
 358   1, /* GP2GP  */
 359   /* Avoid the use of slow int<->fp moves for spilling by setting
 360      their cost higher than memmov_cost.  */
 361   5, /* GP2FP  */
 362   5, /* FP2GP  */
 363   2 /* FP2FP  */
 364 };
 365
 366 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 367 {
 368   1, /* GP2GP  */
 369   /* Avoid the use of slow int<->fp moves for spilling by setting
 370      their cost higher than memmov_cost (actual, 4 and 9).  */
 371   9, /* GP2FP  */
 372   9, /* FP2GP  */
 373   1 /* FP2FP  */
 374 };
 375
 376 static const struct cpu_regmove_cost thunderx_regmove_cost =
 377 {
 378   2, /* GP2GP  */
 379   2, /* GP2FP  */
 380   6, /* FP2GP  */
 381   4 /* FP2FP  */
 382 };
 383
 384 static const struct cpu_regmove_cost xgene1_regmove_cost =
 385 {
 386   1, /* GP2GP  */
 387   /* Avoid the use of slow int<->fp moves for spilling by setting
 388      their cost higher than memmov_cost.  */
 389   8, /* GP2FP  */
 390   8, /* FP2GP  */
 391   2 /* FP2FP  */
 392 };
 393
 394 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 395 {
 396   2, /* GP2GP  */
 397   /* Avoid the use of int<->fp moves for spilling.  */
 398   6, /* GP2FP  */
 399   6, /* FP2GP  */
 400   4 /* FP2FP  */
 401 };
 402
 403 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 404 {
 405   1, /* GP2GP  */
 406   /* Avoid the use of int<->fp moves for spilling.  */
 407   8, /* GP2FP  */
 408   8, /* FP2GP  */
 409   4  /* FP2FP  */
 410 };
 411
 412 /* Generic costs for vector insn classes.  */
 413 static const struct cpu_vector_cost generic_vector_cost =
 414 {
 415   1, /* scalar_int_stmt_cost  */
 416   1, /* scalar_fp_stmt_cost  */
 417   1, /* scalar_load_cost  */
 418   1, /* scalar_store_cost  */
 419   1, /* vec_int_stmt_cost  */
 420   1, /* vec_fp_stmt_cost  */
 421   2, /* vec_permute_cost  */
 422   1, /* vec_to_scalar_cost  */
 423   1, /* scalar_to_vec_cost  */
 424   1, /* vec_align_load_cost  */
 425   1, /* vec_unalign_load_cost  */
 426   1, /* vec_unalign_store_cost  */
 427   1, /* vec_store_cost  */
 428   3, /* cond_taken_branch_cost  */
 429   1 /* cond_not_taken_branch_cost  */
 430 };
 431
 432 /* ThunderX costs for vector insn classes.  */
 433 static const struct cpu_vector_cost thunderx_vector_cost =
 434 {
 435   1, /* scalar_int_stmt_cost  */
 436   1, /* scalar_fp_stmt_cost  */
 437   3, /* scalar_load_cost  */
 438   1, /* scalar_store_cost  */
 439   4, /* vec_int_stmt_cost  */
 440   1, /* vec_fp_stmt_cost  */
 441   4, /* vec_permute_cost  */
 442   2, /* vec_to_scalar_cost  */
 443   2, /* scalar_to_vec_cost  */
 444   3, /* vec_align_load_cost  */
 445   5, /* vec_unalign_load_cost  */
 446   5, /* vec_unalign_store_cost  */
 447   1, /* vec_store_cost  */
 448   3, /* cond_taken_branch_cost  */
 449   3 /* cond_not_taken_branch_cost  */
 450 };
 451
 452 /* Generic costs for vector insn classes.  */
 453 static const struct cpu_vector_cost cortexa57_vector_cost =
 454 {
 455   1, /* scalar_int_stmt_cost  */
 456   1, /* scalar_fp_stmt_cost  */
 457   4, /* scalar_load_cost  */
 458   1, /* scalar_store_cost  */
 459   2, /* vec_int_stmt_cost  */
 460   2, /* vec_fp_stmt_cost  */
 461   3, /* vec_permute_cost  */
 462   8, /* vec_to_scalar_cost  */
 463   8, /* scalar_to_vec_cost  */
 464   4, /* vec_align_load_cost  */
 465   4, /* vec_unalign_load_cost  */
 466   1, /* vec_unalign_store_cost  */
 467   1, /* vec_store_cost  */
 468   1, /* cond_taken_branch_cost  */
 469   1 /* cond_not_taken_branch_cost  */
 470 };
 471
 472 static const struct cpu_vector_cost exynosm1_vector_cost =
 473 {
 474   1, /* scalar_int_stmt_cost  */
 475   1, /* scalar_fp_stmt_cost  */
 476   5, /* scalar_load_cost  */
 477   1, /* scalar_store_cost  */
 478   3, /* vec_int_stmt_cost  */
 479   3, /* vec_fp_stmt_cost  */
 480   3, /* vec_permute_cost  */
 481   3, /* vec_to_scalar_cost  */
 482   3, /* scalar_to_vec_cost  */
 483   5, /* vec_align_load_cost  */
 484   5, /* vec_unalign_load_cost  */
 485   1, /* vec_unalign_store_cost  */
 486   1, /* vec_store_cost  */
 487   1, /* cond_taken_branch_cost  */
 488   1 /* cond_not_taken_branch_cost  */
 489 };
 490
 491 /* Generic costs for vector insn classes.  */
 492 static const struct cpu_vector_cost xgene1_vector_cost =
 493 {
 494   1, /* scalar_int_stmt_cost  */
 495   1, /* scalar_fp_stmt_cost  */
 496   5, /* scalar_load_cost  */
 497   1, /* scalar_store_cost  */
 498   2, /* vec_int_stmt_cost  */
 499   2, /* vec_fp_stmt_cost  */
 500   2, /* vec_permute_cost  */
 501   4, /* vec_to_scalar_cost  */
 502   4, /* scalar_to_vec_cost  */
 503   10, /* vec_align_load_cost  */
 504   10, /* vec_unalign_load_cost  */
 505   2, /* vec_unalign_store_cost  */
 506   2, /* vec_store_cost  */
 507   2, /* cond_taken_branch_cost  */
 508   1 /* cond_not_taken_branch_cost  */
 509 };
 510
 511 /* Costs for vector insn classes for Vulcan.  */
 512 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 513 {
 514   1, /* scalar_int_stmt_cost  */
 515   6, /* scalar_fp_stmt_cost  */
 516   4, /* scalar_load_cost  */
 517   1, /* scalar_store_cost  */
 518   5, /* vec_int_stmt_cost  */
 519   6, /* vec_fp_stmt_cost  */
 520   3, /* vec_permute_cost  */
 521   6, /* vec_to_scalar_cost  */
 522   5, /* scalar_to_vec_cost  */
 523   8, /* vec_align_load_cost  */
 524   8, /* vec_unalign_load_cost  */
 525   4, /* vec_unalign_store_cost  */
 526   4, /* vec_store_cost  */
 527   2, /* cond_taken_branch_cost  */
 528   1  /* cond_not_taken_branch_cost  */
 529 };
 530
 531 /* Generic costs for branch instructions.  */
 532 static const struct cpu_branch_cost generic_branch_cost =
 533 {
 534   1,  /* Predictable.  */
 535   3   /* Unpredictable.  */
 536 };
 537
 538 /* Generic approximation modes.  */
 539 static const cpu_approx_modes generic_approx_modes =
 540 {
 541   AARCH64_APPROX_NONE,  /* division  */
 542   AARCH64_APPROX_NONE,  /* sqrt  */
 543   AARCH64_APPROX_NONE   /* recip_sqrt  */
 544 };
 545
 546 /* Approximation modes for Exynos M1.  */
 547 static const cpu_approx_modes exynosm1_approx_modes =
 548 {
 549   AARCH64_APPROX_NONE,  /* division  */
 550   AARCH64_APPROX_ALL,   /* sqrt  */
 551   AARCH64_APPROX_ALL    /* recip_sqrt  */
 552 };
 553
 554 /* Approximation modes for X-Gene 1.  */
 555 static const cpu_approx_modes xgene1_approx_modes =
 556 {
 557   AARCH64_APPROX_NONE,  /* division  */
 558   AARCH64_APPROX_NONE,  /* sqrt  */
 559   AARCH64_APPROX_ALL    /* recip_sqrt  */
 560 };
 561
 562 /* Generic prefetch settings (which disable prefetch).  */
 563 static const cpu_prefetch_tune generic_prefetch_tune =
 564 {
 565   0,                    /* num_slots  */
 566   -1,                   /* l1_cache_size  */
 567   -1,                   /* l1_cache_line_size  */
 568   -1,                   /* l2_cache_size  */
 569   true,                 /* prefetch_dynamic_strides */
 570   -1,                   /* minimum_stride */
 571   -1                    /* default_opt_level  */
 572 };
 573
 574 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 575 {
 576   0,                    /* num_slots  */
 577   -1,                   /* l1_cache_size  */
 578   64,                   /* l1_cache_line_size  */
 579   -1,                   /* l2_cache_size  */
 580   true,                 /* prefetch_dynamic_strides */
 581   -1,                   /* minimum_stride */
 582   -1                    /* default_opt_level  */
 583 };
 584
 585 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 586 {
 587   4,                    /* num_slots  */
 588   32,                   /* l1_cache_size  */
 589   64,                   /* l1_cache_line_size  */
 590   512,                  /* l2_cache_size  */
 591   false,                /* prefetch_dynamic_strides */
 592   2048,                 /* minimum_stride */
 593   3                     /* default_opt_level  */
 594 };
 595
 596 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 597 {
 598   8,                    /* num_slots  */
 599   32,                   /* l1_cache_size  */
 600   128,                  /* l1_cache_line_size  */
 601   16*1024,              /* l2_cache_size  */
 602   true,                 /* prefetch_dynamic_strides */
 603   -1,                   /* minimum_stride */
 604   3                     /* default_opt_level  */
 605 };
 606
 607 static const cpu_prefetch_tune thunderx_prefetch_tune =
 608 {
 609   8,                    /* num_slots  */
 610   32,                   /* l1_cache_size  */
 611   128,                  /* l1_cache_line_size  */
 612   -1,                   /* l2_cache_size  */
 613   true,                 /* prefetch_dynamic_strides */
 614   -1,                   /* minimum_stride */
 615   -1                    /* default_opt_level  */
 616 };
 617
 618 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 619 {
 620   8,                    /* num_slots  */
 621   32,                   /* l1_cache_size  */
 622   64,                   /* l1_cache_line_size  */
 623   256,                  /* l2_cache_size  */
 624   true,                 /* prefetch_dynamic_strides */
 625   -1,                   /* minimum_stride */
 626   -1                    /* default_opt_level  */
 627 };
 628
 629 static const struct tune_params generic_tunings =
 630 {
 631   &cortexa57_extra_costs,
 632   &generic_addrcost_table,
 633   &generic_regmove_cost,
 634   &generic_vector_cost,
 635   &generic_branch_cost,
 636   &generic_approx_modes,
 637   4, /* memmov_cost  */
 638   2, /* issue_rate  */
 639   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 640   "8",  /* function_align.  */
 641   "4",  /* jump_align.  */
 642   "8",  /* loop_align.  */
 643   2,    /* int_reassoc_width.  */
 644   4,    /* fp_reassoc_width.  */
 645   1,    /* vec_reassoc_width.  */
 646   2,    /* min_div_recip_mul_sf.  */
 647   2,    /* min_div_recip_mul_df.  */
 648   0,    /* max_case_values.  */
 649   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 650   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 651   &generic_prefetch_tune
 652 };
 653
 654 static const struct tune_params cortexa35_tunings =
 655 {
 656   &cortexa53_extra_costs,
 657   &generic_addrcost_table,
 658   &cortexa53_regmove_cost,
 659   &generic_vector_cost,
 660   &generic_branch_cost,
 661   &generic_approx_modes,
 662   4, /* memmov_cost  */
 663   1, /* issue_rate  */
 664   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 665    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 666   "16", /* function_align.  */
 667   "4",  /* jump_align.  */
 668   "8",  /* loop_align.  */
 669   2,    /* int_reassoc_width.  */
 670   4,    /* fp_reassoc_width.  */
 671   1,    /* vec_reassoc_width.  */
 672   2,    /* min_div_recip_mul_sf.  */
 673   2,    /* min_div_recip_mul_df.  */
 674   0,    /* max_case_values.  */
 675   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 676   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 677   &generic_prefetch_tune
 678 };
 679
 680 static const struct tune_params cortexa53_tunings =
 681 {
 682   &cortexa53_extra_costs,
 683   &generic_addrcost_table,
 684   &cortexa53_regmove_cost,
 685   &generic_vector_cost,
 686   &generic_branch_cost,
 687   &generic_approx_modes,
 688   4, /* memmov_cost  */
 689   2, /* issue_rate  */
 690   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 691    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 692   "16", /* function_align.  */
 693   "4",  /* jump_align.  */
 694   "8",  /* loop_align.  */
 695   2,    /* int_reassoc_width.  */
 696   4,    /* fp_reassoc_width.  */
 697   1,    /* vec_reassoc_width.  */
 698   2,    /* min_div_recip_mul_sf.  */
 699   2,    /* min_div_recip_mul_df.  */
 700   0,    /* max_case_values.  */
 701   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 702   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 703   &generic_prefetch_tune
 704 };
 705
 706 static const struct tune_params cortexa57_tunings =
 707 {
 708   &cortexa57_extra_costs,
 709   &generic_addrcost_table,
 710   &cortexa57_regmove_cost,
 711   &cortexa57_vector_cost,
 712   &generic_branch_cost,
 713   &generic_approx_modes,
 714   4, /* memmov_cost  */
 715   3, /* issue_rate  */
 716   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 717    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 718   "16", /* function_align.  */
 719   "4",  /* jump_align.  */
 720   "8",  /* loop_align.  */
 721   2,    /* int_reassoc_width.  */
 722   4,    /* fp_reassoc_width.  */
 723   1,    /* vec_reassoc_width.  */
 724   2,    /* min_div_recip_mul_sf.  */
 725   2,    /* min_div_recip_mul_df.  */
 726   0,    /* max_case_values.  */
 727   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 728   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 729   &generic_prefetch_tune
 730 };
 731
 732 static const struct tune_params cortexa72_tunings =
 733 {
 734   &cortexa57_extra_costs,
 735   &generic_addrcost_table,
 736   &cortexa57_regmove_cost,
 737   &cortexa57_vector_cost,
 738   &generic_branch_cost,
 739   &generic_approx_modes,
 740   4, /* memmov_cost  */
 741   3, /* issue_rate  */
 742   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 743    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 744   "16", /* function_align.  */
 745   "4",  /* jump_align.  */
 746   "8",  /* loop_align.  */
 747   2,    /* int_reassoc_width.  */
 748   4,    /* fp_reassoc_width.  */
 749   1,    /* vec_reassoc_width.  */
 750   2,    /* min_div_recip_mul_sf.  */
 751   2,    /* min_div_recip_mul_df.  */
 752   0,    /* max_case_values.  */
 753   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 754   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 755   &generic_prefetch_tune
 756 };
 757
 758 static const struct tune_params cortexa73_tunings =
 759 {
 760   &cortexa57_extra_costs,
 761   &generic_addrcost_table,
 762   &cortexa57_regmove_cost,
 763   &cortexa57_vector_cost,
 764   &generic_branch_cost,
 765   &generic_approx_modes,
 766   4, /* memmov_cost.  */
 767   2, /* issue_rate.  */
 768   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 769    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 770   "16", /* function_align.  */
 771   "4",  /* jump_align.  */
 772   "8",  /* loop_align.  */
 773   2,    /* int_reassoc_width.  */
 774   4,    /* fp_reassoc_width.  */
 775   1,    /* vec_reassoc_width.  */
 776   2,    /* min_div_recip_mul_sf.  */
 777   2,    /* min_div_recip_mul_df.  */
 778   0,    /* max_case_values.  */
 779   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 780   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 781   &generic_prefetch_tune
 782 };
 783
 784
 785
 786 static const struct tune_params exynosm1_tunings =
 787 {
 788   &exynosm1_extra_costs,
 789   &exynosm1_addrcost_table,
 790   &exynosm1_regmove_cost,
 791   &exynosm1_vector_cost,
 792   &generic_branch_cost,
 793   &exynosm1_approx_modes,
 794   4,    /* memmov_cost  */
 795   3,    /* issue_rate  */
 796   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 797   "4",  /* function_align.  */
 798   "4",  /* jump_align.  */
 799   "4",  /* loop_align.  */
 800   2,    /* int_reassoc_width.  */
 801   4,    /* fp_reassoc_width.  */
 802   1,    /* vec_reassoc_width.  */
 803   2,    /* min_div_recip_mul_sf.  */
 804   2,    /* min_div_recip_mul_df.  */
 805   48,   /* max_case_values.  */
 806   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 807   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 808   &exynosm1_prefetch_tune
 809 };
 810
 811 static const struct tune_params thunderxt88_tunings =
 812 {
 813   &thunderx_extra_costs,
 814   &generic_addrcost_table,
 815   &thunderx_regmove_cost,
 816   &thunderx_vector_cost,
 817   &generic_branch_cost,
 818   &generic_approx_modes,
 819   6, /* memmov_cost  */
 820   2, /* issue_rate  */
 821   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 822   "8",  /* function_align.  */
 823   "8",  /* jump_align.  */
 824   "8",  /* loop_align.  */
 825   2,    /* int_reassoc_width.  */
 826   4,    /* fp_reassoc_width.  */
 827   1,    /* vec_reassoc_width.  */
 828   2,    /* min_div_recip_mul_sf.  */
 829   2,    /* min_div_recip_mul_df.  */
 830   0,    /* max_case_values.  */
 831   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 832   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 833   &thunderxt88_prefetch_tune
 834 };
 835
 836 static const struct tune_params thunderx_tunings =
 837 {
 838   &thunderx_extra_costs,
 839   &generic_addrcost_table,
 840   &thunderx_regmove_cost,
 841   &thunderx_vector_cost,
 842   &generic_branch_cost,
 843   &generic_approx_modes,
 844   6, /* memmov_cost  */
 845   2, /* issue_rate  */
 846   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 847   "8",  /* function_align.  */
 848   "8",  /* jump_align.  */
 849   "8",  /* loop_align.  */
 850   2,    /* int_reassoc_width.  */
 851   4,    /* fp_reassoc_width.  */
 852   1,    /* vec_reassoc_width.  */
 853   2,    /* min_div_recip_mul_sf.  */
 854   2,    /* min_div_recip_mul_df.  */
 855   0,    /* max_case_values.  */
 856   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 857   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 858    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 859   &thunderx_prefetch_tune
 860 };
 861
 862 static const struct tune_params xgene1_tunings =
 863 {
 864   &xgene1_extra_costs,
 865   &xgene1_addrcost_table,
 866   &xgene1_regmove_cost,
 867   &xgene1_vector_cost,
 868   &generic_branch_cost,
 869   &xgene1_approx_modes,
 870   6, /* memmov_cost  */
 871   4, /* issue_rate  */
 872   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 873   "16", /* function_align.  */
 874   "8",  /* jump_align.  */
 875   "16", /* loop_align.  */
 876   2,    /* int_reassoc_width.  */
 877   4,    /* fp_reassoc_width.  */
 878   1,    /* vec_reassoc_width.  */
 879   2,    /* min_div_recip_mul_sf.  */
 880   2,    /* min_div_recip_mul_df.  */
 881   0,    /* max_case_values.  */
 882   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 883   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
 884   &generic_prefetch_tune
 885 };
 886
 887 static const struct tune_params qdf24xx_tunings =
 888 {
 889   &qdf24xx_extra_costs,
 890   &qdf24xx_addrcost_table,
 891   &qdf24xx_regmove_cost,
 892   &generic_vector_cost,
 893   &generic_branch_cost,
 894   &generic_approx_modes,
 895   4, /* memmov_cost  */
 896   4, /* issue_rate  */
 897   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 898    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 899   "16", /* function_align.  */
 900   "8",  /* jump_align.  */
 901   "16", /* loop_align.  */
 902   2,    /* int_reassoc_width.  */
 903   4,    /* fp_reassoc_width.  */
 904   1,    /* vec_reassoc_width.  */
 905   2,    /* min_div_recip_mul_sf.  */
 906   2,    /* min_div_recip_mul_df.  */
 907   0,    /* max_case_values.  */
 908   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 909   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
 910   &qdf24xx_prefetch_tune
 911 };
 912
 913 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
 914    for now.  */
 915 static const struct tune_params saphira_tunings =
 916 {
 917   &generic_extra_costs,
 918   &generic_addrcost_table,
 919   &generic_regmove_cost,
 920   &generic_vector_cost,
 921   &generic_branch_cost,
 922   &generic_approx_modes,
 923   4, /* memmov_cost  */
 924   4, /* issue_rate  */
 925   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 926    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 927   "16", /* function_align.  */
 928   "8",  /* jump_align.  */
 929   "16", /* loop_align.  */
 930   2,    /* int_reassoc_width.  */
 931   4,    /* fp_reassoc_width.  */
 932   1,    /* vec_reassoc_width.  */
 933   2,    /* min_div_recip_mul_sf.  */
 934   2,    /* min_div_recip_mul_df.  */
 935   0,    /* max_case_values.  */
 936   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 937   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
 938   &generic_prefetch_tune
 939 };
 940
 941 static const struct tune_params thunderx2t99_tunings =
 942 {
 943   &thunderx2t99_extra_costs,
 944   &thunderx2t99_addrcost_table,
 945   &thunderx2t99_regmove_cost,
 946   &thunderx2t99_vector_cost,
 947   &generic_branch_cost,
 948   &generic_approx_modes,
 949   4, /* memmov_cost.  */
 950   4, /* issue_rate.  */
 951   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
 952    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 953   "16", /* function_align.  */
 954   "8",  /* jump_align.  */
 955   "16", /* loop_align.  */
 956   3,    /* int_reassoc_width.  */
 957   2,    /* fp_reassoc_width.  */
 958   2,    /* vec_reassoc_width.  */
 959   2,    /* min_div_recip_mul_sf.  */
 960   2,    /* min_div_recip_mul_df.  */
 961   0,    /* max_case_values.  */
 962   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 963   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 964   &thunderx2t99_prefetch_tune
 965 };
 966
 967 /* Support for fine-grained override of the tuning structures.  */
 968 struct aarch64_tuning_override_function
 969 {
 970   const char* name;
 971   void (*parse_override)(const char*, struct tune_params*);
 972 };
 973
 974 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 975 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 976
 977 static const struct aarch64_tuning_override_function
 978 aarch64_tuning_override_functions[] =
 979 {
 980   { "fuse", aarch64_parse_fuse_string },
 981   { "tune", aarch64_parse_tune_string },
 982   { NULL, NULL }
 983 };
 984
 985 /* A processor implementing AArch64.  */
 986 struct processor
 987 {
 988   const char *const name;
 989   enum aarch64_processor ident;
 990   enum aarch64_processor sched_core;
 991   enum aarch64_arch arch;
 992   unsigned architecture_version;
 993   const unsigned long flags;
 994   const struct tune_params *const tune;
 995 };
 996
 997 /* Architectures implementing AArch64.  */
 998 static const struct processor all_architectures[] =
 999 {
1000 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1001   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1002 #include "aarch64-arches.def"
1003   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1004 };
1005
1006 /* Processor cores implementing AArch64.  */
1007 static const struct processor all_cores[] =
1008 {
1009 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1010   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
1011   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
1012   FLAGS, &COSTS##_tunings},
1013 #include "aarch64-cores.def"
1014   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1015     AARCH64_FL_FOR_ARCH8, &generic_tunings},
1016   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1017 };
1018
1019
1020 /* Target specification.  These are populated by the -march, -mtune, -mcpu
1021    handling code or by target attributes.  */
1022 static const struct processor *selected_arch;
1023 static const struct processor *selected_cpu;
1024 static const struct processor *selected_tune;
1025
1026 /* The current tuning set.  */
1027 struct tune_params aarch64_tune_params = generic_tunings;
1028
1029 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1030
1031 /* An ISA extension in the co-processor and main instruction set space.  */
1032 struct aarch64_option_extension
1033 {
1034   const char *const name;
1035   const unsigned long flags_on;
1036   const unsigned long flags_off;
1037 };
1038
1039 typedef enum aarch64_cond_code
1040 {
1041   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1042   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1043   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1044 }
1045 aarch64_cc;
1046
1047 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1048
1049 /* The condition codes of the processor, and the inverse function.  */
1050 static const char * const aarch64_condition_codes[] =
1051 {
1052   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1053   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1054 };
1055
1056 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1057 const char *
1058 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1059                         const char * branch_format)
1060 {
1061     rtx_code_label * tmp_label = gen_label_rtx ();
1062     char label_buf[256];
1063     char buffer[128];
1064     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1065                                  CODE_LABEL_NUMBER (tmp_label));
1066     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1067     rtx dest_label = operands[pos_label];
1068     operands[pos_label] = tmp_label;
1069
1070     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1071     output_asm_insn (buffer, operands);
1072
1073     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1074     operands[pos_label] = dest_label;
1075     output_asm_insn (buffer, operands);
1076     return "";
1077 }
1078
1079 void
1080 aarch64_err_no_fpadvsimd (machine_mode mode)
1081 {
1082   if (TARGET_GENERAL_REGS_ONLY)
1083     if (FLOAT_MODE_P (mode))
1084       error ("%qs is incompatible with the use of floating-point types",
1085              "-mgeneral-regs-only");
1086     else
1087       error ("%qs is incompatible with the use of vector types",
1088              "-mgeneral-regs-only");
1089   else
1090     if (FLOAT_MODE_P (mode))
1091       error ("%qs feature modifier is incompatible with the use of"
1092              " floating-point types", "+nofp");
1093     else
1094       error ("%qs feature modifier is incompatible with the use of"
1095              " vector types", "+nofp");
1096 }
1097
1098 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1099    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1100    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1101    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1102    and GENERAL_REGS is lower than the memory cost (in this case the best class
1103    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
1104    cost results in bad allocations with many redundant int<->FP moves which
1105    are expensive on various cores.
1106    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1107    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
1108    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
1109    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
1110    The result of this is that it is no longer inefficient to have a higher
1111    memory move cost than the register move cost.
1112 */
1113
1114 static reg_class_t
1115 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1116                                          reg_class_t best_class)
1117 {
1118   machine_mode mode;
1119
1120   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1121       || !reg_class_subset_p (FP_REGS, allocno_class))
1122     return allocno_class;
1123
1124   if (!reg_class_subset_p (GENERAL_REGS, best_class)
1125       || !reg_class_subset_p (FP_REGS, best_class))
1126     return best_class;
1127
1128   mode = PSEUDO_REGNO_MODE (regno);
1129   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1130 }
1131
1132 static unsigned int
1133 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1134 {
1135   if (GET_MODE_UNIT_SIZE (mode) == 4)
1136     return aarch64_tune_params.min_div_recip_mul_sf;
1137   return aarch64_tune_params.min_div_recip_mul_df;
1138 }
1139
1140 /* Return the reassociation width of treeop OPC with mode MODE.  */
1141 static int
1142 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1143 {
1144   if (VECTOR_MODE_P (mode))
1145     return aarch64_tune_params.vec_reassoc_width;
1146   if (INTEGRAL_MODE_P (mode))
1147     return aarch64_tune_params.int_reassoc_width;
1148   /* Avoid reassociating floating point addition so we emit more FMAs.  */
1149   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1150     return aarch64_tune_params.fp_reassoc_width;
1151   return 1;
1152 }
1153
1154 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1155 unsigned
1156 aarch64_dbx_register_number (unsigned regno)
1157 {
1158    if (GP_REGNUM_P (regno))
1159      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1160    else if (regno == SP_REGNUM)
1161      return AARCH64_DWARF_SP;
1162    else if (FP_REGNUM_P (regno))
1163      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1164    else if (PR_REGNUM_P (regno))
1165      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1166    else if (regno == VG_REGNUM)
1167      return AARCH64_DWARF_VG;
1168
1169    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1170       equivalent DWARF register.  */
1171    return DWARF_FRAME_REGISTERS;
1172 }
1173
1174 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1175 static bool
1176 aarch64_advsimd_struct_mode_p (machine_mode mode)
1177 {
1178   return (TARGET_SIMD
1179           && (mode == OImode || mode == CImode || mode == XImode));
1180 }
1181
1182 /* Return true if MODE is an SVE predicate mode.  */
1183 static bool
1184 aarch64_sve_pred_mode_p (machine_mode mode)
1185 {
1186   return (TARGET_SVE
1187           && (mode == VNx16BImode
1188               || mode == VNx8BImode
1189               || mode == VNx4BImode
1190               || mode == VNx2BImode));
1191 }
1192
1193 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1194 const unsigned int VEC_ADVSIMD  = 1;
1195 const unsigned int VEC_SVE_DATA = 2;
1196 const unsigned int VEC_SVE_PRED = 4;
1197 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1198    a structure of 2, 3 or 4 vectors.  */
1199 const unsigned int VEC_STRUCT   = 8;
1200 /* Useful combinations of the above.  */
1201 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1202 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1203
1204 /* Return a set of flags describing the vector properties of mode MODE.
1205    Ignore modes that are not supported by the current target.  */
1206 static unsigned int
1207 aarch64_classify_vector_mode (machine_mode mode)
1208 {
1209   if (aarch64_advsimd_struct_mode_p (mode))
1210     return VEC_ADVSIMD | VEC_STRUCT;
1211
1212   if (aarch64_sve_pred_mode_p (mode))
1213     return VEC_SVE_PRED;
1214
1215   scalar_mode inner = GET_MODE_INNER (mode);
1216   if (VECTOR_MODE_P (mode)
1217       && (inner == QImode
1218           || inner == HImode
1219           || inner == HFmode
1220           || inner == SImode
1221           || inner == SFmode
1222           || inner == DImode
1223           || inner == DFmode))
1224     {
1225       if (TARGET_SVE)
1226         {
1227           if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1228             return VEC_SVE_DATA;
1229           if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1230               || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1231               || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1232             return VEC_SVE_DATA | VEC_STRUCT;
1233         }
1234
1235       /* This includes V1DF but not V1DI (which doesn't exist).  */
1236       if (TARGET_SIMD
1237           && (known_eq (GET_MODE_BITSIZE (mode), 64)
1238               || known_eq (GET_MODE_BITSIZE (mode), 128)))
1239         return VEC_ADVSIMD;
1240     }
1241
1242   return 0;
1243 }
1244
1245 /* Return true if MODE is any of the data vector modes, including
1246    structure modes.  */
1247 static bool
1248 aarch64_vector_data_mode_p (machine_mode mode)
1249 {
1250   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1251 }
1252
1253 /* Return true if MODE is an SVE data vector mode; either a single vector
1254    or a structure of vectors.  */
1255 static bool
1256 aarch64_sve_data_mode_p (machine_mode mode)
1257 {
1258   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1259 }
1260
1261 /* Implement target hook TARGET_ARRAY_MODE.  */
1262 static opt_machine_mode
1263 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1264 {
1265   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1266       && IN_RANGE (nelems, 2, 4))
1267     return mode_for_vector (GET_MODE_INNER (mode),
1268                             GET_MODE_NUNITS (mode) * nelems);
1269
1270   return opt_machine_mode ();
1271 }
1272
1273 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1274 static bool
1275 aarch64_array_mode_supported_p (machine_mode mode,
1276                                 unsigned HOST_WIDE_INT nelems)
1277 {
1278   if (TARGET_SIMD
1279       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1280           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1281       && (nelems >= 2 && nelems <= 4))
1282     return true;
1283
1284   return false;
1285 }
1286
1287 /* Return the SVE predicate mode to use for elements that have
1288    ELEM_NBYTES bytes, if such a mode exists.  */
1289
1290 opt_machine_mode
1291 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1292 {
1293   if (TARGET_SVE)
1294     {
1295       if (elem_nbytes == 1)
1296         return VNx16BImode;
1297       if (elem_nbytes == 2)
1298         return VNx8BImode;
1299       if (elem_nbytes == 4)
1300         return VNx4BImode;
1301       if (elem_nbytes == 8)
1302         return VNx2BImode;
1303     }
1304   return opt_machine_mode ();
1305 }
1306
1307 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1308
1309 static opt_machine_mode
1310 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1311 {
1312   if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1313     {
1314       unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1315       machine_mode pred_mode;
1316       if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1317         return pred_mode;
1318     }
1319
1320   return default_get_mask_mode (nunits, nbytes);
1321 }
1322
1323 /* Implement TARGET_PREFERRED_ELSE_VALUE.  Prefer to use the first
1324    arithmetic operand as the else value if the else value doesn't matter,
1325    since that exactly matches the SVE destructive merging form.  */
1326
1327 static tree
1328 aarch64_preferred_else_value (unsigned, tree, unsigned int, tree *ops)
1329 {
1330   return ops[0];
1331 }
1332
1333 /* Implement TARGET_HARD_REGNO_NREGS.  */
1334
1335 static unsigned int
1336 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1337 {
1338   /* ??? Logically we should only need to provide a value when
1339      HARD_REGNO_MODE_OK says that the combination is valid,
1340      but at the moment we need to handle all modes.  Just ignore
1341      any runtime parts for registers that can't store them.  */
1342   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1343   switch (aarch64_regno_regclass (regno))
1344     {
1345     case FP_REGS:
1346     case FP_LO_REGS:
1347       if (aarch64_sve_data_mode_p (mode))
1348         return exact_div (GET_MODE_SIZE (mode),
1349                           BYTES_PER_SVE_VECTOR).to_constant ();
1350       return CEIL (lowest_size, UNITS_PER_VREG);
1351     case PR_REGS:
1352     case PR_LO_REGS:
1353     case PR_HI_REGS:
1354       return 1;
1355     default:
1356       return CEIL (lowest_size, UNITS_PER_WORD);
1357     }
1358   gcc_unreachable ();
1359 }
1360
1361 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1362
1363 static bool
1364 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1365 {
1366   if (GET_MODE_CLASS (mode) == MODE_CC)
1367     return regno == CC_REGNUM;
1368
1369   if (regno == VG_REGNUM)
1370     /* This must have the same size as _Unwind_Word.  */
1371     return mode == DImode;
1372
1373   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1374   if (vec_flags & VEC_SVE_PRED)
1375     return PR_REGNUM_P (regno);
1376
1377   if (PR_REGNUM_P (regno))
1378     return 0;
1379
1380   if (regno == SP_REGNUM)
1381     /* The purpose of comparing with ptr_mode is to support the
1382        global register variable associated with the stack pointer
1383        register via the syntax of asm ("wsp") in ILP32.  */
1384     return mode == Pmode || mode == ptr_mode;
1385
1386   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1387     return mode == Pmode;
1388
1389   if (GP_REGNUM_P (regno) && known_le (GET_MODE_SIZE (mode), 16))
1390     return true;
1391
1392   if (FP_REGNUM_P (regno))
1393     {
1394       if (vec_flags & VEC_STRUCT)
1395         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1396       else
1397         return !VECTOR_MODE_P (mode) || vec_flags != 0;
1398     }
1399
1400   return false;
1401 }
1402
1403 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1404    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1405    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1406
1407 static bool
1408 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1409 {
1410   return FP_REGNUM_P (regno) && maybe_gt (GET_MODE_SIZE (mode), 8);
1411 }
1412
1413 /* Implement REGMODE_NATURAL_SIZE.  */
1414 poly_uint64
1415 aarch64_regmode_natural_size (machine_mode mode)
1416 {
1417   /* The natural size for SVE data modes is one SVE data vector,
1418      and similarly for predicates.  We can't independently modify
1419      anything smaller than that.  */
1420   /* ??? For now, only do this for variable-width SVE registers.
1421      Doing it for constant-sized registers breaks lower-subreg.c.  */
1422   /* ??? And once that's fixed, we should probably have similar
1423      code for Advanced SIMD.  */
1424   if (!aarch64_sve_vg.is_constant ())
1425     {
1426       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1427       if (vec_flags & VEC_SVE_PRED)
1428         return BYTES_PER_SVE_PRED;
1429       if (vec_flags & VEC_SVE_DATA)
1430         return BYTES_PER_SVE_VECTOR;
1431     }
1432   return UNITS_PER_WORD;
1433 }
1434
1435 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1436 machine_mode
1437 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1438                                      machine_mode mode)
1439 {
1440   /* The predicate mode determines which bits are significant and
1441      which are "don't care".  Decreasing the number of lanes would
1442      lose data while increasing the number of lanes would make bits
1443      unnecessarily significant.  */
1444   if (PR_REGNUM_P (regno))
1445     return mode;
1446   if (known_ge (GET_MODE_SIZE (mode), 4))
1447     return mode;
1448   else
1449     return SImode;
1450 }
1451
1452 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
1453    that strcpy from constants will be faster.  */
1454
1455 static HOST_WIDE_INT
1456 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1457 {
1458   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1459     return MAX (align, BITS_PER_WORD);
1460   return align;
1461 }
1462
1463 /* Return true if calls to DECL should be treated as
1464    long-calls (ie called via a register).  */
1465 static bool
1466 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1467 {
1468   return false;
1469 }
1470
1471 /* Return true if calls to symbol-ref SYM should be treated as
1472    long-calls (ie called via a register).  */
1473 bool
1474 aarch64_is_long_call_p (rtx sym)
1475 {
1476   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1477 }
1478
1479 /* Return true if calls to symbol-ref SYM should not go through
1480    plt stubs.  */
1481
1482 bool
1483 aarch64_is_noplt_call_p (rtx sym)
1484 {
1485   const_tree decl = SYMBOL_REF_DECL (sym);
1486
1487   if (flag_pic
1488       && decl
1489       && (!flag_plt
1490           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1491       && !targetm.binds_local_p (decl))
1492     return true;
1493
1494   return false;
1495 }
1496
1497 /* Return true if the offsets to a zero/sign-extract operation
1498    represent an expression that matches an extend operation.  The
1499    operands represent the paramters from
1500
1501    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1502 bool
1503 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1504                                 rtx extract_imm)
1505 {
1506   HOST_WIDE_INT mult_val, extract_val;
1507
1508   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1509     return false;
1510
1511   mult_val = INTVAL (mult_imm);
1512   extract_val = INTVAL (extract_imm);
1513
1514   if (extract_val > 8
1515       && extract_val < GET_MODE_BITSIZE (mode)
1516       && exact_log2 (extract_val & ~7) > 0
1517       && (extract_val & 7) <= 4
1518       && mult_val == (1 << (extract_val & 7)))
1519     return true;
1520
1521   return false;
1522 }
1523
1524 /* Emit an insn that's a simple single-set.  Both the operands must be
1525    known to be valid.  */
1526 inline static rtx_insn *
1527 emit_set_insn (rtx x, rtx y)
1528 {
1529   return emit_insn (gen_rtx_SET (x, y));
1530 }
1531
1532 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1533    return the rtx for register 0 in the proper mode.  */
1534 rtx
1535 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1536 {
1537   machine_mode mode = SELECT_CC_MODE (code, x, y);
1538   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1539
1540   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1541   return cc_reg;
1542 }
1543
1544 /* Build the SYMBOL_REF for __tls_get_addr.  */
1545
1546 static GTY(()) rtx tls_get_addr_libfunc;
1547
1548 rtx
1549 aarch64_tls_get_addr (void)
1550 {
1551   if (!tls_get_addr_libfunc)
1552     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1553   return tls_get_addr_libfunc;
1554 }
1555
1556 /* Return the TLS model to use for ADDR.  */
1557
1558 static enum tls_model
1559 tls_symbolic_operand_type (rtx addr)
1560 {
1561   enum tls_model tls_kind = TLS_MODEL_NONE;
1562   if (GET_CODE (addr) == CONST)
1563     {
1564       poly_int64 addend;
1565       rtx sym = strip_offset (addr, &addend);
1566       if (GET_CODE (sym) == SYMBOL_REF)
1567         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1568     }
1569   else if (GET_CODE (addr) == SYMBOL_REF)
1570     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1571
1572   return tls_kind;
1573 }
1574
1575 /* We'll allow lo_sum's in addresses in our legitimate addresses
1576    so that combine would take care of combining addresses where
1577    necessary, but for generation purposes, we'll generate the address
1578    as :
1579    RTL                               Absolute
1580    tmp = hi (symbol_ref);            adrp  x1, foo
1581    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1582                                      nop
1583
1584    PIC                               TLS
1585    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1586    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1587                                      bl   __tls_get_addr
1588                                      nop
1589
1590    Load TLS symbol, depending on TLS mechanism and TLS access model.
1591
1592    Global Dynamic - Traditional TLS:
1593    adrp tmp, :tlsgd:imm
1594    add  dest, tmp, #:tlsgd_lo12:imm
1595    bl   __tls_get_addr
1596
1597    Global Dynamic - TLS Descriptors:
1598    adrp dest, :tlsdesc:imm
1599    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1600    add  dest, dest, #:tlsdesc_lo12:imm
1601    blr  tmp
1602    mrs  tp, tpidr_el0
1603    add  dest, dest, tp
1604
1605    Initial Exec:
1606    mrs  tp, tpidr_el0
1607    adrp tmp, :gottprel:imm
1608    ldr  dest, [tmp, #:gottprel_lo12:imm]
1609    add  dest, dest, tp
1610
1611    Local Exec:
1612    mrs  tp, tpidr_el0
1613    add  t0, tp, #:tprel_hi12:imm, lsl #12
1614    add  t0, t0, #:tprel_lo12_nc:imm
1615 */
1616
1617 static void
1618 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1619                                    enum aarch64_symbol_type type)
1620 {
1621   switch (type)
1622     {
1623     case SYMBOL_SMALL_ABSOLUTE:
1624       {
1625         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1626         rtx tmp_reg = dest;
1627         machine_mode mode = GET_MODE (dest);
1628
1629         gcc_assert (mode == Pmode || mode == ptr_mode);
1630
1631         if (can_create_pseudo_p ())
1632           tmp_reg = gen_reg_rtx (mode);
1633
1634         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1635         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1636         return;
1637       }
1638
1639     case SYMBOL_TINY_ABSOLUTE:
1640       emit_insn (gen_rtx_SET (dest, imm));
1641       return;
1642
1643     case SYMBOL_SMALL_GOT_28K:
1644       {
1645         machine_mode mode = GET_MODE (dest);
1646         rtx gp_rtx = pic_offset_table_rtx;
1647         rtx insn;
1648         rtx mem;
1649
1650         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1651            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1652            decide rtx costs, in which case pic_offset_table_rtx is not
1653            initialized.  For that case no need to generate the first adrp
1654            instruction as the final cost for global variable access is
1655            one instruction.  */
1656         if (gp_rtx != NULL)
1657           {
1658             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1659                using the page base as GOT base, the first page may be wasted,
1660                in the worst scenario, there is only 28K space for GOT).
1661
1662                The generate instruction sequence for accessing global variable
1663                is:
1664
1665                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1666
1667                Only one instruction needed. But we must initialize
1668                pic_offset_table_rtx properly.  We generate initialize insn for
1669                every global access, and allow CSE to remove all redundant.
1670
1671                The final instruction sequences will look like the following
1672                for multiply global variables access.
1673
1674                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1675
1676                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1677                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1678                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1679                  ...  */
1680
1681             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1682             crtl->uses_pic_offset_table = 1;
1683             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1684
1685             if (mode != GET_MODE (gp_rtx))
1686              gp_rtx = gen_lowpart (mode, gp_rtx);
1687
1688           }
1689
1690         if (mode == ptr_mode)
1691           {
1692             if (mode == DImode)
1693               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1694             else
1695               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1696
1697             mem = XVECEXP (SET_SRC (insn), 0, 0);
1698           }
1699         else
1700           {
1701             gcc_assert (mode == Pmode);
1702
1703             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1704             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1705           }
1706
1707         /* The operand is expected to be MEM.  Whenever the related insn
1708            pattern changed, above code which calculate mem should be
1709            updated.  */
1710         gcc_assert (GET_CODE (mem) == MEM);
1711         MEM_READONLY_P (mem) = 1;
1712         MEM_NOTRAP_P (mem) = 1;
1713         emit_insn (insn);
1714         return;
1715       }
1716
1717     case SYMBOL_SMALL_GOT_4G:
1718       {
1719         /* In ILP32, the mode of dest can be either SImode or DImode,
1720            while the got entry is always of SImode size.  The mode of
1721            dest depends on how dest is used: if dest is assigned to a
1722            pointer (e.g. in the memory), it has SImode; it may have
1723            DImode if dest is dereferenced to access the memeory.
1724            This is why we have to handle three different ldr_got_small
1725            patterns here (two patterns for ILP32).  */
1726
1727         rtx insn;
1728         rtx mem;
1729         rtx tmp_reg = dest;
1730         machine_mode mode = GET_MODE (dest);
1731
1732         if (can_create_pseudo_p ())
1733           tmp_reg = gen_reg_rtx (mode);
1734
1735         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1736         if (mode == ptr_mode)
1737           {
1738             if (mode == DImode)
1739               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1740             else
1741               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1742
1743             mem = XVECEXP (SET_SRC (insn), 0, 0);
1744           }
1745         else
1746           {
1747             gcc_assert (mode == Pmode);
1748
1749             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1750             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1751           }
1752
1753         gcc_assert (GET_CODE (mem) == MEM);
1754         MEM_READONLY_P (mem) = 1;
1755         MEM_NOTRAP_P (mem) = 1;
1756         emit_insn (insn);
1757         return;
1758       }
1759
1760     case SYMBOL_SMALL_TLSGD:
1761       {
1762         rtx_insn *insns;
1763         machine_mode mode = GET_MODE (dest);
1764         rtx result = gen_rtx_REG (mode, R0_REGNUM);
1765
1766         start_sequence ();
1767         if (TARGET_ILP32)
1768           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1769         else
1770           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1771         insns = get_insns ();
1772         end_sequence ();
1773
1774         RTL_CONST_CALL_P (insns) = 1;
1775         emit_libcall_block (insns, dest, result, imm);
1776         return;
1777       }
1778
1779     case SYMBOL_SMALL_TLSDESC:
1780       {
1781         machine_mode mode = GET_MODE (dest);
1782         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1783         rtx tp;
1784
1785         gcc_assert (mode == Pmode || mode == ptr_mode);
1786
1787         /* In ILP32, the got entry is always of SImode size.  Unlike
1788            small GOT, the dest is fixed at reg 0.  */
1789         if (TARGET_ILP32)
1790           emit_insn (gen_tlsdesc_small_si (imm));
1791         else
1792           emit_insn (gen_tlsdesc_small_di (imm));
1793         tp = aarch64_load_tp (NULL);
1794
1795         if (mode != Pmode)
1796           tp = gen_lowpart (mode, tp);
1797
1798         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1799         if (REG_P (dest))
1800           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1801         return;
1802       }
1803
1804     case SYMBOL_SMALL_TLSIE:
1805       {
1806         /* In ILP32, the mode of dest can be either SImode or DImode,
1807            while the got entry is always of SImode size.  The mode of
1808            dest depends on how dest is used: if dest is assigned to a
1809            pointer (e.g. in the memory), it has SImode; it may have
1810            DImode if dest is dereferenced to access the memeory.
1811            This is why we have to handle three different tlsie_small
1812            patterns here (two patterns for ILP32).  */
1813         machine_mode mode = GET_MODE (dest);
1814         rtx tmp_reg = gen_reg_rtx (mode);
1815         rtx tp = aarch64_load_tp (NULL);
1816
1817         if (mode == ptr_mode)
1818           {
1819             if (mode == DImode)
1820               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1821             else
1822               {
1823                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1824                 tp = gen_lowpart (mode, tp);
1825               }
1826           }
1827         else
1828           {
1829             gcc_assert (mode == Pmode);
1830             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1831           }
1832
1833         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1834         if (REG_P (dest))
1835           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1836         return;
1837       }
1838
1839     case SYMBOL_TLSLE12:
1840     case SYMBOL_TLSLE24:
1841     case SYMBOL_TLSLE32:
1842     case SYMBOL_TLSLE48:
1843       {
1844         machine_mode mode = GET_MODE (dest);
1845         rtx tp = aarch64_load_tp (NULL);
1846
1847         if (mode != Pmode)
1848           tp = gen_lowpart (mode, tp);
1849
1850         switch (type)
1851           {
1852           case SYMBOL_TLSLE12:
1853             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1854                         (dest, tp, imm));
1855             break;
1856           case SYMBOL_TLSLE24:
1857             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1858                         (dest, tp, imm));
1859           break;
1860           case SYMBOL_TLSLE32:
1861             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1862                         (dest, imm));
1863             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1864                         (dest, dest, tp));
1865           break;
1866           case SYMBOL_TLSLE48:
1867             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1868                         (dest, imm));
1869             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1870                         (dest, dest, tp));
1871             break;
1872           default:
1873             gcc_unreachable ();
1874           }
1875
1876         if (REG_P (dest))
1877           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1878         return;
1879       }
1880
1881     case SYMBOL_TINY_GOT:
1882       emit_insn (gen_ldr_got_tiny (dest, imm));
1883       return;
1884
1885     case SYMBOL_TINY_TLSIE:
1886       {
1887         machine_mode mode = GET_MODE (dest);
1888         rtx tp = aarch64_load_tp (NULL);
1889
1890         if (mode == ptr_mode)
1891           {
1892             if (mode == DImode)
1893               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1894             else
1895               {
1896                 tp = gen_lowpart (mode, tp);
1897                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1898               }
1899           }
1900         else
1901           {
1902             gcc_assert (mode == Pmode);
1903             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1904           }
1905
1906         if (REG_P (dest))
1907           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1908         return;
1909       }
1910
1911     default:
1912       gcc_unreachable ();
1913     }
1914 }
1915
1916 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1917    handle all moves if !can_create_pseudo_p ().  The distinction is
1918    important because, unlike emit_move_insn, the move expanders know
1919    how to force Pmode objects into the constant pool even when the
1920    constant pool address is not itself legitimate.  */
1921 static rtx
1922 aarch64_emit_move (rtx dest, rtx src)
1923 {
1924   return (can_create_pseudo_p ()
1925           ? emit_move_insn (dest, src)
1926           : emit_move_insn_1 (dest, src));
1927 }
1928
1929 /* Apply UNOPTAB to OP and store the result in DEST.  */
1930
1931 static void
1932 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
1933 {
1934   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
1935   if (dest != tmp)
1936     emit_move_insn (dest, tmp);
1937 }
1938
1939 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
1940
1941 static void
1942 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
1943 {
1944   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
1945                           OPTAB_DIRECT);
1946   if (dest != tmp)
1947     emit_move_insn (dest, tmp);
1948 }
1949
1950 /* Split a 128-bit move operation into two 64-bit move operations,
1951    taking care to handle partial overlap of register to register
1952    copies.  Special cases are needed when moving between GP regs and
1953    FP regs.  SRC can be a register, constant or memory; DST a register
1954    or memory.  If either operand is memory it must not have any side
1955    effects.  */
1956 void
1957 aarch64_split_128bit_move (rtx dst, rtx src)
1958 {
1959   rtx dst_lo, dst_hi;
1960   rtx src_lo, src_hi;
1961
1962   machine_mode mode = GET_MODE (dst);
1963
1964   gcc_assert (mode == TImode || mode == TFmode);
1965   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1966   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1967
1968   if (REG_P (dst) && REG_P (src))
1969     {
1970       int src_regno = REGNO (src);
1971       int dst_regno = REGNO (dst);
1972
1973       /* Handle FP <-> GP regs.  */
1974       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1975         {
1976           src_lo = gen_lowpart (word_mode, src);
1977           src_hi = gen_highpart (word_mode, src);
1978
1979           if (mode == TImode)
1980             {
1981               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1982               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1983             }
1984           else
1985             {
1986               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1987               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1988             }
1989           return;
1990         }
1991       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1992         {
1993           dst_lo = gen_lowpart (word_mode, dst);
1994           dst_hi = gen_highpart (word_mode, dst);
1995
1996           if (mode == TImode)
1997             {
1998               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1999               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
2000             }
2001           else
2002             {
2003               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
2004               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
2005             }
2006           return;
2007         }
2008     }
2009
2010   dst_lo = gen_lowpart (word_mode, dst);
2011   dst_hi = gen_highpart (word_mode, dst);
2012   src_lo = gen_lowpart (word_mode, src);
2013   src_hi = gen_highpart_mode (word_mode, mode, src);
2014
2015   /* At most one pairing may overlap.  */
2016   if (reg_overlap_mentioned_p (dst_lo, src_hi))
2017     {
2018       aarch64_emit_move (dst_hi, src_hi);
2019       aarch64_emit_move (dst_lo, src_lo);
2020     }
2021   else
2022     {
2023       aarch64_emit_move (dst_lo, src_lo);
2024       aarch64_emit_move (dst_hi, src_hi);
2025     }
2026 }
2027
2028 bool
2029 aarch64_split_128bit_move_p (rtx dst, rtx src)
2030 {
2031   return (! REG_P (src)
2032           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2033 }
2034
2035 /* Split a complex SIMD combine.  */
2036
2037 void
2038 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2039 {
2040   machine_mode src_mode = GET_MODE (src1);
2041   machine_mode dst_mode = GET_MODE (dst);
2042
2043   gcc_assert (VECTOR_MODE_P (dst_mode));
2044   gcc_assert (register_operand (dst, dst_mode)
2045               && register_operand (src1, src_mode)
2046               && register_operand (src2, src_mode));
2047
2048   rtx (*gen) (rtx, rtx, rtx);
2049
2050   switch (src_mode)
2051     {
2052     case E_V8QImode:
2053       gen = gen_aarch64_simd_combinev8qi;
2054       break;
2055     case E_V4HImode:
2056       gen = gen_aarch64_simd_combinev4hi;
2057       break;
2058     case E_V2SImode:
2059       gen = gen_aarch64_simd_combinev2si;
2060       break;
2061     case E_V4HFmode:
2062       gen = gen_aarch64_simd_combinev4hf;
2063       break;
2064     case E_V2SFmode:
2065       gen = gen_aarch64_simd_combinev2sf;
2066       break;
2067     case E_DImode:
2068       gen = gen_aarch64_simd_combinedi;
2069       break;
2070     case E_DFmode:
2071       gen = gen_aarch64_simd_combinedf;
2072       break;
2073     default:
2074       gcc_unreachable ();
2075     }
2076
2077   emit_insn (gen (dst, src1, src2));
2078   return;
2079 }
2080
2081 /* Split a complex SIMD move.  */
2082
2083 void
2084 aarch64_split_simd_move (rtx dst, rtx src)
2085 {
2086   machine_mode src_mode = GET_MODE (src);
2087   machine_mode dst_mode = GET_MODE (dst);
2088
2089   gcc_assert (VECTOR_MODE_P (dst_mode));
2090
2091   if (REG_P (dst) && REG_P (src))
2092     {
2093       rtx (*gen) (rtx, rtx);
2094
2095       gcc_assert (VECTOR_MODE_P (src_mode));
2096
2097       switch (src_mode)
2098         {
2099         case E_V16QImode:
2100           gen = gen_aarch64_split_simd_movv16qi;
2101           break;
2102         case E_V8HImode:
2103           gen = gen_aarch64_split_simd_movv8hi;
2104           break;
2105         case E_V4SImode:
2106           gen = gen_aarch64_split_simd_movv4si;
2107           break;
2108         case E_V2DImode:
2109           gen = gen_aarch64_split_simd_movv2di;
2110           break;
2111         case E_V8HFmode:
2112           gen = gen_aarch64_split_simd_movv8hf;
2113           break;
2114         case E_V4SFmode:
2115           gen = gen_aarch64_split_simd_movv4sf;
2116           break;
2117         case E_V2DFmode:
2118           gen = gen_aarch64_split_simd_movv2df;
2119           break;
2120         default:
2121           gcc_unreachable ();
2122         }
2123
2124       emit_insn (gen (dst, src));
2125       return;
2126     }
2127 }
2128
2129 bool
2130 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2131                               machine_mode ymode, rtx y)
2132 {
2133   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2134   gcc_assert (r != NULL);
2135   return rtx_equal_p (x, r);
2136 }
2137
2138
2139 static rtx
2140 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2141 {
2142   if (can_create_pseudo_p ())
2143     return force_reg (mode, value);
2144   else
2145     {
2146       gcc_assert (x);
2147       aarch64_emit_move (x, value);
2148       return x;
2149     }
2150 }
2151
2152 /* Return true if we can move VALUE into a register using a single
2153    CNT[BHWD] instruction.  */
2154
2155 static bool
2156 aarch64_sve_cnt_immediate_p (poly_int64 value)
2157 {
2158   HOST_WIDE_INT factor = value.coeffs[0];
2159   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
2160   return (value.coeffs[1] == factor
2161           && IN_RANGE (factor, 2, 16 * 16)
2162           && (factor & 1) == 0
2163           && factor <= 16 * (factor & -factor));
2164 }
2165
2166 /* Likewise for rtx X.  */
2167
2168 bool
2169 aarch64_sve_cnt_immediate_p (rtx x)
2170 {
2171   poly_int64 value;
2172   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2173 }
2174
2175 /* Return the asm string for an instruction with a CNT-like vector size
2176    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2177    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2178    first part of the operands template (the part that comes before the
2179    vector size itself).  FACTOR is the number of quadwords.
2180    NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2181    If it is zero, we can use any element size.  */
2182
2183 static char *
2184 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2185                                   unsigned int factor,
2186                                   unsigned int nelts_per_vq)
2187 {
2188   static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2189
2190   if (nelts_per_vq == 0)
2191     /* There is some overlap in the ranges of the four CNT instructions.
2192        Here we always use the smallest possible element size, so that the
2193        multiplier is 1 whereever possible.  */
2194     nelts_per_vq = factor & -factor;
2195   int shift = std::min (exact_log2 (nelts_per_vq), 4);
2196   gcc_assert (IN_RANGE (shift, 1, 4));
2197   char suffix = "dwhb"[shift - 1];
2198
2199   factor >>= shift;
2200   unsigned int written;
2201   if (factor == 1)
2202     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2203                         prefix, suffix, operands);
2204   else
2205     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2206                         prefix, suffix, operands, factor);
2207   gcc_assert (written < sizeof (buffer));
2208   return buffer;
2209 }
2210
2211 /* Return the asm string for an instruction with a CNT-like vector size
2212    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2213    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2214    first part of the operands template (the part that comes before the
2215    vector size itself).  X is the value of the vector size operand,
2216    as a polynomial integer rtx.  */
2217
2218 char *
2219 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2220                                   rtx x)
2221 {
2222   poly_int64 value = rtx_to_poly_int64 (x);
2223   gcc_assert (aarch64_sve_cnt_immediate_p (value));
2224   return aarch64_output_sve_cnt_immediate (prefix, operands,
2225                                            value.coeffs[1], 0);
2226 }
2227
2228 /* Return true if we can add VALUE to a register using a single ADDVL
2229    or ADDPL instruction.  */
2230
2231 static bool
2232 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2233 {
2234   HOST_WIDE_INT factor = value.coeffs[0];
2235   if (factor == 0 || value.coeffs[1] != factor)
2236     return false;
2237   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2238      and a value of 16 is one vector width.  */
2239   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2240           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2241 }
2242
2243 /* Likewise for rtx X.  */
2244
2245 bool
2246 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2247 {
2248   poly_int64 value;
2249   return (poly_int_rtx_p (x, &value)
2250           && aarch64_sve_addvl_addpl_immediate_p (value));
2251 }
2252
2253 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2254    and storing the result in operand 0.  */
2255
2256 char *
2257 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2258 {
2259   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2260   poly_int64 offset_value = rtx_to_poly_int64 (offset);
2261   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2262
2263   /* Use INC or DEC if possible.  */
2264   if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2265     {
2266       if (aarch64_sve_cnt_immediate_p (offset_value))
2267         return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2268                                                  offset_value.coeffs[1], 0);
2269       if (aarch64_sve_cnt_immediate_p (-offset_value))
2270         return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2271                                                  -offset_value.coeffs[1], 0);
2272     }
2273
2274   int factor = offset_value.coeffs[1];
2275   if ((factor & 15) == 0)
2276     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2277   else
2278     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2279   return buffer;
2280 }
2281
2282 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2283    instruction.  If it is, store the number of elements in each vector
2284    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2285    factor in *FACTOR_OUT (if nonnull).  */
2286
2287 bool
2288 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2289                                  unsigned int *nelts_per_vq_out)
2290 {
2291   rtx elt;
2292   poly_int64 value;
2293
2294   if (!const_vec_duplicate_p (x, &elt)
2295       || !poly_int_rtx_p (elt, &value))
2296     return false;
2297
2298   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2299   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2300     /* There's no vector INCB.  */
2301     return false;
2302
2303   HOST_WIDE_INT factor = value.coeffs[0];
2304   if (value.coeffs[1] != factor)
2305     return false;
2306
2307   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
2308   if ((factor % nelts_per_vq) != 0
2309       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2310     return false;
2311
2312   if (factor_out)
2313     *factor_out = factor;
2314   if (nelts_per_vq_out)
2315     *nelts_per_vq_out = nelts_per_vq;
2316   return true;
2317 }
2318
2319 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2320    instruction.  */
2321
2322 bool
2323 aarch64_sve_inc_dec_immediate_p (rtx x)
2324 {
2325   return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2326 }
2327
2328 /* Return the asm template for an SVE vector INC or DEC instruction.
2329    OPERANDS gives the operands before the vector count and X is the
2330    value of the vector count operand itself.  */
2331
2332 char *
2333 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2334 {
2335   int factor;
2336   unsigned int nelts_per_vq;
2337   if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2338     gcc_unreachable ();
2339   if (factor < 0)
2340     return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2341                                              nelts_per_vq);
2342   else
2343     return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2344                                              nelts_per_vq);
2345 }
2346
2347 static int
2348 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2349                                 scalar_int_mode mode)
2350 {
2351   int i;
2352   unsigned HOST_WIDE_INT val, val2, mask;
2353   int one_match, zero_match;
2354   int num_insns;
2355
2356   val = INTVAL (imm);
2357
2358   if (aarch64_move_imm (val, mode))
2359     {
2360       if (generate)
2361         emit_insn (gen_rtx_SET (dest, imm));
2362       return 1;
2363     }
2364
2365   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2366      (with XXXX non-zero). In that case check to see if the move can be done in
2367      a smaller mode.  */
2368   val2 = val & 0xffffffff;
2369   if (mode == DImode
2370       && aarch64_move_imm (val2, SImode)
2371       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2372     {
2373       if (generate)
2374         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2375
2376       /* Check if we have to emit a second instruction by checking to see
2377          if any of the upper 32 bits of the original DI mode value is set.  */
2378       if (val == val2)
2379         return 1;
2380
2381       i = (val >> 48) ? 48 : 32;
2382
2383       if (generate)
2384          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2385                                     GEN_INT ((val >> i) & 0xffff)));
2386
2387       return 2;
2388     }
2389
2390   if ((val >> 32) == 0 || mode == SImode)
2391     {
2392       if (generate)
2393         {
2394           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2395           if (mode == SImode)
2396             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2397                                        GEN_INT ((val >> 16) & 0xffff)));
2398           else
2399             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2400                                        GEN_INT ((val >> 16) & 0xffff)));
2401         }
2402       return 2;
2403     }
2404
2405   /* Remaining cases are all for DImode.  */
2406
2407   mask = 0xffff;
2408   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2409     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2410   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2411     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2412
2413   if (zero_match != 2 && one_match != 2)
2414     {
2415       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2416          For a 64-bit bitmask try whether changing 16 bits to all ones or
2417          zeroes creates a valid bitmask.  To check any repeated bitmask,
2418          try using 16 bits from the other 32-bit half of val.  */
2419
2420       for (i = 0; i < 64; i += 16, mask <<= 16)
2421         {
2422           val2 = val & ~mask;
2423           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2424             break;
2425           val2 = val | mask;
2426           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2427             break;
2428           val2 = val2 & ~mask;
2429           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2430           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2431             break;
2432         }
2433       if (i != 64)
2434         {
2435           if (generate)
2436             {
2437               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2438               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2439                                          GEN_INT ((val >> i) & 0xffff)));
2440             }
2441           return 2;
2442         }
2443     }
2444
2445   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2446      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
2447      otherwise skip zero bits.  */
2448
2449   num_insns = 1;
2450   mask = 0xffff;
2451   val2 = one_match > zero_match ? ~val : val;
2452   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2453
2454   if (generate)
2455     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2456                                            ? (val | ~(mask << i))
2457                                            : (val & (mask << i)))));
2458   for (i += 16; i < 64; i += 16)
2459     {
2460       if ((val2 & (mask << i)) == 0)
2461         continue;
2462       if (generate)
2463         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2464                                    GEN_INT ((val >> i) & 0xffff)));
2465       num_insns ++;
2466     }
2467
2468   return num_insns;
2469 }
2470
2471 /* Return whether imm is a 128-bit immediate which is simple enough to
2472    expand inline.  */
2473 bool
2474 aarch64_mov128_immediate (rtx imm)
2475 {
2476   if (GET_CODE (imm) == CONST_INT)
2477     return true;
2478
2479   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2480
2481   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2482   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2483
2484   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2485          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2486 }
2487
2488
2489 /* Return the number of temporary registers that aarch64_add_offset_1
2490    would need to add OFFSET to a register.  */
2491
2492 static unsigned int
2493 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2494 {
2495   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2496 }
2497
2498 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
2499    a non-polynomial OFFSET.  MODE is the mode of the addition.
2500    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2501    be set and CFA adjustments added to the generated instructions.
2502
2503    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2504    temporary if register allocation is already complete.  This temporary
2505    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
2506    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2507    the immediate again.
2508
2509    Since this function may be used to adjust the stack pointer, we must
2510    ensure that it cannot cause transient stack deallocation (for example
2511    by first incrementing SP and then decrementing when adjusting by a
2512    large immediate).  */
2513
2514 static void
2515 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2516                       rtx src, HOST_WIDE_INT offset, rtx temp1,
2517                       bool frame_related_p, bool emit_move_imm)
2518 {
2519   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2520   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2521
2522   HOST_WIDE_INT moffset = abs_hwi (offset);
2523   rtx_insn *insn;
2524
2525   if (!moffset)
2526     {
2527       if (!rtx_equal_p (dest, src))
2528         {
2529           insn = emit_insn (gen_rtx_SET (dest, src));
2530           RTX_FRAME_RELATED_P (insn) = frame_related_p;
2531         }
2532       return;
2533     }
2534
2535   /* Single instruction adjustment.  */
2536   if (aarch64_uimm12_shift (moffset))
2537     {
2538       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2539       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2540       return;
2541     }
2542
2543   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2544      and either:
2545
2546      a) the offset cannot be loaded by a 16-bit move or
2547      b) there is no spare register into which we can move it.  */
2548   if (moffset < 0x1000000
2549       && ((!temp1 && !can_create_pseudo_p ())
2550           || !aarch64_move_imm (moffset, mode)))
2551     {
2552       HOST_WIDE_INT low_off = moffset & 0xfff;
2553
2554       low_off = offset < 0 ? -low_off : low_off;
2555       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2556       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2557       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2558       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2559       return;
2560     }
2561
2562   /* Emit a move immediate if required and an addition/subtraction.  */
2563   if (emit_move_imm)
2564     {
2565       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2566       temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2567     }
2568   insn = emit_insn (offset < 0
2569                     ? gen_sub3_insn (dest, src, temp1)
2570                     : gen_add3_insn (dest, src, temp1));
2571   if (frame_related_p)
2572     {
2573       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2574       rtx adj = plus_constant (mode, src, offset);
2575       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2576     }
2577 }
2578
2579 /* Return the number of temporary registers that aarch64_add_offset
2580    would need to move OFFSET into a register or add OFFSET to a register;
2581    ADD_P is true if we want the latter rather than the former.  */
2582
2583 static unsigned int
2584 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2585 {
2586   /* This follows the same structure as aarch64_add_offset.  */
2587   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2588     return 0;
2589
2590   unsigned int count = 0;
2591   HOST_WIDE_INT factor = offset.coeffs[1];
2592   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2593   poly_int64 poly_offset (factor, factor);
2594   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2595     /* Need one register for the ADDVL/ADDPL result.  */
2596     count += 1;
2597   else if (factor != 0)
2598     {
2599       factor = abs (factor);
2600       if (factor > 16 * (factor & -factor))
2601         /* Need one register for the CNT result and one for the multiplication
2602            factor.  If necessary, the second temporary can be reused for the
2603            constant part of the offset.  */
2604         return 2;
2605       /* Need one register for the CNT result (which might then
2606          be shifted).  */
2607       count += 1;
2608     }
2609   return count + aarch64_add_offset_1_temporaries (constant);
2610 }
2611
2612 /* If X can be represented as a poly_int64, return the number
2613    of temporaries that are required to add it to a register.
2614    Return -1 otherwise.  */
2615
2616 int
2617 aarch64_add_offset_temporaries (rtx x)
2618 {
2619   poly_int64 offset;
2620   if (!poly_int_rtx_p (x, &offset))
2621     return -1;
2622   return aarch64_offset_temporaries (true, offset);
2623 }
2624
2625 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
2626    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2627    be set and CFA adjustments added to the generated instructions.
2628
2629    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2630    temporary if register allocation is already complete.  This temporary
2631    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2632    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2633    false to avoid emitting the immediate again.
2634
2635    TEMP2, if nonnull, is a second temporary register that doesn't
2636    overlap either DEST or REG.
2637
2638    Since this function may be used to adjust the stack pointer, we must
2639    ensure that it cannot cause transient stack deallocation (for example
2640    by first incrementing SP and then decrementing when adjusting by a
2641    large immediate).  */
2642
2643 static void
2644 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2645                     poly_int64 offset, rtx temp1, rtx temp2,
2646                     bool frame_related_p, bool emit_move_imm = true)
2647 {
2648   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2649   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2650   gcc_assert (temp1 == NULL_RTX
2651               || !frame_related_p
2652               || !reg_overlap_mentioned_p (temp1, dest));
2653   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2654
2655   /* Try using ADDVL or ADDPL to add the whole value.  */
2656   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2657     {
2658       rtx offset_rtx = gen_int_mode (offset, mode);
2659       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2660       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2661       return;
2662     }
2663
2664   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2665      SVE vector register, over and above the minimum size of 128 bits.
2666      This is equivalent to half the value returned by CNTD with a
2667      vector shape of ALL.  */
2668   HOST_WIDE_INT factor = offset.coeffs[1];
2669   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2670
2671   /* Try using ADDVL or ADDPL to add the VG-based part.  */
2672   poly_int64 poly_offset (factor, factor);
2673   if (src != const0_rtx
2674       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2675     {
2676       rtx offset_rtx = gen_int_mode (poly_offset, mode);
2677       if (frame_related_p)
2678         {
2679           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2680           RTX_FRAME_RELATED_P (insn) = true;
2681           src = dest;
2682         }
2683       else
2684         {
2685           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2686           src = aarch64_force_temporary (mode, temp1, addr);
2687           temp1 = temp2;
2688           temp2 = NULL_RTX;
2689         }
2690     }
2691   /* Otherwise use a CNT-based sequence.  */
2692   else if (factor != 0)
2693     {
2694       /* Use a subtraction if we have a negative factor.  */
2695       rtx_code code = PLUS;
2696       if (factor < 0)
2697         {
2698           factor = -factor;
2699           code = MINUS;
2700         }
2701
2702       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
2703          into the multiplication.  */
2704       rtx val;
2705       int shift = 0;
2706       if (factor & 1)
2707         /* Use a right shift by 1.  */
2708         shift = -1;
2709       else
2710         factor /= 2;
2711       HOST_WIDE_INT low_bit = factor & -factor;
2712       if (factor <= 16 * low_bit)
2713         {
2714           if (factor > 16 * 8)
2715             {
2716               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2717                  the value with the minimum multiplier and shift it into
2718                  position.  */
2719               int extra_shift = exact_log2 (low_bit);
2720               shift += extra_shift;
2721               factor >>= extra_shift;
2722             }
2723           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
2724         }
2725       else
2726         {
2727           /* Use CNTD, then multiply it by FACTOR.  */
2728           val = gen_int_mode (poly_int64 (2, 2), mode);
2729           val = aarch64_force_temporary (mode, temp1, val);
2730
2731           /* Go back to using a negative multiplication factor if we have
2732              no register from which to subtract.  */
2733           if (code == MINUS && src == const0_rtx)
2734             {
2735               factor = -factor;
2736               code = PLUS;
2737             }
2738           rtx coeff1 = gen_int_mode (factor, mode);
2739           coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
2740           val = gen_rtx_MULT (mode, val, coeff1);
2741         }
2742
2743       if (shift > 0)
2744         {
2745           /* Multiply by 1 << SHIFT.  */
2746           val = aarch64_force_temporary (mode, temp1, val);
2747           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
2748         }
2749       else if (shift == -1)
2750         {
2751           /* Divide by 2.  */
2752           val = aarch64_force_temporary (mode, temp1, val);
2753           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
2754         }
2755
2756       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
2757       if (src != const0_rtx)
2758         {
2759           val = aarch64_force_temporary (mode, temp1, val);
2760           val = gen_rtx_fmt_ee (code, mode, src, val);
2761         }
2762       else if (code == MINUS)
2763         {
2764           val = aarch64_force_temporary (mode, temp1, val);
2765           val = gen_rtx_NEG (mode, val);
2766         }
2767
2768       if (constant == 0 || frame_related_p)
2769         {
2770           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
2771           if (frame_related_p)
2772             {
2773               RTX_FRAME_RELATED_P (insn) = true;
2774               add_reg_note (insn, REG_CFA_ADJUST_CFA,
2775                             gen_rtx_SET (dest, plus_constant (Pmode, src,
2776                                                               poly_offset)));
2777             }
2778           src = dest;
2779           if (constant == 0)
2780             return;
2781         }
2782       else
2783         {
2784           src = aarch64_force_temporary (mode, temp1, val);
2785           temp1 = temp2;
2786           temp2 = NULL_RTX;
2787         }
2788
2789       emit_move_imm = true;
2790     }
2791
2792   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
2793                         frame_related_p, emit_move_imm);
2794 }
2795
2796 /* Like aarch64_add_offset, but the offset is given as an rtx rather
2797    than a poly_int64.  */
2798
2799 void
2800 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2801                           rtx offset_rtx, rtx temp1, rtx temp2)
2802 {
2803   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
2804                       temp1, temp2, false);
2805 }
2806
2807 /* Add DELTA to the stack pointer, marking the instructions frame-related.
2808    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
2809    if TEMP1 already contains abs (DELTA).  */
2810
2811 static inline void
2812 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
2813 {
2814   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
2815                       temp1, temp2, true, emit_move_imm);
2816 }
2817
2818 /* Subtract DELTA from the stack pointer, marking the instructions
2819    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
2820    if nonnull.  */
2821
2822 static inline void
2823 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p)
2824 {
2825   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
2826                       temp1, temp2, frame_related_p);
2827 }
2828
2829 /* Set DEST to (vec_series BASE STEP).  */
2830
2831 static void
2832 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
2833 {
2834   machine_mode mode = GET_MODE (dest);
2835   scalar_mode inner = GET_MODE_INNER (mode);
2836
2837   /* Each operand can be a register or an immediate in the range [-16, 15].  */
2838   if (!aarch64_sve_index_immediate_p (base))
2839     base = force_reg (inner, base);
2840   if (!aarch64_sve_index_immediate_p (step))
2841     step = force_reg (inner, step);
2842
2843   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
2844 }
2845
2846 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
2847    integer of mode INT_MODE.  Return true on success.  */
2848
2849 static bool
2850 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
2851                                       rtx src)
2852 {
2853   /* If the constant is smaller than 128 bits, we can do the move
2854      using a vector of SRC_MODEs.  */
2855   if (src_mode != TImode)
2856     {
2857       poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
2858                                      GET_MODE_SIZE (src_mode));
2859       machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
2860       emit_move_insn (gen_lowpart (dup_mode, dest),
2861                       gen_const_vec_duplicate (dup_mode, src));
2862       return true;
2863     }
2864
2865   /* Use LD1RQ[BHWD] to load the 128 bits from memory.  */
2866   src = force_const_mem (src_mode, src);
2867   if (!src)
2868     return false;
2869
2870   /* Make sure that the address is legitimate.  */
2871   if (!aarch64_sve_ld1r_operand_p (src))
2872     {
2873       rtx addr = force_reg (Pmode, XEXP (src, 0));
2874       src = replace_equiv_address (src, addr);
2875     }
2876
2877   machine_mode mode = GET_MODE (dest);
2878   unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
2879   machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
2880   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
2881   src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
2882   emit_insn (gen_rtx_SET (dest, src));
2883   return true;
2884 }
2885
2886 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
2887    isn't a simple duplicate or series.  */
2888
2889 static void
2890 aarch64_expand_sve_const_vector (rtx dest, rtx src)
2891 {
2892   machine_mode mode = GET_MODE (src);
2893   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
2894   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
2895   gcc_assert (npatterns > 1);
2896
2897   if (nelts_per_pattern == 1)
2898     {
2899       /* The constant is a repeating seqeuence of at least two elements,
2900          where the repeating elements occupy no more than 128 bits.
2901          Get an integer representation of the replicated value.  */
2902       scalar_int_mode int_mode;
2903       if (BYTES_BIG_ENDIAN)
2904         /* For now, always use LD1RQ to load the value on big-endian
2905            targets, since the handling of smaller integers includes a
2906            subreg that is semantically an element reverse.  */
2907         int_mode = TImode;
2908       else
2909         {
2910           unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
2911           gcc_assert (int_bits <= 128);
2912           int_mode = int_mode_for_size (int_bits, 0).require ();
2913         }
2914       rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
2915       if (int_value
2916           && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
2917         return;
2918     }
2919
2920   /* Expand each pattern individually.  */
2921   rtx_vector_builder builder;
2922   auto_vec<rtx, 16> vectors (npatterns);
2923   for (unsigned int i = 0; i < npatterns; ++i)
2924     {
2925       builder.new_vector (mode, 1, nelts_per_pattern);
2926       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
2927         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
2928       vectors.quick_push (force_reg (mode, builder.build ()));
2929     }
2930
2931   /* Use permutes to interleave the separate vectors.  */
2932   while (npatterns > 1)
2933     {
2934       npatterns /= 2;
2935       for (unsigned int i = 0; i < npatterns; ++i)
2936         {
2937           rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
2938           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
2939           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
2940           vectors[i] = tmp;
2941         }
2942     }
2943   gcc_assert (vectors[0] == dest);
2944 }
2945
2946 /* Set DEST to immediate IMM.  For SVE vector modes, GEN_VEC_DUPLICATE
2947    is a pattern that can be used to set DEST to a replicated scalar
2948    element.  */
2949
2950 void
2951 aarch64_expand_mov_immediate (rtx dest, rtx imm,
2952                               rtx (*gen_vec_duplicate) (rtx, rtx))
2953 {
2954   machine_mode mode = GET_MODE (dest);
2955
2956   /* Check on what type of symbol it is.  */
2957   scalar_int_mode int_mode;
2958   if ((GET_CODE (imm) == SYMBOL_REF
2959        || GET_CODE (imm) == LABEL_REF
2960        || GET_CODE (imm) == CONST
2961        || GET_CODE (imm) == CONST_POLY_INT)
2962       && is_a <scalar_int_mode> (mode, &int_mode))
2963     {
2964       rtx mem;
2965       poly_int64 offset;
2966       HOST_WIDE_INT const_offset;
2967       enum aarch64_symbol_type sty;
2968
2969       /* If we have (const (plus symbol offset)), separate out the offset
2970          before we start classifying the symbol.  */
2971       rtx base = strip_offset (imm, &offset);
2972
2973       /* We must always add an offset involving VL separately, rather than
2974          folding it into the relocation.  */
2975       if (!offset.is_constant (&const_offset))
2976         {
2977           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
2978             emit_insn (gen_rtx_SET (dest, imm));
2979           else
2980             {
2981               /* Do arithmetic on 32-bit values if the result is smaller
2982                  than that.  */
2983               if (partial_subreg_p (int_mode, SImode))
2984                 {
2985                   /* It is invalid to do symbol calculations in modes
2986                      narrower than SImode.  */
2987                   gcc_assert (base == const0_rtx);
2988                   dest = gen_lowpart (SImode, dest);
2989                   int_mode = SImode;
2990                 }
2991               if (base != const0_rtx)
2992                 {
2993                   base = aarch64_force_temporary (int_mode, dest, base);
2994                   aarch64_add_offset (int_mode, dest, base, offset,
2995                                       NULL_RTX, NULL_RTX, false);
2996                 }
2997               else
2998                 aarch64_add_offset (int_mode, dest, base, offset,
2999                                     dest, NULL_RTX, false);
3000             }
3001           return;
3002         }
3003
3004       sty = aarch64_classify_symbol (base, const_offset);
3005       switch (sty)
3006         {
3007         case SYMBOL_FORCE_TO_MEM:
3008           if (const_offset != 0
3009               && targetm.cannot_force_const_mem (int_mode, imm))
3010             {
3011               gcc_assert (can_create_pseudo_p ());
3012               base = aarch64_force_temporary (int_mode, dest, base);
3013               aarch64_add_offset (int_mode, dest, base, const_offset,
3014                                   NULL_RTX, NULL_RTX, false);
3015               return;
3016             }
3017
3018           mem = force_const_mem (ptr_mode, imm);
3019           gcc_assert (mem);
3020
3021           /* If we aren't generating PC relative literals, then
3022              we need to expand the literal pool access carefully.
3023              This is something that needs to be done in a number
3024              of places, so could well live as a separate function.  */
3025           if (!aarch64_pcrelative_literal_loads)
3026             {
3027               gcc_assert (can_create_pseudo_p ());
3028               base = gen_reg_rtx (ptr_mode);
3029               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
3030               if (ptr_mode != Pmode)
3031                 base = convert_memory_address (Pmode, base);
3032               mem = gen_rtx_MEM (ptr_mode, base);
3033             }
3034
3035           if (int_mode != ptr_mode)
3036             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
3037
3038           emit_insn (gen_rtx_SET (dest, mem));
3039
3040           return;
3041
3042         case SYMBOL_SMALL_TLSGD:
3043         case SYMBOL_SMALL_TLSDESC:
3044         case SYMBOL_SMALL_TLSIE:
3045         case SYMBOL_SMALL_GOT_28K:
3046         case SYMBOL_SMALL_GOT_4G:
3047         case SYMBOL_TINY_GOT:
3048         case SYMBOL_TINY_TLSIE:
3049           if (const_offset != 0)
3050             {
3051               gcc_assert(can_create_pseudo_p ());
3052               base = aarch64_force_temporary (int_mode, dest, base);
3053               aarch64_add_offset (int_mode, dest, base, const_offset,
3054                                   NULL_RTX, NULL_RTX, false);
3055               return;
3056             }
3057           /* FALLTHRU */
3058
3059         case SYMBOL_SMALL_ABSOLUTE:
3060         case SYMBOL_TINY_ABSOLUTE:
3061         case SYMBOL_TLSLE12:
3062         case SYMBOL_TLSLE24:
3063         case SYMBOL_TLSLE32:
3064         case SYMBOL_TLSLE48:
3065           aarch64_load_symref_appropriately (dest, imm, sty);
3066           return;
3067
3068         default:
3069           gcc_unreachable ();
3070         }
3071     }
3072
3073   if (!CONST_INT_P (imm))
3074     {
3075       rtx base, step, value;
3076       if (GET_CODE (imm) == HIGH
3077           || aarch64_simd_valid_immediate (imm, NULL))
3078         emit_insn (gen_rtx_SET (dest, imm));
3079       else if (const_vec_series_p (imm, &base, &step))
3080         aarch64_expand_vec_series (dest, base, step);
3081       else if (const_vec_duplicate_p (imm, &value))
3082         {
3083           /* If the constant is out of range of an SVE vector move,
3084              load it from memory if we can, otherwise move it into
3085              a register and use a DUP.  */
3086           scalar_mode inner_mode = GET_MODE_INNER (mode);
3087           rtx op = force_const_mem (inner_mode, value);
3088           if (!op)
3089             op = force_reg (inner_mode, value);
3090           else if (!aarch64_sve_ld1r_operand_p (op))
3091             {
3092               rtx addr = force_reg (Pmode, XEXP (op, 0));
3093               op = replace_equiv_address (op, addr);
3094             }
3095           emit_insn (gen_vec_duplicate (dest, op));
3096         }
3097       else if (GET_CODE (imm) == CONST_VECTOR
3098                && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3099         aarch64_expand_sve_const_vector (dest, imm);
3100       else
3101         {
3102           rtx mem = force_const_mem (mode, imm);
3103           gcc_assert (mem);
3104           emit_move_insn (dest, mem);
3105         }
3106
3107       return;
3108     }
3109
3110   aarch64_internal_mov_immediate (dest, imm, true,
3111                                   as_a <scalar_int_mode> (mode));
3112 }
3113
3114 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
3115    that is known to contain PTRUE.  */
3116
3117 void
3118 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3119 {
3120   emit_insn (gen_rtx_SET (dest, gen_rtx_UNSPEC (GET_MODE (dest),
3121                                                 gen_rtvec (2, pred, src),
3122                                                 UNSPEC_MERGE_PTRUE)));
3123 }
3124
3125 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3126    operand is in memory.  In this case we need to use the predicated LD1
3127    and ST1 instead of LDR and STR, both for correctness on big-endian
3128    targets and because LD1 and ST1 support a wider range of addressing modes.
3129    PRED_MODE is the mode of the predicate.
3130
3131    See the comment at the head of aarch64-sve.md for details about the
3132    big-endian handling.  */
3133
3134 void
3135 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3136 {
3137   machine_mode mode = GET_MODE (dest);
3138   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3139   if (!register_operand (src, mode)
3140       && !register_operand (dest, mode))
3141     {
3142       rtx tmp = gen_reg_rtx (mode);
3143       if (MEM_P (src))
3144         aarch64_emit_sve_pred_move (tmp, ptrue, src);
3145       else
3146         emit_move_insn (tmp, src);
3147       src = tmp;
3148     }
3149   aarch64_emit_sve_pred_move (dest, ptrue, src);
3150 }
3151
3152 /* Called only on big-endian targets.  See whether an SVE vector move
3153    from SRC to DEST is effectively a REV[BHW] instruction, because at
3154    least one operand is a subreg of an SVE vector that has wider or
3155    narrower elements.  Return true and emit the instruction if so.
3156
3157    For example:
3158
3159      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3160
3161    represents a VIEW_CONVERT between the following vectors, viewed
3162    in memory order:
3163
3164      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
3165      R1: { [0],      [1],      [2],      [3],     ... }
3166
3167    The high part of lane X in R2 should therefore correspond to lane X*2
3168    of R1, but the register representations are:
3169
3170          msb                                      lsb
3171      R2: ...... [1].high  [1].low   [0].high  [0].low
3172      R1: ...... [3]       [2]       [1]       [0]
3173
3174    where the low part of lane X in R2 corresponds to lane X*2 in R1.
3175    We therefore need a reverse operation to swap the high and low values
3176    around.
3177
3178    This is purely an optimization.  Without it we would spill the
3179    subreg operand to the stack in one mode and reload it in the
3180    other mode, which has the same effect as the REV.  */
3181
3182 bool
3183 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3184 {
3185   gcc_assert (BYTES_BIG_ENDIAN);
3186   if (GET_CODE (dest) == SUBREG)
3187     dest = SUBREG_REG (dest);
3188   if (GET_CODE (src) == SUBREG)
3189     src = SUBREG_REG (src);
3190
3191   /* The optimization handles two single SVE REGs with different element
3192      sizes.  */
3193   if (!REG_P (dest)
3194       || !REG_P (src)
3195       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3196       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3197       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3198           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3199     return false;
3200
3201   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
3202   rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3203   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3204                                UNSPEC_REV_SUBREG);
3205   emit_insn (gen_rtx_SET (dest, unspec));
3206   return true;
3207 }
3208
3209 /* Return a copy of X with mode MODE, without changing its other
3210    attributes.  Unlike gen_lowpart, this doesn't care whether the
3211    mode change is valid.  */
3212
3213 static rtx
3214 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3215 {
3216   if (GET_MODE (x) == mode)
3217     return x;
3218
3219   x = shallow_copy_rtx (x);
3220   set_mode_and_regno (x, mode, REGNO (x));
3221   return x;
3222 }
3223
3224 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3225    operands.  */
3226
3227 void
3228 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3229 {
3230   /* Decide which REV operation we need.  The mode with narrower elements
3231      determines the mode of the operands and the mode with the wider
3232      elements determines the reverse width.  */
3233   machine_mode mode_with_wider_elts = GET_MODE (dest);
3234   machine_mode mode_with_narrower_elts = GET_MODE (src);
3235   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3236       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3237     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3238
3239   unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3240   unsigned int unspec;
3241   if (wider_bytes == 8)
3242     unspec = UNSPEC_REV64;
3243   else if (wider_bytes == 4)
3244     unspec = UNSPEC_REV32;
3245   else if (wider_bytes == 2)
3246     unspec = UNSPEC_REV16;
3247   else
3248     gcc_unreachable ();
3249   machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3250
3251   /* Emit:
3252
3253        (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3254                          UNSPEC_MERGE_PTRUE))
3255
3256      with the appropriate modes.  */
3257   ptrue = gen_lowpart (pred_mode, ptrue);
3258   dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3259   src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3260   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3261   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3262                         UNSPEC_MERGE_PTRUE);
3263   emit_insn (gen_rtx_SET (dest, src));
3264 }
3265
3266 static bool
3267 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3268                                  tree exp ATTRIBUTE_UNUSED)
3269 {
3270   /* Currently, always true.  */
3271   return true;
3272 }
3273
3274 /* Implement TARGET_PASS_BY_REFERENCE.  */
3275
3276 static bool
3277 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3278                            machine_mode mode,
3279                            const_tree type,
3280                            bool named ATTRIBUTE_UNUSED)
3281 {
3282   HOST_WIDE_INT size;
3283   machine_mode dummymode;
3284   int nregs;
3285
3286   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
3287   if (mode == BLKmode && type)
3288     size = int_size_in_bytes (type);
3289   else
3290     /* No frontends can create types with variable-sized modes, so we
3291        shouldn't be asked to pass or return them.  */
3292     size = GET_MODE_SIZE (mode).to_constant ();
3293
3294   /* Aggregates are passed by reference based on their size.  */
3295   if (type && AGGREGATE_TYPE_P (type))
3296     {
3297       size = int_size_in_bytes (type);
3298     }
3299
3300   /* Variable sized arguments are always returned by reference.  */
3301   if (size < 0)
3302     return true;
3303
3304   /* Can this be a candidate to be passed in fp/simd register(s)?  */
3305   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3306                                                &dummymode, &nregs,
3307                                                NULL))
3308     return false;
3309
3310   /* Arguments which are variable sized or larger than 2 registers are
3311      passed by reference unless they are a homogenous floating point
3312      aggregate.  */
3313   return size > 2 * UNITS_PER_WORD;
3314 }
3315
3316 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
3317 static bool
3318 aarch64_return_in_msb (const_tree valtype)
3319 {
3320   machine_mode dummy_mode;
3321   int dummy_int;
3322
3323   /* Never happens in little-endian mode.  */
3324   if (!BYTES_BIG_ENDIAN)
3325     return false;
3326
3327   /* Only composite types smaller than or equal to 16 bytes can
3328      be potentially returned in registers.  */
3329   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3330       || int_size_in_bytes (valtype) <= 0
3331       || int_size_in_bytes (valtype) > 16)
3332     return false;
3333
3334   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3335      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3336      is always passed/returned in the least significant bits of fp/simd
3337      register(s).  */
3338   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3339                                                &dummy_mode, &dummy_int, NULL))
3340     return false;
3341
3342   return true;
3343 }
3344
3345 /* Implement TARGET_FUNCTION_VALUE.
3346    Define how to find the value returned by a function.  */
3347
3348 static rtx
3349 aarch64_function_value (const_tree type, const_tree func,
3350                         bool outgoing ATTRIBUTE_UNUSED)
3351 {
3352   machine_mode mode;
3353   int unsignedp;
3354   int count;
3355   machine_mode ag_mode;
3356
3357   mode = TYPE_MODE (type);
3358   if (INTEGRAL_TYPE_P (type))
3359     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3360
3361   if (aarch64_return_in_msb (type))
3362     {
3363       HOST_WIDE_INT size = int_size_in_bytes (type);
3364
3365       if (size % UNITS_PER_WORD != 0)
3366         {
3367           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3368           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3369         }
3370     }
3371
3372   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3373                                                &ag_mode, &count, NULL))
3374     {
3375       if (!aarch64_composite_type_p (type, mode))
3376         {
3377           gcc_assert (count == 1 && mode == ag_mode);
3378           return gen_rtx_REG (mode, V0_REGNUM);
3379         }
3380       else
3381         {
3382           int i;
3383           rtx par;
3384
3385           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3386           for (i = 0; i < count; i++)
3387             {
3388               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3389               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3390               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3391               XVECEXP (par, 0, i) = tmp;
3392             }
3393           return par;
3394         }
3395     }
3396   else
3397     return gen_rtx_REG (mode, R0_REGNUM);
3398 }
3399
3400 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3401    Return true if REGNO is the number of a hard register in which the values
3402    of called function may come back.  */
3403
3404 static bool
3405 aarch64_function_value_regno_p (const unsigned int regno)
3406 {
3407   /* Maximum of 16 bytes can be returned in the general registers.  Examples
3408      of 16-byte return values are: 128-bit integers and 16-byte small
3409      structures (excluding homogeneous floating-point aggregates).  */
3410   if (regno == R0_REGNUM || regno == R1_REGNUM)
3411     return true;
3412
3413   /* Up to four fp/simd registers can return a function value, e.g. a
3414      homogeneous floating-point aggregate having four members.  */
3415   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3416     return TARGET_FLOAT;
3417
3418   return false;
3419 }
3420
3421 /* Implement TARGET_RETURN_IN_MEMORY.
3422
3423    If the type T of the result of a function is such that
3424      void func (T arg)
3425    would require that arg be passed as a value in a register (or set of
3426    registers) according to the parameter passing rules, then the result
3427    is returned in the same registers as would be used for such an
3428    argument.  */
3429
3430 static bool
3431 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3432 {
3433   HOST_WIDE_INT size;
3434   machine_mode ag_mode;
3435   int count;
3436
3437   if (!AGGREGATE_TYPE_P (type)
3438       && TREE_CODE (type) != COMPLEX_TYPE
3439       && TREE_CODE (type) != VECTOR_TYPE)
3440     /* Simple scalar types always returned in registers.  */
3441     return false;
3442
3443   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3444                                                type,
3445                                                &ag_mode,
3446                                                &count,
3447                                                NULL))
3448     return false;
3449
3450   /* Types larger than 2 registers returned in memory.  */
3451   size = int_size_in_bytes (type);
3452   return (size < 0 || size > 2 * UNITS_PER_WORD);
3453 }
3454
3455 static bool
3456 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3457                                const_tree type, int *nregs)
3458 {
3459   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3460   return aarch64_vfp_is_call_or_return_candidate (mode,
3461                                                   type,
3462                                                   &pcum->aapcs_vfp_rmode,
3463                                                   nregs,
3464                                                   NULL);
3465 }
3466
3467 /* Given MODE and TYPE of a function argument, return the alignment in
3468    bits.  The idea is to suppress any stronger alignment requested by
3469    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3470    This is a helper function for local use only.  */
3471
3472 static unsigned int
3473 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
3474 {
3475   if (!type)
3476     return GET_MODE_ALIGNMENT (mode);
3477
3478   if (integer_zerop (TYPE_SIZE (type)))
3479     return 0;
3480
3481   gcc_assert (TYPE_MODE (type) == mode);
3482
3483   if (!AGGREGATE_TYPE_P (type))
3484     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3485
3486   if (TREE_CODE (type) == ARRAY_TYPE)
3487     return TYPE_ALIGN (TREE_TYPE (type));
3488
3489   unsigned int alignment = 0;
3490   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3491     if (TREE_CODE (field) == FIELD_DECL)
3492       alignment = std::max (alignment, DECL_ALIGN (field));
3493
3494   return alignment;
3495 }
3496
3497 /* Layout a function argument according to the AAPCS64 rules.  The rule
3498    numbers refer to the rule numbers in the AAPCS64.  */
3499
3500 static void
3501 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3502                     const_tree type,
3503                     bool named ATTRIBUTE_UNUSED)
3504 {
3505   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3506   int ncrn, nvrn, nregs;
3507   bool allocate_ncrn, allocate_nvrn;
3508   HOST_WIDE_INT size;
3509
3510   /* We need to do this once per argument.  */
3511   if (pcum->aapcs_arg_processed)
3512     return;
3513
3514   pcum->aapcs_arg_processed = true;
3515
3516   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
3517   if (type)
3518     size = int_size_in_bytes (type);
3519   else
3520     /* No frontends can create types with variable-sized modes, so we
3521        shouldn't be asked to pass or return them.  */
3522     size = GET_MODE_SIZE (mode).to_constant ();
3523   size = ROUND_UP (size, UNITS_PER_WORD);
3524
3525   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3526   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3527                                                  mode,
3528                                                  type,
3529                                                  &nregs);
3530
3531   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3532      The following code thus handles passing by SIMD/FP registers first.  */
3533
3534   nvrn = pcum->aapcs_nvrn;
3535
3536   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3537      and homogenous short-vector aggregates (HVA).  */
3538   if (allocate_nvrn)
3539     {
3540       if (!TARGET_FLOAT)
3541         aarch64_err_no_fpadvsimd (mode);
3542
3543       if (nvrn + nregs <= NUM_FP_ARG_REGS)
3544         {
3545           pcum->aapcs_nextnvrn = nvrn + nregs;
3546           if (!aarch64_composite_type_p (type, mode))
3547             {
3548               gcc_assert (nregs == 1);
3549               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3550             }
3551           else
3552             {
3553               rtx par;
3554               int i;
3555               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3556               for (i = 0; i < nregs; i++)
3557                 {
3558                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3559                                          V0_REGNUM + nvrn + i);
3560                   rtx offset = gen_int_mode
3561                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3562                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3563                   XVECEXP (par, 0, i) = tmp;
3564                 }
3565               pcum->aapcs_reg = par;
3566             }
3567           return;
3568         }
3569       else
3570         {
3571           /* C.3 NSRN is set to 8.  */
3572           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3573           goto on_stack;
3574         }
3575     }
3576
3577   ncrn = pcum->aapcs_ncrn;
3578   nregs = size / UNITS_PER_WORD;
3579
3580   /* C6 - C9.  though the sign and zero extension semantics are
3581      handled elsewhere.  This is the case where the argument fits
3582      entirely general registers.  */
3583   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3584     {
3585
3586       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3587
3588       /* C.8 if the argument has an alignment of 16 then the NGRN is
3589          rounded up to the next even number.  */
3590       if (nregs == 2
3591           && ncrn % 2
3592           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3593              comparison is there because for > 16 * BITS_PER_UNIT
3594              alignment nregs should be > 2 and therefore it should be
3595              passed by reference rather than value.  */
3596           && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3597         {
3598           ++ncrn;
3599           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3600         }
3601
3602       /* NREGS can be 0 when e.g. an empty structure is to be passed.
3603          A reg is still generated for it, but the caller should be smart
3604          enough not to use it.  */
3605       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3606         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3607       else
3608         {
3609           rtx par;
3610           int i;
3611
3612           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3613           for (i = 0; i < nregs; i++)
3614             {
3615               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3616               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3617                                        GEN_INT (i * UNITS_PER_WORD));
3618               XVECEXP (par, 0, i) = tmp;
3619             }
3620           pcum->aapcs_reg = par;
3621         }
3622
3623       pcum->aapcs_nextncrn = ncrn + nregs;
3624       return;
3625     }
3626
3627   /* C.11  */
3628   pcum->aapcs_nextncrn = NUM_ARG_REGS;
3629
3630   /* The argument is passed on stack; record the needed number of words for
3631      this argument and align the total size if necessary.  */
3632 on_stack:
3633   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3634
3635   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3636     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
3637                                        16 / UNITS_PER_WORD);
3638   return;
3639 }
3640
3641 /* Implement TARGET_FUNCTION_ARG.  */
3642
3643 static rtx
3644 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3645                       const_tree type, bool named)
3646 {
3647   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3648   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3649
3650   if (mode == VOIDmode)
3651     return NULL_RTX;
3652
3653   aarch64_layout_arg (pcum_v, mode, type, named);
3654   return pcum->aapcs_reg;
3655 }
3656
3657 void
3658 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
3659                            const_tree fntype ATTRIBUTE_UNUSED,
3660                            rtx libname ATTRIBUTE_UNUSED,
3661                            const_tree fndecl ATTRIBUTE_UNUSED,
3662                            unsigned n_named ATTRIBUTE_UNUSED)
3663 {
3664   pcum->aapcs_ncrn = 0;
3665   pcum->aapcs_nvrn = 0;
3666   pcum->aapcs_nextncrn = 0;
3667   pcum->aapcs_nextnvrn = 0;
3668   pcum->pcs_variant = ARM_PCS_AAPCS64;
3669   pcum->aapcs_reg = NULL_RTX;
3670   pcum->aapcs_arg_processed = false;
3671   pcum->aapcs_stack_words = 0;
3672   pcum->aapcs_stack_size = 0;
3673
3674   if (!TARGET_FLOAT
3675       && fndecl && TREE_PUBLIC (fndecl)
3676       && fntype && fntype != error_mark_node)
3677     {
3678       const_tree type = TREE_TYPE (fntype);
3679       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
3680       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
3681       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
3682                                                    &mode, &nregs, NULL))
3683         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
3684     }
3685   return;
3686 }
3687
3688 static void
3689 aarch64_function_arg_advance (cumulative_args_t pcum_v,
3690                               machine_mode mode,
3691                               const_tree type,
3692                               bool named)
3693 {
3694   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3695   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
3696     {
3697       aarch64_layout_arg (pcum_v, mode, type, named);
3698       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
3699                   != (pcum->aapcs_stack_words != 0));
3700       pcum->aapcs_arg_processed = false;
3701       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
3702       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
3703       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
3704       pcum->aapcs_stack_words = 0;
3705       pcum->aapcs_reg = NULL_RTX;
3706     }
3707 }
3708
3709 bool
3710 aarch64_function_arg_regno_p (unsigned regno)
3711 {
3712   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
3713           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
3714 }
3715
3716 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
3717    PARM_BOUNDARY bits of alignment, but will be given anything up
3718    to STACK_BOUNDARY bits if the type requires it.  This makes sure
3719    that both before and after the layout of each argument, the Next
3720    Stacked Argument Address (NSAA) will have a minimum alignment of
3721    8 bytes.  */
3722
3723 static unsigned int
3724 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
3725 {
3726   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
3727   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
3728 }
3729
3730 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
3731
3732 static fixed_size_mode
3733 aarch64_get_reg_raw_mode (int regno)
3734 {
3735   if (TARGET_SVE && FP_REGNUM_P (regno))
3736     /* Don't use the SVE part of the register for __builtin_apply and
3737        __builtin_return.  The SVE registers aren't used by the normal PCS,
3738        so using them there would be a waste of time.  The PCS extensions
3739        for SVE types are fundamentally incompatible with the
3740        __builtin_return/__builtin_apply interface.  */
3741     return as_a <fixed_size_mode> (V16QImode);
3742   return default_get_reg_raw_mode (regno);
3743 }
3744
3745 /* Implement TARGET_FUNCTION_ARG_PADDING.
3746
3747    Small aggregate types are placed in the lowest memory address.
3748
3749    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
3750
3751 static pad_direction
3752 aarch64_function_arg_padding (machine_mode mode, const_tree type)
3753 {
3754   /* On little-endian targets, the least significant byte of every stack
3755      argument is passed at the lowest byte address of the stack slot.  */
3756   if (!BYTES_BIG_ENDIAN)
3757     return PAD_UPWARD;
3758
3759   /* Otherwise, integral, floating-point and pointer types are padded downward:
3760      the least significant byte of a stack argument is passed at the highest
3761      byte address of the stack slot.  */
3762   if (type
3763       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
3764          || POINTER_TYPE_P (type))
3765       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
3766     return PAD_DOWNWARD;
3767
3768   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
3769   return PAD_UPWARD;
3770 }
3771
3772 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
3773
3774    It specifies padding for the last (may also be the only)
3775    element of a block move between registers and memory.  If
3776    assuming the block is in the memory, padding upward means that
3777    the last element is padded after its highest significant byte,
3778    while in downward padding, the last element is padded at the
3779    its least significant byte side.
3780
3781    Small aggregates and small complex types are always padded
3782    upwards.
3783
3784    We don't need to worry about homogeneous floating-point or
3785    short-vector aggregates; their move is not affected by the
3786    padding direction determined here.  Regardless of endianness,
3787    each element of such an aggregate is put in the least
3788    significant bits of a fp/simd register.
3789
3790    Return !BYTES_BIG_ENDIAN if the least significant byte of the
3791    register has useful data, and return the opposite if the most
3792    significant byte does.  */
3793
3794 bool
3795 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
3796                      bool first ATTRIBUTE_UNUSED)
3797 {
3798
3799   /* Small composite types are always padded upward.  */
3800   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
3801     {
3802       HOST_WIDE_INT size;
3803       if (type)
3804         size = int_size_in_bytes (type);
3805       else
3806         /* No frontends can create types with variable-sized modes, so we
3807            shouldn't be asked to pass or return them.  */
3808         size = GET_MODE_SIZE (mode).to_constant ();
3809       if (size < 2 * UNITS_PER_WORD)
3810         return true;
3811     }
3812
3813   /* Otherwise, use the default padding.  */
3814   return !BYTES_BIG_ENDIAN;
3815 }
3816
3817 static scalar_int_mode
3818 aarch64_libgcc_cmp_return_mode (void)
3819 {
3820   return SImode;
3821 }
3822
3823 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
3824
3825 /* We use the 12-bit shifted immediate arithmetic instructions so values
3826    must be multiple of (1 << 12), i.e. 4096.  */
3827 #define ARITH_FACTOR 4096
3828
3829 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
3830 #error Cannot use simple address calculation for stack probing
3831 #endif
3832
3833 /* The pair of scratch registers used for stack probing.  */
3834 #define PROBE_STACK_FIRST_REG  9
3835 #define PROBE_STACK_SECOND_REG 10
3836
3837 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
3838    inclusive.  These are offsets from the current stack pointer.  */
3839
3840 static void
3841 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
3842 {
3843   HOST_WIDE_INT size;
3844   if (!poly_size.is_constant (&size))
3845     {
3846       sorry ("stack probes for SVE frames");
3847       return;
3848     }
3849
3850   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
3851
3852   /* See the same assertion on PROBE_INTERVAL above.  */
3853   gcc_assert ((first % ARITH_FACTOR) == 0);
3854
3855   /* See if we have a constant small number of probes to generate.  If so,
3856      that's the easy case.  */
3857   if (size <= PROBE_INTERVAL)
3858     {
3859       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
3860
3861       emit_set_insn (reg1,
3862                      plus_constant (Pmode,
3863                                     stack_pointer_rtx, -(first + base)));
3864       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
3865     }
3866
3867   /* The run-time loop is made up of 8 insns in the generic case while the
3868      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
3869   else if (size <= 4 * PROBE_INTERVAL)
3870     {
3871       HOST_WIDE_INT i, rem;
3872
3873       emit_set_insn (reg1,
3874                      plus_constant (Pmode,
3875                                     stack_pointer_rtx,
3876                                     -(first + PROBE_INTERVAL)));
3877       emit_stack_probe (reg1);
3878
3879       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
3880          it exceeds SIZE.  If only two probes are needed, this will not
3881          generate any code.  Then probe at FIRST + SIZE.  */
3882       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
3883         {
3884           emit_set_insn (reg1,
3885                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
3886           emit_stack_probe (reg1);
3887         }
3888
3889       rem = size - (i - PROBE_INTERVAL);
3890       if (rem > 256)
3891         {
3892           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3893
3894           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
3895           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
3896         }
3897       else
3898         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
3899     }
3900
3901   /* Otherwise, do the same as above, but in a loop.  Note that we must be
3902      extra careful with variables wrapping around because we might be at
3903      the very top (or the very bottom) of the address space and we have
3904      to be able to handle this case properly; in particular, we use an
3905      equality test for the loop condition.  */
3906   else
3907     {
3908       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
3909
3910       /* Step 1: round SIZE to the previous multiple of the interval.  */
3911
3912       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
3913
3914
3915       /* Step 2: compute initial and final value of the loop counter.  */
3916
3917       /* TEST_ADDR = SP + FIRST.  */
3918       emit_set_insn (reg1,
3919                      plus_constant (Pmode, stack_pointer_rtx, -first));
3920
3921       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
3922       HOST_WIDE_INT adjustment = - (first + rounded_size);
3923       if (! aarch64_uimm12_shift (adjustment))
3924         {
3925           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
3926                                           true, Pmode);
3927           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
3928         }
3929       else
3930         emit_set_insn (reg2,
3931                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
3932
3933       /* Step 3: the loop
3934
3935          do
3936            {
3937              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
3938              probe at TEST_ADDR
3939            }
3940          while (TEST_ADDR != LAST_ADDR)
3941
3942          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
3943          until it is equal to ROUNDED_SIZE.  */
3944
3945       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
3946
3947
3948       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
3949          that SIZE is equal to ROUNDED_SIZE.  */
3950
3951       if (size != rounded_size)
3952         {
3953           HOST_WIDE_INT rem = size - rounded_size;
3954
3955           if (rem > 256)
3956             {
3957               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3958
3959               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
3960               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
3961             }
3962           else
3963             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
3964         }
3965     }
3966
3967   /* Make sure nothing is scheduled before we are done.  */
3968   emit_insn (gen_blockage ());
3969 }
3970
3971 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
3972    absolute addresses.  */
3973
3974 const char *
3975 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
3976 {
3977   static int labelno = 0;
3978   char loop_lab[32];
3979   rtx xops[2];
3980
3981   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
3982
3983   /* Loop.  */
3984   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
3985
3986   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
3987   xops[0] = reg1;
3988   xops[1] = GEN_INT (PROBE_INTERVAL);
3989   output_asm_insn ("sub\t%0, %0, %1", xops);
3990
3991   /* Probe at TEST_ADDR.  */
3992   output_asm_insn ("str\txzr, [%0]", xops);
3993
3994   /* Test if TEST_ADDR == LAST_ADDR.  */
3995   xops[1] = reg2;
3996   output_asm_insn ("cmp\t%0, %1", xops);
3997
3998   /* Branch.  */
3999   fputs ("\tb.ne\t", asm_out_file);
4000   assemble_name_raw (asm_out_file, loop_lab);
4001   fputc ('\n', asm_out_file);
4002
4003   return "";
4004 }
4005
4006 /* Determine whether a frame chain needs to be generated.  */
4007 static bool
4008 aarch64_needs_frame_chain (void)
4009 {
4010   /* Force a frame chain for EH returns so the return address is at FP+8.  */
4011   if (frame_pointer_needed || crtl->calls_eh_return)
4012     return true;
4013
4014   /* A leaf function cannot have calls or write LR.  */
4015   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
4016
4017   /* Don't use a frame chain in leaf functions if leaf frame pointers
4018      are disabled.  */
4019   if (flag_omit_leaf_frame_pointer && is_leaf)
4020     return false;
4021
4022   return aarch64_use_frame_pointer;
4023 }
4024
4025 /* Mark the registers that need to be saved by the callee and calculate
4026    the size of the callee-saved registers area and frame record (both FP
4027    and LR may be omitted).  */
4028 static void
4029 aarch64_layout_frame (void)
4030 {
4031   HOST_WIDE_INT offset = 0;
4032   int regno, last_fp_reg = INVALID_REGNUM;
4033
4034   if (reload_completed && cfun->machine->frame.laid_out)
4035     return;
4036
4037   cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
4038
4039 #define SLOT_NOT_REQUIRED (-2)
4040 #define SLOT_REQUIRED     (-1)
4041
4042   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
4043   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
4044
4045   /* First mark all the registers that really need to be saved...  */
4046   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4047     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4048
4049   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4050     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4051
4052   /* ... that includes the eh data registers (if needed)...  */
4053   if (crtl->calls_eh_return)
4054     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
4055       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
4056         = SLOT_REQUIRED;
4057
4058   /* ... and any callee saved register that dataflow says is live.  */
4059   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4060     if (df_regs_ever_live_p (regno)
4061         && (regno == R30_REGNUM
4062             || !call_used_regs[regno]))
4063       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4064
4065   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4066     if (df_regs_ever_live_p (regno)
4067         && !call_used_regs[regno])
4068       {
4069         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4070         last_fp_reg = regno;
4071       }
4072
4073   if (cfun->machine->frame.emit_frame_chain)
4074     {
4075       /* FP and LR are placed in the linkage record.  */
4076       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4077       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4078       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4079       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4080       offset = 2 * UNITS_PER_WORD;
4081     }
4082
4083   /* Now assign stack slots for them.  */
4084   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4085     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4086       {
4087         cfun->machine->frame.reg_offset[regno] = offset;
4088         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4089           cfun->machine->frame.wb_candidate1 = regno;
4090         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4091           cfun->machine->frame.wb_candidate2 = regno;
4092         offset += UNITS_PER_WORD;
4093       }
4094
4095   HOST_WIDE_INT max_int_offset = offset;
4096   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4097   bool has_align_gap = offset != max_int_offset;
4098
4099   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4100     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4101       {
4102         /* If there is an alignment gap between integer and fp callee-saves,
4103            allocate the last fp register to it if possible.  */
4104         if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
4105           {
4106             cfun->machine->frame.reg_offset[regno] = max_int_offset;
4107             break;
4108           }
4109
4110         cfun->machine->frame.reg_offset[regno] = offset;
4111         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4112           cfun->machine->frame.wb_candidate1 = regno;
4113         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4114                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4115           cfun->machine->frame.wb_candidate2 = regno;
4116         offset += UNITS_PER_WORD;
4117       }
4118
4119   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4120
4121   cfun->machine->frame.saved_regs_size = offset;
4122
4123   HOST_WIDE_INT varargs_and_saved_regs_size
4124     = offset + cfun->machine->frame.saved_varargs_size;
4125
4126   cfun->machine->frame.hard_fp_offset
4127     = aligned_upper_bound (varargs_and_saved_regs_size
4128                            + get_frame_size (),
4129                            STACK_BOUNDARY / BITS_PER_UNIT);
4130
4131   /* Both these values are already aligned.  */
4132   gcc_assert (multiple_p (crtl->outgoing_args_size,
4133                           STACK_BOUNDARY / BITS_PER_UNIT));
4134   cfun->machine->frame.frame_size
4135     = (cfun->machine->frame.hard_fp_offset
4136        + crtl->outgoing_args_size);
4137
4138   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4139
4140   cfun->machine->frame.initial_adjust = 0;
4141   cfun->machine->frame.final_adjust = 0;
4142   cfun->machine->frame.callee_adjust = 0;
4143   cfun->machine->frame.callee_offset = 0;
4144
4145   HOST_WIDE_INT max_push_offset = 0;
4146   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4147     max_push_offset = 512;
4148   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4149     max_push_offset = 256;
4150
4151   HOST_WIDE_INT const_size, const_fp_offset;
4152   if (cfun->machine->frame.frame_size.is_constant (&const_size)
4153       && const_size < max_push_offset
4154       && known_eq (crtl->outgoing_args_size, 0))
4155     {
4156       /* Simple, small frame with no outgoing arguments:
4157          stp reg1, reg2, [sp, -frame_size]!
4158          stp reg3, reg4, [sp, 16]  */
4159       cfun->machine->frame.callee_adjust = const_size;
4160     }
4161   else if (known_lt (crtl->outgoing_args_size
4162                      + cfun->machine->frame.saved_regs_size, 512)
4163            && !(cfun->calls_alloca
4164                 && known_lt (cfun->machine->frame.hard_fp_offset,
4165                              max_push_offset)))
4166     {
4167       /* Frame with small outgoing arguments:
4168          sub sp, sp, frame_size
4169          stp reg1, reg2, [sp, outgoing_args_size]
4170          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
4171       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4172       cfun->machine->frame.callee_offset
4173         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4174     }
4175   else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4176            && const_fp_offset < max_push_offset)
4177     {
4178       /* Frame with large outgoing arguments but a small local area:
4179          stp reg1, reg2, [sp, -hard_fp_offset]!
4180          stp reg3, reg4, [sp, 16]
4181          sub sp, sp, outgoing_args_size  */
4182       cfun->machine->frame.callee_adjust = const_fp_offset;
4183       cfun->machine->frame.final_adjust
4184         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4185     }
4186   else
4187     {
4188       /* Frame with large local area and outgoing arguments using frame pointer:
4189          sub sp, sp, hard_fp_offset
4190          stp x29, x30, [sp, 0]
4191          add x29, sp, 0
4192          stp reg3, reg4, [sp, 16]
4193          sub sp, sp, outgoing_args_size  */
4194       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4195       cfun->machine->frame.final_adjust
4196         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4197     }
4198
4199   cfun->machine->frame.laid_out = true;
4200 }
4201
4202 /* Return true if the register REGNO is saved on entry to
4203    the current function.  */
4204
4205 static bool
4206 aarch64_register_saved_on_entry (int regno)
4207 {
4208   return cfun->machine->frame.reg_offset[regno] >= 0;
4209 }
4210
4211 /* Return the next register up from REGNO up to LIMIT for the callee
4212    to save.  */
4213
4214 static unsigned
4215 aarch64_next_callee_save (unsigned regno, unsigned limit)
4216 {
4217   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4218     regno ++;
4219   return regno;
4220 }
4221
4222 /* Push the register number REGNO of mode MODE to the stack with write-back
4223    adjusting the stack by ADJUSTMENT.  */
4224
4225 static void
4226 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4227                            HOST_WIDE_INT adjustment)
4228  {
4229   rtx base_rtx = stack_pointer_rtx;
4230   rtx insn, reg, mem;
4231
4232   reg = gen_rtx_REG (mode, regno);
4233   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4234                             plus_constant (Pmode, base_rtx, -adjustment));
4235   mem = gen_frame_mem (mode, mem);
4236
4237   insn = emit_move_insn (mem, reg);
4238   RTX_FRAME_RELATED_P (insn) = 1;
4239 }
4240
4241 /* Generate and return an instruction to store the pair of registers
4242    REG and REG2 of mode MODE to location BASE with write-back adjusting
4243    the stack location BASE by ADJUSTMENT.  */
4244
4245 static rtx
4246 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4247                           HOST_WIDE_INT adjustment)
4248 {
4249   switch (mode)
4250     {
4251     case E_DImode:
4252       return gen_storewb_pairdi_di (base, base, reg, reg2,
4253                                     GEN_INT (-adjustment),
4254                                     GEN_INT (UNITS_PER_WORD - adjustment));
4255     case E_DFmode:
4256       return gen_storewb_pairdf_di (base, base, reg, reg2,
4257                                     GEN_INT (-adjustment),
4258                                     GEN_INT (UNITS_PER_WORD - adjustment));
4259     default:
4260       gcc_unreachable ();
4261     }
4262 }
4263
4264 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4265    stack pointer by ADJUSTMENT.  */
4266
4267 static void
4268 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4269 {
4270   rtx_insn *insn;
4271   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4272
4273   if (regno2 == INVALID_REGNUM)
4274     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4275
4276   rtx reg1 = gen_rtx_REG (mode, regno1);
4277   rtx reg2 = gen_rtx_REG (mode, regno2);
4278
4279   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4280                                               reg2, adjustment));
4281   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4282   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4283   RTX_FRAME_RELATED_P (insn) = 1;
4284 }
4285
4286 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4287    adjusting it by ADJUSTMENT afterwards.  */
4288
4289 static rtx
4290 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4291                          HOST_WIDE_INT adjustment)
4292 {
4293   switch (mode)
4294     {
4295     case E_DImode:
4296       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4297                                    GEN_INT (UNITS_PER_WORD));
4298     case E_DFmode:
4299       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4300                                    GEN_INT (UNITS_PER_WORD));
4301     default:
4302       gcc_unreachable ();
4303     }
4304 }
4305
4306 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4307    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4308    into CFI_OPS.  */
4309
4310 static void
4311 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4312                   rtx *cfi_ops)
4313 {
4314   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4315   rtx reg1 = gen_rtx_REG (mode, regno1);
4316
4317   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4318
4319   if (regno2 == INVALID_REGNUM)
4320     {
4321       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4322       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4323       emit_move_insn (reg1, gen_frame_mem (mode, mem));
4324     }
4325   else
4326     {
4327       rtx reg2 = gen_rtx_REG (mode, regno2);
4328       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4329       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4330                                           reg2, adjustment));
4331     }
4332 }
4333
4334 /* Generate and return a store pair instruction of mode MODE to store
4335    register REG1 to MEM1 and register REG2 to MEM2.  */
4336
4337 static rtx
4338 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4339                         rtx reg2)
4340 {
4341   switch (mode)
4342     {
4343     case E_DImode:
4344       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
4345
4346     case E_DFmode:
4347       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
4348
4349     default:
4350       gcc_unreachable ();
4351     }
4352 }
4353
4354 /* Generate and regurn a load pair isntruction of mode MODE to load register
4355    REG1 from MEM1 and register REG2 from MEM2.  */
4356
4357 static rtx
4358 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4359                        rtx mem2)
4360 {
4361   switch (mode)
4362     {
4363     case E_DImode:
4364       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
4365
4366     case E_DFmode:
4367       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
4368
4369     default:
4370       gcc_unreachable ();
4371     }
4372 }
4373
4374 /* Return TRUE if return address signing should be enabled for the current
4375    function, otherwise return FALSE.  */
4376
4377 bool
4378 aarch64_return_address_signing_enabled (void)
4379 {
4380   /* This function should only be called after frame laid out.   */
4381   gcc_assert (cfun->machine->frame.laid_out);
4382
4383   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4384      if it's LR is pushed onto stack.  */
4385   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4386           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4387               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4388 }
4389
4390 /* Emit code to save the callee-saved registers from register number START
4391    to LIMIT to the stack at the location starting at offset START_OFFSET,
4392    skipping any write-back candidates if SKIP_WB is true.  */
4393
4394 static void
4395 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4396                            unsigned start, unsigned limit, bool skip_wb)
4397 {
4398   rtx_insn *insn;
4399   unsigned regno;
4400   unsigned regno2;
4401
4402   for (regno = aarch64_next_callee_save (start, limit);
4403        regno <= limit;
4404        regno = aarch64_next_callee_save (regno + 1, limit))
4405     {
4406       rtx reg, mem;
4407       poly_int64 offset;
4408
4409       if (skip_wb
4410           && (regno == cfun->machine->frame.wb_candidate1
4411               || regno == cfun->machine->frame.wb_candidate2))
4412         continue;
4413
4414       if (cfun->machine->reg_is_wrapped_separately[regno])
4415        continue;
4416
4417       reg = gen_rtx_REG (mode, regno);
4418       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4419       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4420                                                 offset));
4421
4422       regno2 = aarch64_next_callee_save (regno + 1, limit);
4423
4424       if (regno2 <= limit
4425           && !cfun->machine->reg_is_wrapped_separately[regno2]
4426           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4427               == cfun->machine->frame.reg_offset[regno2]))
4428
4429         {
4430           rtx reg2 = gen_rtx_REG (mode, regno2);
4431           rtx mem2;
4432
4433           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4434           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4435                                                      offset));
4436           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4437                                                     reg2));
4438
4439           /* The first part of a frame-related parallel insn is
4440              always assumed to be relevant to the frame
4441              calculations; subsequent parts, are only
4442              frame-related if explicitly marked.  */
4443           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4444           regno = regno2;
4445         }
4446       else
4447         insn = emit_move_insn (mem, reg);
4448
4449       RTX_FRAME_RELATED_P (insn) = 1;
4450     }
4451 }
4452
4453 /* Emit code to restore the callee registers of mode MODE from register
4454    number START up to and including LIMIT.  Restore from the stack offset
4455    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4456    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
4457
4458 static void
4459 aarch64_restore_callee_saves (machine_mode mode,
4460                               poly_int64 start_offset, unsigned start,
4461                               unsigned limit, bool skip_wb, rtx *cfi_ops)
4462 {
4463   rtx base_rtx = stack_pointer_rtx;
4464   unsigned regno;
4465   unsigned regno2;
4466   poly_int64 offset;
4467
4468   for (regno = aarch64_next_callee_save (start, limit);
4469        regno <= limit;
4470        regno = aarch64_next_callee_save (regno + 1, limit))
4471     {
4472       if (cfun->machine->reg_is_wrapped_separately[regno])
4473        continue;
4474
4475       rtx reg, mem;
4476
4477       if (skip_wb
4478           && (regno == cfun->machine->frame.wb_candidate1
4479               || regno == cfun->machine->frame.wb_candidate2))
4480         continue;
4481
4482       reg = gen_rtx_REG (mode, regno);
4483       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4484       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4485
4486       regno2 = aarch64_next_callee_save (regno + 1, limit);
4487
4488       if (regno2 <= limit
4489           && !cfun->machine->reg_is_wrapped_separately[regno2]
4490           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4491               == cfun->machine->frame.reg_offset[regno2]))
4492         {
4493           rtx reg2 = gen_rtx_REG (mode, regno2);
4494           rtx mem2;
4495
4496           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4497           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4498           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4499
4500           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4501           regno = regno2;
4502         }
4503       else
4504         emit_move_insn (reg, mem);
4505       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
4506     }
4507 }
4508
4509 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4510    of MODE.  */
4511
4512 static inline bool
4513 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4514 {
4515   HOST_WIDE_INT multiple;
4516   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4517           && IN_RANGE (multiple, -8, 7));
4518 }
4519
4520 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4521    of MODE.  */
4522
4523 static inline bool
4524 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4525 {
4526   HOST_WIDE_INT multiple;
4527   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4528           && IN_RANGE (multiple, 0, 63));
4529 }
4530
4531 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4532    of MODE.  */
4533
4534 bool
4535 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4536 {
4537   HOST_WIDE_INT multiple;
4538   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4539           && IN_RANGE (multiple, -64, 63));
4540 }
4541
4542 /* Return true if OFFSET is a signed 9-bit value.  */
4543
4544 static inline bool
4545 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
4546                                poly_int64 offset)
4547 {
4548   HOST_WIDE_INT const_offset;
4549   return (offset.is_constant (&const_offset)
4550           && IN_RANGE (const_offset, -256, 255));
4551 }
4552
4553 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4554    of MODE.  */
4555
4556 static inline bool
4557 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4558 {
4559   HOST_WIDE_INT multiple;
4560   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4561           && IN_RANGE (multiple, -256, 255));
4562 }
4563
4564 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4565    of MODE.  */
4566
4567 static inline bool
4568 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4569 {
4570   HOST_WIDE_INT multiple;
4571   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4572           && IN_RANGE (multiple, 0, 4095));
4573 }
4574
4575 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
4576
4577 static sbitmap
4578 aarch64_get_separate_components (void)
4579 {
4580   aarch64_layout_frame ();
4581
4582   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4583   bitmap_clear (components);
4584
4585   /* The registers we need saved to the frame.  */
4586   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4587     if (aarch64_register_saved_on_entry (regno))
4588       {
4589         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4590         if (!frame_pointer_needed)
4591           offset += cfun->machine->frame.frame_size
4592                     - cfun->machine->frame.hard_fp_offset;
4593         /* Check that we can access the stack slot of the register with one
4594            direct load with no adjustments needed.  */
4595         if (offset_12bit_unsigned_scaled_p (DImode, offset))
4596           bitmap_set_bit (components, regno);
4597       }
4598
4599   /* Don't mess with the hard frame pointer.  */
4600   if (frame_pointer_needed)
4601     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
4602
4603   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4604   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4605   /* If aarch64_layout_frame has chosen registers to store/restore with
4606      writeback don't interfere with them to avoid having to output explicit
4607      stack adjustment instructions.  */
4608   if (reg2 != INVALID_REGNUM)
4609     bitmap_clear_bit (components, reg2);
4610   if (reg1 != INVALID_REGNUM)
4611     bitmap_clear_bit (components, reg1);
4612
4613   bitmap_clear_bit (components, LR_REGNUM);
4614   bitmap_clear_bit (components, SP_REGNUM);
4615
4616   return components;
4617 }
4618
4619 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
4620
4621 static sbitmap
4622 aarch64_components_for_bb (basic_block bb)
4623 {
4624   bitmap in = DF_LIVE_IN (bb);
4625   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
4626   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
4627
4628   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4629   bitmap_clear (components);
4630
4631   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
4632   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4633     if ((!call_used_regs[regno])
4634        && (bitmap_bit_p (in, regno)
4635            || bitmap_bit_p (gen, regno)
4636            || bitmap_bit_p (kill, regno)))
4637       {
4638         unsigned regno2, offset, offset2;
4639         bitmap_set_bit (components, regno);
4640
4641         /* If there is a callee-save at an adjacent offset, add it too
4642            to increase the use of LDP/STP.  */
4643         offset = cfun->machine->frame.reg_offset[regno];
4644         regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
4645
4646         if (regno2 <= LAST_SAVED_REGNUM)
4647           {
4648             offset2 = cfun->machine->frame.reg_offset[regno2];
4649             if ((offset & ~8) == (offset2 & ~8))
4650               bitmap_set_bit (components, regno2);
4651           }
4652       }
4653
4654   return components;
4655 }
4656
4657 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
4658    Nothing to do for aarch64.  */
4659
4660 static void
4661 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
4662 {
4663 }
4664
4665 /* Return the next set bit in BMP from START onwards.  Return the total number
4666    of bits in BMP if no set bit is found at or after START.  */
4667
4668 static unsigned int
4669 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
4670 {
4671   unsigned int nbits = SBITMAP_SIZE (bmp);
4672   if (start == nbits)
4673     return start;
4674
4675   gcc_assert (start < nbits);
4676   for (unsigned int i = start; i < nbits; i++)
4677     if (bitmap_bit_p (bmp, i))
4678       return i;
4679
4680   return nbits;
4681 }
4682
4683 /* Do the work for aarch64_emit_prologue_components and
4684    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
4685    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
4686    for these components or the epilogue sequence.  That is, it determines
4687    whether we should emit stores or loads and what kind of CFA notes to attach
4688    to the insns.  Otherwise the logic for the two sequences is very
4689    similar.  */
4690
4691 static void
4692 aarch64_process_components (sbitmap components, bool prologue_p)
4693 {
4694   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
4695                              ? HARD_FRAME_POINTER_REGNUM
4696                              : STACK_POINTER_REGNUM);
4697
4698   unsigned last_regno = SBITMAP_SIZE (components);
4699   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
4700   rtx_insn *insn = NULL;
4701
4702   while (regno != last_regno)
4703     {
4704       /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
4705          so DFmode for the vector registers is enough.  */
4706       machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
4707       rtx reg = gen_rtx_REG (mode, regno);
4708       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4709       if (!frame_pointer_needed)
4710         offset += cfun->machine->frame.frame_size
4711                   - cfun->machine->frame.hard_fp_offset;
4712       rtx addr = plus_constant (Pmode, ptr_reg, offset);
4713       rtx mem = gen_frame_mem (mode, addr);
4714
4715       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
4716       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
4717       /* No more registers to handle after REGNO.
4718          Emit a single save/restore and exit.  */
4719       if (regno2 == last_regno)
4720         {
4721           insn = emit_insn (set);
4722           RTX_FRAME_RELATED_P (insn) = 1;
4723           if (prologue_p)
4724             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4725           else
4726             add_reg_note (insn, REG_CFA_RESTORE, reg);
4727           break;
4728         }
4729
4730       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
4731       /* The next register is not of the same class or its offset is not
4732          mergeable with the current one into a pair.  */
4733       if (!satisfies_constraint_Ump (mem)
4734           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
4735           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
4736                        GET_MODE_SIZE (mode)))
4737         {
4738           insn = emit_insn (set);
4739           RTX_FRAME_RELATED_P (insn) = 1;
4740           if (prologue_p)
4741             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4742           else
4743             add_reg_note (insn, REG_CFA_RESTORE, reg);
4744
4745           regno = regno2;
4746           continue;
4747         }
4748
4749       /* REGNO2 can be saved/restored in a pair with REGNO.  */
4750       rtx reg2 = gen_rtx_REG (mode, regno2);
4751       if (!frame_pointer_needed)
4752         offset2 += cfun->machine->frame.frame_size
4753                   - cfun->machine->frame.hard_fp_offset;
4754       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
4755       rtx mem2 = gen_frame_mem (mode, addr2);
4756       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
4757                              : gen_rtx_SET (reg2, mem2);
4758
4759       if (prologue_p)
4760         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
4761       else
4762         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4763
4764       RTX_FRAME_RELATED_P (insn) = 1;
4765       if (prologue_p)
4766         {
4767           add_reg_note (insn, REG_CFA_OFFSET, set);
4768           add_reg_note (insn, REG_CFA_OFFSET, set2);
4769         }
4770       else
4771         {
4772           add_reg_note (insn, REG_CFA_RESTORE, reg);
4773           add_reg_note (insn, REG_CFA_RESTORE, reg2);
4774         }
4775
4776       regno = aarch64_get_next_set_bit (components, regno2 + 1);
4777     }
4778 }
4779
4780 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
4781
4782 static void
4783 aarch64_emit_prologue_components (sbitmap components)
4784 {
4785   aarch64_process_components (components, true);
4786 }
4787
4788 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
4789
4790 static void
4791 aarch64_emit_epilogue_components (sbitmap components)
4792 {
4793   aarch64_process_components (components, false);
4794 }
4795
4796 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
4797
4798 static void
4799 aarch64_set_handled_components (sbitmap components)
4800 {
4801   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4802     if (bitmap_bit_p (components, regno))
4803       cfun->machine->reg_is_wrapped_separately[regno] = true;
4804 }
4805
4806 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
4807    is saved at BASE + OFFSET.  */
4808
4809 static void
4810 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
4811                             rtx base, poly_int64 offset)
4812 {
4813   rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
4814   add_reg_note (insn, REG_CFA_EXPRESSION,
4815                 gen_rtx_SET (mem, regno_reg_rtx[reg]));
4816 }
4817
4818 /* AArch64 stack frames generated by this compiler look like:
4819
4820         +-------------------------------+
4821         |                               |
4822         |  incoming stack arguments     |
4823         |                               |
4824         +-------------------------------+
4825         |                               | <-- incoming stack pointer (aligned)
4826         |  callee-allocated save area   |
4827         |  for register varargs         |
4828         |                               |
4829         +-------------------------------+
4830         |  local variables              | <-- frame_pointer_rtx
4831         |                               |
4832         +-------------------------------+
4833         |  padding0                     | \
4834         +-------------------------------+  |
4835         |  callee-saved registers       |  | frame.saved_regs_size
4836         +-------------------------------+  |
4837         |  LR'                          |  |
4838         +-------------------------------+  |
4839         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
4840         +-------------------------------+
4841         |  dynamic allocation           |
4842         +-------------------------------+
4843         |  padding                      |
4844         +-------------------------------+
4845         |  outgoing stack arguments     | <-- arg_pointer
4846         |                               |
4847         +-------------------------------+
4848         |                               | <-- stack_pointer_rtx (aligned)
4849
4850    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
4851    but leave frame_pointer_rtx and hard_frame_pointer_rtx
4852    unchanged.  */
4853
4854 /* Generate the prologue instructions for entry into a function.
4855    Establish the stack frame by decreasing the stack pointer with a
4856    properly calculated size and, if necessary, create a frame record
4857    filled with the values of LR and previous frame pointer.  The
4858    current FP is also set up if it is in use.  */
4859
4860 void
4861 aarch64_expand_prologue (void)
4862 {
4863   aarch64_layout_frame ();
4864
4865   poly_int64 frame_size = cfun->machine->frame.frame_size;
4866   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4867   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4868   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4869   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4870   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4871   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4872   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
4873   rtx_insn *insn;
4874
4875   /* Sign return address for functions.  */
4876   if (aarch64_return_address_signing_enabled ())
4877     {
4878       insn = emit_insn (gen_pacisp ());
4879       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
4880       RTX_FRAME_RELATED_P (insn) = 1;
4881     }
4882
4883   if (flag_stack_usage_info)
4884     current_function_static_stack_size = constant_lower_bound (frame_size);
4885
4886   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4887     {
4888       if (crtl->is_leaf && !cfun->calls_alloca)
4889         {
4890           if (maybe_gt (frame_size, PROBE_INTERVAL)
4891               && maybe_gt (frame_size, get_stack_check_protect ()))
4892             aarch64_emit_probe_stack_range (get_stack_check_protect (),
4893                                             (frame_size
4894                                              - get_stack_check_protect ()));
4895         }
4896       else if (maybe_gt (frame_size, 0))
4897         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
4898     }
4899
4900   rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4901   rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4902
4903   aarch64_sub_sp (ip0_rtx, ip1_rtx, initial_adjust, true);
4904
4905   if (callee_adjust != 0)
4906     aarch64_push_regs (reg1, reg2, callee_adjust);
4907
4908   if (emit_frame_chain)
4909     {
4910       poly_int64 reg_offset = callee_adjust;
4911       if (callee_adjust == 0)
4912         {
4913           reg1 = R29_REGNUM;
4914           reg2 = R30_REGNUM;
4915           reg_offset = callee_offset;
4916           aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
4917         }
4918       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
4919                           stack_pointer_rtx, callee_offset,
4920                           ip1_rtx, ip0_rtx, frame_pointer_needed);
4921       if (frame_pointer_needed && !frame_size.is_constant ())
4922         {
4923           /* Variable-sized frames need to describe the save slot
4924              address using DW_CFA_expression rather than DW_CFA_offset.
4925              This means that, without taking further action, the
4926              locations of the registers that we've already saved would
4927              remain based on the stack pointer even after we redefine
4928              the CFA based on the frame pointer.  We therefore need new
4929              DW_CFA_expressions to re-express the save slots with addresses
4930              based on the frame pointer.  */
4931           rtx_insn *insn = get_last_insn ();
4932           gcc_assert (RTX_FRAME_RELATED_P (insn));
4933
4934           /* Add an explicit CFA definition if this was previously
4935              implicit.  */
4936           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
4937             {
4938               rtx src = plus_constant (Pmode, stack_pointer_rtx,
4939                                        callee_offset);
4940               add_reg_note (insn, REG_CFA_ADJUST_CFA,
4941                             gen_rtx_SET (hard_frame_pointer_rtx, src));
4942             }
4943
4944           /* Change the save slot expressions for the registers that
4945              we've already saved.  */
4946           reg_offset -= callee_offset;
4947           aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
4948                                       reg_offset + UNITS_PER_WORD);
4949           aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
4950                                       reg_offset);
4951         }
4952       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
4953     }
4954
4955   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4956                              callee_adjust != 0 || emit_frame_chain);
4957   aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4958                              callee_adjust != 0 || emit_frame_chain);
4959   aarch64_sub_sp (ip1_rtx, ip0_rtx, final_adjust, !frame_pointer_needed);
4960 }
4961
4962 /* Return TRUE if we can use a simple_return insn.
4963
4964    This function checks whether the callee saved stack is empty, which
4965    means no restore actions are need. The pro_and_epilogue will use
4966    this to check whether shrink-wrapping opt is feasible.  */
4967
4968 bool
4969 aarch64_use_return_insn_p (void)
4970 {
4971   if (!reload_completed)
4972     return false;
4973
4974   if (crtl->profile)
4975     return false;
4976
4977   aarch64_layout_frame ();
4978
4979   return known_eq (cfun->machine->frame.frame_size, 0);
4980 }
4981
4982 /* Generate the epilogue instructions for returning from a function.
4983    This is almost exactly the reverse of the prolog sequence, except
4984    that we need to insert barriers to avoid scheduling loads that read
4985    from a deallocated stack, and we optimize the unwind records by
4986    emitting them all together if possible.  */
4987 void
4988 aarch64_expand_epilogue (bool for_sibcall)
4989 {
4990   aarch64_layout_frame ();
4991
4992   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4993   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4994   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4995   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4996   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4997   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4998   rtx cfi_ops = NULL;
4999   rtx_insn *insn;
5000   /* A stack clash protection prologue may not have left IP0_REGNUM or
5001      IP1_REGNUM in a usable state.  The same is true for allocations
5002      with an SVE component, since we then need both temporary registers
5003      for each allocation.  */
5004   bool can_inherit_p = (initial_adjust.is_constant ()
5005                         && final_adjust.is_constant ()
5006                         && !flag_stack_clash_protection);
5007
5008   /* We need to add memory barrier to prevent read from deallocated stack.  */
5009   bool need_barrier_p
5010     = maybe_ne (get_frame_size ()
5011                 + cfun->machine->frame.saved_varargs_size, 0);
5012
5013   /* Emit a barrier to prevent loads from a deallocated stack.  */
5014   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
5015       || cfun->calls_alloca
5016       || crtl->calls_eh_return)
5017     {
5018       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5019       need_barrier_p = false;
5020     }
5021
5022   /* Restore the stack pointer from the frame pointer if it may not
5023      be the same as the stack pointer.  */
5024   rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
5025   rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
5026   if (frame_pointer_needed
5027       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
5028     /* If writeback is used when restoring callee-saves, the CFA
5029        is restored on the instruction doing the writeback.  */
5030     aarch64_add_offset (Pmode, stack_pointer_rtx,
5031                         hard_frame_pointer_rtx, -callee_offset,
5032                         ip1_rtx, ip0_rtx, callee_adjust == 0);
5033   else
5034     aarch64_add_sp (ip1_rtx, ip0_rtx, final_adjust,
5035                     !can_inherit_p || df_regs_ever_live_p (IP1_REGNUM));
5036
5037   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5038                                 callee_adjust != 0, &cfi_ops);
5039   aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5040                                 callee_adjust != 0, &cfi_ops);
5041
5042   if (need_barrier_p)
5043     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5044
5045   if (callee_adjust != 0)
5046     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
5047
5048   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
5049     {
5050       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
5051       insn = get_last_insn ();
5052       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
5053       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
5054       RTX_FRAME_RELATED_P (insn) = 1;
5055       cfi_ops = NULL;
5056     }
5057
5058   aarch64_add_sp (ip0_rtx, ip1_rtx, initial_adjust,
5059                   !can_inherit_p || df_regs_ever_live_p (IP0_REGNUM));
5060
5061   if (cfi_ops)
5062     {
5063       /* Emit delayed restores and reset the CFA to be SP.  */
5064       insn = get_last_insn ();
5065       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
5066       REG_NOTES (insn) = cfi_ops;
5067       RTX_FRAME_RELATED_P (insn) = 1;
5068     }
5069
5070   /* We prefer to emit the combined return/authenticate instruction RETAA,
5071      however there are three cases in which we must instead emit an explicit
5072      authentication instruction.
5073
5074         1) Sibcalls don't return in a normal way, so if we're about to call one
5075            we must authenticate.
5076
5077         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5078            generating code for !TARGET_ARMV8_3 we can't use it and must
5079            explicitly authenticate.
5080
5081         3) On an eh_return path we make extra stack adjustments to update the
5082            canonical frame address to be the exception handler's CFA.  We want
5083            to authenticate using the CFA of the function which calls eh_return.
5084     */
5085   if (aarch64_return_address_signing_enabled ()
5086       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
5087     {
5088       insn = emit_insn (gen_autisp ());
5089       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5090       RTX_FRAME_RELATED_P (insn) = 1;
5091     }
5092
5093   /* Stack adjustment for exception handler.  */
5094   if (crtl->calls_eh_return)
5095     {
5096       /* We need to unwind the stack by the offset computed by
5097          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
5098          to be SP; letting the CFA move during this adjustment
5099          is just as correct as retaining the CFA from the body
5100          of the function.  Therefore, do nothing special.  */
5101       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
5102     }
5103
5104   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
5105   if (!for_sibcall)
5106     emit_jump_insn (ret_rtx);
5107 }
5108
5109 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
5110    normally or return to a previous frame after unwinding.
5111
5112    An EH return uses a single shared return sequence.  The epilogue is
5113    exactly like a normal epilogue except that it has an extra input
5114    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5115    that must be applied after the frame has been destroyed.  An extra label
5116    is inserted before the epilogue which initializes this register to zero,
5117    and this is the entry point for a normal return.
5118
5119    An actual EH return updates the return address, initializes the stack
5120    adjustment and jumps directly into the epilogue (bypassing the zeroing
5121    of the adjustment).  Since the return address is typically saved on the
5122    stack when a function makes a call, the saved LR must be updated outside
5123    the epilogue.
5124
5125    This poses problems as the store is generated well before the epilogue,
5126    so the offset of LR is not known yet.  Also optimizations will remove the
5127    store as it appears dead, even after the epilogue is generated (as the
5128    base or offset for loading LR is different in many cases).
5129
5130    To avoid these problems this implementation forces the frame pointer
5131    in eh_return functions so that the location of LR is fixed and known early.
5132    It also marks the store volatile, so no optimization is permitted to
5133    remove the store.  */
5134 rtx
5135 aarch64_eh_return_handler_rtx (void)
5136 {
5137   rtx tmp = gen_frame_mem (Pmode,
5138     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
5139
5140   /* Mark the store volatile, so no optimization is permitted to remove it.  */
5141   MEM_VOLATILE_P (tmp) = true;
5142   return tmp;
5143 }
5144
5145 /* Output code to add DELTA to the first argument, and then jump
5146    to FUNCTION.  Used for C++ multiple inheritance.  */
5147 static void
5148 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
5149                          HOST_WIDE_INT delta,
5150                          HOST_WIDE_INT vcall_offset,
5151                          tree function)
5152 {
5153   /* The this pointer is always in x0.  Note that this differs from
5154      Arm where the this pointer maybe bumped to r1 if r0 is required
5155      to return a pointer to an aggregate.  On AArch64 a result value
5156      pointer will be in x8.  */
5157   int this_regno = R0_REGNUM;
5158   rtx this_rtx, temp0, temp1, addr, funexp;
5159   rtx_insn *insn;
5160
5161   reload_completed = 1;
5162   emit_note (NOTE_INSN_PROLOGUE_END);
5163
5164   this_rtx = gen_rtx_REG (Pmode, this_regno);
5165   temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
5166   temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
5167
5168   if (vcall_offset == 0)
5169     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
5170   else
5171     {
5172       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
5173
5174       addr = this_rtx;
5175       if (delta != 0)
5176         {
5177           if (delta >= -256 && delta < 256)
5178             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
5179                                        plus_constant (Pmode, this_rtx, delta));
5180           else
5181             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
5182                                 temp1, temp0, false);
5183         }
5184
5185       if (Pmode == ptr_mode)
5186         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
5187       else
5188         aarch64_emit_move (temp0,
5189                            gen_rtx_ZERO_EXTEND (Pmode,
5190                                                 gen_rtx_MEM (ptr_mode, addr)));
5191
5192       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
5193           addr = plus_constant (Pmode, temp0, vcall_offset);
5194       else
5195         {
5196           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
5197                                           Pmode);
5198           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
5199         }
5200
5201       if (Pmode == ptr_mode)
5202         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
5203       else
5204         aarch64_emit_move (temp1,
5205                            gen_rtx_SIGN_EXTEND (Pmode,
5206                                                 gen_rtx_MEM (ptr_mode, addr)));
5207
5208       emit_insn (gen_add2_insn (this_rtx, temp1));
5209     }
5210
5211   /* Generate a tail call to the target function.  */
5212   if (!TREE_USED (function))
5213     {
5214       assemble_external (function);
5215       TREE_USED (function) = 1;
5216     }
5217   funexp = XEXP (DECL_RTL (function), 0);
5218   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
5219   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
5220   SIBLING_CALL_P (insn) = 1;
5221
5222   insn = get_insns ();
5223   shorten_branches (insn);
5224   final_start_function (insn, file, 1);
5225   final (insn, file, 1);
5226   final_end_function ();
5227
5228   /* Stop pretending to be a post-reload pass.  */
5229   reload_completed = 0;
5230 }
5231
5232 static bool
5233 aarch64_tls_referenced_p (rtx x)
5234 {
5235   if (!TARGET_HAVE_TLS)
5236     return false;
5237   subrtx_iterator::array_type array;
5238   FOR_EACH_SUBRTX (iter, array, x, ALL)
5239     {
5240       const_rtx x = *iter;
5241       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
5242         return true;
5243       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
5244          TLS offsets, not real symbol references.  */
5245       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5246         iter.skip_subrtxes ();
5247     }
5248   return false;
5249 }
5250
5251
5252 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5253    a left shift of 0 or 12 bits.  */
5254 bool
5255 aarch64_uimm12_shift (HOST_WIDE_INT val)
5256 {
5257   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
5258           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
5259           );
5260 }
5261
5262
5263 /* Return true if val is an immediate that can be loaded into a
5264    register by a MOVZ instruction.  */
5265 static bool
5266 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
5267 {
5268   if (GET_MODE_SIZE (mode) > 4)
5269     {
5270       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
5271           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
5272         return 1;
5273     }
5274   else
5275     {
5276       /* Ignore sign extension.  */
5277       val &= (HOST_WIDE_INT) 0xffffffff;
5278     }
5279   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
5280           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
5281 }
5282
5283 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
5284    64-bit (DImode) integer.  */
5285
5286 static unsigned HOST_WIDE_INT
5287 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5288 {
5289   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
5290   while (size < 64)
5291     {
5292       val &= (HOST_WIDE_INT_1U << size) - 1;
5293       val |= val << size;
5294       size *= 2;
5295     }
5296   return val;
5297 }
5298
5299 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
5300
5301 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
5302   {
5303     0x0000000100000001ull,
5304     0x0001000100010001ull,
5305     0x0101010101010101ull,
5306     0x1111111111111111ull,
5307     0x5555555555555555ull,
5308   };
5309
5310
5311 /* Return true if val is a valid bitmask immediate.  */
5312
5313 bool
5314 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
5315 {
5316   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
5317   int bits;
5318
5319   /* Check for a single sequence of one bits and return quickly if so.
5320      The special cases of all ones and all zeroes returns false.  */
5321   val = aarch64_replicate_bitmask_imm (val_in, mode);
5322   tmp = val + (val & -val);
5323
5324   if (tmp == (tmp & -tmp))
5325     return (val + 1) > 1;
5326
5327   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
5328   if (mode == SImode)
5329     val = (val << 32) | (val & 0xffffffff);
5330
5331   /* Invert if the immediate doesn't start with a zero bit - this means we
5332      only need to search for sequences of one bits.  */
5333   if (val & 1)
5334     val = ~val;
5335
5336   /* Find the first set bit and set tmp to val with the first sequence of one
5337      bits removed.  Return success if there is a single sequence of ones.  */
5338   first_one = val & -val;
5339   tmp = val & (val + first_one);
5340
5341   if (tmp == 0)
5342     return true;
5343
5344   /* Find the next set bit and compute the difference in bit position.  */
5345   next_one = tmp & -tmp;
5346   bits = clz_hwi (first_one) - clz_hwi (next_one);
5347   mask = val ^ tmp;
5348
5349   /* Check the bit position difference is a power of 2, and that the first
5350      sequence of one bits fits within 'bits' bits.  */
5351   if ((mask >> bits) != 0 || bits != (bits & -bits))
5352     return false;
5353
5354   /* Check the sequence of one bits is repeated 64/bits times.  */
5355   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
5356 }
5357
5358 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5359    Assumed precondition: VAL_IN Is not zero.  */
5360
5361 unsigned HOST_WIDE_INT
5362 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
5363 {
5364   int lowest_bit_set = ctz_hwi (val_in);
5365   int highest_bit_set = floor_log2 (val_in);
5366   gcc_assert (val_in != 0);
5367
5368   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
5369           (HOST_WIDE_INT_1U << lowest_bit_set));
5370 }
5371
5372 /* Create constant where bits outside of lowest bit set to highest bit set
5373    are set to 1.  */
5374
5375 unsigned HOST_WIDE_INT
5376 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
5377 {
5378   return val_in | ~aarch64_and_split_imm1 (val_in);
5379 }
5380
5381 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
5382
5383 bool
5384 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
5385 {
5386   scalar_int_mode int_mode;
5387   if (!is_a <scalar_int_mode> (mode, &int_mode))
5388     return false;
5389
5390   if (aarch64_bitmask_imm (val_in, int_mode))
5391     return false;
5392
5393   if (aarch64_move_imm (val_in, int_mode))
5394     return false;
5395
5396   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
5397
5398   return aarch64_bitmask_imm (imm2, int_mode);
5399 }
5400
5401 /* Return true if val is an immediate that can be loaded into a
5402    register in a single instruction.  */
5403 bool
5404 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
5405 {
5406   scalar_int_mode int_mode;
5407   if (!is_a <scalar_int_mode> (mode, &int_mode))
5408     return false;
5409
5410   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
5411     return 1;
5412   return aarch64_bitmask_imm (val, int_mode);
5413 }
5414
5415 static bool
5416 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
5417 {
5418   rtx base, offset;
5419
5420   if (GET_CODE (x) == HIGH)
5421     return true;
5422
5423   /* There's no way to calculate VL-based values using relocations.  */
5424   subrtx_iterator::array_type array;
5425   FOR_EACH_SUBRTX (iter, array, x, ALL)
5426     if (GET_CODE (*iter) == CONST_POLY_INT)
5427       return true;
5428
5429   split_const (x, &base, &offset);
5430   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
5431     {
5432       if (aarch64_classify_symbol (base, INTVAL (offset))
5433           != SYMBOL_FORCE_TO_MEM)
5434         return true;
5435       else
5436         /* Avoid generating a 64-bit relocation in ILP32; leave
5437            to aarch64_expand_mov_immediate to handle it properly.  */
5438         return mode != ptr_mode;
5439     }
5440
5441   return aarch64_tls_referenced_p (x);
5442 }
5443
5444 /* Implement TARGET_CASE_VALUES_THRESHOLD.
5445    The expansion for a table switch is quite expensive due to the number
5446    of instructions, the table lookup and hard to predict indirect jump.
5447    When optimizing for speed, and -O3 enabled, use the per-core tuning if
5448    set, otherwise use tables for > 16 cases as a tradeoff between size and
5449    performance.  When optimizing for size, use the default setting.  */
5450
5451 static unsigned int
5452 aarch64_case_values_threshold (void)
5453 {
5454   /* Use the specified limit for the number of cases before using jump
5455      tables at higher optimization levels.  */
5456   if (optimize > 2
5457       && selected_cpu->tune->max_case_values != 0)
5458     return selected_cpu->tune->max_case_values;
5459   else
5460     return optimize_size ? default_case_values_threshold () : 17;
5461 }
5462
5463 /* Return true if register REGNO is a valid index register.
5464    STRICT_P is true if REG_OK_STRICT is in effect.  */
5465
5466 bool
5467 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
5468 {
5469   if (!HARD_REGISTER_NUM_P (regno))
5470     {
5471       if (!strict_p)
5472         return true;
5473
5474       if (!reg_renumber)
5475         return false;
5476
5477       regno = reg_renumber[regno];
5478     }
5479   return GP_REGNUM_P (regno);
5480 }
5481
5482 /* Return true if register REGNO is a valid base register for mode MODE.
5483    STRICT_P is true if REG_OK_STRICT is in effect.  */
5484
5485 bool
5486 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
5487 {
5488   if (!HARD_REGISTER_NUM_P (regno))
5489     {
5490       if (!strict_p)
5491         return true;
5492
5493       if (!reg_renumber)
5494         return false;
5495
5496       regno = reg_renumber[regno];
5497     }
5498
5499   /* The fake registers will be eliminated to either the stack or
5500      hard frame pointer, both of which are usually valid base registers.
5501      Reload deals with the cases where the eliminated form isn't valid.  */
5502   return (GP_REGNUM_P (regno)
5503           || regno == SP_REGNUM
5504           || regno == FRAME_POINTER_REGNUM
5505           || regno == ARG_POINTER_REGNUM);
5506 }
5507
5508 /* Return true if X is a valid base register for mode MODE.
5509    STRICT_P is true if REG_OK_STRICT is in effect.  */
5510
5511 static bool
5512 aarch64_base_register_rtx_p (rtx x, bool strict_p)
5513 {
5514   if (!strict_p
5515       && GET_CODE (x) == SUBREG
5516       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
5517     x = SUBREG_REG (x);
5518
5519   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
5520 }
5521
5522 /* Return true if address offset is a valid index.  If it is, fill in INFO
5523    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
5524
5525 static bool
5526 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
5527                         machine_mode mode, bool strict_p)
5528 {
5529   enum aarch64_address_type type;
5530   rtx index;
5531   int shift;
5532
5533   /* (reg:P) */
5534   if ((REG_P (x) || GET_CODE (x) == SUBREG)
5535       && GET_MODE (x) == Pmode)
5536     {
5537       type = ADDRESS_REG_REG;
5538       index = x;
5539       shift = 0;
5540     }
5541   /* (sign_extend:DI (reg:SI)) */
5542   else if ((GET_CODE (x) == SIGN_EXTEND
5543             || GET_CODE (x) == ZERO_EXTEND)
5544            && GET_MODE (x) == DImode
5545            && GET_MODE (XEXP (x, 0)) == SImode)
5546     {
5547       type = (GET_CODE (x) == SIGN_EXTEND)
5548         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5549       index = XEXP (x, 0);
5550       shift = 0;
5551     }
5552   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
5553   else if (GET_CODE (x) == MULT
5554            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5555                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5556            && GET_MODE (XEXP (x, 0)) == DImode
5557            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5558            && CONST_INT_P (XEXP (x, 1)))
5559     {
5560       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5561         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5562       index = XEXP (XEXP (x, 0), 0);
5563       shift = exact_log2 (INTVAL (XEXP (x, 1)));
5564     }
5565   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
5566   else if (GET_CODE (x) == ASHIFT
5567            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5568                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5569            && GET_MODE (XEXP (x, 0)) == DImode
5570            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5571            && CONST_INT_P (XEXP (x, 1)))
5572     {
5573       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5574         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5575       index = XEXP (XEXP (x, 0), 0);
5576       shift = INTVAL (XEXP (x, 1));
5577     }
5578   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
5579   else if ((GET_CODE (x) == SIGN_EXTRACT
5580             || GET_CODE (x) == ZERO_EXTRACT)
5581            && GET_MODE (x) == DImode
5582            && GET_CODE (XEXP (x, 0)) == MULT
5583            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5584            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5585     {
5586       type = (GET_CODE (x) == SIGN_EXTRACT)
5587         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5588       index = XEXP (XEXP (x, 0), 0);
5589       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5590       if (INTVAL (XEXP (x, 1)) != 32 + shift
5591           || INTVAL (XEXP (x, 2)) != 0)
5592         shift = -1;
5593     }
5594   /* (and:DI (mult:DI (reg:DI) (const_int scale))
5595      (const_int 0xffffffff<<shift)) */
5596   else if (GET_CODE (x) == AND
5597            && GET_MODE (x) == DImode
5598            && GET_CODE (XEXP (x, 0)) == MULT
5599            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5600            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5601            && CONST_INT_P (XEXP (x, 1)))
5602     {
5603       type = ADDRESS_REG_UXTW;
5604       index = XEXP (XEXP (x, 0), 0);
5605       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5606       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5607         shift = -1;
5608     }
5609   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
5610   else if ((GET_CODE (x) == SIGN_EXTRACT
5611             || GET_CODE (x) == ZERO_EXTRACT)
5612            && GET_MODE (x) == DImode
5613            && GET_CODE (XEXP (x, 0)) == ASHIFT
5614            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5615            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5616     {
5617       type = (GET_CODE (x) == SIGN_EXTRACT)
5618         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5619       index = XEXP (XEXP (x, 0), 0);
5620       shift = INTVAL (XEXP (XEXP (x, 0), 1));
5621       if (INTVAL (XEXP (x, 1)) != 32 + shift
5622           || INTVAL (XEXP (x, 2)) != 0)
5623         shift = -1;
5624     }
5625   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
5626      (const_int 0xffffffff<<shift)) */
5627   else if (GET_CODE (x) == AND
5628            && GET_MODE (x) == DImode
5629            && GET_CODE (XEXP (x, 0)) == ASHIFT
5630            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5631            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5632            && CONST_INT_P (XEXP (x, 1)))
5633     {
5634       type = ADDRESS_REG_UXTW;
5635       index = XEXP (XEXP (x, 0), 0);
5636       shift = INTVAL (XEXP (XEXP (x, 0), 1));
5637       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5638         shift = -1;
5639     }
5640   /* (mult:P (reg:P) (const_int scale)) */
5641   else if (GET_CODE (x) == MULT
5642            && GET_MODE (x) == Pmode
5643            && GET_MODE (XEXP (x, 0)) == Pmode
5644            && CONST_INT_P (XEXP (x, 1)))
5645     {
5646       type = ADDRESS_REG_REG;
5647       index = XEXP (x, 0);
5648       shift = exact_log2 (INTVAL (XEXP (x, 1)));
5649     }
5650   /* (ashift:P (reg:P) (const_int shift)) */
5651   else if (GET_CODE (x) == ASHIFT
5652            && GET_MODE (x) == Pmode
5653            && GET_MODE (XEXP (x, 0)) == Pmode
5654            && CONST_INT_P (XEXP (x, 1)))
5655     {
5656       type = ADDRESS_REG_REG;
5657       index = XEXP (x, 0);
5658       shift = INTVAL (XEXP (x, 1));
5659     }
5660   else
5661     return false;
5662
5663   if (!strict_p
5664       && GET_CODE (index) == SUBREG
5665       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
5666     index = SUBREG_REG (index);
5667
5668   if (aarch64_sve_data_mode_p (mode))
5669     {
5670       if (type != ADDRESS_REG_REG
5671           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
5672         return false;
5673     }
5674   else
5675     {
5676       if (shift != 0
5677           && !(IN_RANGE (shift, 1, 3)
5678                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
5679         return false;
5680     }
5681
5682   if (REG_P (index)
5683       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
5684     {
5685       info->type = type;
5686       info->offset = index;
5687       info->shift = shift;
5688       return true;
5689     }
5690
5691   return false;
5692 }
5693
5694 /* Return true if MODE is one of the modes for which we
5695    support LDP/STP operations.  */
5696
5697 static bool
5698 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
5699 {
5700   return mode == SImode || mode == DImode
5701          || mode == SFmode || mode == DFmode
5702          || (aarch64_vector_mode_supported_p (mode)
5703              && (known_eq (GET_MODE_SIZE (mode), 8)
5704                  || (known_eq (GET_MODE_SIZE (mode), 16)
5705                     && (aarch64_tune_params.extra_tuning_flags
5706                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
5707 }
5708
5709 /* Return true if REGNO is a virtual pointer register, or an eliminable
5710    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
5711    include stack_pointer or hard_frame_pointer.  */
5712 static bool
5713 virt_or_elim_regno_p (unsigned regno)
5714 {
5715   return ((regno >= FIRST_VIRTUAL_REGISTER
5716            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
5717           || regno == FRAME_POINTER_REGNUM
5718           || regno == ARG_POINTER_REGNUM);
5719 }
5720
5721 /* Return true if X is a valid address of type TYPE for machine mode MODE.
5722    If it is, fill in INFO appropriately.  STRICT_P is true if
5723    REG_OK_STRICT is in effect.  */
5724
5725 static bool
5726 aarch64_classify_address (struct aarch64_address_info *info,
5727                           rtx x, machine_mode mode, bool strict_p,
5728                           aarch64_addr_query_type type = ADDR_QUERY_M)
5729 {
5730   enum rtx_code code = GET_CODE (x);
5731   rtx op0, op1;
5732   poly_int64 offset;
5733
5734   HOST_WIDE_INT const_size;
5735
5736   /* On BE, we use load/store pair for all large int mode load/stores.
5737      TI/TFmode may also use a load/store pair.  */
5738   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5739   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
5740   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
5741                             || mode == TImode
5742                             || mode == TFmode
5743                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
5744
5745   bool allow_reg_index_p = (!load_store_pair_p
5746                             && (known_lt (GET_MODE_SIZE (mode), 16)
5747                                 || vec_flags == VEC_ADVSIMD
5748                                 || vec_flags == VEC_SVE_DATA));
5749
5750   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
5751      [Rn, #offset, MUL VL].  */
5752   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
5753       && (code != REG && code != PLUS))
5754     return false;
5755
5756   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
5757      REG addressing.  */
5758   if (advsimd_struct_p
5759       && !BYTES_BIG_ENDIAN
5760       && (code != POST_INC && code != REG))
5761     return false;
5762
5763   gcc_checking_assert (GET_MODE (x) == VOIDmode
5764                        || SCALAR_INT_MODE_P (GET_MODE (x)));
5765
5766   switch (code)
5767     {
5768     case REG:
5769     case SUBREG:
5770       info->type = ADDRESS_REG_IMM;
5771       info->base = x;
5772       info->offset = const0_rtx;
5773       info->const_offset = 0;
5774       return aarch64_base_register_rtx_p (x, strict_p);
5775
5776     case PLUS:
5777       op0 = XEXP (x, 0);
5778       op1 = XEXP (x, 1);
5779
5780       if (! strict_p
5781           && REG_P (op0)
5782           && virt_or_elim_regno_p (REGNO (op0))
5783           && poly_int_rtx_p (op1, &offset))
5784         {
5785           info->type = ADDRESS_REG_IMM;
5786           info->base = op0;
5787           info->offset = op1;
5788           info->const_offset = offset;
5789
5790           return true;
5791         }
5792
5793       if (maybe_ne (GET_MODE_SIZE (mode), 0)
5794           && aarch64_base_register_rtx_p (op0, strict_p)
5795           && poly_int_rtx_p (op1, &offset))
5796         {
5797           info->type = ADDRESS_REG_IMM;
5798           info->base = op0;
5799           info->offset = op1;
5800           info->const_offset = offset;
5801
5802           /* TImode and TFmode values are allowed in both pairs of X
5803              registers and individual Q registers.  The available
5804              address modes are:
5805              X,X: 7-bit signed scaled offset
5806              Q:   9-bit signed offset
5807              We conservatively require an offset representable in either mode.
5808              When performing the check for pairs of X registers i.e.  LDP/STP
5809              pass down DImode since that is the natural size of the LDP/STP
5810              instruction memory accesses.  */
5811           if (mode == TImode || mode == TFmode)
5812             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
5813                     && (offset_9bit_signed_unscaled_p (mode, offset)
5814                         || offset_12bit_unsigned_scaled_p (mode, offset)));
5815
5816           /* A 7bit offset check because OImode will emit a ldp/stp
5817              instruction (only big endian will get here).
5818              For ldp/stp instructions, the offset is scaled for the size of a
5819              single element of the pair.  */
5820           if (mode == OImode)
5821             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
5822
5823           /* Three 9/12 bit offsets checks because CImode will emit three
5824              ldr/str instructions (only big endian will get here).  */
5825           if (mode == CImode)
5826             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5827                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
5828                         || offset_12bit_unsigned_scaled_p (V16QImode,
5829                                                            offset + 32)));
5830
5831           /* Two 7bit offsets checks because XImode will emit two ldp/stp
5832              instructions (only big endian will get here).  */
5833           if (mode == XImode)
5834             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5835                     && aarch64_offset_7bit_signed_scaled_p (TImode,
5836                                                             offset + 32));
5837
5838           /* Make "m" use the LD1 offset range for SVE data modes, so
5839              that pre-RTL optimizers like ivopts will work to that
5840              instead of the wider LDR/STR range.  */
5841           if (vec_flags == VEC_SVE_DATA)
5842             return (type == ADDR_QUERY_M
5843                     ? offset_4bit_signed_scaled_p (mode, offset)
5844                     : offset_9bit_signed_scaled_p (mode, offset));
5845
5846           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
5847             {
5848               poly_int64 end_offset = (offset
5849                                        + GET_MODE_SIZE (mode)
5850                                        - BYTES_PER_SVE_VECTOR);
5851               return (type == ADDR_QUERY_M
5852                       ? offset_4bit_signed_scaled_p (mode, offset)
5853                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
5854                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
5855                                                          end_offset)));
5856             }
5857
5858           if (vec_flags == VEC_SVE_PRED)
5859             return offset_9bit_signed_scaled_p (mode, offset);
5860
5861           if (load_store_pair_p)
5862             return ((known_eq (GET_MODE_SIZE (mode), 4)
5863                      || known_eq (GET_MODE_SIZE (mode), 8)
5864                      || known_eq (GET_MODE_SIZE (mode), 16))
5865                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5866           else
5867             return (offset_9bit_signed_unscaled_p (mode, offset)
5868                     || offset_12bit_unsigned_scaled_p (mode, offset));
5869         }
5870
5871       if (allow_reg_index_p)
5872         {
5873           /* Look for base + (scaled/extended) index register.  */
5874           if (aarch64_base_register_rtx_p (op0, strict_p)
5875               && aarch64_classify_index (info, op1, mode, strict_p))
5876             {
5877               info->base = op0;
5878               return true;
5879             }
5880           if (aarch64_base_register_rtx_p (op1, strict_p)
5881               && aarch64_classify_index (info, op0, mode, strict_p))
5882             {
5883               info->base = op1;
5884               return true;
5885             }
5886         }
5887
5888       return false;
5889
5890     case POST_INC:
5891     case POST_DEC:
5892     case PRE_INC:
5893     case PRE_DEC:
5894       info->type = ADDRESS_REG_WB;
5895       info->base = XEXP (x, 0);
5896       info->offset = NULL_RTX;
5897       return aarch64_base_register_rtx_p (info->base, strict_p);
5898
5899     case POST_MODIFY:
5900     case PRE_MODIFY:
5901       info->type = ADDRESS_REG_WB;
5902       info->base = XEXP (x, 0);
5903       if (GET_CODE (XEXP (x, 1)) == PLUS
5904           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
5905           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
5906           && aarch64_base_register_rtx_p (info->base, strict_p))
5907         {
5908           info->offset = XEXP (XEXP (x, 1), 1);
5909           info->const_offset = offset;
5910
5911           /* TImode and TFmode values are allowed in both pairs of X
5912              registers and individual Q registers.  The available
5913              address modes are:
5914              X,X: 7-bit signed scaled offset
5915              Q:   9-bit signed offset
5916              We conservatively require an offset representable in either mode.
5917            */
5918           if (mode == TImode || mode == TFmode)
5919             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
5920                     && offset_9bit_signed_unscaled_p (mode, offset));
5921
5922           if (load_store_pair_p)
5923             return ((known_eq (GET_MODE_SIZE (mode), 4)
5924                      || known_eq (GET_MODE_SIZE (mode), 8)
5925                      || known_eq (GET_MODE_SIZE (mode), 16))
5926                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5927           else
5928             return offset_9bit_signed_unscaled_p (mode, offset);
5929         }
5930       return false;
5931
5932     case CONST:
5933     case SYMBOL_REF:
5934     case LABEL_REF:
5935       /* load literal: pc-relative constant pool entry.  Only supported
5936          for SI mode or larger.  */
5937       info->type = ADDRESS_SYMBOLIC;
5938
5939       if (!load_store_pair_p
5940           && GET_MODE_SIZE (mode).is_constant (&const_size)
5941           && const_size >= 4)
5942         {
5943           rtx sym, addend;
5944
5945           split_const (x, &sym, &addend);
5946           return ((GET_CODE (sym) == LABEL_REF
5947                    || (GET_CODE (sym) == SYMBOL_REF
5948                        && CONSTANT_POOL_ADDRESS_P (sym)
5949                        && aarch64_pcrelative_literal_loads)));
5950         }
5951       return false;
5952
5953     case LO_SUM:
5954       info->type = ADDRESS_LO_SUM;
5955       info->base = XEXP (x, 0);
5956       info->offset = XEXP (x, 1);
5957       if (allow_reg_index_p
5958           && aarch64_base_register_rtx_p (info->base, strict_p))
5959         {
5960           rtx sym, offs;
5961           split_const (info->offset, &sym, &offs);
5962           if (GET_CODE (sym) == SYMBOL_REF
5963               && (aarch64_classify_symbol (sym, INTVAL (offs))
5964                   == SYMBOL_SMALL_ABSOLUTE))
5965             {
5966               /* The symbol and offset must be aligned to the access size.  */
5967               unsigned int align;
5968
5969               if (CONSTANT_POOL_ADDRESS_P (sym))
5970                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
5971               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
5972                 {
5973                   tree exp = SYMBOL_REF_DECL (sym);
5974                   align = TYPE_ALIGN (TREE_TYPE (exp));
5975                   align = aarch64_constant_alignment (exp, align);
5976                 }
5977               else if (SYMBOL_REF_DECL (sym))
5978                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
5979               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
5980                        && SYMBOL_REF_BLOCK (sym) != NULL)
5981                 align = SYMBOL_REF_BLOCK (sym)->alignment;
5982               else
5983                 align = BITS_PER_UNIT;
5984
5985               poly_int64 ref_size = GET_MODE_SIZE (mode);
5986               if (known_eq (ref_size, 0))
5987                 ref_size = GET_MODE_SIZE (DImode);
5988
5989               return (multiple_p (INTVAL (offs), ref_size)
5990                       && multiple_p (align / BITS_PER_UNIT, ref_size));
5991             }
5992         }
5993       return false;
5994
5995     default:
5996       return false;
5997     }
5998 }
5999
6000 /* Return true if the address X is valid for a PRFM instruction.
6001    STRICT_P is true if we should do strict checking with
6002    aarch64_classify_address.  */
6003
6004 bool
6005 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
6006 {
6007   struct aarch64_address_info addr;
6008
6009   /* PRFM accepts the same addresses as DImode...  */
6010   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
6011   if (!res)
6012     return false;
6013
6014   /* ... except writeback forms.  */
6015   return addr.type != ADDRESS_REG_WB;
6016 }
6017
6018 bool
6019 aarch64_symbolic_address_p (rtx x)
6020 {
6021   rtx offset;
6022
6023   split_const (x, &x, &offset);
6024   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
6025 }
6026
6027 /* Classify the base of symbolic expression X.  */
6028
6029 enum aarch64_symbol_type
6030 aarch64_classify_symbolic_expression (rtx x)
6031 {
6032   rtx offset;
6033
6034   split_const (x, &x, &offset);
6035   return aarch64_classify_symbol (x, INTVAL (offset));
6036 }
6037
6038
6039 /* Return TRUE if X is a legitimate address for accessing memory in
6040    mode MODE.  */
6041 static bool
6042 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
6043 {
6044   struct aarch64_address_info addr;
6045
6046   return aarch64_classify_address (&addr, x, mode, strict_p);
6047 }
6048
6049 /* Return TRUE if X is a legitimate address of type TYPE for accessing
6050    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
6051 bool
6052 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
6053                               aarch64_addr_query_type type)
6054 {
6055   struct aarch64_address_info addr;
6056
6057   return aarch64_classify_address (&addr, x, mode, strict_p, type);
6058 }
6059
6060 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
6061
6062 static bool
6063 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
6064                                          poly_int64 orig_offset,
6065                                          machine_mode mode)
6066 {
6067   HOST_WIDE_INT size;
6068   if (GET_MODE_SIZE (mode).is_constant (&size))
6069     {
6070       HOST_WIDE_INT const_offset, second_offset;
6071
6072       /* A general SVE offset is A * VQ + B.  Remove the A component from
6073          coefficient 0 in order to get the constant B.  */
6074       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
6075
6076       /* Split an out-of-range address displacement into a base and
6077          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
6078          range otherwise to increase opportunities for sharing the base
6079          address of different sizes.  Unaligned accesses use the signed
6080          9-bit range, TImode/TFmode use the intersection of signed
6081          scaled 7-bit and signed 9-bit offset.  */
6082       if (mode == TImode || mode == TFmode)
6083         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
6084       else if ((const_offset & (size - 1)) != 0)
6085         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
6086       else
6087         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
6088
6089       if (second_offset == 0 || known_eq (orig_offset, second_offset))
6090         return false;
6091
6092       /* Split the offset into second_offset and the rest.  */
6093       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6094       *offset2 = gen_int_mode (second_offset, Pmode);
6095       return true;
6096     }
6097   else
6098     {
6099       /* Get the mode we should use as the basis of the range.  For structure
6100          modes this is the mode of one vector.  */
6101       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6102       machine_mode step_mode
6103         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
6104
6105       /* Get the "mul vl" multiplier we'd like to use.  */
6106       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
6107       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
6108       if (vec_flags & VEC_SVE_DATA)
6109         /* LDR supports a 9-bit range, but the move patterns for
6110            structure modes require all vectors to be in range of the
6111            same base.  The simplest way of accomodating that while still
6112            promoting reuse of anchor points between different modes is
6113            to use an 8-bit range unconditionally.  */
6114         vnum = ((vnum + 128) & 255) - 128;
6115       else
6116         /* Predicates are only handled singly, so we might as well use
6117            the full range.  */
6118         vnum = ((vnum + 256) & 511) - 256;
6119       if (vnum == 0)
6120         return false;
6121
6122       /* Convert the "mul vl" multiplier into a byte offset.  */
6123       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
6124       if (known_eq (second_offset, orig_offset))
6125         return false;
6126
6127       /* Split the offset into second_offset and the rest.  */
6128       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6129       *offset2 = gen_int_mode (second_offset, Pmode);
6130       return true;
6131     }
6132 }
6133
6134 /* Return the binary representation of floating point constant VALUE in INTVAL.
6135    If the value cannot be converted, return false without setting INTVAL.
6136    The conversion is done in the given MODE.  */
6137 bool
6138 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
6139 {
6140
6141   /* We make a general exception for 0.  */
6142   if (aarch64_float_const_zero_rtx_p (value))
6143     {
6144       *intval = 0;
6145       return true;
6146     }
6147
6148   scalar_float_mode mode;
6149   if (GET_CODE (value) != CONST_DOUBLE
6150       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
6151       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
6152       /* Only support up to DF mode.  */
6153       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
6154     return false;
6155
6156   unsigned HOST_WIDE_INT ival = 0;
6157
6158   long res[2];
6159   real_to_target (res,
6160                   CONST_DOUBLE_REAL_VALUE (value),
6161                   REAL_MODE_FORMAT (mode));
6162
6163   if (mode == DFmode)
6164     {
6165       int order = BYTES_BIG_ENDIAN ? 1 : 0;
6166       ival = zext_hwi (res[order], 32);
6167       ival |= (zext_hwi (res[1 - order], 32) << 32);
6168     }
6169   else
6170       ival = zext_hwi (res[0], 32);
6171
6172   *intval = ival;
6173   return true;
6174 }
6175
6176 /* Return TRUE if rtx X is an immediate constant that can be moved using a
6177    single MOV(+MOVK) followed by an FMOV.  */
6178 bool
6179 aarch64_float_const_rtx_p (rtx x)
6180 {
6181   machine_mode mode = GET_MODE (x);
6182   if (mode == VOIDmode)
6183     return false;
6184
6185   /* Determine whether it's cheaper to write float constants as
6186      mov/movk pairs over ldr/adrp pairs.  */
6187   unsigned HOST_WIDE_INT ival;
6188
6189   if (GET_CODE (x) == CONST_DOUBLE
6190       && SCALAR_FLOAT_MODE_P (mode)
6191       && aarch64_reinterpret_float_as_int (x, &ival))
6192     {
6193       scalar_int_mode imode = (mode == HFmode
6194                                ? SImode
6195                                : int_mode_for_mode (mode).require ());
6196       int num_instr = aarch64_internal_mov_immediate
6197                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
6198       return num_instr < 3;
6199     }
6200
6201   return false;
6202 }
6203
6204 /* Return TRUE if rtx X is immediate constant 0.0 */
6205 bool
6206 aarch64_float_const_zero_rtx_p (rtx x)
6207 {
6208   if (GET_MODE (x) == VOIDmode)
6209     return false;
6210
6211   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
6212     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
6213   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
6214 }
6215
6216 /* Return TRUE if rtx X is immediate constant that fits in a single
6217    MOVI immediate operation.  */
6218 bool
6219 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
6220 {
6221   if (!TARGET_SIMD)
6222      return false;
6223
6224   machine_mode vmode;
6225   scalar_int_mode imode;
6226   unsigned HOST_WIDE_INT ival;
6227
6228   if (GET_CODE (x) == CONST_DOUBLE
6229       && SCALAR_FLOAT_MODE_P (mode))
6230     {
6231       if (!aarch64_reinterpret_float_as_int (x, &ival))
6232         return false;
6233
6234       /* We make a general exception for 0.  */
6235       if (aarch64_float_const_zero_rtx_p (x))
6236         return true;
6237
6238       imode = int_mode_for_mode (mode).require ();
6239     }
6240   else if (GET_CODE (x) == CONST_INT
6241            && is_a <scalar_int_mode> (mode, &imode))
6242     ival = INTVAL (x);
6243   else
6244     return false;
6245
6246    /* use a 64 bit mode for everything except for DI/DF mode, where we use
6247      a 128 bit vector mode.  */
6248   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
6249
6250   vmode = aarch64_simd_container_mode (imode, width);
6251   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
6252
6253   return aarch64_simd_valid_immediate (v_op, NULL);
6254 }
6255
6256
6257 /* Return the fixed registers used for condition codes.  */
6258
6259 static bool
6260 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
6261 {
6262   *p1 = CC_REGNUM;
6263   *p2 = INVALID_REGNUM;
6264   return true;
6265 }
6266
6267 /* This function is used by the call expanders of the machine description.
6268    RESULT is the register in which the result is returned.  It's NULL for
6269    "call" and "sibcall".
6270    MEM is the location of the function call.
6271    SIBCALL indicates whether this function call is normal call or sibling call.
6272    It will generate different pattern accordingly.  */
6273
6274 void
6275 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
6276 {
6277   rtx call, callee, tmp;
6278   rtvec vec;
6279   machine_mode mode;
6280
6281   gcc_assert (MEM_P (mem));
6282   callee = XEXP (mem, 0);
6283   mode = GET_MODE (callee);
6284   gcc_assert (mode == Pmode);
6285
6286   /* Decide if we should generate indirect calls by loading the
6287      address of the callee into a register before performing
6288      the branch-and-link.  */
6289   if (SYMBOL_REF_P (callee)
6290       ? (aarch64_is_long_call_p (callee)
6291          || aarch64_is_noplt_call_p (callee))
6292       : !REG_P (callee))
6293     XEXP (mem, 0) = force_reg (mode, callee);
6294
6295   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
6296
6297   if (result != NULL_RTX)
6298     call = gen_rtx_SET (result, call);
6299
6300   if (sibcall)
6301     tmp = ret_rtx;
6302   else
6303     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
6304
6305   vec = gen_rtvec (2, call, tmp);
6306   call = gen_rtx_PARALLEL (VOIDmode, vec);
6307
6308   aarch64_emit_call_insn (call);
6309 }
6310
6311 /* Emit call insn with PAT and do aarch64-specific handling.  */
6312
6313 void
6314 aarch64_emit_call_insn (rtx pat)
6315 {
6316   rtx insn = emit_call_insn (pat);
6317
6318   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
6319   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
6320   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
6321 }
6322
6323 machine_mode
6324 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
6325 {
6326   /* All floating point compares return CCFP if it is an equality
6327      comparison, and CCFPE otherwise.  */
6328   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
6329     {
6330       switch (code)
6331         {
6332         case EQ:
6333         case NE:
6334         case UNORDERED:
6335         case ORDERED:
6336         case UNLT:
6337         case UNLE:
6338         case UNGT:
6339         case UNGE:
6340         case UNEQ:
6341           return CCFPmode;
6342
6343         case LT:
6344         case LE:
6345         case GT:
6346         case GE:
6347         case LTGT:
6348           return CCFPEmode;
6349
6350         default:
6351           gcc_unreachable ();
6352         }
6353     }
6354
6355   /* Equality comparisons of short modes against zero can be performed
6356      using the TST instruction with the appropriate bitmask.  */
6357   if (y == const0_rtx && REG_P (x)
6358       && (code == EQ || code == NE)
6359       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
6360     return CC_NZmode;
6361
6362   /* Similarly, comparisons of zero_extends from shorter modes can
6363      be performed using an ANDS with an immediate mask.  */
6364   if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
6365       && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6366       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
6367       && (code == EQ || code == NE))
6368     return CC_NZmode;
6369
6370   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6371       && y == const0_rtx
6372       && (code == EQ || code == NE || code == LT || code == GE)
6373       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
6374           || GET_CODE (x) == NEG
6375           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
6376               && CONST_INT_P (XEXP (x, 2)))))
6377     return CC_NZmode;
6378
6379   /* A compare with a shifted operand.  Because of canonicalization,
6380      the comparison will have to be swapped when we emit the assembly
6381      code.  */
6382   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6383       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
6384       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
6385           || GET_CODE (x) == LSHIFTRT
6386           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
6387     return CC_SWPmode;
6388
6389   /* Similarly for a negated operand, but we can only do this for
6390      equalities.  */
6391   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6392       && (REG_P (y) || GET_CODE (y) == SUBREG)
6393       && (code == EQ || code == NE)
6394       && GET_CODE (x) == NEG)
6395     return CC_Zmode;
6396
6397   /* A test for unsigned overflow.  */
6398   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
6399       && code == NE
6400       && GET_CODE (x) == PLUS
6401       && GET_CODE (y) == ZERO_EXTEND)
6402     return CC_Cmode;
6403
6404   /* For everything else, return CCmode.  */
6405   return CCmode;
6406 }
6407
6408 static int
6409 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
6410
6411 int
6412 aarch64_get_condition_code (rtx x)
6413 {
6414   machine_mode mode = GET_MODE (XEXP (x, 0));
6415   enum rtx_code comp_code = GET_CODE (x);
6416
6417   if (GET_MODE_CLASS (mode) != MODE_CC)
6418     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
6419   return aarch64_get_condition_code_1 (mode, comp_code);
6420 }
6421
6422 static int
6423 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
6424 {
6425   switch (mode)
6426     {
6427     case E_CCFPmode:
6428     case E_CCFPEmode:
6429       switch (comp_code)
6430         {
6431         case GE: return AARCH64_GE;
6432         case GT: return AARCH64_GT;
6433         case LE: return AARCH64_LS;
6434         case LT: return AARCH64_MI;
6435         case NE: return AARCH64_NE;
6436         case EQ: return AARCH64_EQ;
6437         case ORDERED: return AARCH64_VC;
6438         case UNORDERED: return AARCH64_VS;
6439         case UNLT: return AARCH64_LT;
6440         case UNLE: return AARCH64_LE;
6441         case UNGT: return AARCH64_HI;
6442         case UNGE: return AARCH64_PL;
6443         default: return -1;
6444         }
6445       break;
6446
6447     case E_CCmode:
6448       switch (comp_code)
6449         {
6450         case NE: return AARCH64_NE;
6451         case EQ: return AARCH64_EQ;
6452         case GE: return AARCH64_GE;
6453         case GT: return AARCH64_GT;
6454         case LE: return AARCH64_LE;
6455         case LT: return AARCH64_LT;
6456         case GEU: return AARCH64_CS;
6457         case GTU: return AARCH64_HI;
6458         case LEU: return AARCH64_LS;
6459         case LTU: return AARCH64_CC;
6460         default: return -1;
6461         }
6462       break;
6463
6464     case E_CC_SWPmode:
6465       switch (comp_code)
6466         {
6467         case NE: return AARCH64_NE;
6468         case EQ: return AARCH64_EQ;
6469         case GE: return AARCH64_LE;
6470         case GT: return AARCH64_LT;
6471         case LE: return AARCH64_GE;
6472         case LT: return AARCH64_GT;
6473         case GEU: return AARCH64_LS;
6474         case GTU: return AARCH64_CC;
6475         case LEU: return AARCH64_CS;
6476         case LTU: return AARCH64_HI;
6477         default: return -1;
6478         }
6479       break;
6480
6481     case E_CC_NZmode:
6482       switch (comp_code)
6483         {
6484         case NE: return AARCH64_NE;
6485         case EQ: return AARCH64_EQ;
6486         case GE: return AARCH64_PL;
6487         case LT: return AARCH64_MI;
6488         default: return -1;
6489         }
6490       break;
6491
6492     case E_CC_Zmode:
6493       switch (comp_code)
6494         {
6495         case NE: return AARCH64_NE;
6496         case EQ: return AARCH64_EQ;
6497         default: return -1;
6498         }
6499       break;
6500
6501     case E_CC_Cmode:
6502       switch (comp_code)
6503         {
6504         case NE: return AARCH64_CS;
6505         case EQ: return AARCH64_CC;
6506         default: return -1;
6507         }
6508       break;
6509
6510     default:
6511       return -1;
6512     }
6513
6514   return -1;
6515 }
6516
6517 bool
6518 aarch64_const_vec_all_same_in_range_p (rtx x,
6519                                        HOST_WIDE_INT minval,
6520                                        HOST_WIDE_INT maxval)
6521 {
6522   rtx elt;
6523   return (const_vec_duplicate_p (x, &elt)
6524           && CONST_INT_P (elt)
6525           && IN_RANGE (INTVAL (elt), minval, maxval));
6526 }
6527
6528 bool
6529 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
6530 {
6531   return aarch64_const_vec_all_same_in_range_p (x, val, val);
6532 }
6533
6534 /* Return true if VEC is a constant in which every element is in the range
6535    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
6536
6537 static bool
6538 aarch64_const_vec_all_in_range_p (rtx vec,
6539                                   HOST_WIDE_INT minval,
6540                                   HOST_WIDE_INT maxval)
6541 {
6542   if (GET_CODE (vec) != CONST_VECTOR
6543       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
6544     return false;
6545
6546   int nunits;
6547   if (!CONST_VECTOR_STEPPED_P (vec))
6548     nunits = const_vector_encoded_nelts (vec);
6549   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
6550     return false;
6551
6552   for (int i = 0; i < nunits; i++)
6553     {
6554       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
6555       if (!CONST_INT_P (vec_elem)
6556           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
6557         return false;
6558     }
6559   return true;
6560 }
6561
6562 /* N Z C V.  */
6563 #define AARCH64_CC_V 1
6564 #define AARCH64_CC_C (1 << 1)
6565 #define AARCH64_CC_Z (1 << 2)
6566 #define AARCH64_CC_N (1 << 3)
6567
6568 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
6569 static const int aarch64_nzcv_codes[] =
6570 {
6571   0,            /* EQ, Z == 1.  */
6572   AARCH64_CC_Z, /* NE, Z == 0.  */
6573   0,            /* CS, C == 1.  */
6574   AARCH64_CC_C, /* CC, C == 0.  */
6575   0,            /* MI, N == 1.  */
6576   AARCH64_CC_N, /* PL, N == 0.  */
6577   0,            /* VS, V == 1.  */
6578   AARCH64_CC_V, /* VC, V == 0.  */
6579   0,            /* HI, C ==1 && Z == 0.  */
6580   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
6581   AARCH64_CC_V, /* GE, N == V.  */
6582   0,            /* LT, N != V.  */
6583   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
6584   0,            /* LE, !(Z == 0 && N == V).  */
6585   0,            /* AL, Any.  */
6586   0             /* NV, Any.  */
6587 };
6588
6589 /* Print floating-point vector immediate operand X to F, negating it
6590    first if NEGATE is true.  Return true on success, false if it isn't
6591    a constant we can handle.  */
6592
6593 static bool
6594 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
6595 {
6596   rtx elt;
6597
6598   if (!const_vec_duplicate_p (x, &elt))
6599     return false;
6600
6601   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
6602   if (negate)
6603     r = real_value_negate (&r);
6604
6605   /* We only handle the SVE single-bit immediates here.  */
6606   if (real_equal (&r, &dconst0))
6607     asm_fprintf (f, "0.0");
6608   else if (real_equal (&r, &dconst1))
6609     asm_fprintf (f, "1.0");
6610   else if (real_equal (&r, &dconsthalf))
6611     asm_fprintf (f, "0.5");
6612   else
6613     return false;
6614
6615   return true;
6616 }
6617
6618 /* Return the equivalent letter for size.  */
6619 static char
6620 sizetochar (int size)
6621 {
6622   switch (size)
6623     {
6624     case 64: return 'd';
6625     case 32: return 's';
6626     case 16: return 'h';
6627     case 8 : return 'b';
6628     default: gcc_unreachable ();
6629     }
6630 }
6631
6632 /* Print operand X to file F in a target specific manner according to CODE.
6633    The acceptable formatting commands given by CODE are:
6634      'c':               An integer or symbol address without a preceding #
6635                         sign.
6636      'C':               Take the duplicated element in a vector constant
6637                         and print it in hex.
6638      'D':               Take the duplicated element in a vector constant
6639                         and print it as an unsigned integer, in decimal.
6640      'e':               Print the sign/zero-extend size as a character 8->b,
6641                         16->h, 32->w.
6642      'p':               Prints N such that 2^N == X (X must be power of 2 and
6643                         const int).
6644      'P':               Print the number of non-zero bits in X (a const_int).
6645      'H':               Print the higher numbered register of a pair (TImode)
6646                         of regs.
6647      'm':               Print a condition (eq, ne, etc).
6648      'M':               Same as 'm', but invert condition.
6649      'N':               Take the duplicated element in a vector constant
6650                         and print the negative of it in decimal.
6651      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
6652      'S/T/U/V':         Print a FP/SIMD register name for a register list.
6653                         The register printed is the FP/SIMD register name
6654                         of X + 0/1/2/3 for S/T/U/V.
6655      'R':               Print a scalar FP/SIMD register name + 1.
6656      'X':               Print bottom 16 bits of integer constant in hex.
6657      'w/x':             Print a general register name or the zero register
6658                         (32-bit or 64-bit).
6659      '0':               Print a normal operand, if it's a general register,
6660                         then we assume DImode.
6661      'k':               Print NZCV for conditional compare instructions.
6662      'A':               Output address constant representing the first
6663                         argument of X, specifying a relocation offset
6664                         if appropriate.
6665      'L':               Output constant address specified by X
6666                         with a relocation offset if appropriate.
6667      'G':               Prints address of X, specifying a PC relative
6668                         relocation mode if appropriate.
6669      'y':               Output address of LDP or STP - this is used for
6670                         some LDP/STPs which don't use a PARALLEL in their
6671                         pattern (so the mode needs to be adjusted).
6672      'z':               Output address of a typical LDP or STP.  */
6673
6674 static void
6675 aarch64_print_operand (FILE *f, rtx x, int code)
6676 {
6677   rtx elt;
6678   switch (code)
6679     {
6680     case 'c':
6681       switch (GET_CODE (x))
6682         {
6683         case CONST_INT:
6684           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
6685           break;
6686
6687         case SYMBOL_REF:
6688           output_addr_const (f, x);
6689           break;
6690
6691         case CONST:
6692           if (GET_CODE (XEXP (x, 0)) == PLUS
6693               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
6694             {
6695               output_addr_const (f, x);
6696               break;
6697             }
6698           /* Fall through.  */
6699
6700         default:
6701           output_operand_lossage ("unsupported operand for code '%c'", code);
6702         }
6703       break;
6704
6705     case 'e':
6706       {
6707         int n;
6708
6709         if (!CONST_INT_P (x)
6710             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
6711           {
6712             output_operand_lossage ("invalid operand for '%%%c'", code);
6713             return;
6714           }
6715
6716         switch (n)
6717           {
6718           case 3:
6719             fputc ('b', f);
6720             break;
6721           case 4:
6722             fputc ('h', f);
6723             break;
6724           case 5:
6725             fputc ('w', f);
6726             break;
6727           default:
6728             output_operand_lossage ("invalid operand for '%%%c'", code);
6729             return;
6730           }
6731       }
6732       break;
6733
6734     case 'p':
6735       {
6736         int n;
6737
6738         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
6739           {
6740             output_operand_lossage ("invalid operand for '%%%c'", code);
6741             return;
6742           }
6743
6744         asm_fprintf (f, "%d", n);
6745       }
6746       break;
6747
6748     case 'P':
6749       if (!CONST_INT_P (x))
6750         {
6751           output_operand_lossage ("invalid operand for '%%%c'", code);
6752           return;
6753         }
6754
6755       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
6756       break;
6757
6758     case 'H':
6759       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
6760         {
6761           output_operand_lossage ("invalid operand for '%%%c'", code);
6762           return;
6763         }
6764
6765       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
6766       break;
6767
6768     case 'M':
6769     case 'm':
6770       {
6771         int cond_code;
6772         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
6773         if (x == const_true_rtx)
6774           {
6775             if (code == 'M')
6776               fputs ("nv", f);
6777             return;
6778           }
6779
6780         if (!COMPARISON_P (x))
6781           {
6782             output_operand_lossage ("invalid operand for '%%%c'", code);
6783             return;
6784           }
6785
6786         cond_code = aarch64_get_condition_code (x);
6787         gcc_assert (cond_code >= 0);
6788         if (code == 'M')
6789           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
6790         fputs (aarch64_condition_codes[cond_code], f);
6791       }
6792       break;
6793
6794     case 'N':
6795       if (!const_vec_duplicate_p (x, &elt))
6796         {
6797           output_operand_lossage ("invalid vector constant");
6798           return;
6799         }
6800
6801       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6802         asm_fprintf (f, "%wd", -INTVAL (elt));
6803       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6804                && aarch64_print_vector_float_operand (f, x, true))
6805         ;
6806       else
6807         {
6808           output_operand_lossage ("invalid vector constant");
6809           return;
6810         }
6811       break;
6812
6813     case 'b':
6814     case 'h':
6815     case 's':
6816     case 'd':
6817     case 'q':
6818       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6819         {
6820           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6821           return;
6822         }
6823       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
6824       break;
6825
6826     case 'S':
6827     case 'T':
6828     case 'U':
6829     case 'V':
6830       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6831         {
6832           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6833           return;
6834         }
6835       asm_fprintf (f, "%c%d",
6836                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
6837                    REGNO (x) - V0_REGNUM + (code - 'S'));
6838       break;
6839
6840     case 'R':
6841       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6842         {
6843           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6844           return;
6845         }
6846       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
6847       break;
6848
6849     case 'X':
6850       if (!CONST_INT_P (x))
6851         {
6852           output_operand_lossage ("invalid operand for '%%%c'", code);
6853           return;
6854         }
6855       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
6856       break;
6857
6858     case 'C':
6859       {
6860         /* Print a replicated constant in hex.  */
6861         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6862           {
6863             output_operand_lossage ("invalid operand for '%%%c'", code);
6864             return;
6865           }
6866         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6867         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6868       }
6869       break;
6870
6871     case 'D':
6872       {
6873         /* Print a replicated constant in decimal, treating it as
6874            unsigned.  */
6875         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6876           {
6877             output_operand_lossage ("invalid operand for '%%%c'", code);
6878             return;
6879           }
6880         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6881         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6882       }
6883       break;
6884
6885     case 'w':
6886     case 'x':
6887       if (x == const0_rtx
6888           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
6889         {
6890           asm_fprintf (f, "%czr", code);
6891           break;
6892         }
6893
6894       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
6895         {
6896           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
6897           break;
6898         }
6899
6900       if (REG_P (x) && REGNO (x) == SP_REGNUM)
6901         {
6902           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
6903           break;
6904         }
6905
6906       /* Fall through */
6907
6908     case 0:
6909       if (x == NULL)
6910         {
6911           output_operand_lossage ("missing operand");
6912           return;
6913         }
6914
6915       switch (GET_CODE (x))
6916         {
6917         case REG:
6918           if (aarch64_sve_data_mode_p (GET_MODE (x)))
6919             {
6920               if (REG_NREGS (x) == 1)
6921                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
6922               else
6923                 {
6924                   char suffix
6925                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
6926                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
6927                                REGNO (x) - V0_REGNUM, suffix,
6928                                END_REGNO (x) - V0_REGNUM - 1, suffix);
6929                 }
6930             }
6931           else
6932             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
6933           break;
6934
6935         case MEM:
6936           output_address (GET_MODE (x), XEXP (x, 0));
6937           break;
6938
6939         case LABEL_REF:
6940         case SYMBOL_REF:
6941           output_addr_const (asm_out_file, x);
6942           break;
6943
6944         case CONST_INT:
6945           asm_fprintf (f, "%wd", INTVAL (x));
6946           break;
6947
6948         case CONST:
6949           if (!VECTOR_MODE_P (GET_MODE (x)))
6950             {
6951               output_addr_const (asm_out_file, x);
6952               break;
6953             }
6954           /* fall through */
6955
6956         case CONST_VECTOR:
6957           if (!const_vec_duplicate_p (x, &elt))
6958             {
6959               output_operand_lossage ("invalid vector constant");
6960               return;
6961             }
6962
6963           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6964             asm_fprintf (f, "%wd", INTVAL (elt));
6965           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6966                    && aarch64_print_vector_float_operand (f, x, false))
6967             ;
6968           else
6969             {
6970               output_operand_lossage ("invalid vector constant");
6971               return;
6972             }
6973           break;
6974
6975         case CONST_DOUBLE:
6976           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
6977              be getting CONST_DOUBLEs holding integers.  */
6978           gcc_assert (GET_MODE (x) != VOIDmode);
6979           if (aarch64_float_const_zero_rtx_p (x))
6980             {
6981               fputc ('0', f);
6982               break;
6983             }
6984           else if (aarch64_float_const_representable_p (x))
6985             {
6986 #define buf_size 20
6987               char float_buf[buf_size] = {'\0'};
6988               real_to_decimal_for_mode (float_buf,
6989                                         CONST_DOUBLE_REAL_VALUE (x),
6990                                         buf_size, buf_size,
6991                                         1, GET_MODE (x));
6992               asm_fprintf (asm_out_file, "%s", float_buf);
6993               break;
6994 #undef buf_size
6995             }
6996           output_operand_lossage ("invalid constant");
6997           return;
6998         default:
6999           output_operand_lossage ("invalid operand");
7000           return;
7001         }
7002       break;
7003
7004     case 'A':
7005       if (GET_CODE (x) == HIGH)
7006         x = XEXP (x, 0);
7007
7008       switch (aarch64_classify_symbolic_expression (x))
7009         {
7010         case SYMBOL_SMALL_GOT_4G:
7011           asm_fprintf (asm_out_file, ":got:");
7012           break;
7013
7014         case SYMBOL_SMALL_TLSGD:
7015           asm_fprintf (asm_out_file, ":tlsgd:");
7016           break;
7017
7018         case SYMBOL_SMALL_TLSDESC:
7019           asm_fprintf (asm_out_file, ":tlsdesc:");
7020           break;
7021
7022         case SYMBOL_SMALL_TLSIE:
7023           asm_fprintf (asm_out_file, ":gottprel:");
7024           break;
7025
7026         case SYMBOL_TLSLE24:
7027           asm_fprintf (asm_out_file, ":tprel:");
7028           break;
7029
7030         case SYMBOL_TINY_GOT:
7031           gcc_unreachable ();
7032           break;
7033
7034         default:
7035           break;
7036         }
7037       output_addr_const (asm_out_file, x);
7038       break;
7039
7040     case 'L':
7041       switch (aarch64_classify_symbolic_expression (x))
7042         {
7043         case SYMBOL_SMALL_GOT_4G:
7044           asm_fprintf (asm_out_file, ":lo12:");
7045           break;
7046
7047         case SYMBOL_SMALL_TLSGD:
7048           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
7049           break;
7050
7051         case SYMBOL_SMALL_TLSDESC:
7052           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
7053           break;
7054
7055         case SYMBOL_SMALL_TLSIE:
7056           asm_fprintf (asm_out_file, ":gottprel_lo12:");
7057           break;
7058
7059         case SYMBOL_TLSLE12:
7060           asm_fprintf (asm_out_file, ":tprel_lo12:");
7061           break;
7062
7063         case SYMBOL_TLSLE24:
7064           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
7065           break;
7066
7067         case SYMBOL_TINY_GOT:
7068           asm_fprintf (asm_out_file, ":got:");
7069           break;
7070
7071         case SYMBOL_TINY_TLSIE:
7072           asm_fprintf (asm_out_file, ":gottprel:");
7073           break;
7074
7075         default:
7076           break;
7077         }
7078       output_addr_const (asm_out_file, x);
7079       break;
7080
7081     case 'G':
7082       switch (aarch64_classify_symbolic_expression (x))
7083         {
7084         case SYMBOL_TLSLE24:
7085           asm_fprintf (asm_out_file, ":tprel_hi12:");
7086           break;
7087         default:
7088           break;
7089         }
7090       output_addr_const (asm_out_file, x);
7091       break;
7092
7093     case 'k':
7094       {
7095         HOST_WIDE_INT cond_code;
7096
7097         if (!CONST_INT_P (x))
7098           {
7099             output_operand_lossage ("invalid operand for '%%%c'", code);
7100             return;
7101           }
7102
7103         cond_code = INTVAL (x);
7104         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
7105         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
7106       }
7107       break;
7108
7109     case 'y':
7110     case 'z':
7111       {
7112         machine_mode mode = GET_MODE (x);
7113
7114         if (GET_CODE (x) != MEM
7115             || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
7116           {
7117             output_operand_lossage ("invalid operand for '%%%c'", code);
7118             return;
7119           }
7120
7121         if (code == 'y')
7122           /* LDP/STP which uses a single double-width memory operand.
7123              Adjust the mode to appear like a typical LDP/STP.
7124              Currently this is supported for 16-byte accesses only.  */
7125           mode = DFmode;
7126
7127         if (!aarch64_print_ldpstp_address (f, mode, XEXP (x, 0)))
7128           output_operand_lossage ("invalid operand prefix '%%%c'", code);
7129       }
7130       break;
7131
7132     default:
7133       output_operand_lossage ("invalid operand prefix '%%%c'", code);
7134       return;
7135     }
7136 }
7137
7138 /* Print address 'x' of a memory access with mode 'mode'.
7139    'op' is the context required by aarch64_classify_address.  It can either be
7140    MEM for a normal memory access or PARALLEL for LDP/STP.  */
7141 static bool
7142 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
7143                                 aarch64_addr_query_type type)
7144 {
7145   struct aarch64_address_info addr;
7146   unsigned int size;
7147
7148   /* Check all addresses are Pmode - including ILP32.  */
7149   if (GET_MODE (x) != Pmode)
7150     output_operand_lossage ("invalid address mode");
7151
7152   if (aarch64_classify_address (&addr, x, mode, true, type))
7153     switch (addr.type)
7154       {
7155       case ADDRESS_REG_IMM:
7156         if (known_eq (addr.const_offset, 0))
7157           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
7158         else if (aarch64_sve_data_mode_p (mode))
7159           {
7160             HOST_WIDE_INT vnum
7161               = exact_div (addr.const_offset,
7162                            BYTES_PER_SVE_VECTOR).to_constant ();
7163             asm_fprintf (f, "[%s, #%wd, mul vl]",
7164                          reg_names[REGNO (addr.base)], vnum);
7165           }
7166         else if (aarch64_sve_pred_mode_p (mode))
7167           {
7168             HOST_WIDE_INT vnum
7169               = exact_div (addr.const_offset,
7170                            BYTES_PER_SVE_PRED).to_constant ();
7171             asm_fprintf (f, "[%s, #%wd, mul vl]",
7172                          reg_names[REGNO (addr.base)], vnum);
7173           }
7174         else
7175           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
7176                        INTVAL (addr.offset));
7177         return true;
7178
7179       case ADDRESS_REG_REG:
7180         if (addr.shift == 0)
7181           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
7182                        reg_names [REGNO (addr.offset)]);
7183         else
7184           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
7185                        reg_names [REGNO (addr.offset)], addr.shift);
7186         return true;
7187
7188       case ADDRESS_REG_UXTW:
7189         if (addr.shift == 0)
7190           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
7191                        REGNO (addr.offset) - R0_REGNUM);
7192         else
7193           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
7194                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
7195         return true;
7196
7197       case ADDRESS_REG_SXTW:
7198         if (addr.shift == 0)
7199           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
7200                        REGNO (addr.offset) - R0_REGNUM);
7201         else
7202           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
7203                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
7204         return true;
7205
7206       case ADDRESS_REG_WB:
7207         /* Writeback is only supported for fixed-width modes.  */
7208         size = GET_MODE_SIZE (mode).to_constant ();
7209         switch (GET_CODE (x))
7210           {
7211           case PRE_INC:
7212             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
7213             return true;
7214           case POST_INC:
7215             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
7216             return true;
7217           case PRE_DEC:
7218             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
7219             return true;
7220           case POST_DEC:
7221             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
7222             return true;
7223           case PRE_MODIFY:
7224             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
7225                          INTVAL (addr.offset));
7226             return true;
7227           case POST_MODIFY:
7228             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
7229                          INTVAL (addr.offset));
7230             return true;
7231           default:
7232             break;
7233           }
7234         break;
7235
7236       case ADDRESS_LO_SUM:
7237         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
7238         output_addr_const (f, addr.offset);
7239         asm_fprintf (f, "]");
7240         return true;
7241
7242       case ADDRESS_SYMBOLIC:
7243         output_addr_const (f, x);
7244         return true;
7245       }
7246
7247   return false;
7248 }
7249
7250 /* Print address 'x' of a LDP/STP with mode 'mode'.  */
7251 static bool
7252 aarch64_print_ldpstp_address (FILE *f, machine_mode mode, rtx x)
7253 {
7254   return aarch64_print_address_internal (f, mode, x, ADDR_QUERY_LDP_STP);
7255 }
7256
7257 /* Print address 'x' of a memory access with mode 'mode'.  */
7258 static void
7259 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
7260 {
7261   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
7262     output_addr_const (f, x);
7263 }
7264
7265 bool
7266 aarch64_label_mentioned_p (rtx x)
7267 {
7268   const char *fmt;
7269   int i;
7270
7271   if (GET_CODE (x) == LABEL_REF)
7272     return true;
7273
7274   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
7275      referencing instruction, but they are constant offsets, not
7276      symbols.  */
7277   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
7278     return false;
7279
7280   fmt = GET_RTX_FORMAT (GET_CODE (x));
7281   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
7282     {
7283       if (fmt[i] == 'E')
7284         {
7285           int j;
7286
7287           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
7288             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
7289               return 1;
7290         }
7291       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
7292         return 1;
7293     }
7294
7295   return 0;
7296 }
7297
7298 /* Implement REGNO_REG_CLASS.  */
7299
7300 enum reg_class
7301 aarch64_regno_regclass (unsigned regno)
7302 {
7303   if (GP_REGNUM_P (regno))
7304     return GENERAL_REGS;
7305
7306   if (regno == SP_REGNUM)
7307     return STACK_REG;
7308
7309   if (regno == FRAME_POINTER_REGNUM
7310       || regno == ARG_POINTER_REGNUM)
7311     return POINTER_REGS;
7312
7313   if (FP_REGNUM_P (regno))
7314     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
7315
7316   if (PR_REGNUM_P (regno))
7317     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
7318
7319   return NO_REGS;
7320 }
7321
7322 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
7323    If OFFSET is out of range, return an offset of an anchor point
7324    that is in range.  Return 0 otherwise.  */
7325
7326 static HOST_WIDE_INT
7327 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
7328                        machine_mode mode)
7329 {
7330   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
7331   if (size > 16)
7332     return (offset + 0x400) & ~0x7f0;
7333
7334   /* For offsets that aren't a multiple of the access size, the limit is
7335      -256...255.  */
7336   if (offset & (size - 1))
7337     {
7338       /* BLKmode typically uses LDP of X-registers.  */
7339       if (mode == BLKmode)
7340         return (offset + 512) & ~0x3ff;
7341       return (offset + 0x100) & ~0x1ff;
7342     }
7343
7344   /* Small negative offsets are supported.  */
7345   if (IN_RANGE (offset, -256, 0))
7346     return 0;
7347
7348   if (mode == TImode || mode == TFmode)
7349     return (offset + 0x100) & ~0x1ff;
7350
7351   /* Use 12-bit offset by access size.  */
7352   return offset & (~0xfff * size);
7353 }
7354
7355 static rtx
7356 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
7357 {
7358   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
7359      where mask is selected by alignment and size of the offset.
7360      We try to pick as large a range for the offset as possible to
7361      maximize the chance of a CSE.  However, for aligned addresses
7362      we limit the range to 4k so that structures with different sized
7363      elements are likely to use the same base.  We need to be careful
7364      not to split a CONST for some forms of address expression, otherwise
7365      it will generate sub-optimal code.  */
7366
7367   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
7368     {
7369       rtx base = XEXP (x, 0);
7370       rtx offset_rtx = XEXP (x, 1);
7371       HOST_WIDE_INT offset = INTVAL (offset_rtx);
7372
7373       if (GET_CODE (base) == PLUS)
7374         {
7375           rtx op0 = XEXP (base, 0);
7376           rtx op1 = XEXP (base, 1);
7377
7378           /* Force any scaling into a temp for CSE.  */
7379           op0 = force_reg (Pmode, op0);
7380           op1 = force_reg (Pmode, op1);
7381
7382           /* Let the pointer register be in op0.  */
7383           if (REG_POINTER (op1))
7384             std::swap (op0, op1);
7385
7386           /* If the pointer is virtual or frame related, then we know that
7387              virtual register instantiation or register elimination is going
7388              to apply a second constant.  We want the two constants folded
7389              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
7390           if (virt_or_elim_regno_p (REGNO (op0)))
7391             {
7392               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
7393                                    NULL_RTX, true, OPTAB_DIRECT);
7394               return gen_rtx_PLUS (Pmode, base, op1);
7395             }
7396
7397           /* Otherwise, in order to encourage CSE (and thence loop strength
7398              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
7399           base = expand_binop (Pmode, add_optab, op0, op1,
7400                                NULL_RTX, true, OPTAB_DIRECT);
7401           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
7402         }
7403
7404       HOST_WIDE_INT size;
7405       if (GET_MODE_SIZE (mode).is_constant (&size))
7406         {
7407           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
7408                                                              mode);
7409           if (base_offset != 0)
7410             {
7411               base = plus_constant (Pmode, base, base_offset);
7412               base = force_operand (base, NULL_RTX);
7413               return plus_constant (Pmode, base, offset - base_offset);
7414             }
7415         }
7416     }
7417
7418   return x;
7419 }
7420
7421 /* Return the reload icode required for a constant pool in mode.  */
7422 static enum insn_code
7423 aarch64_constant_pool_reload_icode (machine_mode mode)
7424 {
7425   switch (mode)
7426     {
7427     case E_SFmode:
7428       return CODE_FOR_aarch64_reload_movcpsfdi;
7429
7430     case E_DFmode:
7431       return CODE_FOR_aarch64_reload_movcpdfdi;
7432
7433     case E_TFmode:
7434       return CODE_FOR_aarch64_reload_movcptfdi;
7435
7436     case E_V8QImode:
7437       return CODE_FOR_aarch64_reload_movcpv8qidi;
7438
7439     case E_V16QImode:
7440       return CODE_FOR_aarch64_reload_movcpv16qidi;
7441
7442     case E_V4HImode:
7443       return CODE_FOR_aarch64_reload_movcpv4hidi;
7444
7445     case E_V8HImode:
7446       return CODE_FOR_aarch64_reload_movcpv8hidi;
7447
7448     case E_V2SImode:
7449       return CODE_FOR_aarch64_reload_movcpv2sidi;
7450
7451     case E_V4SImode:
7452       return CODE_FOR_aarch64_reload_movcpv4sidi;
7453
7454     case E_V2DImode:
7455       return CODE_FOR_aarch64_reload_movcpv2didi;
7456
7457     case E_V2DFmode:
7458       return CODE_FOR_aarch64_reload_movcpv2dfdi;
7459
7460     default:
7461       gcc_unreachable ();
7462     }
7463
7464   gcc_unreachable ();
7465 }
7466 static reg_class_t
7467 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
7468                           reg_class_t rclass,
7469                           machine_mode mode,
7470                           secondary_reload_info *sri)
7471 {
7472   /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
7473      directly by the *aarch64_sve_mov<mode>_be move pattern.  See the
7474      comment at the head of aarch64-sve.md for more details about the
7475      big-endian handling.  */
7476   if (BYTES_BIG_ENDIAN
7477       && reg_class_subset_p (rclass, FP_REGS)
7478       && !((REG_P (x) && HARD_REGISTER_P (x))
7479            || aarch64_simd_valid_immediate (x, NULL))
7480       && aarch64_sve_data_mode_p (mode))
7481     {
7482       sri->icode = CODE_FOR_aarch64_sve_reload_be;
7483       return NO_REGS;
7484     }
7485
7486   /* If we have to disable direct literal pool loads and stores because the
7487      function is too big, then we need a scratch register.  */
7488   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
7489       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
7490           || targetm.vector_mode_supported_p (GET_MODE (x)))
7491       && !aarch64_pcrelative_literal_loads)
7492     {
7493       sri->icode = aarch64_constant_pool_reload_icode (mode);
7494       return NO_REGS;
7495     }
7496
7497   /* Without the TARGET_SIMD instructions we cannot move a Q register
7498      to a Q register directly.  We need a scratch.  */
7499   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
7500       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
7501       && reg_class_subset_p (rclass, FP_REGS))
7502     {
7503       if (mode == TFmode)
7504         sri->icode = CODE_FOR_aarch64_reload_movtf;
7505       else if (mode == TImode)
7506         sri->icode = CODE_FOR_aarch64_reload_movti;
7507       return NO_REGS;
7508     }
7509
7510   /* A TFmode or TImode memory access should be handled via an FP_REGS
7511      because AArch64 has richer addressing modes for LDR/STR instructions
7512      than LDP/STP instructions.  */
7513   if (TARGET_FLOAT && rclass == GENERAL_REGS
7514       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
7515     return FP_REGS;
7516
7517   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
7518       return GENERAL_REGS;
7519
7520   return NO_REGS;
7521 }
7522
7523 static bool
7524 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
7525 {
7526   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
7527
7528   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
7529      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
7530   if (frame_pointer_needed)
7531     return to == HARD_FRAME_POINTER_REGNUM;
7532   return true;
7533 }
7534
7535 poly_int64
7536 aarch64_initial_elimination_offset (unsigned from, unsigned to)
7537 {
7538   aarch64_layout_frame ();
7539
7540   if (to == HARD_FRAME_POINTER_REGNUM)
7541     {
7542       if (from == ARG_POINTER_REGNUM)
7543         return cfun->machine->frame.hard_fp_offset;
7544
7545       if (from == FRAME_POINTER_REGNUM)
7546         return cfun->machine->frame.hard_fp_offset
7547                - cfun->machine->frame.locals_offset;
7548     }
7549
7550   if (to == STACK_POINTER_REGNUM)
7551     {
7552       if (from == FRAME_POINTER_REGNUM)
7553           return cfun->machine->frame.frame_size
7554                  - cfun->machine->frame.locals_offset;
7555     }
7556
7557   return cfun->machine->frame.frame_size;
7558 }
7559
7560 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
7561    previous frame.  */
7562
7563 rtx
7564 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
7565 {
7566   if (count != 0)
7567     return const0_rtx;
7568   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
7569 }
7570
7571
7572 static void
7573 aarch64_asm_trampoline_template (FILE *f)
7574 {
7575   if (TARGET_ILP32)
7576     {
7577       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
7578       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
7579     }
7580   else
7581     {
7582       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
7583       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
7584     }
7585   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
7586   assemble_aligned_integer (4, const0_rtx);
7587   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7588   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7589 }
7590
7591 static void
7592 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
7593 {
7594   rtx fnaddr, mem, a_tramp;
7595   const int tramp_code_sz = 16;
7596
7597   /* Don't need to copy the trailing D-words, we fill those in below.  */
7598   emit_block_move (m_tramp, assemble_trampoline_template (),
7599                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
7600   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
7601   fnaddr = XEXP (DECL_RTL (fndecl), 0);
7602   if (GET_MODE (fnaddr) != ptr_mode)
7603     fnaddr = convert_memory_address (ptr_mode, fnaddr);
7604   emit_move_insn (mem, fnaddr);
7605
7606   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
7607   emit_move_insn (mem, chain_value);
7608
7609   /* XXX We should really define a "clear_cache" pattern and use
7610      gen_clear_cache().  */
7611   a_tramp = XEXP (m_tramp, 0);
7612   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
7613                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
7614                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
7615                      ptr_mode);
7616 }
7617
7618 static unsigned char
7619 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
7620 {
7621   /* ??? Logically we should only need to provide a value when
7622      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
7623      can hold MODE, but at the moment we need to handle all modes.
7624      Just ignore any runtime parts for registers that can't store them.  */
7625   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
7626   unsigned int nregs;
7627   switch (regclass)
7628     {
7629     case TAILCALL_ADDR_REGS:
7630     case POINTER_REGS:
7631     case GENERAL_REGS:
7632     case ALL_REGS:
7633     case POINTER_AND_FP_REGS:
7634     case FP_REGS:
7635     case FP_LO_REGS:
7636       if (aarch64_sve_data_mode_p (mode)
7637           && constant_multiple_p (GET_MODE_SIZE (mode),
7638                                   BYTES_PER_SVE_VECTOR, &nregs))
7639         return nregs;
7640       return (aarch64_vector_data_mode_p (mode)
7641               ? CEIL (lowest_size, UNITS_PER_VREG)
7642               : CEIL (lowest_size, UNITS_PER_WORD));
7643     case STACK_REG:
7644     case PR_REGS:
7645     case PR_LO_REGS:
7646     case PR_HI_REGS:
7647       return 1;
7648
7649     case NO_REGS:
7650       return 0;
7651
7652     default:
7653       break;
7654     }
7655   gcc_unreachable ();
7656 }
7657
7658 static reg_class_t
7659 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
7660 {
7661   if (regclass == POINTER_REGS)
7662     return GENERAL_REGS;
7663
7664   if (regclass == STACK_REG)
7665     {
7666       if (REG_P(x)
7667           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
7668           return regclass;
7669
7670       return NO_REGS;
7671     }
7672
7673   /* Register eliminiation can result in a request for
7674      SP+constant->FP_REGS.  We cannot support such operations which
7675      use SP as source and an FP_REG as destination, so reject out
7676      right now.  */
7677   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
7678     {
7679       rtx lhs = XEXP (x, 0);
7680
7681       /* Look through a possible SUBREG introduced by ILP32.  */
7682       if (GET_CODE (lhs) == SUBREG)
7683         lhs = SUBREG_REG (lhs);
7684
7685       gcc_assert (REG_P (lhs));
7686       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
7687                                       POINTER_REGS));
7688       return NO_REGS;
7689     }
7690
7691   return regclass;
7692 }
7693
7694 void
7695 aarch64_asm_output_labelref (FILE* f, const char *name)
7696 {
7697   asm_fprintf (f, "%U%s", name);
7698 }
7699
7700 static void
7701 aarch64_elf_asm_constructor (rtx symbol, int priority)
7702 {
7703   if (priority == DEFAULT_INIT_PRIORITY)
7704     default_ctor_section_asm_out_constructor (symbol, priority);
7705   else
7706     {
7707       section *s;
7708       /* While priority is known to be in range [0, 65535], so 18 bytes
7709          would be enough, the compiler might not know that.  To avoid
7710          -Wformat-truncation false positive, use a larger size.  */
7711       char buf[23];
7712       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
7713       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7714       switch_to_section (s);
7715       assemble_align (POINTER_SIZE);
7716       assemble_aligned_integer (POINTER_BYTES, symbol);
7717     }
7718 }
7719
7720 static void
7721 aarch64_elf_asm_destructor (rtx symbol, int priority)
7722 {
7723   if (priority == DEFAULT_INIT_PRIORITY)
7724     default_dtor_section_asm_out_destructor (symbol, priority);
7725   else
7726     {
7727       section *s;
7728       /* While priority is known to be in range [0, 65535], so 18 bytes
7729          would be enough, the compiler might not know that.  To avoid
7730          -Wformat-truncation false positive, use a larger size.  */
7731       char buf[23];
7732       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
7733       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7734       switch_to_section (s);
7735       assemble_align (POINTER_SIZE);
7736       assemble_aligned_integer (POINTER_BYTES, symbol);
7737     }
7738 }
7739
7740 const char*
7741 aarch64_output_casesi (rtx *operands)
7742 {
7743   char buf[100];
7744   char label[100];
7745   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
7746   int index;
7747   static const char *const patterns[4][2] =
7748   {
7749     {
7750       "ldrb\t%w3, [%0,%w1,uxtw]",
7751       "add\t%3, %4, %w3, sxtb #2"
7752     },
7753     {
7754       "ldrh\t%w3, [%0,%w1,uxtw #1]",
7755       "add\t%3, %4, %w3, sxth #2"
7756     },
7757     {
7758       "ldr\t%w3, [%0,%w1,uxtw #2]",
7759       "add\t%3, %4, %w3, sxtw #2"
7760     },
7761     /* We assume that DImode is only generated when not optimizing and
7762        that we don't really need 64-bit address offsets.  That would
7763        imply an object file with 8GB of code in a single function!  */
7764     {
7765       "ldr\t%w3, [%0,%w1,uxtw #2]",
7766       "add\t%3, %4, %w3, sxtw #2"
7767     }
7768   };
7769
7770   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
7771
7772   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
7773   index = exact_log2 (GET_MODE_SIZE (mode));
7774
7775   gcc_assert (index >= 0 && index <= 3);
7776
7777   /* Need to implement table size reduction, by chaning the code below.  */
7778   output_asm_insn (patterns[index][0], operands);
7779   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
7780   snprintf (buf, sizeof (buf),
7781             "adr\t%%4, %s", targetm.strip_name_encoding (label));
7782   output_asm_insn (buf, operands);
7783   output_asm_insn (patterns[index][1], operands);
7784   output_asm_insn ("br\t%3", operands);
7785   assemble_label (asm_out_file, label);
7786   return "";
7787 }
7788
7789
7790 /* Return size in bits of an arithmetic operand which is shifted/scaled and
7791    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
7792    operator.  */
7793
7794 int
7795 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
7796 {
7797   if (shift >= 0 && shift <= 3)
7798     {
7799       int size;
7800       for (size = 8; size <= 32; size *= 2)
7801         {
7802           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
7803           if (mask == bits << shift)
7804             return size;
7805         }
7806     }
7807   return 0;
7808 }
7809
7810 /* Constant pools are per function only when PC relative
7811    literal loads are true or we are in the large memory
7812    model.  */
7813
7814 static inline bool
7815 aarch64_can_use_per_function_literal_pools_p (void)
7816 {
7817   return (aarch64_pcrelative_literal_loads
7818           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
7819 }
7820
7821 static bool
7822 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
7823 {
7824   /* We can't use blocks for constants when we're using a per-function
7825      constant pool.  */
7826   return !aarch64_can_use_per_function_literal_pools_p ();
7827 }
7828
7829 /* Select appropriate section for constants depending
7830    on where we place literal pools.  */
7831
7832 static section *
7833 aarch64_select_rtx_section (machine_mode mode,
7834                             rtx x,
7835                             unsigned HOST_WIDE_INT align)
7836 {
7837   if (aarch64_can_use_per_function_literal_pools_p ())
7838     return function_section (current_function_decl);
7839
7840   return default_elf_select_rtx_section (mode, x, align);
7841 }
7842
7843 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
7844 void
7845 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
7846                                   HOST_WIDE_INT offset)
7847 {
7848   /* When using per-function literal pools, we must ensure that any code
7849      section is aligned to the minimal instruction length, lest we get
7850      errors from the assembler re "unaligned instructions".  */
7851   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
7852     ASM_OUTPUT_ALIGN (f, 2);
7853 }
7854
7855 /* Costs.  */
7856
7857 /* Helper function for rtx cost calculation.  Strip a shift expression
7858    from X.  Returns the inner operand if successful, or the original
7859    expression on failure.  */
7860 static rtx
7861 aarch64_strip_shift (rtx x)
7862 {
7863   rtx op = x;
7864
7865   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
7866      we can convert both to ROR during final output.  */
7867   if ((GET_CODE (op) == ASHIFT
7868        || GET_CODE (op) == ASHIFTRT
7869        || GET_CODE (op) == LSHIFTRT
7870        || GET_CODE (op) == ROTATERT
7871        || GET_CODE (op) == ROTATE)
7872       && CONST_INT_P (XEXP (op, 1)))
7873     return XEXP (op, 0);
7874
7875   if (GET_CODE (op) == MULT
7876       && CONST_INT_P (XEXP (op, 1))
7877       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
7878     return XEXP (op, 0);
7879
7880   return x;
7881 }
7882
7883 /* Helper function for rtx cost calculation.  Strip an extend
7884    expression from X.  Returns the inner operand if successful, or the
7885    original expression on failure.  We deal with a number of possible
7886    canonicalization variations here. If STRIP_SHIFT is true, then
7887    we can strip off a shift also.  */
7888 static rtx
7889 aarch64_strip_extend (rtx x, bool strip_shift)
7890 {
7891   scalar_int_mode mode;
7892   rtx op = x;
7893
7894   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
7895     return op;
7896
7897   /* Zero and sign extraction of a widened value.  */
7898   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
7899       && XEXP (op, 2) == const0_rtx
7900       && GET_CODE (XEXP (op, 0)) == MULT
7901       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
7902                                          XEXP (op, 1)))
7903     return XEXP (XEXP (op, 0), 0);
7904
7905   /* It can also be represented (for zero-extend) as an AND with an
7906      immediate.  */
7907   if (GET_CODE (op) == AND
7908       && GET_CODE (XEXP (op, 0)) == MULT
7909       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
7910       && CONST_INT_P (XEXP (op, 1))
7911       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
7912                            INTVAL (XEXP (op, 1))) != 0)
7913     return XEXP (XEXP (op, 0), 0);
7914
7915   /* Now handle extended register, as this may also have an optional
7916      left shift by 1..4.  */
7917   if (strip_shift
7918       && GET_CODE (op) == ASHIFT
7919       && CONST_INT_P (XEXP (op, 1))
7920       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
7921     op = XEXP (op, 0);
7922
7923   if (GET_CODE (op) == ZERO_EXTEND
7924       || GET_CODE (op) == SIGN_EXTEND)
7925     op = XEXP (op, 0);
7926
7927   if (op != x)
7928     return op;
7929
7930   return x;
7931 }
7932
7933 /* Return true iff CODE is a shift supported in combination
7934    with arithmetic instructions.  */
7935
7936 static bool
7937 aarch64_shift_p (enum rtx_code code)
7938 {
7939   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
7940 }
7941
7942
7943 /* Return true iff X is a cheap shift without a sign extend. */
7944
7945 static bool
7946 aarch64_cheap_mult_shift_p (rtx x)
7947 {
7948   rtx op0, op1;
7949
7950   op0 = XEXP (x, 0);
7951   op1 = XEXP (x, 1);
7952
7953   if (!(aarch64_tune_params.extra_tuning_flags
7954                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
7955     return false;
7956
7957   if (GET_CODE (op0) == SIGN_EXTEND)
7958     return false;
7959
7960   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
7961       && UINTVAL (op1) <= 4)
7962     return true;
7963
7964   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
7965     return false;
7966
7967   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
7968
7969   if (l2 > 0 && l2 <= 4)
7970     return true;
7971
7972   return false;
7973 }
7974
7975 /* Helper function for rtx cost calculation.  Calculate the cost of
7976    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
7977    Return the calculated cost of the expression, recursing manually in to
7978    operands where needed.  */
7979
7980 static int
7981 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
7982 {
7983   rtx op0, op1;
7984   const struct cpu_cost_table *extra_cost
7985     = aarch64_tune_params.insn_extra_cost;
7986   int cost = 0;
7987   bool compound_p = (outer == PLUS || outer == MINUS);
7988   machine_mode mode = GET_MODE (x);
7989
7990   gcc_checking_assert (code == MULT);
7991
7992   op0 = XEXP (x, 0);
7993   op1 = XEXP (x, 1);
7994
7995   if (VECTOR_MODE_P (mode))
7996     mode = GET_MODE_INNER (mode);
7997
7998   /* Integer multiply/fma.  */
7999   if (GET_MODE_CLASS (mode) == MODE_INT)
8000     {
8001       /* The multiply will be canonicalized as a shift, cost it as such.  */
8002       if (aarch64_shift_p (GET_CODE (x))
8003           || (CONST_INT_P (op1)
8004               && exact_log2 (INTVAL (op1)) > 0))
8005         {
8006           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
8007                            || GET_CODE (op0) == SIGN_EXTEND;
8008           if (speed)
8009             {
8010               if (compound_p)
8011                 {
8012                   /* If the shift is considered cheap,
8013                      then don't add any cost. */
8014                   if (aarch64_cheap_mult_shift_p (x))
8015                     ;
8016                   else if (REG_P (op1))
8017                     /* ARITH + shift-by-register.  */
8018                     cost += extra_cost->alu.arith_shift_reg;
8019                   else if (is_extend)
8020                     /* ARITH + extended register.  We don't have a cost field
8021                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
8022                     cost += extra_cost->alu.extend_arith;
8023                   else
8024                     /* ARITH + shift-by-immediate.  */
8025                     cost += extra_cost->alu.arith_shift;
8026                 }
8027               else
8028                 /* LSL (immediate).  */
8029                 cost += extra_cost->alu.shift;
8030
8031             }
8032           /* Strip extends as we will have costed them in the case above.  */
8033           if (is_extend)
8034             op0 = aarch64_strip_extend (op0, true);
8035
8036           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
8037
8038           return cost;
8039         }
8040
8041       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
8042          compound and let the below cases handle it.  After all, MNEG is a
8043          special-case alias of MSUB.  */
8044       if (GET_CODE (op0) == NEG)
8045         {
8046           op0 = XEXP (op0, 0);
8047           compound_p = true;
8048         }
8049
8050       /* Integer multiplies or FMAs have zero/sign extending variants.  */
8051       if ((GET_CODE (op0) == ZERO_EXTEND
8052            && GET_CODE (op1) == ZERO_EXTEND)
8053           || (GET_CODE (op0) == SIGN_EXTEND
8054               && GET_CODE (op1) == SIGN_EXTEND))
8055         {
8056           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
8057           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
8058
8059           if (speed)
8060             {
8061               if (compound_p)
8062                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
8063                 cost += extra_cost->mult[0].extend_add;
8064               else
8065                 /* MUL/SMULL/UMULL.  */
8066                 cost += extra_cost->mult[0].extend;
8067             }
8068
8069           return cost;
8070         }
8071
8072       /* This is either an integer multiply or a MADD.  In both cases
8073          we want to recurse and cost the operands.  */
8074       cost += rtx_cost (op0, mode, MULT, 0, speed);
8075       cost += rtx_cost (op1, mode, MULT, 1, speed);
8076
8077       if (speed)
8078         {
8079           if (compound_p)
8080             /* MADD/MSUB.  */
8081             cost += extra_cost->mult[mode == DImode].add;
8082           else
8083             /* MUL.  */
8084             cost += extra_cost->mult[mode == DImode].simple;
8085         }
8086
8087       return cost;
8088     }
8089   else
8090     {
8091       if (speed)
8092         {
8093           /* Floating-point FMA/FMUL can also support negations of the
8094              operands, unless the rounding mode is upward or downward in
8095              which case FNMUL is different than FMUL with operand negation.  */
8096           bool neg0 = GET_CODE (op0) == NEG;
8097           bool neg1 = GET_CODE (op1) == NEG;
8098           if (compound_p || !flag_rounding_math || (neg0 && neg1))
8099             {
8100               if (neg0)
8101                 op0 = XEXP (op0, 0);
8102               if (neg1)
8103                 op1 = XEXP (op1, 0);
8104             }
8105
8106           if (compound_p)
8107             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
8108             cost += extra_cost->fp[mode == DFmode].fma;
8109           else
8110             /* FMUL/FNMUL.  */
8111             cost += extra_cost->fp[mode == DFmode].mult;
8112         }
8113
8114       cost += rtx_cost (op0, mode, MULT, 0, speed);
8115       cost += rtx_cost (op1, mode, MULT, 1, speed);
8116       return cost;
8117     }
8118 }
8119
8120 static int
8121 aarch64_address_cost (rtx x,
8122                       machine_mode mode,
8123                       addr_space_t as ATTRIBUTE_UNUSED,
8124                       bool speed)
8125 {
8126   enum rtx_code c = GET_CODE (x);
8127   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
8128   struct aarch64_address_info info;
8129   int cost = 0;
8130   info.shift = 0;
8131
8132   if (!aarch64_classify_address (&info, x, mode, false))
8133     {
8134       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
8135         {
8136           /* This is a CONST or SYMBOL ref which will be split
8137              in a different way depending on the code model in use.
8138              Cost it through the generic infrastructure.  */
8139           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
8140           /* Divide through by the cost of one instruction to
8141              bring it to the same units as the address costs.  */
8142           cost_symbol_ref /= COSTS_N_INSNS (1);
8143           /* The cost is then the cost of preparing the address,
8144              followed by an immediate (possibly 0) offset.  */
8145           return cost_symbol_ref + addr_cost->imm_offset;
8146         }
8147       else
8148         {
8149           /* This is most likely a jump table from a case
8150              statement.  */
8151           return addr_cost->register_offset;
8152         }
8153     }
8154
8155   switch (info.type)
8156     {
8157       case ADDRESS_LO_SUM:
8158       case ADDRESS_SYMBOLIC:
8159       case ADDRESS_REG_IMM:
8160         cost += addr_cost->imm_offset;
8161         break;
8162
8163       case ADDRESS_REG_WB:
8164         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
8165           cost += addr_cost->pre_modify;
8166         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
8167           cost += addr_cost->post_modify;
8168         else
8169           gcc_unreachable ();
8170
8171         break;
8172
8173       case ADDRESS_REG_REG:
8174         cost += addr_cost->register_offset;
8175         break;
8176
8177       case ADDRESS_REG_SXTW:
8178         cost += addr_cost->register_sextend;
8179         break;
8180
8181       case ADDRESS_REG_UXTW:
8182         cost += addr_cost->register_zextend;
8183         break;
8184
8185       default:
8186         gcc_unreachable ();
8187     }
8188
8189
8190   if (info.shift > 0)
8191     {
8192       /* For the sake of calculating the cost of the shifted register
8193          component, we can treat same sized modes in the same way.  */
8194       if (known_eq (GET_MODE_BITSIZE (mode), 16))
8195         cost += addr_cost->addr_scale_costs.hi;
8196       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
8197         cost += addr_cost->addr_scale_costs.si;
8198       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
8199         cost += addr_cost->addr_scale_costs.di;
8200       else
8201         /* We can't tell, or this is a 128-bit vector.  */
8202         cost += addr_cost->addr_scale_costs.ti;
8203     }
8204
8205   return cost;
8206 }
8207
8208 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
8209    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
8210    to be taken.  */
8211
8212 int
8213 aarch64_branch_cost (bool speed_p, bool predictable_p)
8214 {
8215   /* When optimizing for speed, use the cost of unpredictable branches.  */
8216   const struct cpu_branch_cost *branch_costs =
8217     aarch64_tune_params.branch_costs;
8218
8219   if (!speed_p || predictable_p)
8220     return branch_costs->predictable;
8221   else
8222     return branch_costs->unpredictable;
8223 }
8224
8225 /* Return true if the RTX X in mode MODE is a zero or sign extract
8226    usable in an ADD or SUB (extended register) instruction.  */
8227 static bool
8228 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
8229 {
8230   /* Catch add with a sign extract.
8231      This is add_<optab><mode>_multp2.  */
8232   if (GET_CODE (x) == SIGN_EXTRACT
8233       || GET_CODE (x) == ZERO_EXTRACT)
8234     {
8235       rtx op0 = XEXP (x, 0);
8236       rtx op1 = XEXP (x, 1);
8237       rtx op2 = XEXP (x, 2);
8238
8239       if (GET_CODE (op0) == MULT
8240           && CONST_INT_P (op1)
8241           && op2 == const0_rtx
8242           && CONST_INT_P (XEXP (op0, 1))
8243           && aarch64_is_extend_from_extract (mode,
8244                                              XEXP (op0, 1),
8245                                              op1))
8246         {
8247           return true;
8248         }
8249     }
8250   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
8251      No shift.  */
8252   else if (GET_CODE (x) == SIGN_EXTEND
8253            || GET_CODE (x) == ZERO_EXTEND)
8254     return REG_P (XEXP (x, 0));
8255
8256   return false;
8257 }
8258
8259 static bool
8260 aarch64_frint_unspec_p (unsigned int u)
8261 {
8262   switch (u)
8263     {
8264       case UNSPEC_FRINTZ:
8265       case UNSPEC_FRINTP:
8266       case UNSPEC_FRINTM:
8267       case UNSPEC_FRINTA:
8268       case UNSPEC_FRINTN:
8269       case UNSPEC_FRINTX:
8270       case UNSPEC_FRINTI:
8271         return true;
8272
8273       default:
8274         return false;
8275     }
8276 }
8277
8278 /* Return true iff X is an rtx that will match an extr instruction
8279    i.e. as described in the *extr<mode>5_insn family of patterns.
8280    OP0 and OP1 will be set to the operands of the shifts involved
8281    on success and will be NULL_RTX otherwise.  */
8282
8283 static bool
8284 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
8285 {
8286   rtx op0, op1;
8287   scalar_int_mode mode;
8288   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
8289     return false;
8290
8291   *res_op0 = NULL_RTX;
8292   *res_op1 = NULL_RTX;
8293
8294   if (GET_CODE (x) != IOR)
8295     return false;
8296
8297   op0 = XEXP (x, 0);
8298   op1 = XEXP (x, 1);
8299
8300   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
8301       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
8302     {
8303      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
8304       if (GET_CODE (op1) == ASHIFT)
8305         std::swap (op0, op1);
8306
8307       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
8308         return false;
8309
8310       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
8311       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
8312
8313       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
8314           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
8315         {
8316           *res_op0 = XEXP (op0, 0);
8317           *res_op1 = XEXP (op1, 0);
8318           return true;
8319         }
8320     }
8321
8322   return false;
8323 }
8324
8325 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
8326    storing it in *COST.  Result is true if the total cost of the operation
8327    has now been calculated.  */
8328 static bool
8329 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
8330 {
8331   rtx inner;
8332   rtx comparator;
8333   enum rtx_code cmpcode;
8334
8335   if (COMPARISON_P (op0))
8336     {
8337       inner = XEXP (op0, 0);
8338       comparator = XEXP (op0, 1);
8339       cmpcode = GET_CODE (op0);
8340     }
8341   else
8342     {
8343       inner = op0;
8344       comparator = const0_rtx;
8345       cmpcode = NE;
8346     }
8347
8348   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
8349     {
8350       /* Conditional branch.  */
8351       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8352         return true;
8353       else
8354         {
8355           if (cmpcode == NE || cmpcode == EQ)
8356             {
8357               if (comparator == const0_rtx)
8358                 {
8359                   /* TBZ/TBNZ/CBZ/CBNZ.  */
8360                   if (GET_CODE (inner) == ZERO_EXTRACT)
8361                     /* TBZ/TBNZ.  */
8362                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
8363                                        ZERO_EXTRACT, 0, speed);
8364                   else
8365                     /* CBZ/CBNZ.  */
8366                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
8367
8368                 return true;
8369               }
8370             }
8371           else if (cmpcode == LT || cmpcode == GE)
8372             {
8373               /* TBZ/TBNZ.  */
8374               if (comparator == const0_rtx)
8375                 return true;
8376             }
8377         }
8378     }
8379   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8380     {
8381       /* CCMP.  */
8382       if (GET_CODE (op1) == COMPARE)
8383         {
8384           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
8385           if (XEXP (op1, 1) == const0_rtx)
8386             *cost += 1;
8387           if (speed)
8388             {
8389               machine_mode mode = GET_MODE (XEXP (op1, 0));
8390               const struct cpu_cost_table *extra_cost
8391                 = aarch64_tune_params.insn_extra_cost;
8392
8393               if (GET_MODE_CLASS (mode) == MODE_INT)
8394                 *cost += extra_cost->alu.arith;
8395               else
8396                 *cost += extra_cost->fp[mode == DFmode].compare;
8397             }
8398           return true;
8399         }
8400
8401       /* It's a conditional operation based on the status flags,
8402          so it must be some flavor of CSEL.  */
8403
8404       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
8405       if (GET_CODE (op1) == NEG
8406           || GET_CODE (op1) == NOT
8407           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
8408         op1 = XEXP (op1, 0);
8409       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
8410         {
8411           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
8412           op1 = XEXP (op1, 0);
8413           op2 = XEXP (op2, 0);
8414         }
8415
8416       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
8417       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
8418       return true;
8419     }
8420
8421   /* We don't know what this is, cost all operands.  */
8422   return false;
8423 }
8424
8425 /* Check whether X is a bitfield operation of the form shift + extend that
8426    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
8427    operand to which the bitfield operation is applied.  Otherwise return
8428    NULL_RTX.  */
8429
8430 static rtx
8431 aarch64_extend_bitfield_pattern_p (rtx x)
8432 {
8433   rtx_code outer_code = GET_CODE (x);
8434   machine_mode outer_mode = GET_MODE (x);
8435
8436   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
8437       && outer_mode != SImode && outer_mode != DImode)
8438     return NULL_RTX;
8439
8440   rtx inner = XEXP (x, 0);
8441   rtx_code inner_code = GET_CODE (inner);
8442   machine_mode inner_mode = GET_MODE (inner);
8443   rtx op = NULL_RTX;
8444
8445   switch (inner_code)
8446     {
8447       case ASHIFT:
8448         if (CONST_INT_P (XEXP (inner, 1))
8449             && (inner_mode == QImode || inner_mode == HImode))
8450           op = XEXP (inner, 0);
8451         break;
8452       case LSHIFTRT:
8453         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
8454             && (inner_mode == QImode || inner_mode == HImode))
8455           op = XEXP (inner, 0);
8456         break;
8457       case ASHIFTRT:
8458         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
8459             && (inner_mode == QImode || inner_mode == HImode))
8460           op = XEXP (inner, 0);
8461         break;
8462       default:
8463         break;
8464     }
8465
8466   return op;
8467 }
8468
8469 /* Return true if the mask and a shift amount from an RTX of the form
8470    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
8471    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
8472
8473 bool
8474 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
8475                                     rtx shft_amnt)
8476 {
8477   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
8478          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
8479          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
8480          && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
8481 }
8482
8483 /* Calculate the cost of calculating X, storing it in *COST.  Result
8484    is true if the total cost of the operation has now been calculated.  */
8485 static bool
8486 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
8487                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
8488 {
8489   rtx op0, op1, op2;
8490   const struct cpu_cost_table *extra_cost
8491     = aarch64_tune_params.insn_extra_cost;
8492   int code = GET_CODE (x);
8493   scalar_int_mode int_mode;
8494
8495   /* By default, assume that everything has equivalent cost to the
8496      cheapest instruction.  Any additional costs are applied as a delta
8497      above this default.  */
8498   *cost = COSTS_N_INSNS (1);
8499
8500   switch (code)
8501     {
8502     case SET:
8503       /* The cost depends entirely on the operands to SET.  */
8504       *cost = 0;
8505       op0 = SET_DEST (x);
8506       op1 = SET_SRC (x);
8507
8508       switch (GET_CODE (op0))
8509         {
8510         case MEM:
8511           if (speed)
8512             {
8513               rtx address = XEXP (op0, 0);
8514               if (VECTOR_MODE_P (mode))
8515                 *cost += extra_cost->ldst.storev;
8516               else if (GET_MODE_CLASS (mode) == MODE_INT)
8517                 *cost += extra_cost->ldst.store;
8518               else if (mode == SFmode)
8519                 *cost += extra_cost->ldst.storef;
8520               else if (mode == DFmode)
8521                 *cost += extra_cost->ldst.stored;
8522
8523               *cost +=
8524                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8525                                                      0, speed));
8526             }
8527
8528           *cost += rtx_cost (op1, mode, SET, 1, speed);
8529           return true;
8530
8531         case SUBREG:
8532           if (! REG_P (SUBREG_REG (op0)))
8533             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
8534
8535           /* Fall through.  */
8536         case REG:
8537           /* The cost is one per vector-register copied.  */
8538           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
8539             {
8540               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
8541               *cost = COSTS_N_INSNS (nregs);
8542             }
8543           /* const0_rtx is in general free, but we will use an
8544              instruction to set a register to 0.  */
8545           else if (REG_P (op1) || op1 == const0_rtx)
8546             {
8547               /* The cost is 1 per register copied.  */
8548               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
8549               *cost = COSTS_N_INSNS (nregs);
8550             }
8551           else
8552             /* Cost is just the cost of the RHS of the set.  */
8553             *cost += rtx_cost (op1, mode, SET, 1, speed);
8554           return true;
8555
8556         case ZERO_EXTRACT:
8557         case SIGN_EXTRACT:
8558           /* Bit-field insertion.  Strip any redundant widening of
8559              the RHS to meet the width of the target.  */
8560           if (GET_CODE (op1) == SUBREG)
8561             op1 = SUBREG_REG (op1);
8562           if ((GET_CODE (op1) == ZERO_EXTEND
8563                || GET_CODE (op1) == SIGN_EXTEND)
8564               && CONST_INT_P (XEXP (op0, 1))
8565               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
8566               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
8567             op1 = XEXP (op1, 0);
8568
8569           if (CONST_INT_P (op1))
8570             {
8571               /* MOV immediate is assumed to always be cheap.  */
8572               *cost = COSTS_N_INSNS (1);
8573             }
8574           else
8575             {
8576               /* BFM.  */
8577               if (speed)
8578                 *cost += extra_cost->alu.bfi;
8579               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
8580             }
8581
8582           return true;
8583
8584         default:
8585           /* We can't make sense of this, assume default cost.  */
8586           *cost = COSTS_N_INSNS (1);
8587           return false;
8588         }
8589       return false;
8590
8591     case CONST_INT:
8592       /* If an instruction can incorporate a constant within the
8593          instruction, the instruction's expression avoids calling
8594          rtx_cost() on the constant.  If rtx_cost() is called on a
8595          constant, then it is usually because the constant must be
8596          moved into a register by one or more instructions.
8597
8598          The exception is constant 0, which can be expressed
8599          as XZR/WZR and is therefore free.  The exception to this is
8600          if we have (set (reg) (const0_rtx)) in which case we must cost
8601          the move.  However, we can catch that when we cost the SET, so
8602          we don't need to consider that here.  */
8603       if (x == const0_rtx)
8604         *cost = 0;
8605       else
8606         {
8607           /* To an approximation, building any other constant is
8608              proportionally expensive to the number of instructions
8609              required to build that constant.  This is true whether we
8610              are compiling for SPEED or otherwise.  */
8611           if (!is_a <scalar_int_mode> (mode, &int_mode))
8612             int_mode = word_mode;
8613           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
8614                                  (NULL_RTX, x, false, int_mode));
8615         }
8616       return true;
8617
8618     case CONST_DOUBLE:
8619
8620       /* First determine number of instructions to do the move
8621           as an integer constant.  */
8622       if (!aarch64_float_const_representable_p (x)
8623            && !aarch64_can_const_movi_rtx_p (x, mode)
8624            && aarch64_float_const_rtx_p (x))
8625         {
8626           unsigned HOST_WIDE_INT ival;
8627           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
8628           gcc_assert (succeed);
8629
8630           scalar_int_mode imode = (mode == HFmode
8631                                    ? SImode
8632                                    : int_mode_for_mode (mode).require ());
8633           int ncost = aarch64_internal_mov_immediate
8634                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
8635           *cost += COSTS_N_INSNS (ncost);
8636           return true;
8637         }
8638
8639       if (speed)
8640         {
8641           /* mov[df,sf]_aarch64.  */
8642           if (aarch64_float_const_representable_p (x))
8643             /* FMOV (scalar immediate).  */
8644             *cost += extra_cost->fp[mode == DFmode].fpconst;
8645           else if (!aarch64_float_const_zero_rtx_p (x))
8646             {
8647               /* This will be a load from memory.  */
8648               if (mode == DFmode)
8649                 *cost += extra_cost->ldst.loadd;
8650               else
8651                 *cost += extra_cost->ldst.loadf;
8652             }
8653           else
8654             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
8655                or MOV v0.s[0], wzr - neither of which are modeled by the
8656                cost tables.  Just use the default cost.  */
8657             {
8658             }
8659         }
8660
8661       return true;
8662
8663     case MEM:
8664       if (speed)
8665         {
8666           /* For loads we want the base cost of a load, plus an
8667              approximation for the additional cost of the addressing
8668              mode.  */
8669           rtx address = XEXP (x, 0);
8670           if (VECTOR_MODE_P (mode))
8671             *cost += extra_cost->ldst.loadv;
8672           else if (GET_MODE_CLASS (mode) == MODE_INT)
8673             *cost += extra_cost->ldst.load;
8674           else if (mode == SFmode)
8675             *cost += extra_cost->ldst.loadf;
8676           else if (mode == DFmode)
8677             *cost += extra_cost->ldst.loadd;
8678
8679           *cost +=
8680                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8681                                                      0, speed));
8682         }
8683
8684       return true;
8685
8686     case NEG:
8687       op0 = XEXP (x, 0);
8688
8689       if (VECTOR_MODE_P (mode))
8690         {
8691           if (speed)
8692             {
8693               /* FNEG.  */
8694               *cost += extra_cost->vect.alu;
8695             }
8696           return false;
8697         }
8698
8699       if (GET_MODE_CLASS (mode) == MODE_INT)
8700         {
8701           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8702               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8703             {
8704               /* CSETM.  */
8705               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
8706               return true;
8707             }
8708
8709           /* Cost this as SUB wzr, X.  */
8710           op0 = CONST0_RTX (mode);
8711           op1 = XEXP (x, 0);
8712           goto cost_minus;
8713         }
8714
8715       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8716         {
8717           /* Support (neg(fma...)) as a single instruction only if
8718              sign of zeros is unimportant.  This matches the decision
8719              making in aarch64.md.  */
8720           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
8721             {
8722               /* FNMADD.  */
8723               *cost = rtx_cost (op0, mode, NEG, 0, speed);
8724               return true;
8725             }
8726           if (GET_CODE (op0) == MULT)
8727             {
8728               /* FNMUL.  */
8729               *cost = rtx_cost (op0, mode, NEG, 0, speed);
8730               return true;
8731             }
8732           if (speed)
8733             /* FNEG.  */
8734             *cost += extra_cost->fp[mode == DFmode].neg;
8735           return false;
8736         }
8737
8738       return false;
8739
8740     case CLRSB:
8741     case CLZ:
8742       if (speed)
8743         {
8744           if (VECTOR_MODE_P (mode))
8745             *cost += extra_cost->vect.alu;
8746           else
8747             *cost += extra_cost->alu.clz;
8748         }
8749
8750       return false;
8751
8752     case COMPARE:
8753       op0 = XEXP (x, 0);
8754       op1 = XEXP (x, 1);
8755
8756       if (op1 == const0_rtx
8757           && GET_CODE (op0) == AND)
8758         {
8759           x = op0;
8760           mode = GET_MODE (op0);
8761           goto cost_logic;
8762         }
8763
8764       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
8765         {
8766           /* TODO: A write to the CC flags possibly costs extra, this
8767              needs encoding in the cost tables.  */
8768
8769           mode = GET_MODE (op0);
8770           /* ANDS.  */
8771           if (GET_CODE (op0) == AND)
8772             {
8773               x = op0;
8774               goto cost_logic;
8775             }
8776
8777           if (GET_CODE (op0) == PLUS)
8778             {
8779               /* ADDS (and CMN alias).  */
8780               x = op0;
8781               goto cost_plus;
8782             }
8783
8784           if (GET_CODE (op0) == MINUS)
8785             {
8786               /* SUBS.  */
8787               x = op0;
8788               goto cost_minus;
8789             }
8790
8791           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
8792               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
8793               && CONST_INT_P (XEXP (op0, 2)))
8794             {
8795               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
8796                  Handle it here directly rather than going to cost_logic
8797                  since we know the immediate generated for the TST is valid
8798                  so we can avoid creating an intermediate rtx for it only
8799                  for costing purposes.  */
8800               if (speed)
8801                 *cost += extra_cost->alu.logical;
8802
8803               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
8804                                  ZERO_EXTRACT, 0, speed);
8805               return true;
8806             }
8807
8808           if (GET_CODE (op1) == NEG)
8809             {
8810               /* CMN.  */
8811               if (speed)
8812                 *cost += extra_cost->alu.arith;
8813
8814               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
8815               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
8816               return true;
8817             }
8818
8819           /* CMP.
8820
8821              Compare can freely swap the order of operands, and
8822              canonicalization puts the more complex operation first.
8823              But the integer MINUS logic expects the shift/extend
8824              operation in op1.  */
8825           if (! (REG_P (op0)
8826                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
8827           {
8828             op0 = XEXP (x, 1);
8829             op1 = XEXP (x, 0);
8830           }
8831           goto cost_minus;
8832         }
8833
8834       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
8835         {
8836           /* FCMP.  */
8837           if (speed)
8838             *cost += extra_cost->fp[mode == DFmode].compare;
8839
8840           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
8841             {
8842               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
8843               /* FCMP supports constant 0.0 for no extra cost. */
8844               return true;
8845             }
8846           return false;
8847         }
8848
8849       if (VECTOR_MODE_P (mode))
8850         {
8851           /* Vector compare.  */
8852           if (speed)
8853             *cost += extra_cost->vect.alu;
8854
8855           if (aarch64_float_const_zero_rtx_p (op1))
8856             {
8857               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
8858                  cost.  */
8859               return true;
8860             }
8861           return false;
8862         }
8863       return false;
8864
8865     case MINUS:
8866       {
8867         op0 = XEXP (x, 0);
8868         op1 = XEXP (x, 1);
8869
8870 cost_minus:
8871         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
8872
8873         /* Detect valid immediates.  */
8874         if ((GET_MODE_CLASS (mode) == MODE_INT
8875              || (GET_MODE_CLASS (mode) == MODE_CC
8876                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
8877             && CONST_INT_P (op1)
8878             && aarch64_uimm12_shift (INTVAL (op1)))
8879           {
8880             if (speed)
8881               /* SUB(S) (immediate).  */
8882               *cost += extra_cost->alu.arith;
8883             return true;
8884           }
8885
8886         /* Look for SUB (extended register).  */
8887         if (is_a <scalar_int_mode> (mode, &int_mode)
8888             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
8889           {
8890             if (speed)
8891               *cost += extra_cost->alu.extend_arith;
8892
8893             op1 = aarch64_strip_extend (op1, true);
8894             *cost += rtx_cost (op1, VOIDmode,
8895                                (enum rtx_code) GET_CODE (op1), 0, speed);
8896             return true;
8897           }
8898
8899         rtx new_op1 = aarch64_strip_extend (op1, false);
8900
8901         /* Cost this as an FMA-alike operation.  */
8902         if ((GET_CODE (new_op1) == MULT
8903              || aarch64_shift_p (GET_CODE (new_op1)))
8904             && code != COMPARE)
8905           {
8906             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
8907                                             (enum rtx_code) code,
8908                                             speed);
8909             return true;
8910           }
8911
8912         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
8913
8914         if (speed)
8915           {
8916             if (VECTOR_MODE_P (mode))
8917               {
8918                 /* Vector SUB.  */
8919                 *cost += extra_cost->vect.alu;
8920               }
8921             else if (GET_MODE_CLASS (mode) == MODE_INT)
8922               {
8923                 /* SUB(S).  */
8924                 *cost += extra_cost->alu.arith;
8925               }
8926             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8927               {
8928                 /* FSUB.  */
8929                 *cost += extra_cost->fp[mode == DFmode].addsub;
8930               }
8931           }
8932         return true;
8933       }
8934
8935     case PLUS:
8936       {
8937         rtx new_op0;
8938
8939         op0 = XEXP (x, 0);
8940         op1 = XEXP (x, 1);
8941
8942 cost_plus:
8943         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8944             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8945           {
8946             /* CSINC.  */
8947             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
8948             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8949             return true;
8950           }
8951
8952         if (GET_MODE_CLASS (mode) == MODE_INT
8953             && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
8954                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
8955           {
8956             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
8957
8958             if (speed)
8959               /* ADD (immediate).  */
8960               *cost += extra_cost->alu.arith;
8961             return true;
8962           }
8963
8964         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8965
8966         /* Look for ADD (extended register).  */
8967         if (is_a <scalar_int_mode> (mode, &int_mode)
8968             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
8969           {
8970             if (speed)
8971               *cost += extra_cost->alu.extend_arith;
8972
8973             op0 = aarch64_strip_extend (op0, true);
8974             *cost += rtx_cost (op0, VOIDmode,
8975                                (enum rtx_code) GET_CODE (op0), 0, speed);
8976             return true;
8977           }
8978
8979         /* Strip any extend, leave shifts behind as we will
8980            cost them through mult_cost.  */
8981         new_op0 = aarch64_strip_extend (op0, false);
8982
8983         if (GET_CODE (new_op0) == MULT
8984             || aarch64_shift_p (GET_CODE (new_op0)))
8985           {
8986             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
8987                                             speed);
8988             return true;
8989           }
8990
8991         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
8992
8993         if (speed)
8994           {
8995             if (VECTOR_MODE_P (mode))
8996               {
8997                 /* Vector ADD.  */
8998                 *cost += extra_cost->vect.alu;
8999               }
9000             else if (GET_MODE_CLASS (mode) == MODE_INT)
9001               {
9002                 /* ADD.  */
9003                 *cost += extra_cost->alu.arith;
9004               }
9005             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9006               {
9007                 /* FADD.  */
9008                 *cost += extra_cost->fp[mode == DFmode].addsub;
9009               }
9010           }
9011         return true;
9012       }
9013
9014     case BSWAP:
9015       *cost = COSTS_N_INSNS (1);
9016
9017       if (speed)
9018         {
9019           if (VECTOR_MODE_P (mode))
9020             *cost += extra_cost->vect.alu;
9021           else
9022             *cost += extra_cost->alu.rev;
9023         }
9024       return false;
9025
9026     case IOR:
9027       if (aarch_rev16_p (x))
9028         {
9029           *cost = COSTS_N_INSNS (1);
9030
9031           if (speed)
9032             {
9033               if (VECTOR_MODE_P (mode))
9034                 *cost += extra_cost->vect.alu;
9035               else
9036                 *cost += extra_cost->alu.rev;
9037             }
9038           return true;
9039         }
9040
9041       if (aarch64_extr_rtx_p (x, &op0, &op1))
9042         {
9043           *cost += rtx_cost (op0, mode, IOR, 0, speed);
9044           *cost += rtx_cost (op1, mode, IOR, 1, speed);
9045           if (speed)
9046             *cost += extra_cost->alu.shift;
9047
9048           return true;
9049         }
9050     /* Fall through.  */
9051     case XOR:
9052     case AND:
9053     cost_logic:
9054       op0 = XEXP (x, 0);
9055       op1 = XEXP (x, 1);
9056
9057       if (VECTOR_MODE_P (mode))
9058         {
9059           if (speed)
9060             *cost += extra_cost->vect.alu;
9061           return true;
9062         }
9063
9064       if (code == AND
9065           && GET_CODE (op0) == MULT
9066           && CONST_INT_P (XEXP (op0, 1))
9067           && CONST_INT_P (op1)
9068           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
9069                                INTVAL (op1)) != 0)
9070         {
9071           /* This is a UBFM/SBFM.  */
9072           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
9073           if (speed)
9074             *cost += extra_cost->alu.bfx;
9075           return true;
9076         }
9077
9078       if (is_int_mode (mode, &int_mode))
9079         {
9080           if (CONST_INT_P (op1))
9081             {
9082               /* We have a mask + shift version of a UBFIZ
9083                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
9084               if (GET_CODE (op0) == ASHIFT
9085                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
9086                                                          XEXP (op0, 1)))
9087                 {
9088                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
9089                                      (enum rtx_code) code, 0, speed);
9090                   if (speed)
9091                     *cost += extra_cost->alu.bfx;
9092
9093                   return true;
9094                 }
9095               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
9096                 {
9097                 /* We possibly get the immediate for free, this is not
9098                    modelled.  */
9099                   *cost += rtx_cost (op0, int_mode,
9100                                      (enum rtx_code) code, 0, speed);
9101                   if (speed)
9102                     *cost += extra_cost->alu.logical;
9103
9104                   return true;
9105                 }
9106             }
9107           else
9108             {
9109               rtx new_op0 = op0;
9110
9111               /* Handle ORN, EON, or BIC.  */
9112               if (GET_CODE (op0) == NOT)
9113                 op0 = XEXP (op0, 0);
9114
9115               new_op0 = aarch64_strip_shift (op0);
9116
9117               /* If we had a shift on op0 then this is a logical-shift-
9118                  by-register/immediate operation.  Otherwise, this is just
9119                  a logical operation.  */
9120               if (speed)
9121                 {
9122                   if (new_op0 != op0)
9123                     {
9124                       /* Shift by immediate.  */
9125                       if (CONST_INT_P (XEXP (op0, 1)))
9126                         *cost += extra_cost->alu.log_shift;
9127                       else
9128                         *cost += extra_cost->alu.log_shift_reg;
9129                     }
9130                   else
9131                     *cost += extra_cost->alu.logical;
9132                 }
9133
9134               /* In both cases we want to cost both operands.  */
9135               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
9136                                  0, speed);
9137               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
9138                                  1, speed);
9139
9140               return true;
9141             }
9142         }
9143       return false;
9144
9145     case NOT:
9146       x = XEXP (x, 0);
9147       op0 = aarch64_strip_shift (x);
9148
9149       if (VECTOR_MODE_P (mode))
9150         {
9151           /* Vector NOT.  */
9152           *cost += extra_cost->vect.alu;
9153           return false;
9154         }
9155
9156       /* MVN-shifted-reg.  */
9157       if (op0 != x)
9158         {
9159           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9160
9161           if (speed)
9162             *cost += extra_cost->alu.log_shift;
9163
9164           return true;
9165         }
9166       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
9167          Handle the second form here taking care that 'a' in the above can
9168          be a shift.  */
9169       else if (GET_CODE (op0) == XOR)
9170         {
9171           rtx newop0 = XEXP (op0, 0);
9172           rtx newop1 = XEXP (op0, 1);
9173           rtx op0_stripped = aarch64_strip_shift (newop0);
9174
9175           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
9176           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
9177
9178           if (speed)
9179             {
9180               if (op0_stripped != newop0)
9181                 *cost += extra_cost->alu.log_shift;
9182               else
9183                 *cost += extra_cost->alu.logical;
9184             }
9185
9186           return true;
9187         }
9188       /* MVN.  */
9189       if (speed)
9190         *cost += extra_cost->alu.logical;
9191
9192       return false;
9193
9194     case ZERO_EXTEND:
9195
9196       op0 = XEXP (x, 0);
9197       /* If a value is written in SI mode, then zero extended to DI
9198          mode, the operation will in general be free as a write to
9199          a 'w' register implicitly zeroes the upper bits of an 'x'
9200          register.  However, if this is
9201
9202            (set (reg) (zero_extend (reg)))
9203
9204          we must cost the explicit register move.  */
9205       if (mode == DImode
9206           && GET_MODE (op0) == SImode
9207           && outer == SET)
9208         {
9209           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
9210
9211         /* If OP_COST is non-zero, then the cost of the zero extend
9212            is effectively the cost of the inner operation.  Otherwise
9213            we have a MOV instruction and we take the cost from the MOV
9214            itself.  This is true independently of whether we are
9215            optimizing for space or time.  */
9216           if (op_cost)
9217             *cost = op_cost;
9218
9219           return true;
9220         }
9221       else if (MEM_P (op0))
9222         {
9223           /* All loads can zero extend to any size for free.  */
9224           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
9225           return true;
9226         }
9227
9228       op0 = aarch64_extend_bitfield_pattern_p (x);
9229       if (op0)
9230         {
9231           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
9232           if (speed)
9233             *cost += extra_cost->alu.bfx;
9234           return true;
9235         }
9236
9237       if (speed)
9238         {
9239           if (VECTOR_MODE_P (mode))
9240             {
9241               /* UMOV.  */
9242               *cost += extra_cost->vect.alu;
9243             }
9244           else
9245             {
9246               /* We generate an AND instead of UXTB/UXTH.  */
9247               *cost += extra_cost->alu.logical;
9248             }
9249         }
9250       return false;
9251
9252     case SIGN_EXTEND:
9253       if (MEM_P (XEXP (x, 0)))
9254         {
9255           /* LDRSH.  */
9256           if (speed)
9257             {
9258               rtx address = XEXP (XEXP (x, 0), 0);
9259               *cost += extra_cost->ldst.load_sign_extend;
9260
9261               *cost +=
9262                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9263                                                      0, speed));
9264             }
9265           return true;
9266         }
9267
9268       op0 = aarch64_extend_bitfield_pattern_p (x);
9269       if (op0)
9270         {
9271           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
9272           if (speed)
9273             *cost += extra_cost->alu.bfx;
9274           return true;
9275         }
9276
9277       if (speed)
9278         {
9279           if (VECTOR_MODE_P (mode))
9280             *cost += extra_cost->vect.alu;
9281           else
9282             *cost += extra_cost->alu.extend;
9283         }
9284       return false;
9285
9286     case ASHIFT:
9287       op0 = XEXP (x, 0);
9288       op1 = XEXP (x, 1);
9289
9290       if (CONST_INT_P (op1))
9291         {
9292           if (speed)
9293             {
9294               if (VECTOR_MODE_P (mode))
9295                 {
9296                   /* Vector shift (immediate).  */
9297                   *cost += extra_cost->vect.alu;
9298                 }
9299               else
9300                 {
9301                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
9302                      aliases.  */
9303                   *cost += extra_cost->alu.shift;
9304                 }
9305             }
9306
9307           /* We can incorporate zero/sign extend for free.  */
9308           if (GET_CODE (op0) == ZERO_EXTEND
9309               || GET_CODE (op0) == SIGN_EXTEND)
9310             op0 = XEXP (op0, 0);
9311
9312           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
9313           return true;
9314         }
9315       else
9316         {
9317           if (VECTOR_MODE_P (mode))
9318             {
9319               if (speed)
9320                 /* Vector shift (register).  */
9321                 *cost += extra_cost->vect.alu;
9322             }
9323           else
9324             {
9325               if (speed)
9326                 /* LSLV.  */
9327                 *cost += extra_cost->alu.shift_reg;
9328
9329               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9330                   && CONST_INT_P (XEXP (op1, 1))
9331                   && known_eq (INTVAL (XEXP (op1, 1)),
9332                                GET_MODE_BITSIZE (mode) - 1))
9333                 {
9334                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9335                   /* We already demanded XEXP (op1, 0) to be REG_P, so
9336                      don't recurse into it.  */
9337                   return true;
9338                 }
9339             }
9340           return false;  /* All arguments need to be in registers.  */
9341         }
9342
9343     case ROTATE:
9344     case ROTATERT:
9345     case LSHIFTRT:
9346     case ASHIFTRT:
9347       op0 = XEXP (x, 0);
9348       op1 = XEXP (x, 1);
9349
9350       if (CONST_INT_P (op1))
9351         {
9352           /* ASR (immediate) and friends.  */
9353           if (speed)
9354             {
9355               if (VECTOR_MODE_P (mode))
9356                 *cost += extra_cost->vect.alu;
9357               else
9358                 *cost += extra_cost->alu.shift;
9359             }
9360
9361           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9362           return true;
9363         }
9364       else
9365         {
9366           if (VECTOR_MODE_P (mode))
9367             {
9368               if (speed)
9369                 /* Vector shift (register).  */
9370                 *cost += extra_cost->vect.alu;
9371             }
9372           else
9373             {
9374               if (speed)
9375                 /* ASR (register) and friends.  */
9376                 *cost += extra_cost->alu.shift_reg;
9377
9378               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9379                   && CONST_INT_P (XEXP (op1, 1))
9380                   && known_eq (INTVAL (XEXP (op1, 1)),
9381                                GET_MODE_BITSIZE (mode) - 1))
9382                 {
9383                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9384                   /* We already demanded XEXP (op1, 0) to be REG_P, so
9385                      don't recurse into it.  */
9386                   return true;
9387                 }
9388             }
9389           return false;  /* All arguments need to be in registers.  */
9390         }
9391
9392     case SYMBOL_REF:
9393
9394       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
9395           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
9396         {
9397           /* LDR.  */
9398           if (speed)
9399             *cost += extra_cost->ldst.load;
9400         }
9401       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
9402                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
9403         {
9404           /* ADRP, followed by ADD.  */
9405           *cost += COSTS_N_INSNS (1);
9406           if (speed)
9407             *cost += 2 * extra_cost->alu.arith;
9408         }
9409       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
9410                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
9411         {
9412           /* ADR.  */
9413           if (speed)
9414             *cost += extra_cost->alu.arith;
9415         }
9416
9417       if (flag_pic)
9418         {
9419           /* One extra load instruction, after accessing the GOT.  */
9420           *cost += COSTS_N_INSNS (1);
9421           if (speed)
9422             *cost += extra_cost->ldst.load;
9423         }
9424       return true;
9425
9426     case HIGH:
9427     case LO_SUM:
9428       /* ADRP/ADD (immediate).  */
9429       if (speed)
9430         *cost += extra_cost->alu.arith;
9431       return true;
9432
9433     case ZERO_EXTRACT:
9434     case SIGN_EXTRACT:
9435       /* UBFX/SBFX.  */
9436       if (speed)
9437         {
9438           if (VECTOR_MODE_P (mode))
9439             *cost += extra_cost->vect.alu;
9440           else
9441             *cost += extra_cost->alu.bfx;
9442         }
9443
9444       /* We can trust that the immediates used will be correct (there
9445          are no by-register forms), so we need only cost op0.  */
9446       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
9447       return true;
9448
9449     case MULT:
9450       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
9451       /* aarch64_rtx_mult_cost always handles recursion to its
9452          operands.  */
9453       return true;
9454
9455     case MOD:
9456     /* We can expand signed mod by power of 2 using a NEGS, two parallel
9457        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
9458        an unconditional negate.  This case should only ever be reached through
9459        the set_smod_pow2_cheap check in expmed.c.  */
9460       if (CONST_INT_P (XEXP (x, 1))
9461           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
9462           && (mode == SImode || mode == DImode))
9463         {
9464           /* We expand to 4 instructions.  Reset the baseline.  */
9465           *cost = COSTS_N_INSNS (4);
9466
9467           if (speed)
9468             *cost += 2 * extra_cost->alu.logical
9469                      + 2 * extra_cost->alu.arith;
9470
9471           return true;
9472         }
9473
9474     /* Fall-through.  */
9475     case UMOD:
9476       if (speed)
9477         {
9478           /* Slighly prefer UMOD over SMOD.  */
9479           if (VECTOR_MODE_P (mode))
9480             *cost += extra_cost->vect.alu;
9481           else if (GET_MODE_CLASS (mode) == MODE_INT)
9482             *cost += (extra_cost->mult[mode == DImode].add
9483                       + extra_cost->mult[mode == DImode].idiv
9484                       + (code == MOD ? 1 : 0));
9485         }
9486       return false;  /* All arguments need to be in registers.  */
9487
9488     case DIV:
9489     case UDIV:
9490     case SQRT:
9491       if (speed)
9492         {
9493           if (VECTOR_MODE_P (mode))
9494             *cost += extra_cost->vect.alu;
9495           else if (GET_MODE_CLASS (mode) == MODE_INT)
9496             /* There is no integer SQRT, so only DIV and UDIV can get
9497                here.  */
9498             *cost += (extra_cost->mult[mode == DImode].idiv
9499                      /* Slighly prefer UDIV over SDIV.  */
9500                      + (code == DIV ? 1 : 0));
9501           else
9502             *cost += extra_cost->fp[mode == DFmode].div;
9503         }
9504       return false;  /* All arguments need to be in registers.  */
9505
9506     case IF_THEN_ELSE:
9507       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
9508                                          XEXP (x, 2), cost, speed);
9509
9510     case EQ:
9511     case NE:
9512     case GT:
9513     case GTU:
9514     case LT:
9515     case LTU:
9516     case GE:
9517     case GEU:
9518     case LE:
9519     case LEU:
9520
9521       return false; /* All arguments must be in registers.  */
9522
9523     case FMA:
9524       op0 = XEXP (x, 0);
9525       op1 = XEXP (x, 1);
9526       op2 = XEXP (x, 2);
9527
9528       if (speed)
9529         {
9530           if (VECTOR_MODE_P (mode))
9531             *cost += extra_cost->vect.alu;
9532           else
9533             *cost += extra_cost->fp[mode == DFmode].fma;
9534         }
9535
9536       /* FMSUB, FNMADD, and FNMSUB are free.  */
9537       if (GET_CODE (op0) == NEG)
9538         op0 = XEXP (op0, 0);
9539
9540       if (GET_CODE (op2) == NEG)
9541         op2 = XEXP (op2, 0);
9542
9543       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
9544          and the by-element operand as operand 0.  */
9545       if (GET_CODE (op1) == NEG)
9546         op1 = XEXP (op1, 0);
9547
9548       /* Catch vector-by-element operations.  The by-element operand can
9549          either be (vec_duplicate (vec_select (x))) or just
9550          (vec_select (x)), depending on whether we are multiplying by
9551          a vector or a scalar.
9552
9553          Canonicalization is not very good in these cases, FMA4 will put the
9554          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
9555       if (GET_CODE (op0) == VEC_DUPLICATE)
9556         op0 = XEXP (op0, 0);
9557       else if (GET_CODE (op1) == VEC_DUPLICATE)
9558         op1 = XEXP (op1, 0);
9559
9560       if (GET_CODE (op0) == VEC_SELECT)
9561         op0 = XEXP (op0, 0);
9562       else if (GET_CODE (op1) == VEC_SELECT)
9563         op1 = XEXP (op1, 0);
9564
9565       /* If the remaining parameters are not registers,
9566          get the cost to put them into registers.  */
9567       *cost += rtx_cost (op0, mode, FMA, 0, speed);
9568       *cost += rtx_cost (op1, mode, FMA, 1, speed);
9569       *cost += rtx_cost (op2, mode, FMA, 2, speed);
9570       return true;
9571
9572     case FLOAT:
9573     case UNSIGNED_FLOAT:
9574       if (speed)
9575         *cost += extra_cost->fp[mode == DFmode].fromint;
9576       return false;
9577
9578     case FLOAT_EXTEND:
9579       if (speed)
9580         {
9581           if (VECTOR_MODE_P (mode))
9582             {
9583               /*Vector truncate.  */
9584               *cost += extra_cost->vect.alu;
9585             }
9586           else
9587             *cost += extra_cost->fp[mode == DFmode].widen;
9588         }
9589       return false;
9590
9591     case FLOAT_TRUNCATE:
9592       if (speed)
9593         {
9594           if (VECTOR_MODE_P (mode))
9595             {
9596               /*Vector conversion.  */
9597               *cost += extra_cost->vect.alu;
9598             }
9599           else
9600             *cost += extra_cost->fp[mode == DFmode].narrow;
9601         }
9602       return false;
9603
9604     case FIX:
9605     case UNSIGNED_FIX:
9606       x = XEXP (x, 0);
9607       /* Strip the rounding part.  They will all be implemented
9608          by the fcvt* family of instructions anyway.  */
9609       if (GET_CODE (x) == UNSPEC)
9610         {
9611           unsigned int uns_code = XINT (x, 1);
9612
9613           if (uns_code == UNSPEC_FRINTA
9614               || uns_code == UNSPEC_FRINTM
9615               || uns_code == UNSPEC_FRINTN
9616               || uns_code == UNSPEC_FRINTP
9617               || uns_code == UNSPEC_FRINTZ)
9618             x = XVECEXP (x, 0, 0);
9619         }
9620
9621       if (speed)
9622         {
9623           if (VECTOR_MODE_P (mode))
9624             *cost += extra_cost->vect.alu;
9625           else
9626             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
9627         }
9628
9629       /* We can combine fmul by a power of 2 followed by a fcvt into a single
9630          fixed-point fcvt.  */
9631       if (GET_CODE (x) == MULT
9632           && ((VECTOR_MODE_P (mode)
9633                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
9634               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
9635         {
9636           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
9637                              0, speed);
9638           return true;
9639         }
9640
9641       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
9642       return true;
9643
9644     case ABS:
9645       if (VECTOR_MODE_P (mode))
9646         {
9647           /* ABS (vector).  */
9648           if (speed)
9649             *cost += extra_cost->vect.alu;
9650         }
9651       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9652         {
9653           op0 = XEXP (x, 0);
9654
9655           /* FABD, which is analogous to FADD.  */
9656           if (GET_CODE (op0) == MINUS)
9657             {
9658               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
9659               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
9660               if (speed)
9661                 *cost += extra_cost->fp[mode == DFmode].addsub;
9662
9663               return true;
9664             }
9665           /* Simple FABS is analogous to FNEG.  */
9666           if (speed)
9667             *cost += extra_cost->fp[mode == DFmode].neg;
9668         }
9669       else
9670         {
9671           /* Integer ABS will either be split to
9672              two arithmetic instructions, or will be an ABS
9673              (scalar), which we don't model.  */
9674           *cost = COSTS_N_INSNS (2);
9675           if (speed)
9676             *cost += 2 * extra_cost->alu.arith;
9677         }
9678       return false;
9679
9680     case SMAX:
9681     case SMIN:
9682       if (speed)
9683         {
9684           if (VECTOR_MODE_P (mode))
9685             *cost += extra_cost->vect.alu;
9686           else
9687             {
9688               /* FMAXNM/FMINNM/FMAX/FMIN.
9689                  TODO: This may not be accurate for all implementations, but
9690                  we do not model this in the cost tables.  */
9691               *cost += extra_cost->fp[mode == DFmode].addsub;
9692             }
9693         }
9694       return false;
9695
9696     case UNSPEC:
9697       /* The floating point round to integer frint* instructions.  */
9698       if (aarch64_frint_unspec_p (XINT (x, 1)))
9699         {
9700           if (speed)
9701             *cost += extra_cost->fp[mode == DFmode].roundint;
9702
9703           return false;
9704         }
9705
9706       if (XINT (x, 1) == UNSPEC_RBIT)
9707         {
9708           if (speed)
9709             *cost += extra_cost->alu.rev;
9710
9711           return false;
9712         }
9713       break;
9714
9715     case TRUNCATE:
9716
9717       /* Decompose <su>muldi3_highpart.  */
9718       if (/* (truncate:DI  */
9719           mode == DImode
9720           /*   (lshiftrt:TI  */
9721           && GET_MODE (XEXP (x, 0)) == TImode
9722           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
9723           /*      (mult:TI  */
9724           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
9725           /*        (ANY_EXTEND:TI (reg:DI))
9726                     (ANY_EXTEND:TI (reg:DI)))  */
9727           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
9728                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
9729               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
9730                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
9731           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
9732           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
9733           /*     (const_int 64)  */
9734           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9735           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
9736         {
9737           /* UMULH/SMULH.  */
9738           if (speed)
9739             *cost += extra_cost->mult[mode == DImode].extend;
9740           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
9741                              mode, MULT, 0, speed);
9742           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
9743                              mode, MULT, 1, speed);
9744           return true;
9745         }
9746
9747       /* Fall through.  */
9748     default:
9749       break;
9750     }
9751
9752   if (dump_file
9753       && flag_aarch64_verbose_cost)
9754     fprintf (dump_file,
9755       "\nFailed to cost RTX.  Assuming default cost.\n");
9756
9757   return true;
9758 }
9759
9760 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
9761    calculated for X.  This cost is stored in *COST.  Returns true
9762    if the total cost of X was calculated.  */
9763 static bool
9764 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
9765                    int param, int *cost, bool speed)
9766 {
9767   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
9768
9769   if (dump_file
9770       && flag_aarch64_verbose_cost)
9771     {
9772       print_rtl_single (dump_file, x);
9773       fprintf (dump_file, "\n%s cost: %d (%s)\n",
9774                speed ? "Hot" : "Cold",
9775                *cost, result ? "final" : "partial");
9776     }
9777
9778   return result;
9779 }
9780
9781 static int
9782 aarch64_register_move_cost (machine_mode mode,
9783                             reg_class_t from_i, reg_class_t to_i)
9784 {
9785   enum reg_class from = (enum reg_class) from_i;
9786   enum reg_class to = (enum reg_class) to_i;
9787   const struct cpu_regmove_cost *regmove_cost
9788     = aarch64_tune_params.regmove_cost;
9789
9790   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
9791   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
9792     to = GENERAL_REGS;
9793
9794   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
9795     from = GENERAL_REGS;
9796
9797   /* Moving between GPR and stack cost is the same as GP2GP.  */
9798   if ((from == GENERAL_REGS && to == STACK_REG)
9799       || (to == GENERAL_REGS && from == STACK_REG))
9800     return regmove_cost->GP2GP;
9801
9802   /* To/From the stack register, we move via the gprs.  */
9803   if (to == STACK_REG || from == STACK_REG)
9804     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
9805             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
9806
9807   if (known_eq (GET_MODE_SIZE (mode), 16))
9808     {
9809       /* 128-bit operations on general registers require 2 instructions.  */
9810       if (from == GENERAL_REGS && to == GENERAL_REGS)
9811         return regmove_cost->GP2GP * 2;
9812       else if (from == GENERAL_REGS)
9813         return regmove_cost->GP2FP * 2;
9814       else if (to == GENERAL_REGS)
9815         return regmove_cost->FP2GP * 2;
9816
9817       /* When AdvSIMD instructions are disabled it is not possible to move
9818          a 128-bit value directly between Q registers.  This is handled in
9819          secondary reload.  A general register is used as a scratch to move
9820          the upper DI value and the lower DI value is moved directly,
9821          hence the cost is the sum of three moves. */
9822       if (! TARGET_SIMD)
9823         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
9824
9825       return regmove_cost->FP2FP;
9826     }
9827
9828   if (from == GENERAL_REGS && to == GENERAL_REGS)
9829     return regmove_cost->GP2GP;
9830   else if (from == GENERAL_REGS)
9831     return regmove_cost->GP2FP;
9832   else if (to == GENERAL_REGS)
9833     return regmove_cost->FP2GP;
9834
9835   return regmove_cost->FP2FP;
9836 }
9837
9838 static int
9839 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
9840                           reg_class_t rclass ATTRIBUTE_UNUSED,
9841                           bool in ATTRIBUTE_UNUSED)
9842 {
9843   return aarch64_tune_params.memmov_cost;
9844 }
9845
9846 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
9847    to optimize 1.0/sqrt.  */
9848
9849 static bool
9850 use_rsqrt_p (machine_mode mode)
9851 {
9852   return (!flag_trapping_math
9853           && flag_unsafe_math_optimizations
9854           && ((aarch64_tune_params.approx_modes->recip_sqrt
9855                & AARCH64_APPROX_MODE (mode))
9856               || flag_mrecip_low_precision_sqrt));
9857 }
9858
9859 /* Function to decide when to use the approximate reciprocal square root
9860    builtin.  */
9861
9862 static tree
9863 aarch64_builtin_reciprocal (tree fndecl)
9864 {
9865   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
9866
9867   if (!use_rsqrt_p (mode))
9868     return NULL_TREE;
9869   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
9870 }
9871
9872 typedef rtx (*rsqrte_type) (rtx, rtx);
9873
9874 /* Select reciprocal square root initial estimate insn depending on machine
9875    mode.  */
9876
9877 static rsqrte_type
9878 get_rsqrte_type (machine_mode mode)
9879 {
9880   switch (mode)
9881   {
9882     case E_DFmode:   return gen_aarch64_rsqrtedf;
9883     case E_SFmode:   return gen_aarch64_rsqrtesf;
9884     case E_V2DFmode: return gen_aarch64_rsqrtev2df;
9885     case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
9886     case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
9887     default: gcc_unreachable ();
9888   }
9889 }
9890
9891 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
9892
9893 /* Select reciprocal square root series step insn depending on machine mode.  */
9894
9895 static rsqrts_type
9896 get_rsqrts_type (machine_mode mode)
9897 {
9898   switch (mode)
9899   {
9900     case E_DFmode:   return gen_aarch64_rsqrtsdf;
9901     case E_SFmode:   return gen_aarch64_rsqrtssf;
9902     case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
9903     case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
9904     case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
9905     default: gcc_unreachable ();
9906   }
9907 }
9908
9909 /* Emit instruction sequence to compute either the approximate square root
9910    or its approximate reciprocal, depending on the flag RECP, and return
9911    whether the sequence was emitted or not.  */
9912
9913 bool
9914 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
9915 {
9916   machine_mode mode = GET_MODE (dst);
9917
9918   if (GET_MODE_INNER (mode) == HFmode)
9919     {
9920       gcc_assert (!recp);
9921       return false;
9922     }
9923
9924   if (!recp)
9925     {
9926       if (!(flag_mlow_precision_sqrt
9927             || (aarch64_tune_params.approx_modes->sqrt
9928                 & AARCH64_APPROX_MODE (mode))))
9929         return false;
9930
9931       if (flag_finite_math_only
9932           || flag_trapping_math
9933           || !flag_unsafe_math_optimizations
9934           || optimize_function_for_size_p (cfun))
9935         return false;
9936     }
9937   else
9938     /* Caller assumes we cannot fail.  */
9939     gcc_assert (use_rsqrt_p (mode));
9940
9941   machine_mode mmsk = mode_for_int_vector (mode).require ();
9942   rtx xmsk = gen_reg_rtx (mmsk);
9943   if (!recp)
9944     /* When calculating the approximate square root, compare the
9945        argument with 0.0 and create a mask.  */
9946     emit_insn (gen_rtx_SET (xmsk,
9947                             gen_rtx_NEG (mmsk,
9948                                          gen_rtx_EQ (mmsk, src,
9949                                                      CONST0_RTX (mode)))));
9950
9951   /* Estimate the approximate reciprocal square root.  */
9952   rtx xdst = gen_reg_rtx (mode);
9953   emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
9954
9955   /* Iterate over the series twice for SF and thrice for DF.  */
9956   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9957
9958   /* Optionally iterate over the series once less for faster performance
9959      while sacrificing the accuracy.  */
9960   if ((recp && flag_mrecip_low_precision_sqrt)
9961       || (!recp && flag_mlow_precision_sqrt))
9962     iterations--;
9963
9964   /* Iterate over the series to calculate the approximate reciprocal square
9965      root.  */
9966   rtx x1 = gen_reg_rtx (mode);
9967   while (iterations--)
9968     {
9969       rtx x2 = gen_reg_rtx (mode);
9970       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
9971
9972       emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
9973
9974       if (iterations > 0)
9975         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
9976     }
9977
9978   if (!recp)
9979     {
9980       /* Qualify the approximate reciprocal square root when the argument is
9981          0.0 by squashing the intermediary result to 0.0.  */
9982       rtx xtmp = gen_reg_rtx (mmsk);
9983       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
9984                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
9985       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
9986
9987       /* Calculate the approximate square root.  */
9988       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
9989     }
9990
9991   /* Finalize the approximation.  */
9992   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
9993
9994   return true;
9995 }
9996
9997 typedef rtx (*recpe_type) (rtx, rtx);
9998
9999 /* Select reciprocal initial estimate insn depending on machine mode.  */
10000
10001 static recpe_type
10002 get_recpe_type (machine_mode mode)
10003 {
10004   switch (mode)
10005   {
10006     case E_SFmode:   return (gen_aarch64_frecpesf);
10007     case E_V2SFmode: return (gen_aarch64_frecpev2sf);
10008     case E_V4SFmode: return (gen_aarch64_frecpev4sf);
10009     case E_DFmode:   return (gen_aarch64_frecpedf);
10010     case E_V2DFmode: return (gen_aarch64_frecpev2df);
10011     default:         gcc_unreachable ();
10012   }
10013 }
10014
10015 typedef rtx (*recps_type) (rtx, rtx, rtx);
10016
10017 /* Select reciprocal series step insn depending on machine mode.  */
10018
10019 static recps_type
10020 get_recps_type (machine_mode mode)
10021 {
10022   switch (mode)
10023   {
10024     case E_SFmode:   return (gen_aarch64_frecpssf);
10025     case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
10026     case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
10027     case E_DFmode:   return (gen_aarch64_frecpsdf);
10028     case E_V2DFmode: return (gen_aarch64_frecpsv2df);
10029     default:         gcc_unreachable ();
10030   }
10031 }
10032
10033 /* Emit the instruction sequence to compute the approximation for the division
10034    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
10035
10036 bool
10037 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
10038 {
10039   machine_mode mode = GET_MODE (quo);
10040
10041   if (GET_MODE_INNER (mode) == HFmode)
10042     return false;
10043
10044   bool use_approx_division_p = (flag_mlow_precision_div
10045                                 || (aarch64_tune_params.approx_modes->division
10046                                     & AARCH64_APPROX_MODE (mode)));
10047
10048   if (!flag_finite_math_only
10049       || flag_trapping_math
10050       || !flag_unsafe_math_optimizations
10051       || optimize_function_for_size_p (cfun)
10052       || !use_approx_division_p)
10053     return false;
10054
10055   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
10056     return false;
10057
10058   /* Estimate the approximate reciprocal.  */
10059   rtx xrcp = gen_reg_rtx (mode);
10060   emit_insn ((*get_recpe_type (mode)) (xrcp, den));
10061
10062   /* Iterate over the series twice for SF and thrice for DF.  */
10063   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10064
10065   /* Optionally iterate over the series once less for faster performance,
10066      while sacrificing the accuracy.  */
10067   if (flag_mlow_precision_div)
10068     iterations--;
10069
10070   /* Iterate over the series to calculate the approximate reciprocal.  */
10071   rtx xtmp = gen_reg_rtx (mode);
10072   while (iterations--)
10073     {
10074       emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
10075
10076       if (iterations > 0)
10077         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
10078     }
10079
10080   if (num != CONST1_RTX (mode))
10081     {
10082       /* As the approximate reciprocal of DEN is already calculated, only
10083          calculate the approximate division when NUM is not 1.0.  */
10084       rtx xnum = force_reg (mode, num);
10085       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
10086     }
10087
10088   /* Finalize the approximation.  */
10089   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
10090   return true;
10091 }
10092
10093 /* Return the number of instructions that can be issued per cycle.  */
10094 static int
10095 aarch64_sched_issue_rate (void)
10096 {
10097   return aarch64_tune_params.issue_rate;
10098 }
10099
10100 static int
10101 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10102 {
10103   int issue_rate = aarch64_sched_issue_rate ();
10104
10105   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
10106 }
10107
10108
10109 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
10110    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
10111    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
10112
10113 static int
10114 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
10115                                                     int ready_index)
10116 {
10117   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
10118 }
10119
10120
10121 /* Vectorizer cost model target hooks.  */
10122
10123 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
10124 static int
10125 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
10126                                     tree vectype,
10127                                     int misalign ATTRIBUTE_UNUSED)
10128 {
10129   unsigned elements;
10130   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
10131   bool fp = false;
10132
10133   if (vectype != NULL)
10134     fp = FLOAT_TYPE_P (vectype);
10135
10136   switch (type_of_cost)
10137     {
10138       case scalar_stmt:
10139         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
10140
10141       case scalar_load:
10142         return costs->scalar_load_cost;
10143
10144       case scalar_store:
10145         return costs->scalar_store_cost;
10146
10147       case vector_stmt:
10148         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10149
10150       case vector_load:
10151         return costs->vec_align_load_cost;
10152
10153       case vector_store:
10154         return costs->vec_store_cost;
10155
10156       case vec_to_scalar:
10157         return costs->vec_to_scalar_cost;
10158
10159       case scalar_to_vec:
10160         return costs->scalar_to_vec_cost;
10161
10162       case unaligned_load:
10163       case vector_gather_load:
10164         return costs->vec_unalign_load_cost;
10165
10166       case unaligned_store:
10167       case vector_scatter_store:
10168         return costs->vec_unalign_store_cost;
10169
10170       case cond_branch_taken:
10171         return costs->cond_taken_branch_cost;
10172
10173       case cond_branch_not_taken:
10174         return costs->cond_not_taken_branch_cost;
10175
10176       case vec_perm:
10177         return costs->vec_permute_cost;
10178
10179       case vec_promote_demote:
10180         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10181
10182       case vec_construct:
10183         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
10184         return elements / 2 + 1;
10185
10186       default:
10187         gcc_unreachable ();
10188     }
10189 }
10190
10191 /* Implement targetm.vectorize.add_stmt_cost.  */
10192 static unsigned
10193 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
10194                        struct _stmt_vec_info *stmt_info, int misalign,
10195                        enum vect_cost_model_location where)
10196 {
10197   unsigned *cost = (unsigned *) data;
10198   unsigned retval = 0;
10199
10200   if (flag_vect_cost_model)
10201     {
10202       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
10203       int stmt_cost =
10204             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
10205
10206       /* Statements in an inner loop relative to the loop being
10207          vectorized are weighted more heavily.  The value here is
10208          arbitrary and could potentially be improved with analysis.  */
10209       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
10210         count *= 50; /*  FIXME  */
10211
10212       retval = (unsigned) (count * stmt_cost);
10213       cost[where] += retval;
10214     }
10215
10216   return retval;
10217 }
10218
10219 static void initialize_aarch64_code_model (struct gcc_options *);
10220
10221 /* Parse the TO_PARSE string and put the architecture struct that it
10222    selects into RES and the architectural features into ISA_FLAGS.
10223    Return an aarch64_parse_opt_result describing the parse result.
10224    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
10225
10226 static enum aarch64_parse_opt_result
10227 aarch64_parse_arch (const char *to_parse, const struct processor **res,
10228                     unsigned long *isa_flags)
10229 {
10230   char *ext;
10231   const struct processor *arch;
10232   char *str = (char *) alloca (strlen (to_parse) + 1);
10233   size_t len;
10234
10235   strcpy (str, to_parse);
10236
10237   ext = strchr (str, '+');
10238
10239   if (ext != NULL)
10240     len = ext - str;
10241   else
10242     len = strlen (str);
10243
10244   if (len == 0)
10245     return AARCH64_PARSE_MISSING_ARG;
10246
10247
10248   /* Loop through the list of supported ARCHes to find a match.  */
10249   for (arch = all_architectures; arch->name != NULL; arch++)
10250     {
10251       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
10252         {
10253           unsigned long isa_temp = arch->flags;
10254
10255           if (ext != NULL)
10256             {
10257               /* TO_PARSE string contains at least one extension.  */
10258               enum aarch64_parse_opt_result ext_res
10259                 = aarch64_parse_extension (ext, &isa_temp);
10260
10261               if (ext_res != AARCH64_PARSE_OK)
10262                 return ext_res;
10263             }
10264           /* Extension parsing was successful.  Confirm the result
10265              arch and ISA flags.  */
10266           *res = arch;
10267           *isa_flags = isa_temp;
10268           return AARCH64_PARSE_OK;
10269         }
10270     }
10271
10272   /* ARCH name not found in list.  */
10273   return AARCH64_PARSE_INVALID_ARG;
10274 }
10275
10276 /* Parse the TO_PARSE string and put the result tuning in RES and the
10277    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
10278    describing the parse result.  If there is an error parsing, RES and
10279    ISA_FLAGS are left unchanged.  */
10280
10281 static enum aarch64_parse_opt_result
10282 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
10283                    unsigned long *isa_flags)
10284 {
10285   char *ext;
10286   const struct processor *cpu;
10287   char *str = (char *) alloca (strlen (to_parse) + 1);
10288   size_t len;
10289
10290   strcpy (str, to_parse);
10291
10292   ext = strchr (str, '+');
10293
10294   if (ext != NULL)
10295     len = ext - str;
10296   else
10297     len = strlen (str);
10298
10299   if (len == 0)
10300     return AARCH64_PARSE_MISSING_ARG;
10301
10302
10303   /* Loop through the list of supported CPUs to find a match.  */
10304   for (cpu = all_cores; cpu->name != NULL; cpu++)
10305     {
10306       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
10307         {
10308           unsigned long isa_temp = cpu->flags;
10309
10310
10311           if (ext != NULL)
10312             {
10313               /* TO_PARSE string contains at least one extension.  */
10314               enum aarch64_parse_opt_result ext_res
10315                 = aarch64_parse_extension (ext, &isa_temp);
10316
10317               if (ext_res != AARCH64_PARSE_OK)
10318                 return ext_res;
10319             }
10320           /* Extension parsing was successfull.  Confirm the result
10321              cpu and ISA flags.  */
10322           *res = cpu;
10323           *isa_flags = isa_temp;
10324           return AARCH64_PARSE_OK;
10325         }
10326     }
10327
10328   /* CPU name not found in list.  */
10329   return AARCH64_PARSE_INVALID_ARG;
10330 }
10331
10332 /* Parse the TO_PARSE string and put the cpu it selects into RES.
10333    Return an aarch64_parse_opt_result describing the parse result.
10334    If the parsing fails the RES does not change.  */
10335
10336 static enum aarch64_parse_opt_result
10337 aarch64_parse_tune (const char *to_parse, const struct processor **res)
10338 {
10339   const struct processor *cpu;
10340   char *str = (char *) alloca (strlen (to_parse) + 1);
10341
10342   strcpy (str, to_parse);
10343
10344   /* Loop through the list of supported CPUs to find a match.  */
10345   for (cpu = all_cores; cpu->name != NULL; cpu++)
10346     {
10347       if (strcmp (cpu->name, str) == 0)
10348         {
10349           *res = cpu;
10350           return AARCH64_PARSE_OK;
10351         }
10352     }
10353
10354   /* CPU name not found in list.  */
10355   return AARCH64_PARSE_INVALID_ARG;
10356 }
10357
10358 /* Parse TOKEN, which has length LENGTH to see if it is an option
10359    described in FLAG.  If it is, return the index bit for that fusion type.
10360    If not, error (printing OPTION_NAME) and return zero.  */
10361
10362 static unsigned int
10363 aarch64_parse_one_option_token (const char *token,
10364                                 size_t length,
10365                                 const struct aarch64_flag_desc *flag,
10366                                 const char *option_name)
10367 {
10368   for (; flag->name != NULL; flag++)
10369     {
10370       if (length == strlen (flag->name)
10371           && !strncmp (flag->name, token, length))
10372         return flag->flag;
10373     }
10374
10375   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
10376   return 0;
10377 }
10378
10379 /* Parse OPTION which is a comma-separated list of flags to enable.
10380    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
10381    default state we inherit from the CPU tuning structures.  OPTION_NAME
10382    gives the top-level option we are parsing in the -moverride string,
10383    for use in error messages.  */
10384
10385 static unsigned int
10386 aarch64_parse_boolean_options (const char *option,
10387                                const struct aarch64_flag_desc *flags,
10388                                unsigned int initial_state,
10389                                const char *option_name)
10390 {
10391   const char separator = '.';
10392   const char* specs = option;
10393   const char* ntoken = option;
10394   unsigned int found_flags = initial_state;
10395
10396   while ((ntoken = strchr (specs, separator)))
10397     {
10398       size_t token_length = ntoken - specs;
10399       unsigned token_ops = aarch64_parse_one_option_token (specs,
10400                                                            token_length,
10401                                                            flags,
10402                                                            option_name);
10403       /* If we find "none" (or, for simplicity's sake, an error) anywhere
10404          in the token stream, reset the supported operations.  So:
10405
10406            adrp+add.cmp+branch.none.adrp+add
10407
10408            would have the result of turning on only adrp+add fusion.  */
10409       if (!token_ops)
10410         found_flags = 0;
10411
10412       found_flags |= token_ops;
10413       specs = ++ntoken;
10414     }
10415
10416   /* We ended with a comma, print something.  */
10417   if (!(*specs))
10418     {
10419       error ("%s string ill-formed\n", option_name);
10420       return 0;
10421     }
10422
10423   /* We still have one more token to parse.  */
10424   size_t token_length = strlen (specs);
10425   unsigned token_ops = aarch64_parse_one_option_token (specs,
10426                                                        token_length,
10427                                                        flags,
10428                                                        option_name);
10429    if (!token_ops)
10430      found_flags = 0;
10431
10432   found_flags |= token_ops;
10433   return found_flags;
10434 }
10435
10436 /* Support for overriding instruction fusion.  */
10437
10438 static void
10439 aarch64_parse_fuse_string (const char *fuse_string,
10440                             struct tune_params *tune)
10441 {
10442   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
10443                                                      aarch64_fusible_pairs,
10444                                                      tune->fusible_ops,
10445                                                      "fuse=");
10446 }
10447
10448 /* Support for overriding other tuning flags.  */
10449
10450 static void
10451 aarch64_parse_tune_string (const char *tune_string,
10452                             struct tune_params *tune)
10453 {
10454   tune->extra_tuning_flags
10455     = aarch64_parse_boolean_options (tune_string,
10456                                      aarch64_tuning_flags,
10457                                      tune->extra_tuning_flags,
10458                                      "tune=");
10459 }
10460
10461 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
10462    we understand.  If it is, extract the option string and handoff to
10463    the appropriate function.  */
10464
10465 void
10466 aarch64_parse_one_override_token (const char* token,
10467                                   size_t length,
10468                                   struct tune_params *tune)
10469 {
10470   const struct aarch64_tuning_override_function *fn
10471     = aarch64_tuning_override_functions;
10472
10473   const char *option_part = strchr (token, '=');
10474   if (!option_part)
10475     {
10476       error ("tuning string missing in option (%s)", token);
10477       return;
10478     }
10479
10480   /* Get the length of the option name.  */
10481   length = option_part - token;
10482   /* Skip the '=' to get to the option string.  */
10483   option_part++;
10484
10485   for (; fn->name != NULL; fn++)
10486     {
10487       if (!strncmp (fn->name, token, length))
10488         {
10489           fn->parse_override (option_part, tune);
10490           return;
10491         }
10492     }
10493
10494   error ("unknown tuning option (%s)",token);
10495   return;
10496 }
10497
10498 /* A checking mechanism for the implementation of the tls size.  */
10499
10500 static void
10501 initialize_aarch64_tls_size (struct gcc_options *opts)
10502 {
10503   if (aarch64_tls_size == 0)
10504     aarch64_tls_size = 24;
10505
10506   switch (opts->x_aarch64_cmodel_var)
10507     {
10508     case AARCH64_CMODEL_TINY:
10509       /* Both the default and maximum TLS size allowed under tiny is 1M which
10510          needs two instructions to address, so we clamp the size to 24.  */
10511       if (aarch64_tls_size > 24)
10512         aarch64_tls_size = 24;
10513       break;
10514     case AARCH64_CMODEL_SMALL:
10515       /* The maximum TLS size allowed under small is 4G.  */
10516       if (aarch64_tls_size > 32)
10517         aarch64_tls_size = 32;
10518       break;
10519     case AARCH64_CMODEL_LARGE:
10520       /* The maximum TLS size allowed under large is 16E.
10521          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
10522       if (aarch64_tls_size > 48)
10523         aarch64_tls_size = 48;
10524       break;
10525     default:
10526       gcc_unreachable ();
10527     }
10528
10529   return;
10530 }
10531
10532 /* Parse STRING looking for options in the format:
10533      string     :: option:string
10534      option     :: name=substring
10535      name       :: {a-z}
10536      substring  :: defined by option.  */
10537
10538 static void
10539 aarch64_parse_override_string (const char* input_string,
10540                                struct tune_params* tune)
10541 {
10542   const char separator = ':';
10543   size_t string_length = strlen (input_string) + 1;
10544   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
10545   char *string = string_root;
10546   strncpy (string, input_string, string_length);
10547   string[string_length - 1] = '\0';
10548
10549   char* ntoken = string;
10550
10551   while ((ntoken = strchr (string, separator)))
10552     {
10553       size_t token_length = ntoken - string;
10554       /* Make this substring look like a string.  */
10555       *ntoken = '\0';
10556       aarch64_parse_one_override_token (string, token_length, tune);
10557       string = ++ntoken;
10558     }
10559
10560   /* One last option to parse.  */
10561   aarch64_parse_one_override_token (string, strlen (string), tune);
10562   free (string_root);
10563 }
10564
10565
10566 static void
10567 aarch64_override_options_after_change_1 (struct gcc_options *opts)
10568 {
10569   /* PR 70044: We have to be careful about being called multiple times for the
10570      same function.  This means all changes should be repeatable.  */
10571
10572   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
10573      Disable the frame pointer flag so the mid-end will not use a frame
10574      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
10575      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
10576      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
10577   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
10578   if (opts->x_flag_omit_frame_pointer == 0)
10579     opts->x_flag_omit_frame_pointer = 2;
10580
10581   /* If not optimizing for size, set the default
10582      alignment to what the target wants.  */
10583   if (!opts->x_optimize_size)
10584     {
10585       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
10586         opts->x_str_align_loops = aarch64_tune_params.loop_align;
10587       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
10588         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
10589       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
10590         opts->x_str_align_functions = aarch64_tune_params.function_align;
10591     }
10592
10593   /* We default to no pc-relative literal loads.  */
10594
10595   aarch64_pcrelative_literal_loads = false;
10596
10597   /* If -mpc-relative-literal-loads is set on the command line, this
10598      implies that the user asked for PC relative literal loads.  */
10599   if (opts->x_pcrelative_literal_loads == 1)
10600     aarch64_pcrelative_literal_loads = true;
10601
10602   /* In the tiny memory model it makes no sense to disallow PC relative
10603      literal pool loads.  */
10604   if (aarch64_cmodel == AARCH64_CMODEL_TINY
10605       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10606     aarch64_pcrelative_literal_loads = true;
10607
10608   /* When enabling the lower precision Newton series for the square root, also
10609      enable it for the reciprocal square root, since the latter is an
10610      intermediary step for the former.  */
10611   if (flag_mlow_precision_sqrt)
10612     flag_mrecip_low_precision_sqrt = true;
10613 }
10614
10615 /* 'Unpack' up the internal tuning structs and update the options
10616     in OPTS.  The caller must have set up selected_tune and selected_arch
10617     as all the other target-specific codegen decisions are
10618     derived from them.  */
10619
10620 void
10621 aarch64_override_options_internal (struct gcc_options *opts)
10622 {
10623   aarch64_tune_flags = selected_tune->flags;
10624   aarch64_tune = selected_tune->sched_core;
10625   /* Make a copy of the tuning parameters attached to the core, which
10626      we may later overwrite.  */
10627   aarch64_tune_params = *(selected_tune->tune);
10628   aarch64_architecture_version = selected_arch->architecture_version;
10629
10630   if (opts->x_aarch64_override_tune_string)
10631     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
10632                                   &aarch64_tune_params);
10633
10634   /* This target defaults to strict volatile bitfields.  */
10635   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
10636     opts->x_flag_strict_volatile_bitfields = 1;
10637
10638   initialize_aarch64_code_model (opts);
10639   initialize_aarch64_tls_size (opts);
10640
10641   int queue_depth = 0;
10642   switch (aarch64_tune_params.autoprefetcher_model)
10643     {
10644       case tune_params::AUTOPREFETCHER_OFF:
10645         queue_depth = -1;
10646         break;
10647       case tune_params::AUTOPREFETCHER_WEAK:
10648         queue_depth = 0;
10649         break;
10650       case tune_params::AUTOPREFETCHER_STRONG:
10651         queue_depth = max_insn_queue_index + 1;
10652         break;
10653       default:
10654         gcc_unreachable ();
10655     }
10656
10657   /* We don't mind passing in global_options_set here as we don't use
10658      the *options_set structs anyway.  */
10659   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
10660                          queue_depth,
10661                          opts->x_param_values,
10662                          global_options_set.x_param_values);
10663
10664   /* Set up parameters to be used in prefetching algorithm.  Do not
10665      override the defaults unless we are tuning for a core we have
10666      researched values for.  */
10667   if (aarch64_tune_params.prefetch->num_slots > 0)
10668     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
10669                            aarch64_tune_params.prefetch->num_slots,
10670                            opts->x_param_values,
10671                            global_options_set.x_param_values);
10672   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
10673     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
10674                            aarch64_tune_params.prefetch->l1_cache_size,
10675                            opts->x_param_values,
10676                            global_options_set.x_param_values);
10677   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
10678     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
10679                            aarch64_tune_params.prefetch->l1_cache_line_size,
10680                            opts->x_param_values,
10681                            global_options_set.x_param_values);
10682   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
10683     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
10684                            aarch64_tune_params.prefetch->l2_cache_size,
10685                            opts->x_param_values,
10686                            global_options_set.x_param_values);
10687   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
10688     maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
10689                            0,
10690                            opts->x_param_values,
10691                            global_options_set.x_param_values);
10692   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
10693     maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
10694                            aarch64_tune_params.prefetch->minimum_stride,
10695                            opts->x_param_values,
10696                            global_options_set.x_param_values);
10697
10698   /* Use the alternative scheduling-pressure algorithm by default.  */
10699   maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
10700                          opts->x_param_values,
10701                          global_options_set.x_param_values);
10702
10703   /* Enable sw prefetching at specified optimization level for
10704      CPUS that have prefetch.  Lower optimization level threshold by 1
10705      when profiling is enabled.  */
10706   if (opts->x_flag_prefetch_loop_arrays < 0
10707       && !opts->x_optimize_size
10708       && aarch64_tune_params.prefetch->default_opt_level >= 0
10709       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
10710     opts->x_flag_prefetch_loop_arrays = 1;
10711
10712   aarch64_override_options_after_change_1 (opts);
10713 }
10714
10715 /* Print a hint with a suggestion for a core or architecture name that
10716    most closely resembles what the user passed in STR.  ARCH is true if
10717    the user is asking for an architecture name.  ARCH is false if the user
10718    is asking for a core name.  */
10719
10720 static void
10721 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
10722 {
10723   auto_vec<const char *> candidates;
10724   const struct processor *entry = arch ? all_architectures : all_cores;
10725   for (; entry->name != NULL; entry++)
10726     candidates.safe_push (entry->name);
10727
10728 #ifdef HAVE_LOCAL_CPU_DETECT
10729   /* Add also "native" as possible value.  */
10730   if (arch)
10731     candidates.safe_push ("native");
10732 #endif
10733
10734   char *s;
10735   const char *hint = candidates_list_and_hint (str, s, candidates);
10736   if (hint)
10737     inform (input_location, "valid arguments are: %s;"
10738                              " did you mean %qs?", s, hint);
10739   else
10740     inform (input_location, "valid arguments are: %s", s);
10741
10742   XDELETEVEC (s);
10743 }
10744
10745 /* Print a hint with a suggestion for a core name that most closely resembles
10746    what the user passed in STR.  */
10747
10748 inline static void
10749 aarch64_print_hint_for_core (const char *str)
10750 {
10751   aarch64_print_hint_for_core_or_arch (str, false);
10752 }
10753
10754 /* Print a hint with a suggestion for an architecture name that most closely
10755    resembles what the user passed in STR.  */
10756
10757 inline static void
10758 aarch64_print_hint_for_arch (const char *str)
10759 {
10760   aarch64_print_hint_for_core_or_arch (str, true);
10761 }
10762
10763 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
10764    specified in STR and throw errors if appropriate.  Put the results if
10765    they are valid in RES and ISA_FLAGS.  Return whether the option is
10766    valid.  */
10767
10768 static bool
10769 aarch64_validate_mcpu (const char *str, const struct processor **res,
10770                        unsigned long *isa_flags)
10771 {
10772   enum aarch64_parse_opt_result parse_res
10773     = aarch64_parse_cpu (str, res, isa_flags);
10774
10775   if (parse_res == AARCH64_PARSE_OK)
10776     return true;
10777
10778   switch (parse_res)
10779     {
10780       case AARCH64_PARSE_MISSING_ARG:
10781         error ("missing cpu name in %<-mcpu=%s%>", str);
10782         break;
10783       case AARCH64_PARSE_INVALID_ARG:
10784         error ("unknown value %qs for -mcpu", str);
10785         aarch64_print_hint_for_core (str);
10786         break;
10787       case AARCH64_PARSE_INVALID_FEATURE:
10788         error ("invalid feature modifier in %<-mcpu=%s%>", str);
10789         break;
10790       default:
10791         gcc_unreachable ();
10792     }
10793
10794   return false;
10795 }
10796
10797 /* Validate a command-line -march option.  Parse the arch and extensions
10798    (if any) specified in STR and throw errors if appropriate.  Put the
10799    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
10800    option is valid.  */
10801
10802 static bool
10803 aarch64_validate_march (const char *str, const struct processor **res,
10804                          unsigned long *isa_flags)
10805 {
10806   enum aarch64_parse_opt_result parse_res
10807     = aarch64_parse_arch (str, res, isa_flags);
10808
10809   if (parse_res == AARCH64_PARSE_OK)
10810     return true;
10811
10812   switch (parse_res)
10813     {
10814       case AARCH64_PARSE_MISSING_ARG:
10815         error ("missing arch name in %<-march=%s%>", str);
10816         break;
10817       case AARCH64_PARSE_INVALID_ARG:
10818         error ("unknown value %qs for -march", str);
10819         aarch64_print_hint_for_arch (str);
10820         break;
10821       case AARCH64_PARSE_INVALID_FEATURE:
10822         error ("invalid feature modifier in %<-march=%s%>", str);
10823         break;
10824       default:
10825         gcc_unreachable ();
10826     }
10827
10828   return false;
10829 }
10830
10831 /* Validate a command-line -mtune option.  Parse the cpu
10832    specified in STR and throw errors if appropriate.  Put the
10833    result, if it is valid, in RES.  Return whether the option is
10834    valid.  */
10835
10836 static bool
10837 aarch64_validate_mtune (const char *str, const struct processor **res)
10838 {
10839   enum aarch64_parse_opt_result parse_res
10840     = aarch64_parse_tune (str, res);
10841
10842   if (parse_res == AARCH64_PARSE_OK)
10843     return true;
10844
10845   switch (parse_res)
10846     {
10847       case AARCH64_PARSE_MISSING_ARG:
10848         error ("missing cpu name in %<-mtune=%s%>", str);
10849         break;
10850       case AARCH64_PARSE_INVALID_ARG:
10851         error ("unknown value %qs for -mtune", str);
10852         aarch64_print_hint_for_core (str);
10853         break;
10854       default:
10855         gcc_unreachable ();
10856     }
10857   return false;
10858 }
10859
10860 /* Return the CPU corresponding to the enum CPU.
10861    If it doesn't specify a cpu, return the default.  */
10862
10863 static const struct processor *
10864 aarch64_get_tune_cpu (enum aarch64_processor cpu)
10865 {
10866   if (cpu != aarch64_none)
10867     return &all_cores[cpu];
10868
10869   /* The & 0x3f is to extract the bottom 6 bits that encode the
10870      default cpu as selected by the --with-cpu GCC configure option
10871      in config.gcc.
10872      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
10873      flags mechanism should be reworked to make it more sane.  */
10874   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10875 }
10876
10877 /* Return the architecture corresponding to the enum ARCH.
10878    If it doesn't specify a valid architecture, return the default.  */
10879
10880 static const struct processor *
10881 aarch64_get_arch (enum aarch64_arch arch)
10882 {
10883   if (arch != aarch64_no_arch)
10884     return &all_architectures[arch];
10885
10886   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10887
10888   return &all_architectures[cpu->arch];
10889 }
10890
10891 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
10892
10893 static poly_uint16
10894 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
10895 {
10896   /* For now generate vector-length agnostic code for -msve-vector-bits=128.
10897      This ensures we can clearly distinguish SVE and Advanced SIMD modes when
10898      deciding which .md file patterns to use and when deciding whether
10899      something is a legitimate address or constant.  */
10900   if (value == SVE_SCALABLE || value == SVE_128)
10901     return poly_uint16 (2, 2);
10902   else
10903     return (int) value / 64;
10904 }
10905
10906 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
10907    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
10908    tuning structs.  In particular it must set selected_tune and
10909    aarch64_isa_flags that define the available ISA features and tuning
10910    decisions.  It must also set selected_arch as this will be used to
10911    output the .arch asm tags for each function.  */
10912
10913 static void
10914 aarch64_override_options (void)
10915 {
10916   unsigned long cpu_isa = 0;
10917   unsigned long arch_isa = 0;
10918   aarch64_isa_flags = 0;
10919
10920   bool valid_cpu = true;
10921   bool valid_tune = true;
10922   bool valid_arch = true;
10923
10924   selected_cpu = NULL;
10925   selected_arch = NULL;
10926   selected_tune = NULL;
10927
10928   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
10929      If either of -march or -mtune is given, they override their
10930      respective component of -mcpu.  */
10931   if (aarch64_cpu_string)
10932     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
10933                                         &cpu_isa);
10934
10935   if (aarch64_arch_string)
10936     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
10937                                           &arch_isa);
10938
10939   if (aarch64_tune_string)
10940     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
10941
10942   /* If the user did not specify a processor, choose the default
10943      one for them.  This will be the CPU set during configuration using
10944      --with-cpu, otherwise it is "generic".  */
10945   if (!selected_cpu)
10946     {
10947       if (selected_arch)
10948         {
10949           selected_cpu = &all_cores[selected_arch->ident];
10950           aarch64_isa_flags = arch_isa;
10951           explicit_arch = selected_arch->arch;
10952         }
10953       else
10954         {
10955           /* Get default configure-time CPU.  */
10956           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
10957           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
10958         }
10959
10960       if (selected_tune)
10961         explicit_tune_core = selected_tune->ident;
10962     }
10963   /* If both -mcpu and -march are specified check that they are architecturally
10964      compatible, warn if they're not and prefer the -march ISA flags.  */
10965   else if (selected_arch)
10966     {
10967       if (selected_arch->arch != selected_cpu->arch)
10968         {
10969           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
10970                        all_architectures[selected_cpu->arch].name,
10971                        selected_arch->name);
10972         }
10973       aarch64_isa_flags = arch_isa;
10974       explicit_arch = selected_arch->arch;
10975       explicit_tune_core = selected_tune ? selected_tune->ident
10976                                           : selected_cpu->ident;
10977     }
10978   else
10979     {
10980       /* -mcpu but no -march.  */
10981       aarch64_isa_flags = cpu_isa;
10982       explicit_tune_core = selected_tune ? selected_tune->ident
10983                                           : selected_cpu->ident;
10984       gcc_assert (selected_cpu);
10985       selected_arch = &all_architectures[selected_cpu->arch];
10986       explicit_arch = selected_arch->arch;
10987     }
10988
10989   /* Set the arch as well as we will need it when outputing
10990      the .arch directive in assembly.  */
10991   if (!selected_arch)
10992     {
10993       gcc_assert (selected_cpu);
10994       selected_arch = &all_architectures[selected_cpu->arch];
10995     }
10996
10997   if (!selected_tune)
10998     selected_tune = selected_cpu;
10999
11000 #ifndef HAVE_AS_MABI_OPTION
11001   /* The compiler may have been configured with 2.23.* binutils, which does
11002      not have support for ILP32.  */
11003   if (TARGET_ILP32)
11004     error ("assembler does not support -mabi=ilp32");
11005 #endif
11006
11007   /* Convert -msve-vector-bits to a VG count.  */
11008   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
11009
11010   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
11011     sorry ("return address signing is only supported for -mabi=lp64");
11012
11013   /* Make sure we properly set up the explicit options.  */
11014   if ((aarch64_cpu_string && valid_cpu)
11015        || (aarch64_tune_string && valid_tune))
11016     gcc_assert (explicit_tune_core != aarch64_none);
11017
11018   if ((aarch64_cpu_string && valid_cpu)
11019        || (aarch64_arch_string && valid_arch))
11020     gcc_assert (explicit_arch != aarch64_no_arch);
11021
11022   aarch64_override_options_internal (&global_options);
11023
11024   /* Save these options as the default ones in case we push and pop them later
11025      while processing functions with potential target attributes.  */
11026   target_option_default_node = target_option_current_node
11027       = build_target_option_node (&global_options);
11028 }
11029
11030 /* Implement targetm.override_options_after_change.  */
11031
11032 static void
11033 aarch64_override_options_after_change (void)
11034 {
11035   aarch64_override_options_after_change_1 (&global_options);
11036 }
11037
11038 static struct machine_function *
11039 aarch64_init_machine_status (void)
11040 {
11041   struct machine_function *machine;
11042   machine = ggc_cleared_alloc<machine_function> ();
11043   return machine;
11044 }
11045
11046 void
11047 aarch64_init_expanders (void)
11048 {
11049   init_machine_status = aarch64_init_machine_status;
11050 }
11051
11052 /* A checking mechanism for the implementation of the various code models.  */
11053 static void
11054 initialize_aarch64_code_model (struct gcc_options *opts)
11055 {
11056    if (opts->x_flag_pic)
11057      {
11058        switch (opts->x_aarch64_cmodel_var)
11059          {
11060          case AARCH64_CMODEL_TINY:
11061            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
11062            break;
11063          case AARCH64_CMODEL_SMALL:
11064 #ifdef HAVE_AS_SMALL_PIC_RELOCS
11065            aarch64_cmodel = (flag_pic == 2
11066                              ? AARCH64_CMODEL_SMALL_PIC
11067                              : AARCH64_CMODEL_SMALL_SPIC);
11068 #else
11069            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
11070 #endif
11071            break;
11072          case AARCH64_CMODEL_LARGE:
11073            sorry ("code model %qs with -f%s", "large",
11074                   opts->x_flag_pic > 1 ? "PIC" : "pic");
11075            break;
11076          default:
11077            gcc_unreachable ();
11078          }
11079      }
11080    else
11081      aarch64_cmodel = opts->x_aarch64_cmodel_var;
11082 }
11083
11084 /* Implement TARGET_OPTION_SAVE.  */
11085
11086 static void
11087 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
11088 {
11089   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
11090 }
11091
11092 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
11093    using the information saved in PTR.  */
11094
11095 static void
11096 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
11097 {
11098   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
11099   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11100   opts->x_explicit_arch = ptr->x_explicit_arch;
11101   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
11102   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
11103
11104   aarch64_override_options_internal (opts);
11105 }
11106
11107 /* Implement TARGET_OPTION_PRINT.  */
11108
11109 static void
11110 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
11111 {
11112   const struct processor *cpu
11113     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11114   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
11115   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
11116   std::string extension
11117     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
11118
11119   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
11120   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
11121            arch->name, extension.c_str ());
11122 }
11123
11124 static GTY(()) tree aarch64_previous_fndecl;
11125
11126 void
11127 aarch64_reset_previous_fndecl (void)
11128 {
11129   aarch64_previous_fndecl = NULL;
11130 }
11131
11132 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
11133    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
11134    make sure optab availability predicates are recomputed when necessary.  */
11135
11136 void
11137 aarch64_save_restore_target_globals (tree new_tree)
11138 {
11139   if (TREE_TARGET_GLOBALS (new_tree))
11140     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
11141   else if (new_tree == target_option_default_node)
11142     restore_target_globals (&default_target_globals);
11143   else
11144     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
11145 }
11146
11147 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
11148    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
11149    of the function, if such exists.  This function may be called multiple
11150    times on a single function so use aarch64_previous_fndecl to avoid
11151    setting up identical state.  */
11152
11153 static void
11154 aarch64_set_current_function (tree fndecl)
11155 {
11156   if (!fndecl || fndecl == aarch64_previous_fndecl)
11157     return;
11158
11159   tree old_tree = (aarch64_previous_fndecl
11160                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
11161                    : NULL_TREE);
11162
11163   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11164
11165   /* If current function has no attributes but the previous one did,
11166      use the default node.  */
11167   if (!new_tree && old_tree)
11168     new_tree = target_option_default_node;
11169
11170   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
11171      the default have been handled by aarch64_save_restore_target_globals from
11172      aarch64_pragma_target_parse.  */
11173   if (old_tree == new_tree)
11174     return;
11175
11176   aarch64_previous_fndecl = fndecl;
11177
11178   /* First set the target options.  */
11179   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
11180
11181   aarch64_save_restore_target_globals (new_tree);
11182 }
11183
11184 /* Enum describing the various ways we can handle attributes.
11185    In many cases we can reuse the generic option handling machinery.  */
11186
11187 enum aarch64_attr_opt_type
11188 {
11189   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
11190   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
11191   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
11192   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
11193 };
11194
11195 /* All the information needed to handle a target attribute.
11196    NAME is the name of the attribute.
11197    ATTR_TYPE specifies the type of behavior of the attribute as described
11198    in the definition of enum aarch64_attr_opt_type.
11199    ALLOW_NEG is true if the attribute supports a "no-" form.
11200    HANDLER is the function that takes the attribute string as an argument
11201    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
11202    OPT_NUM is the enum specifying the option that the attribute modifies.
11203    This is needed for attributes that mirror the behavior of a command-line
11204    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
11205    aarch64_attr_enum.  */
11206
11207 struct aarch64_attribute_info
11208 {
11209   const char *name;
11210   enum aarch64_attr_opt_type attr_type;
11211   bool allow_neg;
11212   bool (*handler) (const char *);
11213   enum opt_code opt_num;
11214 };
11215
11216 /* Handle the ARCH_STR argument to the arch= target attribute.  */
11217
11218 static bool
11219 aarch64_handle_attr_arch (const char *str)
11220 {
11221   const struct processor *tmp_arch = NULL;
11222   enum aarch64_parse_opt_result parse_res
11223     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
11224
11225   if (parse_res == AARCH64_PARSE_OK)
11226     {
11227       gcc_assert (tmp_arch);
11228       selected_arch = tmp_arch;
11229       explicit_arch = selected_arch->arch;
11230       return true;
11231     }
11232
11233   switch (parse_res)
11234     {
11235       case AARCH64_PARSE_MISSING_ARG:
11236         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
11237         break;
11238       case AARCH64_PARSE_INVALID_ARG:
11239         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
11240         aarch64_print_hint_for_arch (str);
11241         break;
11242       case AARCH64_PARSE_INVALID_FEATURE:
11243         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11244         break;
11245       default:
11246         gcc_unreachable ();
11247     }
11248
11249   return false;
11250 }
11251
11252 /* Handle the argument CPU_STR to the cpu= target attribute.  */
11253
11254 static bool
11255 aarch64_handle_attr_cpu (const char *str)
11256 {
11257   const struct processor *tmp_cpu = NULL;
11258   enum aarch64_parse_opt_result parse_res
11259     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
11260
11261   if (parse_res == AARCH64_PARSE_OK)
11262     {
11263       gcc_assert (tmp_cpu);
11264       selected_tune = tmp_cpu;
11265       explicit_tune_core = selected_tune->ident;
11266
11267       selected_arch = &all_architectures[tmp_cpu->arch];
11268       explicit_arch = selected_arch->arch;
11269       return true;
11270     }
11271
11272   switch (parse_res)
11273     {
11274       case AARCH64_PARSE_MISSING_ARG:
11275         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
11276         break;
11277       case AARCH64_PARSE_INVALID_ARG:
11278         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
11279         aarch64_print_hint_for_core (str);
11280         break;
11281       case AARCH64_PARSE_INVALID_FEATURE:
11282         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11283         break;
11284       default:
11285         gcc_unreachable ();
11286     }
11287
11288   return false;
11289 }
11290
11291 /* Handle the argument STR to the tune= target attribute.  */
11292
11293 static bool
11294 aarch64_handle_attr_tune (const char *str)
11295 {
11296   const struct processor *tmp_tune = NULL;
11297   enum aarch64_parse_opt_result parse_res
11298     = aarch64_parse_tune (str, &tmp_tune);
11299
11300   if (parse_res == AARCH64_PARSE_OK)
11301     {
11302       gcc_assert (tmp_tune);
11303       selected_tune = tmp_tune;
11304       explicit_tune_core = selected_tune->ident;
11305       return true;
11306     }
11307
11308   switch (parse_res)
11309     {
11310       case AARCH64_PARSE_INVALID_ARG:
11311         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
11312         aarch64_print_hint_for_core (str);
11313         break;
11314       default:
11315         gcc_unreachable ();
11316     }
11317
11318   return false;
11319 }
11320
11321 /* Parse an architecture extensions target attribute string specified in STR.
11322    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
11323    if successful.  Update aarch64_isa_flags to reflect the ISA features
11324    modified.  */
11325
11326 static bool
11327 aarch64_handle_attr_isa_flags (char *str)
11328 {
11329   enum aarch64_parse_opt_result parse_res;
11330   unsigned long isa_flags = aarch64_isa_flags;
11331
11332   /* We allow "+nothing" in the beginning to clear out all architectural
11333      features if the user wants to handpick specific features.  */
11334   if (strncmp ("+nothing", str, 8) == 0)
11335     {
11336       isa_flags = 0;
11337       str += 8;
11338     }
11339
11340   parse_res = aarch64_parse_extension (str, &isa_flags);
11341
11342   if (parse_res == AARCH64_PARSE_OK)
11343     {
11344       aarch64_isa_flags = isa_flags;
11345       return true;
11346     }
11347
11348   switch (parse_res)
11349     {
11350       case AARCH64_PARSE_MISSING_ARG:
11351         error ("missing value in %<target()%> pragma or attribute");
11352         break;
11353
11354       case AARCH64_PARSE_INVALID_FEATURE:
11355         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11356         break;
11357
11358       default:
11359         gcc_unreachable ();
11360     }
11361
11362  return false;
11363 }
11364
11365 /* The target attributes that we support.  On top of these we also support just
11366    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
11367    handled explicitly in aarch64_process_one_target_attr.  */
11368
11369 static const struct aarch64_attribute_info aarch64_attributes[] =
11370 {
11371   { "general-regs-only", aarch64_attr_mask, false, NULL,
11372      OPT_mgeneral_regs_only },
11373   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
11374      OPT_mfix_cortex_a53_835769 },
11375   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
11376      OPT_mfix_cortex_a53_843419 },
11377   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
11378   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
11379   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
11380      OPT_momit_leaf_frame_pointer },
11381   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
11382   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
11383      OPT_march_ },
11384   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
11385   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
11386      OPT_mtune_ },
11387   { "sign-return-address", aarch64_attr_enum, false, NULL,
11388      OPT_msign_return_address_ },
11389   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
11390 };
11391
11392 /* Parse ARG_STR which contains the definition of one target attribute.
11393    Show appropriate errors if any or return true if the attribute is valid.  */
11394
11395 static bool
11396 aarch64_process_one_target_attr (char *arg_str)
11397 {
11398   bool invert = false;
11399
11400   size_t len = strlen (arg_str);
11401
11402   if (len == 0)
11403     {
11404       error ("malformed %<target()%> pragma or attribute");
11405       return false;
11406     }
11407
11408   char *str_to_check = (char *) alloca (len + 1);
11409   strcpy (str_to_check, arg_str);
11410
11411   /* Skip leading whitespace.  */
11412   while (*str_to_check == ' ' || *str_to_check == '\t')
11413     str_to_check++;
11414
11415   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
11416      It is easier to detect and handle it explicitly here rather than going
11417      through the machinery for the rest of the target attributes in this
11418      function.  */
11419   if (*str_to_check == '+')
11420     return aarch64_handle_attr_isa_flags (str_to_check);
11421
11422   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
11423     {
11424       invert = true;
11425       str_to_check += 3;
11426     }
11427   char *arg = strchr (str_to_check, '=');
11428
11429   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
11430      and point ARG to "foo".  */
11431   if (arg)
11432     {
11433       *arg = '\0';
11434       arg++;
11435     }
11436   const struct aarch64_attribute_info *p_attr;
11437   bool found = false;
11438   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
11439     {
11440       /* If the names don't match up, or the user has given an argument
11441          to an attribute that doesn't accept one, or didn't give an argument
11442          to an attribute that expects one, fail to match.  */
11443       if (strcmp (str_to_check, p_attr->name) != 0)
11444         continue;
11445
11446       found = true;
11447       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
11448                               || p_attr->attr_type == aarch64_attr_enum;
11449
11450       if (attr_need_arg_p ^ (arg != NULL))
11451         {
11452           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
11453           return false;
11454         }
11455
11456       /* If the name matches but the attribute does not allow "no-" versions
11457          then we can't match.  */
11458       if (invert && !p_attr->allow_neg)
11459         {
11460           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
11461           return false;
11462         }
11463
11464       switch (p_attr->attr_type)
11465         {
11466         /* Has a custom handler registered.
11467            For example, cpu=, arch=, tune=.  */
11468           case aarch64_attr_custom:
11469             gcc_assert (p_attr->handler);
11470             if (!p_attr->handler (arg))
11471               return false;
11472             break;
11473
11474           /* Either set or unset a boolean option.  */
11475           case aarch64_attr_bool:
11476             {
11477               struct cl_decoded_option decoded;
11478
11479               generate_option (p_attr->opt_num, NULL, !invert,
11480                                CL_TARGET, &decoded);
11481               aarch64_handle_option (&global_options, &global_options_set,
11482                                       &decoded, input_location);
11483               break;
11484             }
11485           /* Set or unset a bit in the target_flags.  aarch64_handle_option
11486              should know what mask to apply given the option number.  */
11487           case aarch64_attr_mask:
11488             {
11489               struct cl_decoded_option decoded;
11490               /* We only need to specify the option number.
11491                  aarch64_handle_option will know which mask to apply.  */
11492               decoded.opt_index = p_attr->opt_num;
11493               decoded.value = !invert;
11494               aarch64_handle_option (&global_options, &global_options_set,
11495                                       &decoded, input_location);
11496               break;
11497             }
11498           /* Use the option setting machinery to set an option to an enum.  */
11499           case aarch64_attr_enum:
11500             {
11501               gcc_assert (arg);
11502               bool valid;
11503               int value;
11504               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
11505                                               &value, CL_TARGET);
11506               if (valid)
11507                 {
11508                   set_option (&global_options, NULL, p_attr->opt_num, value,
11509                               NULL, DK_UNSPECIFIED, input_location,
11510                               global_dc);
11511                 }
11512               else
11513                 {
11514                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
11515                 }
11516               break;
11517             }
11518           default:
11519             gcc_unreachable ();
11520         }
11521     }
11522
11523   /* If we reached here we either have found an attribute and validated
11524      it or didn't match any.  If we matched an attribute but its arguments
11525      were malformed we will have returned false already.  */
11526   return found;
11527 }
11528
11529 /* Count how many times the character C appears in
11530    NULL-terminated string STR.  */
11531
11532 static unsigned int
11533 num_occurences_in_str (char c, char *str)
11534 {
11535   unsigned int res = 0;
11536   while (*str != '\0')
11537     {
11538       if (*str == c)
11539         res++;
11540
11541       str++;
11542     }
11543
11544   return res;
11545 }
11546
11547 /* Parse the tree in ARGS that contains the target attribute information
11548    and update the global target options space.  */
11549
11550 bool
11551 aarch64_process_target_attr (tree args)
11552 {
11553   if (TREE_CODE (args) == TREE_LIST)
11554     {
11555       do
11556         {
11557           tree head = TREE_VALUE (args);
11558           if (head)
11559             {
11560               if (!aarch64_process_target_attr (head))
11561                 return false;
11562             }
11563           args = TREE_CHAIN (args);
11564         } while (args);
11565
11566       return true;
11567     }
11568
11569   if (TREE_CODE (args) != STRING_CST)
11570     {
11571       error ("attribute %<target%> argument not a string");
11572       return false;
11573     }
11574
11575   size_t len = strlen (TREE_STRING_POINTER (args));
11576   char *str_to_check = (char *) alloca (len + 1);
11577   strcpy (str_to_check, TREE_STRING_POINTER (args));
11578
11579   if (len == 0)
11580     {
11581       error ("malformed %<target()%> pragma or attribute");
11582       return false;
11583     }
11584
11585   /* Used to catch empty spaces between commas i.e.
11586      attribute ((target ("attr1,,attr2"))).  */
11587   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
11588
11589   /* Handle multiple target attributes separated by ','.  */
11590   char *token = strtok (str_to_check, ",");
11591
11592   unsigned int num_attrs = 0;
11593   while (token)
11594     {
11595       num_attrs++;
11596       if (!aarch64_process_one_target_attr (token))
11597         {
11598           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
11599           return false;
11600         }
11601
11602       token = strtok (NULL, ",");
11603     }
11604
11605   if (num_attrs != num_commas + 1)
11606     {
11607       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
11608       return false;
11609     }
11610
11611   return true;
11612 }
11613
11614 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
11615    process attribute ((target ("..."))).  */
11616
11617 static bool
11618 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
11619 {
11620   struct cl_target_option cur_target;
11621   bool ret;
11622   tree old_optimize;
11623   tree new_target, new_optimize;
11624   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11625
11626   /* If what we're processing is the current pragma string then the
11627      target option node is already stored in target_option_current_node
11628      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
11629      having to re-parse the string.  This is especially useful to keep
11630      arm_neon.h compile times down since that header contains a lot
11631      of intrinsics enclosed in pragmas.  */
11632   if (!existing_target && args == current_target_pragma)
11633     {
11634       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
11635       return true;
11636     }
11637   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11638
11639   old_optimize = build_optimization_node (&global_options);
11640   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11641
11642   /* If the function changed the optimization levels as well as setting
11643      target options, start with the optimizations specified.  */
11644   if (func_optimize && func_optimize != old_optimize)
11645     cl_optimization_restore (&global_options,
11646                              TREE_OPTIMIZATION (func_optimize));
11647
11648   /* Save the current target options to restore at the end.  */
11649   cl_target_option_save (&cur_target, &global_options);
11650
11651   /* If fndecl already has some target attributes applied to it, unpack
11652      them so that we add this attribute on top of them, rather than
11653      overwriting them.  */
11654   if (existing_target)
11655     {
11656       struct cl_target_option *existing_options
11657         = TREE_TARGET_OPTION (existing_target);
11658
11659       if (existing_options)
11660         cl_target_option_restore (&global_options, existing_options);
11661     }
11662   else
11663     cl_target_option_restore (&global_options,
11664                         TREE_TARGET_OPTION (target_option_current_node));
11665
11666   ret = aarch64_process_target_attr (args);
11667
11668   /* Set up any additional state.  */
11669   if (ret)
11670     {
11671       aarch64_override_options_internal (&global_options);
11672       /* Initialize SIMD builtins if we haven't already.
11673          Set current_target_pragma to NULL for the duration so that
11674          the builtin initialization code doesn't try to tag the functions
11675          being built with the attributes specified by any current pragma, thus
11676          going into an infinite recursion.  */
11677       if (TARGET_SIMD)
11678         {
11679           tree saved_current_target_pragma = current_target_pragma;
11680           current_target_pragma = NULL;
11681           aarch64_init_simd_builtins ();
11682           current_target_pragma = saved_current_target_pragma;
11683         }
11684       new_target = build_target_option_node (&global_options);
11685     }
11686   else
11687     new_target = NULL;
11688
11689   new_optimize = build_optimization_node (&global_options);
11690
11691   if (fndecl && ret)
11692     {
11693       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
11694
11695       if (old_optimize != new_optimize)
11696         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
11697     }
11698
11699   cl_target_option_restore (&global_options, &cur_target);
11700
11701   if (old_optimize != new_optimize)
11702     cl_optimization_restore (&global_options,
11703                              TREE_OPTIMIZATION (old_optimize));
11704   return ret;
11705 }
11706
11707 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
11708    tri-bool options (yes, no, don't care) and the default value is
11709    DEF, determine whether to reject inlining.  */
11710
11711 static bool
11712 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
11713                                      int dont_care, int def)
11714 {
11715   /* If the callee doesn't care, always allow inlining.  */
11716   if (callee == dont_care)
11717     return true;
11718
11719   /* If the caller doesn't care, always allow inlining.  */
11720   if (caller == dont_care)
11721     return true;
11722
11723   /* Otherwise, allow inlining if either the callee and caller values
11724      agree, or if the callee is using the default value.  */
11725   return (callee == caller || callee == def);
11726 }
11727
11728 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
11729    to inline CALLEE into CALLER based on target-specific info.
11730    Make sure that the caller and callee have compatible architectural
11731    features.  Then go through the other possible target attributes
11732    and see if they can block inlining.  Try not to reject always_inline
11733    callees unless they are incompatible architecturally.  */
11734
11735 static bool
11736 aarch64_can_inline_p (tree caller, tree callee)
11737 {
11738   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
11739   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
11740
11741   struct cl_target_option *caller_opts
11742         = TREE_TARGET_OPTION (caller_tree ? caller_tree
11743                                            : target_option_default_node);
11744
11745   struct cl_target_option *callee_opts
11746         = TREE_TARGET_OPTION (callee_tree ? callee_tree
11747                                            : target_option_default_node);
11748
11749   /* Callee's ISA flags should be a subset of the caller's.  */
11750   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
11751        != callee_opts->x_aarch64_isa_flags)
11752     return false;
11753
11754   /* Allow non-strict aligned functions inlining into strict
11755      aligned ones.  */
11756   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
11757        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
11758       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
11759            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
11760     return false;
11761
11762   bool always_inline = lookup_attribute ("always_inline",
11763                                           DECL_ATTRIBUTES (callee));
11764
11765   /* If the architectural features match up and the callee is always_inline
11766      then the other attributes don't matter.  */
11767   if (always_inline)
11768     return true;
11769
11770   if (caller_opts->x_aarch64_cmodel_var
11771       != callee_opts->x_aarch64_cmodel_var)
11772     return false;
11773
11774   if (caller_opts->x_aarch64_tls_dialect
11775       != callee_opts->x_aarch64_tls_dialect)
11776     return false;
11777
11778   /* Honour explicit requests to workaround errata.  */
11779   if (!aarch64_tribools_ok_for_inlining_p (
11780           caller_opts->x_aarch64_fix_a53_err835769,
11781           callee_opts->x_aarch64_fix_a53_err835769,
11782           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
11783     return false;
11784
11785   if (!aarch64_tribools_ok_for_inlining_p (
11786           caller_opts->x_aarch64_fix_a53_err843419,
11787           callee_opts->x_aarch64_fix_a53_err843419,
11788           2, TARGET_FIX_ERR_A53_843419))
11789     return false;
11790
11791   /* If the user explicitly specified -momit-leaf-frame-pointer for the
11792      caller and calle and they don't match up, reject inlining.  */
11793   if (!aarch64_tribools_ok_for_inlining_p (
11794           caller_opts->x_flag_omit_leaf_frame_pointer,
11795           callee_opts->x_flag_omit_leaf_frame_pointer,
11796           2, 1))
11797     return false;
11798
11799   /* If the callee has specific tuning overrides, respect them.  */
11800   if (callee_opts->x_aarch64_override_tune_string != NULL
11801       && caller_opts->x_aarch64_override_tune_string == NULL)
11802     return false;
11803
11804   /* If the user specified tuning override strings for the
11805      caller and callee and they don't match up, reject inlining.
11806      We just do a string compare here, we don't analyze the meaning
11807      of the string, as it would be too costly for little gain.  */
11808   if (callee_opts->x_aarch64_override_tune_string
11809       && caller_opts->x_aarch64_override_tune_string
11810       && (strcmp (callee_opts->x_aarch64_override_tune_string,
11811                   caller_opts->x_aarch64_override_tune_string) != 0))
11812     return false;
11813
11814   return true;
11815 }
11816
11817 /* Return true if SYMBOL_REF X binds locally.  */
11818
11819 static bool
11820 aarch64_symbol_binds_local_p (const_rtx x)
11821 {
11822   return (SYMBOL_REF_DECL (x)
11823           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
11824           : SYMBOL_REF_LOCAL_P (x));
11825 }
11826
11827 /* Return true if SYMBOL_REF X is thread local */
11828 static bool
11829 aarch64_tls_symbol_p (rtx x)
11830 {
11831   if (! TARGET_HAVE_TLS)
11832     return false;
11833
11834   if (GET_CODE (x) != SYMBOL_REF)
11835     return false;
11836
11837   return SYMBOL_REF_TLS_MODEL (x) != 0;
11838 }
11839
11840 /* Classify a TLS symbol into one of the TLS kinds.  */
11841 enum aarch64_symbol_type
11842 aarch64_classify_tls_symbol (rtx x)
11843 {
11844   enum tls_model tls_kind = tls_symbolic_operand_type (x);
11845
11846   switch (tls_kind)
11847     {
11848     case TLS_MODEL_GLOBAL_DYNAMIC:
11849     case TLS_MODEL_LOCAL_DYNAMIC:
11850       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
11851
11852     case TLS_MODEL_INITIAL_EXEC:
11853       switch (aarch64_cmodel)
11854         {
11855         case AARCH64_CMODEL_TINY:
11856         case AARCH64_CMODEL_TINY_PIC:
11857           return SYMBOL_TINY_TLSIE;
11858         default:
11859           return SYMBOL_SMALL_TLSIE;
11860         }
11861
11862     case TLS_MODEL_LOCAL_EXEC:
11863       if (aarch64_tls_size == 12)
11864         return SYMBOL_TLSLE12;
11865       else if (aarch64_tls_size == 24)
11866         return SYMBOL_TLSLE24;
11867       else if (aarch64_tls_size == 32)
11868         return SYMBOL_TLSLE32;
11869       else if (aarch64_tls_size == 48)
11870         return SYMBOL_TLSLE48;
11871       else
11872         gcc_unreachable ();
11873
11874     case TLS_MODEL_EMULATED:
11875     case TLS_MODEL_NONE:
11876       return SYMBOL_FORCE_TO_MEM;
11877
11878     default:
11879       gcc_unreachable ();
11880     }
11881 }
11882
11883 /* Return the correct method for accessing X + OFFSET, where X is either
11884    a SYMBOL_REF or LABEL_REF.  */
11885
11886 enum aarch64_symbol_type
11887 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
11888 {
11889   if (GET_CODE (x) == LABEL_REF)
11890     {
11891       switch (aarch64_cmodel)
11892         {
11893         case AARCH64_CMODEL_LARGE:
11894           return SYMBOL_FORCE_TO_MEM;
11895
11896         case AARCH64_CMODEL_TINY_PIC:
11897         case AARCH64_CMODEL_TINY:
11898           return SYMBOL_TINY_ABSOLUTE;
11899
11900         case AARCH64_CMODEL_SMALL_SPIC:
11901         case AARCH64_CMODEL_SMALL_PIC:
11902         case AARCH64_CMODEL_SMALL:
11903           return SYMBOL_SMALL_ABSOLUTE;
11904
11905         default:
11906           gcc_unreachable ();
11907         }
11908     }
11909
11910   if (GET_CODE (x) == SYMBOL_REF)
11911     {
11912       if (aarch64_tls_symbol_p (x))
11913         return aarch64_classify_tls_symbol (x);
11914
11915       switch (aarch64_cmodel)
11916         {
11917         case AARCH64_CMODEL_TINY:
11918           /* When we retrieve symbol + offset address, we have to make sure
11919              the offset does not cause overflow of the final address.  But
11920              we have no way of knowing the address of symbol at compile time
11921              so we can't accurately say if the distance between the PC and
11922              symbol + offset is outside the addressible range of +/-1M in the
11923              TINY code model.  So we rely on images not being greater than
11924              1M and cap the offset at 1M and anything beyond 1M will have to
11925              be loaded using an alternative mechanism.  Furthermore if the
11926              symbol is a weak reference to something that isn't known to
11927              resolve to a symbol in this module, then force to memory.  */
11928           if ((SYMBOL_REF_WEAK (x)
11929                && !aarch64_symbol_binds_local_p (x))
11930               || !IN_RANGE (offset, -1048575, 1048575))
11931             return SYMBOL_FORCE_TO_MEM;
11932           return SYMBOL_TINY_ABSOLUTE;
11933
11934         case AARCH64_CMODEL_SMALL:
11935           /* Same reasoning as the tiny code model, but the offset cap here is
11936              4G.  */
11937           if ((SYMBOL_REF_WEAK (x)
11938                && !aarch64_symbol_binds_local_p (x))
11939               || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
11940                             HOST_WIDE_INT_C (4294967264)))
11941             return SYMBOL_FORCE_TO_MEM;
11942           return SYMBOL_SMALL_ABSOLUTE;
11943
11944         case AARCH64_CMODEL_TINY_PIC:
11945           if (!aarch64_symbol_binds_local_p (x))
11946             return SYMBOL_TINY_GOT;
11947           return SYMBOL_TINY_ABSOLUTE;
11948
11949         case AARCH64_CMODEL_SMALL_SPIC:
11950         case AARCH64_CMODEL_SMALL_PIC:
11951           if (!aarch64_symbol_binds_local_p (x))
11952             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
11953                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
11954           return SYMBOL_SMALL_ABSOLUTE;
11955
11956         case AARCH64_CMODEL_LARGE:
11957           /* This is alright even in PIC code as the constant
11958              pool reference is always PC relative and within
11959              the same translation unit.  */
11960           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
11961             return SYMBOL_SMALL_ABSOLUTE;
11962           else
11963             return SYMBOL_FORCE_TO_MEM;
11964
11965         default:
11966           gcc_unreachable ();
11967         }
11968     }
11969
11970   /* By default push everything into the constant pool.  */
11971   return SYMBOL_FORCE_TO_MEM;
11972 }
11973
11974 bool
11975 aarch64_constant_address_p (rtx x)
11976 {
11977   return (CONSTANT_P (x) && memory_address_p (DImode, x));
11978 }
11979
11980 bool
11981 aarch64_legitimate_pic_operand_p (rtx x)
11982 {
11983   if (GET_CODE (x) == SYMBOL_REF
11984       || (GET_CODE (x) == CONST
11985           && GET_CODE (XEXP (x, 0)) == PLUS
11986           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
11987      return false;
11988
11989   return true;
11990 }
11991
11992 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
11993    that should be rematerialized rather than spilled.  */
11994
11995 static bool
11996 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
11997 {
11998   /* Support CSE and rematerialization of common constants.  */
11999   if (CONST_INT_P (x)
12000       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
12001       || GET_CODE (x) == CONST_VECTOR)
12002     return true;
12003
12004   /* Do not allow vector struct mode constants for Advanced SIMD.
12005      We could support 0 and -1 easily, but they need support in
12006      aarch64-simd.md.  */
12007   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12008   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
12009     return false;
12010
12011   /* Only accept variable-length vector constants if they can be
12012      handled directly.
12013
12014      ??? It would be possible to handle rematerialization of other
12015      constants via secondary reloads.  */
12016   if (vec_flags & VEC_ANY_SVE)
12017     return aarch64_simd_valid_immediate (x, NULL);
12018
12019   if (GET_CODE (x) == HIGH)
12020     x = XEXP (x, 0);
12021
12022   /* Accept polynomial constants that can be calculated by using the
12023      destination of a move as the sole temporary.  Constants that
12024      require a second temporary cannot be rematerialized (they can't be
12025      forced to memory and also aren't legitimate constants).  */
12026   poly_int64 offset;
12027   if (poly_int_rtx_p (x, &offset))
12028     return aarch64_offset_temporaries (false, offset) <= 1;
12029
12030   /* If an offset is being added to something else, we need to allow the
12031      base to be moved into the destination register, meaning that there
12032      are no free temporaries for the offset.  */
12033   x = strip_offset (x, &offset);
12034   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
12035     return false;
12036
12037   /* Do not allow const (plus (anchor_symbol, const_int)).  */
12038   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
12039     return false;
12040
12041   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
12042      so spilling them is better than rematerialization.  */
12043   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
12044     return true;
12045
12046   /* Label references are always constant.  */
12047   if (GET_CODE (x) == LABEL_REF)
12048     return true;
12049
12050   return false;
12051 }
12052
12053 rtx
12054 aarch64_load_tp (rtx target)
12055 {
12056   if (!target
12057       || GET_MODE (target) != Pmode
12058       || !register_operand (target, Pmode))
12059     target = gen_reg_rtx (Pmode);
12060
12061   /* Can return in any reg.  */
12062   emit_insn (gen_aarch64_load_tp_hard (target));
12063   return target;
12064 }
12065
12066 /* On AAPCS systems, this is the "struct __va_list".  */
12067 static GTY(()) tree va_list_type;
12068
12069 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
12070    Return the type to use as __builtin_va_list.
12071
12072    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
12073
12074    struct __va_list
12075    {
12076      void *__stack;
12077      void *__gr_top;
12078      void *__vr_top;
12079      int   __gr_offs;
12080      int   __vr_offs;
12081    };  */
12082
12083 static tree
12084 aarch64_build_builtin_va_list (void)
12085 {
12086   tree va_list_name;
12087   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12088
12089   /* Create the type.  */
12090   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
12091   /* Give it the required name.  */
12092   va_list_name = build_decl (BUILTINS_LOCATION,
12093                              TYPE_DECL,
12094                              get_identifier ("__va_list"),
12095                              va_list_type);
12096   DECL_ARTIFICIAL (va_list_name) = 1;
12097   TYPE_NAME (va_list_type) = va_list_name;
12098   TYPE_STUB_DECL (va_list_type) = va_list_name;
12099
12100   /* Create the fields.  */
12101   f_stack = build_decl (BUILTINS_LOCATION,
12102                         FIELD_DECL, get_identifier ("__stack"),
12103                         ptr_type_node);
12104   f_grtop = build_decl (BUILTINS_LOCATION,
12105                         FIELD_DECL, get_identifier ("__gr_top"),
12106                         ptr_type_node);
12107   f_vrtop = build_decl (BUILTINS_LOCATION,
12108                         FIELD_DECL, get_identifier ("__vr_top"),
12109                         ptr_type_node);
12110   f_groff = build_decl (BUILTINS_LOCATION,
12111                         FIELD_DECL, get_identifier ("__gr_offs"),
12112                         integer_type_node);
12113   f_vroff = build_decl (BUILTINS_LOCATION,
12114                         FIELD_DECL, get_identifier ("__vr_offs"),
12115                         integer_type_node);
12116
12117   /* Tell tree-stdarg pass about our internal offset fields.
12118      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
12119      purpose to identify whether the code is updating va_list internal
12120      offset fields through irregular way.  */
12121   va_list_gpr_counter_field = f_groff;
12122   va_list_fpr_counter_field = f_vroff;
12123
12124   DECL_ARTIFICIAL (f_stack) = 1;
12125   DECL_ARTIFICIAL (f_grtop) = 1;
12126   DECL_ARTIFICIAL (f_vrtop) = 1;
12127   DECL_ARTIFICIAL (f_groff) = 1;
12128   DECL_ARTIFICIAL (f_vroff) = 1;
12129
12130   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
12131   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
12132   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
12133   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
12134   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
12135
12136   TYPE_FIELDS (va_list_type) = f_stack;
12137   DECL_CHAIN (f_stack) = f_grtop;
12138   DECL_CHAIN (f_grtop) = f_vrtop;
12139   DECL_CHAIN (f_vrtop) = f_groff;
12140   DECL_CHAIN (f_groff) = f_vroff;
12141
12142   /* Compute its layout.  */
12143   layout_type (va_list_type);
12144
12145   return va_list_type;
12146 }
12147
12148 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
12149 static void
12150 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
12151 {
12152   const CUMULATIVE_ARGS *cum;
12153   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12154   tree stack, grtop, vrtop, groff, vroff;
12155   tree t;
12156   int gr_save_area_size = cfun->va_list_gpr_size;
12157   int vr_save_area_size = cfun->va_list_fpr_size;
12158   int vr_offset;
12159
12160   cum = &crtl->args.info;
12161   if (cfun->va_list_gpr_size)
12162     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
12163                              cfun->va_list_gpr_size);
12164   if (cfun->va_list_fpr_size)
12165     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
12166                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
12167
12168   if (!TARGET_FLOAT)
12169     {
12170       gcc_assert (cum->aapcs_nvrn == 0);
12171       vr_save_area_size = 0;
12172     }
12173
12174   f_stack = TYPE_FIELDS (va_list_type_node);
12175   f_grtop = DECL_CHAIN (f_stack);
12176   f_vrtop = DECL_CHAIN (f_grtop);
12177   f_groff = DECL_CHAIN (f_vrtop);
12178   f_vroff = DECL_CHAIN (f_groff);
12179
12180   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
12181                   NULL_TREE);
12182   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
12183                   NULL_TREE);
12184   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
12185                   NULL_TREE);
12186   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
12187                   NULL_TREE);
12188   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
12189                   NULL_TREE);
12190
12191   /* Emit code to initialize STACK, which points to the next varargs stack
12192      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
12193      by named arguments.  STACK is 8-byte aligned.  */
12194   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
12195   if (cum->aapcs_stack_size > 0)
12196     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
12197   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
12198   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12199
12200   /* Emit code to initialize GRTOP, the top of the GR save area.
12201      virtual_incoming_args_rtx should have been 16 byte aligned.  */
12202   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
12203   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
12204   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12205
12206   /* Emit code to initialize VRTOP, the top of the VR save area.
12207      This address is gr_save_area_bytes below GRTOP, rounded
12208      down to the next 16-byte boundary.  */
12209   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
12210   vr_offset = ROUND_UP (gr_save_area_size,
12211                         STACK_BOUNDARY / BITS_PER_UNIT);
12212
12213   if (vr_offset)
12214     t = fold_build_pointer_plus_hwi (t, -vr_offset);
12215   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
12216   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12217
12218   /* Emit code to initialize GROFF, the offset from GRTOP of the
12219      next GPR argument.  */
12220   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
12221               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
12222   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12223
12224   /* Likewise emit code to initialize VROFF, the offset from FTOP
12225      of the next VR argument.  */
12226   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
12227               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
12228   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12229 }
12230
12231 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
12232
12233 static tree
12234 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
12235                               gimple_seq *post_p ATTRIBUTE_UNUSED)
12236 {
12237   tree addr;
12238   bool indirect_p;
12239   bool is_ha;           /* is HFA or HVA.  */
12240   bool dw_align;        /* double-word align.  */
12241   machine_mode ag_mode = VOIDmode;
12242   int nregs;
12243   machine_mode mode;
12244
12245   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12246   tree stack, f_top, f_off, off, arg, roundup, on_stack;
12247   HOST_WIDE_INT size, rsize, adjust, align;
12248   tree t, u, cond1, cond2;
12249
12250   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
12251   if (indirect_p)
12252     type = build_pointer_type (type);
12253
12254   mode = TYPE_MODE (type);
12255
12256   f_stack = TYPE_FIELDS (va_list_type_node);
12257   f_grtop = DECL_CHAIN (f_stack);
12258   f_vrtop = DECL_CHAIN (f_grtop);
12259   f_groff = DECL_CHAIN (f_vrtop);
12260   f_vroff = DECL_CHAIN (f_groff);
12261
12262   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
12263                   f_stack, NULL_TREE);
12264   size = int_size_in_bytes (type);
12265   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
12266
12267   dw_align = false;
12268   adjust = 0;
12269   if (aarch64_vfp_is_call_or_return_candidate (mode,
12270                                                type,
12271                                                &ag_mode,
12272                                                &nregs,
12273                                                &is_ha))
12274     {
12275       /* No frontends can create types with variable-sized modes, so we
12276          shouldn't be asked to pass or return them.  */
12277       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
12278
12279       /* TYPE passed in fp/simd registers.  */
12280       if (!TARGET_FLOAT)
12281         aarch64_err_no_fpadvsimd (mode);
12282
12283       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
12284                       unshare_expr (valist), f_vrtop, NULL_TREE);
12285       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
12286                       unshare_expr (valist), f_vroff, NULL_TREE);
12287
12288       rsize = nregs * UNITS_PER_VREG;
12289
12290       if (is_ha)
12291         {
12292           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
12293             adjust = UNITS_PER_VREG - ag_size;
12294         }
12295       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12296                && size < UNITS_PER_VREG)
12297         {
12298           adjust = UNITS_PER_VREG - size;
12299         }
12300     }
12301   else
12302     {
12303       /* TYPE passed in general registers.  */
12304       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
12305                       unshare_expr (valist), f_grtop, NULL_TREE);
12306       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
12307                       unshare_expr (valist), f_groff, NULL_TREE);
12308       rsize = ROUND_UP (size, UNITS_PER_WORD);
12309       nregs = rsize / UNITS_PER_WORD;
12310
12311       if (align > 8)
12312         dw_align = true;
12313
12314       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12315           && size < UNITS_PER_WORD)
12316         {
12317           adjust = UNITS_PER_WORD  - size;
12318         }
12319     }
12320
12321   /* Get a local temporary for the field value.  */
12322   off = get_initialized_tmp_var (f_off, pre_p, NULL);
12323
12324   /* Emit code to branch if off >= 0.  */
12325   t = build2 (GE_EXPR, boolean_type_node, off,
12326               build_int_cst (TREE_TYPE (off), 0));
12327   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
12328
12329   if (dw_align)
12330     {
12331       /* Emit: offs = (offs + 15) & -16.  */
12332       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12333                   build_int_cst (TREE_TYPE (off), 15));
12334       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
12335                   build_int_cst (TREE_TYPE (off), -16));
12336       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
12337     }
12338   else
12339     roundup = NULL;
12340
12341   /* Update ap.__[g|v]r_offs  */
12342   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12343               build_int_cst (TREE_TYPE (off), rsize));
12344   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
12345
12346   /* String up.  */
12347   if (roundup)
12348     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12349
12350   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
12351   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
12352               build_int_cst (TREE_TYPE (f_off), 0));
12353   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
12354
12355   /* String up: make sure the assignment happens before the use.  */
12356   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
12357   COND_EXPR_ELSE (cond1) = t;
12358
12359   /* Prepare the trees handling the argument that is passed on the stack;
12360      the top level node will store in ON_STACK.  */
12361   arg = get_initialized_tmp_var (stack, pre_p, NULL);
12362   if (align > 8)
12363     {
12364       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
12365       t = fold_build_pointer_plus_hwi (arg, 15);
12366       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12367                   build_int_cst (TREE_TYPE (t), -16));
12368       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
12369     }
12370   else
12371     roundup = NULL;
12372   /* Advance ap.__stack  */
12373   t = fold_build_pointer_plus_hwi (arg, size + 7);
12374   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12375               build_int_cst (TREE_TYPE (t), -8));
12376   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
12377   /* String up roundup and advance.  */
12378   if (roundup)
12379     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12380   /* String up with arg */
12381   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
12382   /* Big-endianness related address adjustment.  */
12383   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12384       && size < UNITS_PER_WORD)
12385   {
12386     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
12387                 size_int (UNITS_PER_WORD - size));
12388     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
12389   }
12390
12391   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
12392   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
12393
12394   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
12395   t = off;
12396   if (adjust)
12397     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
12398                 build_int_cst (TREE_TYPE (off), adjust));
12399
12400   t = fold_convert (sizetype, t);
12401   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
12402
12403   if (is_ha)
12404     {
12405       /* type ha; // treat as "struct {ftype field[n];}"
12406          ... [computing offs]
12407          for (i = 0; i <nregs; ++i, offs += 16)
12408            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
12409          return ha;  */
12410       int i;
12411       tree tmp_ha, field_t, field_ptr_t;
12412
12413       /* Declare a local variable.  */
12414       tmp_ha = create_tmp_var_raw (type, "ha");
12415       gimple_add_tmp_var (tmp_ha);
12416
12417       /* Establish the base type.  */
12418       switch (ag_mode)
12419         {
12420         case E_SFmode:
12421           field_t = float_type_node;
12422           field_ptr_t = float_ptr_type_node;
12423           break;
12424         case E_DFmode:
12425           field_t = double_type_node;
12426           field_ptr_t = double_ptr_type_node;
12427           break;
12428         case E_TFmode:
12429           field_t = long_double_type_node;
12430           field_ptr_t = long_double_ptr_type_node;
12431           break;
12432         case E_HFmode:
12433           field_t = aarch64_fp16_type_node;
12434           field_ptr_t = aarch64_fp16_ptr_type_node;
12435           break;
12436         case E_V2SImode:
12437         case E_V4SImode:
12438             {
12439               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
12440               field_t = build_vector_type_for_mode (innertype, ag_mode);
12441               field_ptr_t = build_pointer_type (field_t);
12442             }
12443           break;
12444         default:
12445           gcc_assert (0);
12446         }
12447
12448       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
12449       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
12450       addr = t;
12451       t = fold_convert (field_ptr_t, addr);
12452       t = build2 (MODIFY_EXPR, field_t,
12453                   build1 (INDIRECT_REF, field_t, tmp_ha),
12454                   build1 (INDIRECT_REF, field_t, t));
12455
12456       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
12457       for (i = 1; i < nregs; ++i)
12458         {
12459           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
12460           u = fold_convert (field_ptr_t, addr);
12461           u = build2 (MODIFY_EXPR, field_t,
12462                       build2 (MEM_REF, field_t, tmp_ha,
12463                               build_int_cst (field_ptr_t,
12464                                              (i *
12465                                               int_size_in_bytes (field_t)))),
12466                       build1 (INDIRECT_REF, field_t, u));
12467           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
12468         }
12469
12470       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
12471       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
12472     }
12473
12474   COND_EXPR_ELSE (cond2) = t;
12475   addr = fold_convert (build_pointer_type (type), cond1);
12476   addr = build_va_arg_indirect_ref (addr);
12477
12478   if (indirect_p)
12479     addr = build_va_arg_indirect_ref (addr);
12480
12481   return addr;
12482 }
12483
12484 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
12485
12486 static void
12487 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
12488                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
12489                                 int no_rtl)
12490 {
12491   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
12492   CUMULATIVE_ARGS local_cum;
12493   int gr_saved = cfun->va_list_gpr_size;
12494   int vr_saved = cfun->va_list_fpr_size;
12495
12496   /* The caller has advanced CUM up to, but not beyond, the last named
12497      argument.  Advance a local copy of CUM past the last "real" named
12498      argument, to find out how many registers are left over.  */
12499   local_cum = *cum;
12500   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
12501
12502   /* Found out how many registers we need to save.
12503      Honor tree-stdvar analysis results.  */
12504   if (cfun->va_list_gpr_size)
12505     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
12506                     cfun->va_list_gpr_size / UNITS_PER_WORD);
12507   if (cfun->va_list_fpr_size)
12508     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
12509                     cfun->va_list_fpr_size / UNITS_PER_VREG);
12510
12511   if (!TARGET_FLOAT)
12512     {
12513       gcc_assert (local_cum.aapcs_nvrn == 0);
12514       vr_saved = 0;
12515     }
12516
12517   if (!no_rtl)
12518     {
12519       if (gr_saved > 0)
12520         {
12521           rtx ptr, mem;
12522
12523           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
12524           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
12525                                - gr_saved * UNITS_PER_WORD);
12526           mem = gen_frame_mem (BLKmode, ptr);
12527           set_mem_alias_set (mem, get_varargs_alias_set ());
12528
12529           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
12530                                mem, gr_saved);
12531         }
12532       if (vr_saved > 0)
12533         {
12534           /* We can't use move_block_from_reg, because it will use
12535              the wrong mode, storing D regs only.  */
12536           machine_mode mode = TImode;
12537           int off, i, vr_start;
12538
12539           /* Set OFF to the offset from virtual_incoming_args_rtx of
12540              the first vector register.  The VR save area lies below
12541              the GR one, and is aligned to 16 bytes.  */
12542           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
12543                            STACK_BOUNDARY / BITS_PER_UNIT);
12544           off -= vr_saved * UNITS_PER_VREG;
12545
12546           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
12547           for (i = 0; i < vr_saved; ++i)
12548             {
12549               rtx ptr, mem;
12550
12551               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
12552               mem = gen_frame_mem (mode, ptr);
12553               set_mem_alias_set (mem, get_varargs_alias_set ());
12554               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
12555               off += UNITS_PER_VREG;
12556             }
12557         }
12558     }
12559
12560   /* We don't save the size into *PRETEND_SIZE because we want to avoid
12561      any complication of having crtl->args.pretend_args_size changed.  */
12562   cfun->machine->frame.saved_varargs_size
12563     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
12564                  STACK_BOUNDARY / BITS_PER_UNIT)
12565        + vr_saved * UNITS_PER_VREG);
12566 }
12567
12568 static void
12569 aarch64_conditional_register_usage (void)
12570 {
12571   int i;
12572   if (!TARGET_FLOAT)
12573     {
12574       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
12575         {
12576           fixed_regs[i] = 1;
12577           call_used_regs[i] = 1;
12578         }
12579     }
12580   if (!TARGET_SVE)
12581     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
12582       {
12583         fixed_regs[i] = 1;
12584         call_used_regs[i] = 1;
12585       }
12586 }
12587
12588 /* Walk down the type tree of TYPE counting consecutive base elements.
12589    If *MODEP is VOIDmode, then set it to the first valid floating point
12590    type.  If a non-floating point type is found, or if a floating point
12591    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
12592    otherwise return the count in the sub-tree.  */
12593 static int
12594 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
12595 {
12596   machine_mode mode;
12597   HOST_WIDE_INT size;
12598
12599   switch (TREE_CODE (type))
12600     {
12601     case REAL_TYPE:
12602       mode = TYPE_MODE (type);
12603       if (mode != DFmode && mode != SFmode
12604           && mode != TFmode && mode != HFmode)
12605         return -1;
12606
12607       if (*modep == VOIDmode)
12608         *modep = mode;
12609
12610       if (*modep == mode)
12611         return 1;
12612
12613       break;
12614
12615     case COMPLEX_TYPE:
12616       mode = TYPE_MODE (TREE_TYPE (type));
12617       if (mode != DFmode && mode != SFmode
12618           && mode != TFmode && mode != HFmode)
12619         return -1;
12620
12621       if (*modep == VOIDmode)
12622         *modep = mode;
12623
12624       if (*modep == mode)
12625         return 2;
12626
12627       break;
12628
12629     case VECTOR_TYPE:
12630       /* Use V2SImode and V4SImode as representatives of all 64-bit
12631          and 128-bit vector types.  */
12632       size = int_size_in_bytes (type);
12633       switch (size)
12634         {
12635         case 8:
12636           mode = V2SImode;
12637           break;
12638         case 16:
12639           mode = V4SImode;
12640           break;
12641         default:
12642           return -1;
12643         }
12644
12645       if (*modep == VOIDmode)
12646         *modep = mode;
12647
12648       /* Vector modes are considered to be opaque: two vectors are
12649          equivalent for the purposes of being homogeneous aggregates
12650          if they are the same size.  */
12651       if (*modep == mode)
12652         return 1;
12653
12654       break;
12655
12656     case ARRAY_TYPE:
12657       {
12658         int count;
12659         tree index = TYPE_DOMAIN (type);
12660
12661         /* Can't handle incomplete types nor sizes that are not
12662            fixed.  */
12663         if (!COMPLETE_TYPE_P (type)
12664             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12665           return -1;
12666
12667         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
12668         if (count == -1
12669             || !index
12670             || !TYPE_MAX_VALUE (index)
12671             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
12672             || !TYPE_MIN_VALUE (index)
12673             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
12674             || count < 0)
12675           return -1;
12676
12677         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
12678                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
12679
12680         /* There must be no padding.  */
12681         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12682                       count * GET_MODE_BITSIZE (*modep)))
12683           return -1;
12684
12685         return count;
12686       }
12687
12688     case RECORD_TYPE:
12689       {
12690         int count = 0;
12691         int sub_count;
12692         tree field;
12693
12694         /* Can't handle incomplete types nor sizes that are not
12695            fixed.  */
12696         if (!COMPLETE_TYPE_P (type)
12697             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12698           return -1;
12699
12700         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12701           {
12702             if (TREE_CODE (field) != FIELD_DECL)
12703               continue;
12704
12705             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12706             if (sub_count < 0)
12707               return -1;
12708             count += sub_count;
12709           }
12710
12711         /* There must be no padding.  */
12712         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12713                       count * GET_MODE_BITSIZE (*modep)))
12714           return -1;
12715
12716         return count;
12717       }
12718
12719     case UNION_TYPE:
12720     case QUAL_UNION_TYPE:
12721       {
12722         /* These aren't very interesting except in a degenerate case.  */
12723         int count = 0;
12724         int sub_count;
12725         tree field;
12726
12727         /* Can't handle incomplete types nor sizes that are not
12728            fixed.  */
12729         if (!COMPLETE_TYPE_P (type)
12730             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12731           return -1;
12732
12733         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12734           {
12735             if (TREE_CODE (field) != FIELD_DECL)
12736               continue;
12737
12738             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12739             if (sub_count < 0)
12740               return -1;
12741             count = count > sub_count ? count : sub_count;
12742           }
12743
12744         /* There must be no padding.  */
12745         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12746                       count * GET_MODE_BITSIZE (*modep)))
12747           return -1;
12748
12749         return count;
12750       }
12751
12752     default:
12753       break;
12754     }
12755
12756   return -1;
12757 }
12758
12759 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
12760    type as described in AAPCS64 \S 4.1.2.
12761
12762    See the comment above aarch64_composite_type_p for the notes on MODE.  */
12763
12764 static bool
12765 aarch64_short_vector_p (const_tree type,
12766                         machine_mode mode)
12767 {
12768   poly_int64 size = -1;
12769
12770   if (type && TREE_CODE (type) == VECTOR_TYPE)
12771     size = int_size_in_bytes (type);
12772   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
12773             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
12774     size = GET_MODE_SIZE (mode);
12775
12776   return known_eq (size, 8) || known_eq (size, 16);
12777 }
12778
12779 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
12780    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
12781    array types.  The C99 floating-point complex types are also considered
12782    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
12783    types, which are GCC extensions and out of the scope of AAPCS64, are
12784    treated as composite types here as well.
12785
12786    Note that MODE itself is not sufficient in determining whether a type
12787    is such a composite type or not.  This is because
12788    stor-layout.c:compute_record_mode may have already changed the MODE
12789    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
12790    structure with only one field may have its MODE set to the mode of the
12791    field.  Also an integer mode whose size matches the size of the
12792    RECORD_TYPE type may be used to substitute the original mode
12793    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
12794    solely relied on.  */
12795
12796 static bool
12797 aarch64_composite_type_p (const_tree type,
12798                           machine_mode mode)
12799 {
12800   if (aarch64_short_vector_p (type, mode))
12801     return false;
12802
12803   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
12804     return true;
12805
12806   if (mode == BLKmode
12807       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
12808       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
12809     return true;
12810
12811   return false;
12812 }
12813
12814 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
12815    shall be passed or returned in simd/fp register(s) (providing these
12816    parameter passing registers are available).
12817
12818    Upon successful return, *COUNT returns the number of needed registers,
12819    *BASE_MODE returns the mode of the individual register and when IS_HAF
12820    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
12821    floating-point aggregate or a homogeneous short-vector aggregate.  */
12822
12823 static bool
12824 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
12825                                          const_tree type,
12826                                          machine_mode *base_mode,
12827                                          int *count,
12828                                          bool *is_ha)
12829 {
12830   machine_mode new_mode = VOIDmode;
12831   bool composite_p = aarch64_composite_type_p (type, mode);
12832
12833   if (is_ha != NULL) *is_ha = false;
12834
12835   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
12836       || aarch64_short_vector_p (type, mode))
12837     {
12838       *count = 1;
12839       new_mode = mode;
12840     }
12841   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
12842     {
12843       if (is_ha != NULL) *is_ha = true;
12844       *count = 2;
12845       new_mode = GET_MODE_INNER (mode);
12846     }
12847   else if (type && composite_p)
12848     {
12849       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
12850
12851       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
12852         {
12853           if (is_ha != NULL) *is_ha = true;
12854           *count = ag_count;
12855         }
12856       else
12857         return false;
12858     }
12859   else
12860     return false;
12861
12862   *base_mode = new_mode;
12863   return true;
12864 }
12865
12866 /* Implement TARGET_STRUCT_VALUE_RTX.  */
12867
12868 static rtx
12869 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
12870                           int incoming ATTRIBUTE_UNUSED)
12871 {
12872   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
12873 }
12874
12875 /* Implements target hook vector_mode_supported_p.  */
12876 static bool
12877 aarch64_vector_mode_supported_p (machine_mode mode)
12878 {
12879   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12880   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
12881 }
12882
12883 /* Return appropriate SIMD container
12884    for MODE within a vector of WIDTH bits.  */
12885 static machine_mode
12886 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
12887 {
12888   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
12889     switch (mode)
12890       {
12891       case E_DFmode:
12892         return VNx2DFmode;
12893       case E_SFmode:
12894         return VNx4SFmode;
12895       case E_HFmode:
12896         return VNx8HFmode;
12897       case E_DImode:
12898         return VNx2DImode;
12899       case E_SImode:
12900         return VNx4SImode;
12901       case E_HImode:
12902         return VNx8HImode;
12903       case E_QImode:
12904         return VNx16QImode;
12905       default:
12906         return word_mode;
12907       }
12908
12909   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
12910   if (TARGET_SIMD)
12911     {
12912       if (known_eq (width, 128))
12913         switch (mode)
12914           {
12915           case E_DFmode:
12916             return V2DFmode;
12917           case E_SFmode:
12918             return V4SFmode;
12919           case E_HFmode:
12920             return V8HFmode;
12921           case E_SImode:
12922             return V4SImode;
12923           case E_HImode:
12924             return V8HImode;
12925           case E_QImode:
12926             return V16QImode;
12927           case E_DImode:
12928             return V2DImode;
12929           default:
12930             break;
12931           }
12932       else
12933         switch (mode)
12934           {
12935           case E_SFmode:
12936             return V2SFmode;
12937           case E_HFmode:
12938             return V4HFmode;
12939           case E_SImode:
12940             return V2SImode;
12941           case E_HImode:
12942             return V4HImode;
12943           case E_QImode:
12944             return V8QImode;
12945           default:
12946             break;
12947           }
12948     }
12949   return word_mode;
12950 }
12951
12952 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
12953 static machine_mode
12954 aarch64_preferred_simd_mode (scalar_mode mode)
12955 {
12956   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
12957   return aarch64_simd_container_mode (mode, bits);
12958 }
12959
12960 /* Return a list of possible vector sizes for the vectorizer
12961    to iterate over.  */
12962 static void
12963 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
12964 {
12965   if (TARGET_SVE)
12966     sizes->safe_push (BYTES_PER_SVE_VECTOR);
12967   sizes->safe_push (16);
12968   sizes->safe_push (8);
12969 }
12970
12971 /* Implement TARGET_MANGLE_TYPE.  */
12972
12973 static const char *
12974 aarch64_mangle_type (const_tree type)
12975 {
12976   /* The AArch64 ABI documents say that "__va_list" has to be
12977      managled as if it is in the "std" namespace.  */
12978   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
12979     return "St9__va_list";
12980
12981   /* Half-precision float.  */
12982   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
12983     return "Dh";
12984
12985   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
12986      builtin types.  */
12987   if (TYPE_NAME (type) != NULL)
12988     return aarch64_mangle_builtin_type (type);
12989
12990   /* Use the default mangling.  */
12991   return NULL;
12992 }
12993
12994 /* Find the first rtx_insn before insn that will generate an assembly
12995    instruction.  */
12996
12997 static rtx_insn *
12998 aarch64_prev_real_insn (rtx_insn *insn)
12999 {
13000   if (!insn)
13001     return NULL;
13002
13003   do
13004     {
13005       insn = prev_real_insn (insn);
13006     }
13007   while (insn && recog_memoized (insn) < 0);
13008
13009   return insn;
13010 }
13011
13012 static bool
13013 is_madd_op (enum attr_type t1)
13014 {
13015   unsigned int i;
13016   /* A number of these may be AArch32 only.  */
13017   enum attr_type mlatypes[] = {
13018     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
13019     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
13020     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
13021   };
13022
13023   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
13024     {
13025       if (t1 == mlatypes[i])
13026         return true;
13027     }
13028
13029   return false;
13030 }
13031
13032 /* Check if there is a register dependency between a load and the insn
13033    for which we hold recog_data.  */
13034
13035 static bool
13036 dep_between_memop_and_curr (rtx memop)
13037 {
13038   rtx load_reg;
13039   int opno;
13040
13041   gcc_assert (GET_CODE (memop) == SET);
13042
13043   if (!REG_P (SET_DEST (memop)))
13044     return false;
13045
13046   load_reg = SET_DEST (memop);
13047   for (opno = 1; opno < recog_data.n_operands; opno++)
13048     {
13049       rtx operand = recog_data.operand[opno];
13050       if (REG_P (operand)
13051           && reg_overlap_mentioned_p (load_reg, operand))
13052         return true;
13053
13054     }
13055   return false;
13056 }
13057
13058
13059 /* When working around the Cortex-A53 erratum 835769,
13060    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
13061    instruction and has a preceding memory instruction such that a NOP
13062    should be inserted between them.  */
13063
13064 bool
13065 aarch64_madd_needs_nop (rtx_insn* insn)
13066 {
13067   enum attr_type attr_type;
13068   rtx_insn *prev;
13069   rtx body;
13070
13071   if (!TARGET_FIX_ERR_A53_835769)
13072     return false;
13073
13074   if (!INSN_P (insn) || recog_memoized (insn) < 0)
13075     return false;
13076
13077   attr_type = get_attr_type (insn);
13078   if (!is_madd_op (attr_type))
13079     return false;
13080
13081   prev = aarch64_prev_real_insn (insn);
13082   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
13083      Restore recog state to INSN to avoid state corruption.  */
13084   extract_constrain_insn_cached (insn);
13085
13086   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
13087     return false;
13088
13089   body = single_set (prev);
13090
13091   /* If the previous insn is a memory op and there is no dependency between
13092      it and the DImode madd, emit a NOP between them.  If body is NULL then we
13093      have a complex memory operation, probably a load/store pair.
13094      Be conservative for now and emit a NOP.  */
13095   if (GET_MODE (recog_data.operand[0]) == DImode
13096       && (!body || !dep_between_memop_and_curr (body)))
13097     return true;
13098
13099   return false;
13100
13101 }
13102
13103
13104 /* Implement FINAL_PRESCAN_INSN.  */
13105
13106 void
13107 aarch64_final_prescan_insn (rtx_insn *insn)
13108 {
13109   if (aarch64_madd_needs_nop (insn))
13110     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
13111 }
13112
13113
13114 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
13115    instruction.  */
13116
13117 bool
13118 aarch64_sve_index_immediate_p (rtx base_or_step)
13119 {
13120   return (CONST_INT_P (base_or_step)
13121           && IN_RANGE (INTVAL (base_or_step), -16, 15));
13122 }
13123
13124 /* Return true if X is a valid immediate for the SVE ADD and SUB
13125    instructions.  Negate X first if NEGATE_P is true.  */
13126
13127 bool
13128 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
13129 {
13130   rtx elt;
13131
13132   if (!const_vec_duplicate_p (x, &elt)
13133       || !CONST_INT_P (elt))
13134     return false;
13135
13136   HOST_WIDE_INT val = INTVAL (elt);
13137   if (negate_p)
13138     val = -val;
13139   val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
13140
13141   if (val & 0xff)
13142     return IN_RANGE (val, 0, 0xff);
13143   return IN_RANGE (val, 0, 0xff00);
13144 }
13145
13146 /* Return true if X is a valid immediate operand for an SVE logical
13147    instruction such as AND.  */
13148
13149 bool
13150 aarch64_sve_bitmask_immediate_p (rtx x)
13151 {
13152   rtx elt;
13153
13154   return (const_vec_duplicate_p (x, &elt)
13155           && CONST_INT_P (elt)
13156           && aarch64_bitmask_imm (INTVAL (elt),
13157                                   GET_MODE_INNER (GET_MODE (x))));
13158 }
13159
13160 /* Return true if X is a valid immediate for the SVE DUP and CPY
13161    instructions.  */
13162
13163 bool
13164 aarch64_sve_dup_immediate_p (rtx x)
13165 {
13166   rtx elt;
13167
13168   if (!const_vec_duplicate_p (x, &elt)
13169       || !CONST_INT_P (elt))
13170     return false;
13171
13172   HOST_WIDE_INT val = INTVAL (elt);
13173   if (val & 0xff)
13174     return IN_RANGE (val, -0x80, 0x7f);
13175   return IN_RANGE (val, -0x8000, 0x7f00);
13176 }
13177
13178 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
13179    SIGNED_P says whether the operand is signed rather than unsigned.  */
13180
13181 bool
13182 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
13183 {
13184   rtx elt;
13185
13186   return (const_vec_duplicate_p (x, &elt)
13187           && CONST_INT_P (elt)
13188           && (signed_p
13189               ? IN_RANGE (INTVAL (elt), -16, 15)
13190               : IN_RANGE (INTVAL (elt), 0, 127)));
13191 }
13192
13193 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
13194    instruction.  Negate X first if NEGATE_P is true.  */
13195
13196 bool
13197 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
13198 {
13199   rtx elt;
13200   REAL_VALUE_TYPE r;
13201
13202   if (!const_vec_duplicate_p (x, &elt)
13203       || GET_CODE (elt) != CONST_DOUBLE)
13204     return false;
13205
13206   r = *CONST_DOUBLE_REAL_VALUE (elt);
13207
13208   if (negate_p)
13209     r = real_value_negate (&r);
13210
13211   if (real_equal (&r, &dconst1))
13212     return true;
13213   if (real_equal (&r, &dconsthalf))
13214     return true;
13215   return false;
13216 }
13217
13218 /* Return true if X is a valid immediate operand for an SVE FMUL
13219    instruction.  */
13220
13221 bool
13222 aarch64_sve_float_mul_immediate_p (rtx x)
13223 {
13224   rtx elt;
13225
13226   /* GCC will never generate a multiply with an immediate of 2, so there is no
13227      point testing for it (even though it is a valid constant).  */
13228   return (const_vec_duplicate_p (x, &elt)
13229           && GET_CODE (elt) == CONST_DOUBLE
13230           && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
13231 }
13232
13233 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
13234    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
13235    is nonnull, use it to describe valid immediates.  */
13236 static bool
13237 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
13238                                     simd_immediate_info *info,
13239                                     enum simd_immediate_check which,
13240                                     simd_immediate_info::insn_type insn)
13241 {
13242   /* Try a 4-byte immediate with LSL.  */
13243   for (unsigned int shift = 0; shift < 32; shift += 8)
13244     if ((val32 & (0xff << shift)) == val32)
13245       {
13246         if (info)
13247           *info = simd_immediate_info (SImode, val32 >> shift, insn,
13248                                        simd_immediate_info::LSL, shift);
13249         return true;
13250       }
13251
13252   /* Try a 2-byte immediate with LSL.  */
13253   unsigned int imm16 = val32 & 0xffff;
13254   if (imm16 == (val32 >> 16))
13255     for (unsigned int shift = 0; shift < 16; shift += 8)
13256       if ((imm16 & (0xff << shift)) == imm16)
13257         {
13258           if (info)
13259             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
13260                                          simd_immediate_info::LSL, shift);
13261           return true;
13262         }
13263
13264   /* Try a 4-byte immediate with MSL, except for cases that MVN
13265      can handle.  */
13266   if (which == AARCH64_CHECK_MOV)
13267     for (unsigned int shift = 8; shift < 24; shift += 8)
13268       {
13269         unsigned int low = (1 << shift) - 1;
13270         if (((val32 & (0xff << shift)) | low) == val32)
13271           {
13272             if (info)
13273               *info = simd_immediate_info (SImode, val32 >> shift, insn,
13274                                            simd_immediate_info::MSL, shift);
13275             return true;
13276           }
13277       }
13278
13279   return false;
13280 }
13281
13282 /* Return true if replicating VAL64 is a valid immediate for the
13283    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
13284    use it to describe valid immediates.  */
13285 static bool
13286 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
13287                                  simd_immediate_info *info,
13288                                  enum simd_immediate_check which)
13289 {
13290   unsigned int val32 = val64 & 0xffffffff;
13291   unsigned int val16 = val64 & 0xffff;
13292   unsigned int val8 = val64 & 0xff;
13293
13294   if (val32 == (val64 >> 32))
13295     {
13296       if ((which & AARCH64_CHECK_ORR) != 0
13297           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
13298                                                  simd_immediate_info::MOV))
13299         return true;
13300
13301       if ((which & AARCH64_CHECK_BIC) != 0
13302           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
13303                                                  simd_immediate_info::MVN))
13304         return true;
13305
13306       /* Try using a replicated byte.  */
13307       if (which == AARCH64_CHECK_MOV
13308           && val16 == (val32 >> 16)
13309           && val8 == (val16 >> 8))
13310         {
13311           if (info)
13312             *info = simd_immediate_info (QImode, val8);
13313           return true;
13314         }
13315     }
13316
13317   /* Try using a bit-to-bytemask.  */
13318   if (which == AARCH64_CHECK_MOV)
13319     {
13320       unsigned int i;
13321       for (i = 0; i < 64; i += 8)
13322         {
13323           unsigned char byte = (val64 >> i) & 0xff;
13324           if (byte != 0 && byte != 0xff)
13325             break;
13326         }
13327       if (i == 64)
13328         {
13329           if (info)
13330             *info = simd_immediate_info (DImode, val64);
13331           return true;
13332         }
13333     }
13334   return false;
13335 }
13336
13337 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
13338    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
13339
13340 static bool
13341 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
13342                              simd_immediate_info *info)
13343 {
13344   scalar_int_mode mode = DImode;
13345   unsigned int val32 = val64 & 0xffffffff;
13346   if (val32 == (val64 >> 32))
13347     {
13348       mode = SImode;
13349       unsigned int val16 = val32 & 0xffff;
13350       if (val16 == (val32 >> 16))
13351         {
13352           mode = HImode;
13353           unsigned int val8 = val16 & 0xff;
13354           if (val8 == (val16 >> 8))
13355             mode = QImode;
13356         }
13357     }
13358   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
13359   if (IN_RANGE (val, -0x80, 0x7f))
13360     {
13361       /* DUP with no shift.  */
13362       if (info)
13363         *info = simd_immediate_info (mode, val);
13364       return true;
13365     }
13366   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
13367     {
13368       /* DUP with LSL #8.  */
13369       if (info)
13370         *info = simd_immediate_info (mode, val);
13371       return true;
13372     }
13373   if (aarch64_bitmask_imm (val64, mode))
13374     {
13375       /* DUPM.  */
13376       if (info)
13377         *info = simd_immediate_info (mode, val);
13378       return true;
13379     }
13380   return false;
13381 }
13382
13383 /* Return true if OP is a valid SIMD immediate for the operation
13384    described by WHICH.  If INFO is nonnull, use it to describe valid
13385    immediates.  */
13386 bool
13387 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
13388                               enum simd_immediate_check which)
13389 {
13390   machine_mode mode = GET_MODE (op);
13391   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13392   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13393     return false;
13394
13395   scalar_mode elt_mode = GET_MODE_INNER (mode);
13396   rtx base, step;
13397   unsigned int n_elts;
13398   if (GET_CODE (op) == CONST_VECTOR
13399       && CONST_VECTOR_DUPLICATE_P (op))
13400     n_elts = CONST_VECTOR_NPATTERNS (op);
13401   else if ((vec_flags & VEC_SVE_DATA)
13402            && const_vec_series_p (op, &base, &step))
13403     {
13404       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
13405       if (!aarch64_sve_index_immediate_p (base)
13406           || !aarch64_sve_index_immediate_p (step))
13407         return false;
13408
13409       if (info)
13410         *info = simd_immediate_info (elt_mode, base, step);
13411       return true;
13412     }
13413   else if (GET_CODE (op) == CONST_VECTOR
13414            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
13415     /* N_ELTS set above.  */;
13416   else
13417     return false;
13418
13419   /* Handle PFALSE and PTRUE.  */
13420   if (vec_flags & VEC_SVE_PRED)
13421     return (op == CONST0_RTX (mode)
13422             || op == CONSTM1_RTX (mode));
13423
13424   scalar_float_mode elt_float_mode;
13425   if (n_elts == 1
13426       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
13427     {
13428       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
13429       if (aarch64_float_const_zero_rtx_p (elt)
13430           || aarch64_float_const_representable_p (elt))
13431         {
13432           if (info)
13433             *info = simd_immediate_info (elt_float_mode, elt);
13434           return true;
13435         }
13436     }
13437
13438   unsigned int elt_size = GET_MODE_SIZE (elt_mode);
13439   if (elt_size > 8)
13440     return false;
13441
13442   scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
13443
13444   /* Expand the vector constant out into a byte vector, with the least
13445      significant byte of the register first.  */
13446   auto_vec<unsigned char, 16> bytes;
13447   bytes.reserve (n_elts * elt_size);
13448   for (unsigned int i = 0; i < n_elts; i++)
13449     {
13450       /* The vector is provided in gcc endian-neutral fashion.
13451          For aarch64_be Advanced SIMD, it must be laid out in the vector
13452          register in reverse order.  */
13453       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
13454       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
13455
13456       if (elt_mode != elt_int_mode)
13457         elt = gen_lowpart (elt_int_mode, elt);
13458
13459       if (!CONST_INT_P (elt))
13460         return false;
13461
13462       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
13463       for (unsigned int byte = 0; byte < elt_size; byte++)
13464         {
13465           bytes.quick_push (elt_val & 0xff);
13466           elt_val >>= BITS_PER_UNIT;
13467         }
13468     }
13469
13470   /* The immediate must repeat every eight bytes.  */
13471   unsigned int nbytes = bytes.length ();
13472   for (unsigned i = 8; i < nbytes; ++i)
13473     if (bytes[i] != bytes[i - 8])
13474       return false;
13475
13476   /* Get the repeating 8-byte value as an integer.  No endian correction
13477      is needed here because bytes is already in lsb-first order.  */
13478   unsigned HOST_WIDE_INT val64 = 0;
13479   for (unsigned int i = 0; i < 8; i++)
13480     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
13481               << (i * BITS_PER_UNIT));
13482
13483   if (vec_flags & VEC_SVE_DATA)
13484     return aarch64_sve_valid_immediate (val64, info);
13485   else
13486     return aarch64_advsimd_valid_immediate (val64, info, which);
13487 }
13488
13489 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
13490    has a step in the range of INDEX.  Return the index expression if so,
13491    otherwise return null.  */
13492 rtx
13493 aarch64_check_zero_based_sve_index_immediate (rtx x)
13494 {
13495   rtx base, step;
13496   if (const_vec_series_p (x, &base, &step)
13497       && base == const0_rtx
13498       && aarch64_sve_index_immediate_p (step))
13499     return step;
13500   return NULL_RTX;
13501 }
13502
13503 /* Check of immediate shift constants are within range.  */
13504 bool
13505 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
13506 {
13507   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
13508   if (left)
13509     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
13510   else
13511     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
13512 }
13513
13514 /* Return the bitmask CONST_INT to select the bits required by a zero extract
13515    operation of width WIDTH at bit position POS.  */
13516
13517 rtx
13518 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
13519 {
13520   gcc_assert (CONST_INT_P (width));
13521   gcc_assert (CONST_INT_P (pos));
13522
13523   unsigned HOST_WIDE_INT mask
13524     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
13525   return GEN_INT (mask << UINTVAL (pos));
13526 }
13527
13528 bool
13529 aarch64_mov_operand_p (rtx x, machine_mode mode)
13530 {
13531   if (GET_CODE (x) == HIGH
13532       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
13533     return true;
13534
13535   if (CONST_INT_P (x))
13536     return true;
13537
13538   if (VECTOR_MODE_P (GET_MODE (x)))
13539     return aarch64_simd_valid_immediate (x, NULL);
13540
13541   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
13542     return true;
13543
13544   if (aarch64_sve_cnt_immediate_p (x))
13545     return true;
13546
13547   return aarch64_classify_symbolic_expression (x)
13548     == SYMBOL_TINY_ABSOLUTE;
13549 }
13550
13551 /* Return a const_int vector of VAL.  */
13552 rtx
13553 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
13554 {
13555   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
13556   return gen_const_vec_duplicate (mode, c);
13557 }
13558
13559 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
13560
13561 bool
13562 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
13563 {
13564   machine_mode vmode;
13565
13566   vmode = aarch64_simd_container_mode (mode, 64);
13567   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
13568   return aarch64_simd_valid_immediate (op_v, NULL);
13569 }
13570
13571 /* Construct and return a PARALLEL RTX vector with elements numbering the
13572    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
13573    the vector - from the perspective of the architecture.  This does not
13574    line up with GCC's perspective on lane numbers, so we end up with
13575    different masks depending on our target endian-ness.  The diagram
13576    below may help.  We must draw the distinction when building masks
13577    which select one half of the vector.  An instruction selecting
13578    architectural low-lanes for a big-endian target, must be described using
13579    a mask selecting GCC high-lanes.
13580
13581                  Big-Endian             Little-Endian
13582
13583 GCC             0   1   2   3           3   2   1   0
13584               | x | x | x | x |       | x | x | x | x |
13585 Architecture    3   2   1   0           3   2   1   0
13586
13587 Low Mask:         { 2, 3 }                { 0, 1 }
13588 High Mask:        { 0, 1 }                { 2, 3 }
13589
13590    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
13591
13592 rtx
13593 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
13594 {
13595   rtvec v = rtvec_alloc (nunits / 2);
13596   int high_base = nunits / 2;
13597   int low_base = 0;
13598   int base;
13599   rtx t1;
13600   int i;
13601
13602   if (BYTES_BIG_ENDIAN)
13603     base = high ? low_base : high_base;
13604   else
13605     base = high ? high_base : low_base;
13606
13607   for (i = 0; i < nunits / 2; i++)
13608     RTVEC_ELT (v, i) = GEN_INT (base + i);
13609
13610   t1 = gen_rtx_PARALLEL (mode, v);
13611   return t1;
13612 }
13613
13614 /* Check OP for validity as a PARALLEL RTX vector with elements
13615    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
13616    from the perspective of the architecture.  See the diagram above
13617    aarch64_simd_vect_par_cnst_half for more details.  */
13618
13619 bool
13620 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
13621                                        bool high)
13622 {
13623   int nelts;
13624   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
13625     return false;
13626
13627   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
13628   HOST_WIDE_INT count_op = XVECLEN (op, 0);
13629   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
13630   int i = 0;
13631
13632   if (count_op != count_ideal)
13633     return false;
13634
13635   for (i = 0; i < count_ideal; i++)
13636     {
13637       rtx elt_op = XVECEXP (op, 0, i);
13638       rtx elt_ideal = XVECEXP (ideal, 0, i);
13639
13640       if (!CONST_INT_P (elt_op)
13641           || INTVAL (elt_ideal) != INTVAL (elt_op))
13642         return false;
13643     }
13644   return true;
13645 }
13646
13647 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
13648    HIGH (exclusive).  */
13649 void
13650 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
13651                           const_tree exp)
13652 {
13653   HOST_WIDE_INT lane;
13654   gcc_assert (CONST_INT_P (operand));
13655   lane = INTVAL (operand);
13656
13657   if (lane < low || lane >= high)
13658   {
13659     if (exp)
13660       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
13661     else
13662       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
13663   }
13664 }
13665
13666 /* Peform endian correction on lane number N, which indexes a vector
13667    of mode MODE, and return the result as an SImode rtx.  */
13668
13669 rtx
13670 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
13671 {
13672   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
13673 }
13674
13675 /* Return TRUE if OP is a valid vector addressing mode.  */
13676
13677 bool
13678 aarch64_simd_mem_operand_p (rtx op)
13679 {
13680   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
13681                         || REG_P (XEXP (op, 0)));
13682 }
13683
13684 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
13685
13686 bool
13687 aarch64_sve_ld1r_operand_p (rtx op)
13688 {
13689   struct aarch64_address_info addr;
13690   scalar_mode mode;
13691
13692   return (MEM_P (op)
13693           && is_a <scalar_mode> (GET_MODE (op), &mode)
13694           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
13695           && addr.type == ADDRESS_REG_IMM
13696           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
13697 }
13698
13699 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
13700    The conditions for STR are the same.  */
13701 bool
13702 aarch64_sve_ldr_operand_p (rtx op)
13703 {
13704   struct aarch64_address_info addr;
13705
13706   return (MEM_P (op)
13707           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
13708                                        false, ADDR_QUERY_ANY)
13709           && addr.type == ADDRESS_REG_IMM);
13710 }
13711
13712 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
13713    We need to be able to access the individual pieces, so the range
13714    is different from LD[234] and ST[234].  */
13715 bool
13716 aarch64_sve_struct_memory_operand_p (rtx op)
13717 {
13718   if (!MEM_P (op))
13719     return false;
13720
13721   machine_mode mode = GET_MODE (op);
13722   struct aarch64_address_info addr;
13723   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
13724                                  ADDR_QUERY_ANY)
13725       || addr.type != ADDRESS_REG_IMM)
13726     return false;
13727
13728   poly_int64 first = addr.const_offset;
13729   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
13730   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
13731           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
13732 }
13733
13734 /* Emit a register copy from operand to operand, taking care not to
13735    early-clobber source registers in the process.
13736
13737    COUNT is the number of components into which the copy needs to be
13738    decomposed.  */
13739 void
13740 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
13741                                 unsigned int count)
13742 {
13743   unsigned int i;
13744   int rdest = REGNO (operands[0]);
13745   int rsrc = REGNO (operands[1]);
13746
13747   if (!reg_overlap_mentioned_p (operands[0], operands[1])
13748       || rdest < rsrc)
13749     for (i = 0; i < count; i++)
13750       emit_move_insn (gen_rtx_REG (mode, rdest + i),
13751                       gen_rtx_REG (mode, rsrc + i));
13752   else
13753     for (i = 0; i < count; i++)
13754       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
13755                       gen_rtx_REG (mode, rsrc + count - i - 1));
13756 }
13757
13758 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
13759    one of VSTRUCT modes: OI, CI, or XI.  */
13760 int
13761 aarch64_simd_attr_length_rglist (machine_mode mode)
13762 {
13763   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
13764   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
13765 }
13766
13767 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
13768    alignment of a vector to 128 bits.  SVE predicates have an alignment of
13769    16 bits.  */
13770 static HOST_WIDE_INT
13771 aarch64_simd_vector_alignment (const_tree type)
13772 {
13773   if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13774     /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
13775        be set for non-predicate vectors of booleans.  Modes are the most
13776        direct way we have of identifying real SVE predicate types.  */
13777     return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
13778   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
13779   return MIN (align, 128);
13780 }
13781
13782 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
13783 static HOST_WIDE_INT
13784 aarch64_vectorize_preferred_vector_alignment (const_tree type)
13785 {
13786   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
13787     {
13788       /* If the length of the vector is fixed, try to align to that length,
13789          otherwise don't try to align at all.  */
13790       HOST_WIDE_INT result;
13791       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
13792         result = TYPE_ALIGN (TREE_TYPE (type));
13793       return result;
13794     }
13795   return TYPE_ALIGN (type);
13796 }
13797
13798 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
13799 static bool
13800 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
13801 {
13802   if (is_packed)
13803     return false;
13804
13805   /* For fixed-length vectors, check that the vectorizer will aim for
13806      full-vector alignment.  This isn't true for generic GCC vectors
13807      that are wider than the ABI maximum of 128 bits.  */
13808   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
13809       && (wi::to_widest (TYPE_SIZE (type))
13810           != aarch64_vectorize_preferred_vector_alignment (type)))
13811     return false;
13812
13813   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
13814   return true;
13815 }
13816
13817 /* Return true if the vector misalignment factor is supported by the
13818    target.  */
13819 static bool
13820 aarch64_builtin_support_vector_misalignment (machine_mode mode,
13821                                              const_tree type, int misalignment,
13822                                              bool is_packed)
13823 {
13824   if (TARGET_SIMD && STRICT_ALIGNMENT)
13825     {
13826       /* Return if movmisalign pattern is not supported for this mode.  */
13827       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
13828         return false;
13829
13830       /* Misalignment factor is unknown at compile time.  */
13831       if (misalignment == -1)
13832         return false;
13833     }
13834   return default_builtin_support_vector_misalignment (mode, type, misalignment,
13835                                                       is_packed);
13836 }
13837
13838 /* If VALS is a vector constant that can be loaded into a register
13839    using DUP, generate instructions to do so and return an RTX to
13840    assign to the register.  Otherwise return NULL_RTX.  */
13841 static rtx
13842 aarch64_simd_dup_constant (rtx vals)
13843 {
13844   machine_mode mode = GET_MODE (vals);
13845   machine_mode inner_mode = GET_MODE_INNER (mode);
13846   rtx x;
13847
13848   if (!const_vec_duplicate_p (vals, &x))
13849     return NULL_RTX;
13850
13851   /* We can load this constant by using DUP and a constant in a
13852      single ARM register.  This will be cheaper than a vector
13853      load.  */
13854   x = copy_to_mode_reg (inner_mode, x);
13855   return gen_vec_duplicate (mode, x);
13856 }
13857
13858
13859 /* Generate code to load VALS, which is a PARALLEL containing only
13860    constants (for vec_init) or CONST_VECTOR, efficiently into a
13861    register.  Returns an RTX to copy into the register, or NULL_RTX
13862    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
13863 static rtx
13864 aarch64_simd_make_constant (rtx vals)
13865 {
13866   machine_mode mode = GET_MODE (vals);
13867   rtx const_dup;
13868   rtx const_vec = NULL_RTX;
13869   int n_const = 0;
13870   int i;
13871
13872   if (GET_CODE (vals) == CONST_VECTOR)
13873     const_vec = vals;
13874   else if (GET_CODE (vals) == PARALLEL)
13875     {
13876       /* A CONST_VECTOR must contain only CONST_INTs and
13877          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
13878          Only store valid constants in a CONST_VECTOR.  */
13879       int n_elts = XVECLEN (vals, 0);
13880       for (i = 0; i < n_elts; ++i)
13881         {
13882           rtx x = XVECEXP (vals, 0, i);
13883           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13884             n_const++;
13885         }
13886       if (n_const == n_elts)
13887         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
13888     }
13889   else
13890     gcc_unreachable ();
13891
13892   if (const_vec != NULL_RTX
13893       && aarch64_simd_valid_immediate (const_vec, NULL))
13894     /* Load using MOVI/MVNI.  */
13895     return const_vec;
13896   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
13897     /* Loaded using DUP.  */
13898     return const_dup;
13899   else if (const_vec != NULL_RTX)
13900     /* Load from constant pool. We can not take advantage of single-cycle
13901        LD1 because we need a PC-relative addressing mode.  */
13902     return const_vec;
13903   else
13904     /* A PARALLEL containing something not valid inside CONST_VECTOR.
13905        We can not construct an initializer.  */
13906     return NULL_RTX;
13907 }
13908
13909 /* Expand a vector initialisation sequence, such that TARGET is
13910    initialised to contain VALS.  */
13911
13912 void
13913 aarch64_expand_vector_init (rtx target, rtx vals)
13914 {
13915   machine_mode mode = GET_MODE (target);
13916   scalar_mode inner_mode = GET_MODE_INNER (mode);
13917   /* The number of vector elements.  */
13918   int n_elts = XVECLEN (vals, 0);
13919   /* The number of vector elements which are not constant.  */
13920   int n_var = 0;
13921   rtx any_const = NULL_RTX;
13922   /* The first element of vals.  */
13923   rtx v0 = XVECEXP (vals, 0, 0);
13924   bool all_same = true;
13925
13926   /* Count the number of variable elements to initialise.  */
13927   for (int i = 0; i < n_elts; ++i)
13928     {
13929       rtx x = XVECEXP (vals, 0, i);
13930       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
13931         ++n_var;
13932       else
13933         any_const = x;
13934
13935       all_same &= rtx_equal_p (x, v0);
13936     }
13937
13938   /* No variable elements, hand off to aarch64_simd_make_constant which knows
13939      how best to handle this.  */
13940   if (n_var == 0)
13941     {
13942       rtx constant = aarch64_simd_make_constant (vals);
13943       if (constant != NULL_RTX)
13944         {
13945           emit_move_insn (target, constant);
13946           return;
13947         }
13948     }
13949
13950   /* Splat a single non-constant element if we can.  */
13951   if (all_same)
13952     {
13953       rtx x = copy_to_mode_reg (inner_mode, v0);
13954       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13955       return;
13956     }
13957
13958   enum insn_code icode = optab_handler (vec_set_optab, mode);
13959   gcc_assert (icode != CODE_FOR_nothing);
13960
13961   /* If there are only variable elements, try to optimize
13962      the insertion using dup for the most common element
13963      followed by insertions.  */
13964
13965   /* The algorithm will fill matches[*][0] with the earliest matching element,
13966      and matches[X][1] with the count of duplicate elements (if X is the
13967      earliest element which has duplicates).  */
13968
13969   if (n_var == n_elts && n_elts <= 16)
13970     {
13971       int matches[16][2] = {0};
13972       for (int i = 0; i < n_elts; i++)
13973         {
13974           for (int j = 0; j <= i; j++)
13975             {
13976               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
13977                 {
13978                   matches[i][0] = j;
13979                   matches[j][1]++;
13980                   break;
13981                 }
13982             }
13983         }
13984       int maxelement = 0;
13985       int maxv = 0;
13986       for (int i = 0; i < n_elts; i++)
13987         if (matches[i][1] > maxv)
13988           {
13989             maxelement = i;
13990             maxv = matches[i][1];
13991           }
13992
13993       /* Create a duplicate of the most common element, unless all elements
13994          are equally useless to us, in which case just immediately set the
13995          vector register using the first element.  */
13996
13997       if (maxv == 1)
13998         {
13999           /* For vectors of two 64-bit elements, we can do even better.  */
14000           if (n_elts == 2
14001               && (inner_mode == E_DImode
14002                   || inner_mode == E_DFmode))
14003
14004             {
14005               rtx x0 = XVECEXP (vals, 0, 0);
14006               rtx x1 = XVECEXP (vals, 0, 1);
14007               /* Combine can pick up this case, but handling it directly
14008                  here leaves clearer RTL.
14009
14010                  This is load_pair_lanes<mode>, and also gives us a clean-up
14011                  for store_pair_lanes<mode>.  */
14012               if (memory_operand (x0, inner_mode)
14013                   && memory_operand (x1, inner_mode)
14014                   && !STRICT_ALIGNMENT
14015                   && rtx_equal_p (XEXP (x1, 0),
14016                                   plus_constant (Pmode,
14017                                                  XEXP (x0, 0),
14018                                                  GET_MODE_SIZE (inner_mode))))
14019                 {
14020                   rtx t;
14021                   if (inner_mode == DFmode)
14022                     t = gen_load_pair_lanesdf (target, x0, x1);
14023                   else
14024                     t = gen_load_pair_lanesdi (target, x0, x1);
14025                   emit_insn (t);
14026                   return;
14027                 }
14028             }
14029           /* The subreg-move sequence below will move into lane zero of the
14030              vector register.  For big-endian we want that position to hold
14031              the last element of VALS.  */
14032           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
14033           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
14034           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
14035         }
14036       else
14037         {
14038           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
14039           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
14040         }
14041
14042       /* Insert the rest.  */
14043       for (int i = 0; i < n_elts; i++)
14044         {
14045           rtx x = XVECEXP (vals, 0, i);
14046           if (matches[i][0] == maxelement)
14047             continue;
14048           x = copy_to_mode_reg (inner_mode, x);
14049           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
14050         }
14051       return;
14052     }
14053
14054   /* Initialise a vector which is part-variable.  We want to first try
14055      to build those lanes which are constant in the most efficient way we
14056      can.  */
14057   if (n_var != n_elts)
14058     {
14059       rtx copy = copy_rtx (vals);
14060
14061       /* Load constant part of vector.  We really don't care what goes into the
14062          parts we will overwrite, but we're more likely to be able to load the
14063          constant efficiently if it has fewer, larger, repeating parts
14064          (see aarch64_simd_valid_immediate).  */
14065       for (int i = 0; i < n_elts; i++)
14066         {
14067           rtx x = XVECEXP (vals, 0, i);
14068           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
14069             continue;
14070           rtx subst = any_const;
14071           for (int bit = n_elts / 2; bit > 0; bit /= 2)
14072             {
14073               /* Look in the copied vector, as more elements are const.  */
14074               rtx test = XVECEXP (copy, 0, i ^ bit);
14075               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
14076                 {
14077                   subst = test;
14078                   break;
14079                 }
14080             }
14081           XVECEXP (copy, 0, i) = subst;
14082         }
14083       aarch64_expand_vector_init (target, copy);
14084     }
14085
14086   /* Insert the variable lanes directly.  */
14087   for (int i = 0; i < n_elts; i++)
14088     {
14089       rtx x = XVECEXP (vals, 0, i);
14090       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
14091         continue;
14092       x = copy_to_mode_reg (inner_mode, x);
14093       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
14094     }
14095 }
14096
14097 static unsigned HOST_WIDE_INT
14098 aarch64_shift_truncation_mask (machine_mode mode)
14099 {
14100   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
14101     return 0;
14102   return GET_MODE_UNIT_BITSIZE (mode) - 1;
14103 }
14104
14105 /* Select a format to encode pointers in exception handling data.  */
14106 int
14107 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
14108 {
14109    int type;
14110    switch (aarch64_cmodel)
14111      {
14112      case AARCH64_CMODEL_TINY:
14113      case AARCH64_CMODEL_TINY_PIC:
14114      case AARCH64_CMODEL_SMALL:
14115      case AARCH64_CMODEL_SMALL_PIC:
14116      case AARCH64_CMODEL_SMALL_SPIC:
14117        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
14118           for everything.  */
14119        type = DW_EH_PE_sdata4;
14120        break;
14121      default:
14122        /* No assumptions here.  8-byte relocs required.  */
14123        type = DW_EH_PE_sdata8;
14124        break;
14125      }
14126    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
14127 }
14128
14129 /* The last .arch and .tune assembly strings that we printed.  */
14130 static std::string aarch64_last_printed_arch_string;
14131 static std::string aarch64_last_printed_tune_string;
14132
14133 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
14134    by the function fndecl.  */
14135
14136 void
14137 aarch64_declare_function_name (FILE *stream, const char* name,
14138                                 tree fndecl)
14139 {
14140   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
14141
14142   struct cl_target_option *targ_options;
14143   if (target_parts)
14144     targ_options = TREE_TARGET_OPTION (target_parts);
14145   else
14146     targ_options = TREE_TARGET_OPTION (target_option_current_node);
14147   gcc_assert (targ_options);
14148
14149   const struct processor *this_arch
14150     = aarch64_get_arch (targ_options->x_explicit_arch);
14151
14152   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
14153   std::string extension
14154     = aarch64_get_extension_string_for_isa_flags (isa_flags,
14155                                                   this_arch->flags);
14156   /* Only update the assembler .arch string if it is distinct from the last
14157      such string we printed.  */
14158   std::string to_print = this_arch->name + extension;
14159   if (to_print != aarch64_last_printed_arch_string)
14160     {
14161       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
14162       aarch64_last_printed_arch_string = to_print;
14163     }
14164
14165   /* Print the cpu name we're tuning for in the comments, might be
14166      useful to readers of the generated asm.  Do it only when it changes
14167      from function to function and verbose assembly is requested.  */
14168   const struct processor *this_tune
14169     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
14170
14171   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
14172     {
14173       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
14174                    this_tune->name);
14175       aarch64_last_printed_tune_string = this_tune->name;
14176     }
14177
14178   /* Don't forget the type directive for ELF.  */
14179   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
14180   ASM_OUTPUT_LABEL (stream, name);
14181 }
14182
14183 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
14184
14185 static void
14186 aarch64_start_file (void)
14187 {
14188   struct cl_target_option *default_options
14189     = TREE_TARGET_OPTION (target_option_default_node);
14190
14191   const struct processor *default_arch
14192     = aarch64_get_arch (default_options->x_explicit_arch);
14193   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
14194   std::string extension
14195     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
14196                                                   default_arch->flags);
14197
14198    aarch64_last_printed_arch_string = default_arch->name + extension;
14199    aarch64_last_printed_tune_string = "";
14200    asm_fprintf (asm_out_file, "\t.arch %s\n",
14201                 aarch64_last_printed_arch_string.c_str ());
14202
14203    default_file_start ();
14204 }
14205
14206 /* Emit load exclusive.  */
14207
14208 static void
14209 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
14210                              rtx mem, rtx model_rtx)
14211 {
14212   rtx (*gen) (rtx, rtx, rtx);
14213
14214   switch (mode)
14215     {
14216     case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
14217     case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
14218     case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
14219     case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
14220     default:
14221       gcc_unreachable ();
14222     }
14223
14224   emit_insn (gen (rval, mem, model_rtx));
14225 }
14226
14227 /* Emit store exclusive.  */
14228
14229 static void
14230 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
14231                               rtx rval, rtx mem, rtx model_rtx)
14232 {
14233   rtx (*gen) (rtx, rtx, rtx, rtx);
14234
14235   switch (mode)
14236     {
14237     case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
14238     case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
14239     case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
14240     case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
14241     default:
14242       gcc_unreachable ();
14243     }
14244
14245   emit_insn (gen (bval, rval, mem, model_rtx));
14246 }
14247
14248 /* Mark the previous jump instruction as unlikely.  */
14249
14250 static void
14251 aarch64_emit_unlikely_jump (rtx insn)
14252 {
14253   rtx_insn *jump = emit_jump_insn (insn);
14254   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
14255 }
14256
14257 /* Expand a compare and swap pattern.  */
14258
14259 void
14260 aarch64_expand_compare_and_swap (rtx operands[])
14261 {
14262   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
14263   machine_mode mode, cmp_mode;
14264   typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
14265   int idx;
14266   gen_cas_fn gen;
14267   const gen_cas_fn split_cas[] =
14268   {
14269     gen_aarch64_compare_and_swapqi,
14270     gen_aarch64_compare_and_swaphi,
14271     gen_aarch64_compare_and_swapsi,
14272     gen_aarch64_compare_and_swapdi
14273   };
14274   const gen_cas_fn atomic_cas[] =
14275   {
14276     gen_aarch64_compare_and_swapqi_lse,
14277     gen_aarch64_compare_and_swaphi_lse,
14278     gen_aarch64_compare_and_swapsi_lse,
14279     gen_aarch64_compare_and_swapdi_lse
14280   };
14281
14282   bval = operands[0];
14283   rval = operands[1];
14284   mem = operands[2];
14285   oldval = operands[3];
14286   newval = operands[4];
14287   is_weak = operands[5];
14288   mod_s = operands[6];
14289   mod_f = operands[7];
14290   mode = GET_MODE (mem);
14291   cmp_mode = mode;
14292
14293   /* Normally the succ memory model must be stronger than fail, but in the
14294      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
14295      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
14296
14297   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
14298       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
14299     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
14300
14301   switch (mode)
14302     {
14303     case E_QImode:
14304     case E_HImode:
14305       /* For short modes, we're going to perform the comparison in SImode,
14306          so do the zero-extension now.  */
14307       cmp_mode = SImode;
14308       rval = gen_reg_rtx (SImode);
14309       oldval = convert_modes (SImode, mode, oldval, true);
14310       /* Fall through.  */
14311
14312     case E_SImode:
14313     case E_DImode:
14314       /* Force the value into a register if needed.  */
14315       if (!aarch64_plus_operand (oldval, mode))
14316         oldval = force_reg (cmp_mode, oldval);
14317       break;
14318
14319     default:
14320       gcc_unreachable ();
14321     }
14322
14323   switch (mode)
14324     {
14325     case E_QImode: idx = 0; break;
14326     case E_HImode: idx = 1; break;
14327     case E_SImode: idx = 2; break;
14328     case E_DImode: idx = 3; break;
14329     default:
14330       gcc_unreachable ();
14331     }
14332   if (TARGET_LSE)
14333     gen = atomic_cas[idx];
14334   else
14335     gen = split_cas[idx];
14336
14337   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
14338
14339   if (mode == QImode || mode == HImode)
14340     emit_move_insn (operands[1], gen_lowpart (mode, rval));
14341
14342   x = gen_rtx_REG (CCmode, CC_REGNUM);
14343   x = gen_rtx_EQ (SImode, x, const0_rtx);
14344   emit_insn (gen_rtx_SET (bval, x));
14345 }
14346
14347 /* Test whether the target supports using a atomic load-operate instruction.
14348    CODE is the operation and AFTER is TRUE if the data in memory after the
14349    operation should be returned and FALSE if the data before the operation
14350    should be returned.  Returns FALSE if the operation isn't supported by the
14351    architecture.  */
14352
14353 bool
14354 aarch64_atomic_ldop_supported_p (enum rtx_code code)
14355 {
14356   if (!TARGET_LSE)
14357     return false;
14358
14359   switch (code)
14360     {
14361     case SET:
14362     case AND:
14363     case IOR:
14364     case XOR:
14365     case MINUS:
14366     case PLUS:
14367       return true;
14368     default:
14369       return false;
14370     }
14371 }
14372
14373 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
14374    sequence implementing an atomic operation.  */
14375
14376 static void
14377 aarch64_emit_post_barrier (enum memmodel model)
14378 {
14379   const enum memmodel base_model = memmodel_base (model);
14380
14381   if (is_mm_sync (model)
14382       && (base_model == MEMMODEL_ACQUIRE
14383           || base_model == MEMMODEL_ACQ_REL
14384           || base_model == MEMMODEL_SEQ_CST))
14385     {
14386       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
14387     }
14388 }
14389
14390 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
14391    for the data in memory.  EXPECTED is the value expected to be in memory.
14392    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
14393    is the memory ordering to use.  */
14394
14395 void
14396 aarch64_gen_atomic_cas (rtx rval, rtx mem,
14397                         rtx expected, rtx desired,
14398                         rtx model)
14399 {
14400   rtx (*gen) (rtx, rtx, rtx, rtx);
14401   machine_mode mode;
14402
14403   mode = GET_MODE (mem);
14404
14405   switch (mode)
14406     {
14407     case E_QImode: gen = gen_aarch64_atomic_casqi; break;
14408     case E_HImode: gen = gen_aarch64_atomic_cashi; break;
14409     case E_SImode: gen = gen_aarch64_atomic_cassi; break;
14410     case E_DImode: gen = gen_aarch64_atomic_casdi; break;
14411     default:
14412       gcc_unreachable ();
14413     }
14414
14415   /* Move the expected value into the CAS destination register.  */
14416   emit_insn (gen_rtx_SET (rval, expected));
14417
14418   /* Emit the CAS.  */
14419   emit_insn (gen (rval, mem, desired, model));
14420
14421   /* Compare the expected value with the value loaded by the CAS, to establish
14422      whether the swap was made.  */
14423   aarch64_gen_compare_reg (EQ, rval, expected);
14424 }
14425
14426 /* Split a compare and swap pattern.  */
14427
14428 void
14429 aarch64_split_compare_and_swap (rtx operands[])
14430 {
14431   rtx rval, mem, oldval, newval, scratch;
14432   machine_mode mode;
14433   bool is_weak;
14434   rtx_code_label *label1, *label2;
14435   rtx x, cond;
14436   enum memmodel model;
14437   rtx model_rtx;
14438
14439   rval = operands[0];
14440   mem = operands[1];
14441   oldval = operands[2];
14442   newval = operands[3];
14443   is_weak = (operands[4] != const0_rtx);
14444   model_rtx = operands[5];
14445   scratch = operands[7];
14446   mode = GET_MODE (mem);
14447   model = memmodel_from_int (INTVAL (model_rtx));
14448
14449   /* When OLDVAL is zero and we want the strong version we can emit a tighter
14450     loop:
14451     .label1:
14452         LD[A]XR rval, [mem]
14453         CBNZ    rval, .label2
14454         ST[L]XR scratch, newval, [mem]
14455         CBNZ    scratch, .label1
14456     .label2:
14457         CMP     rval, 0.  */
14458   bool strong_zero_p = !is_weak && oldval == const0_rtx;
14459
14460   label1 = NULL;
14461   if (!is_weak)
14462     {
14463       label1 = gen_label_rtx ();
14464       emit_label (label1);
14465     }
14466   label2 = gen_label_rtx ();
14467
14468   /* The initial load can be relaxed for a __sync operation since a final
14469      barrier will be emitted to stop code hoisting.  */
14470   if (is_mm_sync (model))
14471     aarch64_emit_load_exclusive (mode, rval, mem,
14472                                  GEN_INT (MEMMODEL_RELAXED));
14473   else
14474     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
14475
14476   if (strong_zero_p)
14477     {
14478       x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
14479       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14480                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14481       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14482     }
14483   else
14484     {
14485       cond = aarch64_gen_compare_reg (NE, rval, oldval);
14486       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14487       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14488                                  gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14489       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14490     }
14491
14492   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
14493
14494   if (!is_weak)
14495     {
14496       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
14497       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14498                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
14499       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14500     }
14501   else
14502     {
14503       cond = gen_rtx_REG (CCmode, CC_REGNUM);
14504       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
14505       emit_insn (gen_rtx_SET (cond, x));
14506     }
14507
14508   emit_label (label2);
14509   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
14510      to set the condition flags.  If this is not used it will be removed by
14511      later passes.  */
14512   if (strong_zero_p)
14513     {
14514       cond = gen_rtx_REG (CCmode, CC_REGNUM);
14515       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
14516       emit_insn (gen_rtx_SET (cond, x));
14517     }
14518   /* Emit any final barrier needed for a __sync operation.  */
14519   if (is_mm_sync (model))
14520     aarch64_emit_post_barrier (model);
14521 }
14522
14523 /* Emit a BIC instruction.  */
14524
14525 static void
14526 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
14527 {
14528   rtx shift_rtx = GEN_INT (shift);
14529   rtx (*gen) (rtx, rtx, rtx, rtx);
14530
14531   switch (mode)
14532     {
14533     case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
14534     case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
14535     default:
14536       gcc_unreachable ();
14537     }
14538
14539   emit_insn (gen (dst, s2, shift_rtx, s1));
14540 }
14541
14542 /* Emit an atomic swap.  */
14543
14544 static void
14545 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
14546                           rtx mem, rtx model)
14547 {
14548   rtx (*gen) (rtx, rtx, rtx, rtx);
14549
14550   switch (mode)
14551     {
14552     case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
14553     case E_HImode: gen = gen_aarch64_atomic_swphi; break;
14554     case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
14555     case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
14556     default:
14557       gcc_unreachable ();
14558     }
14559
14560   emit_insn (gen (dst, mem, value, model));
14561 }
14562
14563 /* Operations supported by aarch64_emit_atomic_load_op.  */
14564
14565 enum aarch64_atomic_load_op_code
14566 {
14567   AARCH64_LDOP_PLUS,    /* A + B  */
14568   AARCH64_LDOP_XOR,     /* A ^ B  */
14569   AARCH64_LDOP_OR,      /* A | B  */
14570   AARCH64_LDOP_BIC      /* A & ~B  */
14571 };
14572
14573 /* Emit an atomic load-operate.  */
14574
14575 static void
14576 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
14577                              machine_mode mode, rtx dst, rtx src,
14578                              rtx mem, rtx model)
14579 {
14580   typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
14581   const aarch64_atomic_load_op_fn plus[] =
14582   {
14583     gen_aarch64_atomic_loadaddqi,
14584     gen_aarch64_atomic_loadaddhi,
14585     gen_aarch64_atomic_loadaddsi,
14586     gen_aarch64_atomic_loadadddi
14587   };
14588   const aarch64_atomic_load_op_fn eor[] =
14589   {
14590     gen_aarch64_atomic_loadeorqi,
14591     gen_aarch64_atomic_loadeorhi,
14592     gen_aarch64_atomic_loadeorsi,
14593     gen_aarch64_atomic_loadeordi
14594   };
14595   const aarch64_atomic_load_op_fn ior[] =
14596   {
14597     gen_aarch64_atomic_loadsetqi,
14598     gen_aarch64_atomic_loadsethi,
14599     gen_aarch64_atomic_loadsetsi,
14600     gen_aarch64_atomic_loadsetdi
14601   };
14602   const aarch64_atomic_load_op_fn bic[] =
14603   {
14604     gen_aarch64_atomic_loadclrqi,
14605     gen_aarch64_atomic_loadclrhi,
14606     gen_aarch64_atomic_loadclrsi,
14607     gen_aarch64_atomic_loadclrdi
14608   };
14609   aarch64_atomic_load_op_fn gen;
14610   int idx = 0;
14611
14612   switch (mode)
14613     {
14614     case E_QImode: idx = 0; break;
14615     case E_HImode: idx = 1; break;
14616     case E_SImode: idx = 2; break;
14617     case E_DImode: idx = 3; break;
14618     default:
14619       gcc_unreachable ();
14620     }
14621
14622   switch (code)
14623     {
14624     case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
14625     case AARCH64_LDOP_XOR: gen = eor[idx]; break;
14626     case AARCH64_LDOP_OR: gen = ior[idx]; break;
14627     case AARCH64_LDOP_BIC: gen = bic[idx]; break;
14628     default:
14629       gcc_unreachable ();
14630     }
14631
14632   emit_insn (gen (dst, mem, src, model));
14633 }
14634
14635 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
14636    location to store the data read from memory.  OUT_RESULT is the location to
14637    store the result of the operation.  MEM is the memory location to read and
14638    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
14639    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
14640    be NULL.  */
14641
14642 void
14643 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
14644                          rtx mem, rtx value, rtx model_rtx)
14645 {
14646   machine_mode mode = GET_MODE (mem);
14647   machine_mode wmode = (mode == DImode ? DImode : SImode);
14648   const bool short_mode = (mode < SImode);
14649   aarch64_atomic_load_op_code ldop_code;
14650   rtx src;
14651   rtx x;
14652
14653   if (out_data)
14654     out_data = gen_lowpart (mode, out_data);
14655
14656   if (out_result)
14657     out_result = gen_lowpart (mode, out_result);
14658
14659   /* Make sure the value is in a register, putting it into a destination
14660      register if it needs to be manipulated.  */
14661   if (!register_operand (value, mode)
14662       || code == AND || code == MINUS)
14663     {
14664       src = out_result ? out_result : out_data;
14665       emit_move_insn (src, gen_lowpart (mode, value));
14666     }
14667   else
14668     src = value;
14669   gcc_assert (register_operand (src, mode));
14670
14671   /* Preprocess the data for the operation as necessary.  If the operation is
14672      a SET then emit a swap instruction and finish.  */
14673   switch (code)
14674     {
14675     case SET:
14676       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
14677       return;
14678
14679     case MINUS:
14680       /* Negate the value and treat it as a PLUS.  */
14681       {
14682         rtx neg_src;
14683
14684         /* Resize the value if necessary.  */
14685         if (short_mode)
14686           src = gen_lowpart (wmode, src);
14687
14688         neg_src = gen_rtx_NEG (wmode, src);
14689         emit_insn (gen_rtx_SET (src, neg_src));
14690
14691         if (short_mode)
14692           src = gen_lowpart (mode, src);
14693       }
14694       /* Fall-through.  */
14695     case PLUS:
14696       ldop_code = AARCH64_LDOP_PLUS;
14697       break;
14698
14699     case IOR:
14700       ldop_code = AARCH64_LDOP_OR;
14701       break;
14702
14703     case XOR:
14704       ldop_code = AARCH64_LDOP_XOR;
14705       break;
14706
14707     case AND:
14708       {
14709         rtx not_src;
14710
14711         /* Resize the value if necessary.  */
14712         if (short_mode)
14713           src = gen_lowpart (wmode, src);
14714
14715         not_src = gen_rtx_NOT (wmode, src);
14716         emit_insn (gen_rtx_SET (src, not_src));
14717
14718         if (short_mode)
14719           src = gen_lowpart (mode, src);
14720       }
14721       ldop_code = AARCH64_LDOP_BIC;
14722       break;
14723
14724     default:
14725       /* The operation can't be done with atomic instructions.  */
14726       gcc_unreachable ();
14727     }
14728
14729   aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
14730
14731   /* If necessary, calculate the data in memory after the update by redoing the
14732      operation from values in registers.  */
14733   if (!out_result)
14734     return;
14735
14736   if (short_mode)
14737     {
14738       src = gen_lowpart (wmode, src);
14739       out_data = gen_lowpart (wmode, out_data);
14740       out_result = gen_lowpart (wmode, out_result);
14741     }
14742
14743   x = NULL_RTX;
14744
14745   switch (code)
14746     {
14747     case MINUS:
14748     case PLUS:
14749       x = gen_rtx_PLUS (wmode, out_data, src);
14750       break;
14751     case IOR:
14752       x = gen_rtx_IOR (wmode, out_data, src);
14753       break;
14754     case XOR:
14755       x = gen_rtx_XOR (wmode, out_data, src);
14756       break;
14757     case AND:
14758       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
14759       return;
14760     default:
14761       gcc_unreachable ();
14762     }
14763
14764   emit_set_insn (out_result, x);
14765
14766   return;
14767 }
14768
14769 /* Split an atomic operation.  */
14770
14771 void
14772 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
14773                          rtx value, rtx model_rtx, rtx cond)
14774 {
14775   machine_mode mode = GET_MODE (mem);
14776   machine_mode wmode = (mode == DImode ? DImode : SImode);
14777   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
14778   const bool is_sync = is_mm_sync (model);
14779   rtx_code_label *label;
14780   rtx x;
14781
14782   /* Split the atomic operation into a sequence.  */
14783   label = gen_label_rtx ();
14784   emit_label (label);
14785
14786   if (new_out)
14787     new_out = gen_lowpart (wmode, new_out);
14788   if (old_out)
14789     old_out = gen_lowpart (wmode, old_out);
14790   else
14791     old_out = new_out;
14792   value = simplify_gen_subreg (wmode, value, mode, 0);
14793
14794   /* The initial load can be relaxed for a __sync operation since a final
14795      barrier will be emitted to stop code hoisting.  */
14796  if (is_sync)
14797     aarch64_emit_load_exclusive (mode, old_out, mem,
14798                                  GEN_INT (MEMMODEL_RELAXED));
14799   else
14800     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
14801
14802   switch (code)
14803     {
14804     case SET:
14805       new_out = value;
14806       break;
14807
14808     case NOT:
14809       x = gen_rtx_AND (wmode, old_out, value);
14810       emit_insn (gen_rtx_SET (new_out, x));
14811       x = gen_rtx_NOT (wmode, new_out);
14812       emit_insn (gen_rtx_SET (new_out, x));
14813       break;
14814
14815     case MINUS:
14816       if (CONST_INT_P (value))
14817         {
14818           value = GEN_INT (-INTVAL (value));
14819           code = PLUS;
14820         }
14821       /* Fall through.  */
14822
14823     default:
14824       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
14825       emit_insn (gen_rtx_SET (new_out, x));
14826       break;
14827     }
14828
14829   aarch64_emit_store_exclusive (mode, cond, mem,
14830                                 gen_lowpart (mode, new_out), model_rtx);
14831
14832   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14833   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14834                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
14835   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14836
14837   /* Emit any final barrier needed for a __sync operation.  */
14838   if (is_sync)
14839     aarch64_emit_post_barrier (model);
14840 }
14841
14842 static void
14843 aarch64_init_libfuncs (void)
14844 {
14845    /* Half-precision float operations.  The compiler handles all operations
14846      with NULL libfuncs by converting to SFmode.  */
14847
14848   /* Conversions.  */
14849   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
14850   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
14851
14852   /* Arithmetic.  */
14853   set_optab_libfunc (add_optab, HFmode, NULL);
14854   set_optab_libfunc (sdiv_optab, HFmode, NULL);
14855   set_optab_libfunc (smul_optab, HFmode, NULL);
14856   set_optab_libfunc (neg_optab, HFmode, NULL);
14857   set_optab_libfunc (sub_optab, HFmode, NULL);
14858
14859   /* Comparisons.  */
14860   set_optab_libfunc (eq_optab, HFmode, NULL);
14861   set_optab_libfunc (ne_optab, HFmode, NULL);
14862   set_optab_libfunc (lt_optab, HFmode, NULL);
14863   set_optab_libfunc (le_optab, HFmode, NULL);
14864   set_optab_libfunc (ge_optab, HFmode, NULL);
14865   set_optab_libfunc (gt_optab, HFmode, NULL);
14866   set_optab_libfunc (unord_optab, HFmode, NULL);
14867 }
14868
14869 /* Target hook for c_mode_for_suffix.  */
14870 static machine_mode
14871 aarch64_c_mode_for_suffix (char suffix)
14872 {
14873   if (suffix == 'q')
14874     return TFmode;
14875
14876   return VOIDmode;
14877 }
14878
14879 /* We can only represent floating point constants which will fit in
14880    "quarter-precision" values.  These values are characterised by
14881    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
14882    by:
14883
14884    (-1)^s * (n/16) * 2^r
14885
14886    Where:
14887      's' is the sign bit.
14888      'n' is an integer in the range 16 <= n <= 31.
14889      'r' is an integer in the range -3 <= r <= 4.  */
14890
14891 /* Return true iff X can be represented by a quarter-precision
14892    floating point immediate operand X.  Note, we cannot represent 0.0.  */
14893 bool
14894 aarch64_float_const_representable_p (rtx x)
14895 {
14896   /* This represents our current view of how many bits
14897      make up the mantissa.  */
14898   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
14899   int exponent;
14900   unsigned HOST_WIDE_INT mantissa, mask;
14901   REAL_VALUE_TYPE r, m;
14902   bool fail;
14903
14904   if (!CONST_DOUBLE_P (x))
14905     return false;
14906
14907   /* We don't support HFmode constants yet.  */
14908   if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
14909     return false;
14910
14911   r = *CONST_DOUBLE_REAL_VALUE (x);
14912
14913   /* We cannot represent infinities, NaNs or +/-zero.  We won't
14914      know if we have +zero until we analyse the mantissa, but we
14915      can reject the other invalid values.  */
14916   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
14917       || REAL_VALUE_MINUS_ZERO (r))
14918     return false;
14919
14920   /* Extract exponent.  */
14921   r = real_value_abs (&r);
14922   exponent = REAL_EXP (&r);
14923
14924   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
14925      highest (sign) bit, with a fixed binary point at bit point_pos.
14926      m1 holds the low part of the mantissa, m2 the high part.
14927      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
14928      bits for the mantissa, this can fail (low bits will be lost).  */
14929   real_ldexp (&m, &r, point_pos - exponent);
14930   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
14931
14932   /* If the low part of the mantissa has bits set we cannot represent
14933      the value.  */
14934   if (w.ulow () != 0)
14935     return false;
14936   /* We have rejected the lower HOST_WIDE_INT, so update our
14937      understanding of how many bits lie in the mantissa and
14938      look only at the high HOST_WIDE_INT.  */
14939   mantissa = w.elt (1);
14940   point_pos -= HOST_BITS_PER_WIDE_INT;
14941
14942   /* We can only represent values with a mantissa of the form 1.xxxx.  */
14943   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
14944   if ((mantissa & mask) != 0)
14945     return false;
14946
14947   /* Having filtered unrepresentable values, we may now remove all
14948      but the highest 5 bits.  */
14949   mantissa >>= point_pos - 5;
14950
14951   /* We cannot represent the value 0.0, so reject it.  This is handled
14952      elsewhere.  */
14953   if (mantissa == 0)
14954     return false;
14955
14956   /* Then, as bit 4 is always set, we can mask it off, leaving
14957      the mantissa in the range [0, 15].  */
14958   mantissa &= ~(1 << 4);
14959   gcc_assert (mantissa <= 15);
14960
14961   /* GCC internally does not use IEEE754-like encoding (where normalized
14962      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
14963      Our mantissa values are shifted 4 places to the left relative to
14964      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
14965      by 5 places to correct for GCC's representation.  */
14966   exponent = 5 - exponent;
14967
14968   return (exponent >= 0 && exponent <= 7);
14969 }
14970
14971 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
14972    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
14973    output MOVI/MVNI, ORR or BIC immediate.  */
14974 char*
14975 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
14976                                    enum simd_immediate_check which)
14977 {
14978   bool is_valid;
14979   static char templ[40];
14980   const char *mnemonic;
14981   const char *shift_op;
14982   unsigned int lane_count = 0;
14983   char element_char;
14984
14985   struct simd_immediate_info info;
14986
14987   /* This will return true to show const_vector is legal for use as either
14988      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
14989      It will also update INFO to show how the immediate should be generated.
14990      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
14991   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
14992   gcc_assert (is_valid);
14993
14994   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14995   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
14996
14997   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
14998     {
14999       gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
15000       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
15001          move immediate path.  */
15002       if (aarch64_float_const_zero_rtx_p (info.value))
15003         info.value = GEN_INT (0);
15004       else
15005         {
15006           const unsigned int buf_size = 20;
15007           char float_buf[buf_size] = {'\0'};
15008           real_to_decimal_for_mode (float_buf,
15009                                     CONST_DOUBLE_REAL_VALUE (info.value),
15010                                     buf_size, buf_size, 1, info.elt_mode);
15011
15012           if (lane_count == 1)
15013             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
15014           else
15015             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
15016                       lane_count, element_char, float_buf);
15017           return templ;
15018         }
15019     }
15020
15021   gcc_assert (CONST_INT_P (info.value));
15022
15023   if (which == AARCH64_CHECK_MOV)
15024     {
15025       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
15026       shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
15027       if (lane_count == 1)
15028         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
15029                   mnemonic, UINTVAL (info.value));
15030       else if (info.shift)
15031         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
15032                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
15033                   element_char, UINTVAL (info.value), shift_op, info.shift);
15034       else
15035         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
15036                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
15037                   element_char, UINTVAL (info.value));
15038     }
15039   else
15040     {
15041       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
15042       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
15043       if (info.shift)
15044         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
15045                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
15046                   element_char, UINTVAL (info.value), "lsl", info.shift);
15047       else
15048         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
15049                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
15050                   element_char, UINTVAL (info.value));
15051     }
15052   return templ;
15053 }
15054
15055 char*
15056 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
15057 {
15058
15059   /* If a floating point number was passed and we desire to use it in an
15060      integer mode do the conversion to integer.  */
15061   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
15062     {
15063       unsigned HOST_WIDE_INT ival;
15064       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
15065           gcc_unreachable ();
15066       immediate = gen_int_mode (ival, mode);
15067     }
15068
15069   machine_mode vmode;
15070   /* use a 64 bit mode for everything except for DI/DF mode, where we use
15071      a 128 bit vector mode.  */
15072   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
15073
15074   vmode = aarch64_simd_container_mode (mode, width);
15075   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
15076   return aarch64_output_simd_mov_immediate (v_op, width);
15077 }
15078
15079 /* Return the output string to use for moving immediate CONST_VECTOR
15080    into an SVE register.  */
15081
15082 char *
15083 aarch64_output_sve_mov_immediate (rtx const_vector)
15084 {
15085   static char templ[40];
15086   struct simd_immediate_info info;
15087   char element_char;
15088
15089   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
15090   gcc_assert (is_valid);
15091
15092   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
15093
15094   if (info.step)
15095     {
15096       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
15097                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
15098                 element_char, INTVAL (info.value), INTVAL (info.step));
15099       return templ;
15100     }
15101
15102   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15103     {
15104       if (aarch64_float_const_zero_rtx_p (info.value))
15105         info.value = GEN_INT (0);
15106       else
15107         {
15108           const int buf_size = 20;
15109           char float_buf[buf_size] = {};
15110           real_to_decimal_for_mode (float_buf,
15111                                     CONST_DOUBLE_REAL_VALUE (info.value),
15112                                     buf_size, buf_size, 1, info.elt_mode);
15113
15114           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
15115                     element_char, float_buf);
15116           return templ;
15117         }
15118     }
15119
15120   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
15121             element_char, INTVAL (info.value));
15122   return templ;
15123 }
15124
15125 /* Return the asm format for a PTRUE instruction whose destination has
15126    mode MODE.  SUFFIX is the element size suffix.  */
15127
15128 char *
15129 aarch64_output_ptrue (machine_mode mode, char suffix)
15130 {
15131   unsigned int nunits;
15132   static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
15133   if (GET_MODE_NUNITS (mode).is_constant (&nunits))
15134     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
15135   else
15136     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
15137   return buf;
15138 }
15139
15140 /* Split operands into moves from op[1] + op[2] into op[0].  */
15141
15142 void
15143 aarch64_split_combinev16qi (rtx operands[3])
15144 {
15145   unsigned int dest = REGNO (operands[0]);
15146   unsigned int src1 = REGNO (operands[1]);
15147   unsigned int src2 = REGNO (operands[2]);
15148   machine_mode halfmode = GET_MODE (operands[1]);
15149   unsigned int halfregs = REG_NREGS (operands[1]);
15150   rtx destlo, desthi;
15151
15152   gcc_assert (halfmode == V16QImode);
15153
15154   if (src1 == dest && src2 == dest + halfregs)
15155     {
15156       /* No-op move.  Can't split to nothing; emit something.  */
15157       emit_note (NOTE_INSN_DELETED);
15158       return;
15159     }
15160
15161   /* Preserve register attributes for variable tracking.  */
15162   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
15163   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
15164                                GET_MODE_SIZE (halfmode));
15165
15166   /* Special case of reversed high/low parts.  */
15167   if (reg_overlap_mentioned_p (operands[2], destlo)
15168       && reg_overlap_mentioned_p (operands[1], desthi))
15169     {
15170       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15171       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
15172       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15173     }
15174   else if (!reg_overlap_mentioned_p (operands[2], destlo))
15175     {
15176       /* Try to avoid unnecessary moves if part of the result
15177          is in the right place already.  */
15178       if (src1 != dest)
15179         emit_move_insn (destlo, operands[1]);
15180       if (src2 != dest + halfregs)
15181         emit_move_insn (desthi, operands[2]);
15182     }
15183   else
15184     {
15185       if (src2 != dest + halfregs)
15186         emit_move_insn (desthi, operands[2]);
15187       if (src1 != dest)
15188         emit_move_insn (destlo, operands[1]);
15189     }
15190 }
15191
15192 /* vec_perm support.  */
15193
15194 struct expand_vec_perm_d
15195 {
15196   rtx target, op0, op1;
15197   vec_perm_indices perm;
15198   machine_mode vmode;
15199   unsigned int vec_flags;
15200   bool one_vector_p;
15201   bool testing_p;
15202 };
15203
15204 /* Generate a variable permutation.  */
15205
15206 static void
15207 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
15208 {
15209   machine_mode vmode = GET_MODE (target);
15210   bool one_vector_p = rtx_equal_p (op0, op1);
15211
15212   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
15213   gcc_checking_assert (GET_MODE (op0) == vmode);
15214   gcc_checking_assert (GET_MODE (op1) == vmode);
15215   gcc_checking_assert (GET_MODE (sel) == vmode);
15216   gcc_checking_assert (TARGET_SIMD);
15217
15218   if (one_vector_p)
15219     {
15220       if (vmode == V8QImode)
15221         {
15222           /* Expand the argument to a V16QI mode by duplicating it.  */
15223           rtx pair = gen_reg_rtx (V16QImode);
15224           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
15225           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15226         }
15227       else
15228         {
15229           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
15230         }
15231     }
15232   else
15233     {
15234       rtx pair;
15235
15236       if (vmode == V8QImode)
15237         {
15238           pair = gen_reg_rtx (V16QImode);
15239           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
15240           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15241         }
15242       else
15243         {
15244           pair = gen_reg_rtx (OImode);
15245           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
15246           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
15247         }
15248     }
15249 }
15250
15251 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
15252    NELT is the number of elements in the vector.  */
15253
15254 void
15255 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
15256                          unsigned int nelt)
15257 {
15258   machine_mode vmode = GET_MODE (target);
15259   bool one_vector_p = rtx_equal_p (op0, op1);
15260   rtx mask;
15261
15262   /* The TBL instruction does not use a modulo index, so we must take care
15263      of that ourselves.  */
15264   mask = aarch64_simd_gen_const_vector_dup (vmode,
15265       one_vector_p ? nelt - 1 : 2 * nelt - 1);
15266   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
15267
15268   /* For big-endian, we also need to reverse the index within the vector
15269      (but not which vector).  */
15270   if (BYTES_BIG_ENDIAN)
15271     {
15272       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
15273       if (!one_vector_p)
15274         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
15275       sel = expand_simple_binop (vmode, XOR, sel, mask,
15276                                  NULL, 0, OPTAB_LIB_WIDEN);
15277     }
15278   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
15279 }
15280
15281 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
15282
15283 static void
15284 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
15285 {
15286   emit_insn (gen_rtx_SET (target,
15287                           gen_rtx_UNSPEC (GET_MODE (target),
15288                                           gen_rtvec (2, op0, op1), code)));
15289 }
15290
15291 /* Expand an SVE vec_perm with the given operands.  */
15292
15293 void
15294 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
15295 {
15296   machine_mode data_mode = GET_MODE (target);
15297   machine_mode sel_mode = GET_MODE (sel);
15298   /* Enforced by the pattern condition.  */
15299   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
15300
15301   /* Note: vec_perm indices are supposed to wrap when they go beyond the
15302      size of the two value vectors, i.e. the upper bits of the indices
15303      are effectively ignored.  SVE TBL instead produces 0 for any
15304      out-of-range indices, so we need to modulo all the vec_perm indices
15305      to ensure they are all in range.  */
15306   rtx sel_reg = force_reg (sel_mode, sel);
15307
15308   /* Check if the sel only references the first values vector.  */
15309   if (GET_CODE (sel) == CONST_VECTOR
15310       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
15311     {
15312       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
15313       return;
15314     }
15315
15316   /* Check if the two values vectors are the same.  */
15317   if (rtx_equal_p (op0, op1))
15318     {
15319       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
15320       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15321                                          NULL, 0, OPTAB_DIRECT);
15322       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
15323       return;
15324     }
15325
15326   /* Run TBL on for each value vector and combine the results.  */
15327
15328   rtx res0 = gen_reg_rtx (data_mode);
15329   rtx res1 = gen_reg_rtx (data_mode);
15330   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
15331   if (GET_CODE (sel) != CONST_VECTOR
15332       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
15333     {
15334       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
15335                                                        2 * nunits - 1);
15336       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15337                                      NULL, 0, OPTAB_DIRECT);
15338     }
15339   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
15340   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
15341                                      NULL, 0, OPTAB_DIRECT);
15342   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
15343   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
15344     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
15345   else
15346     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
15347 }
15348
15349 /* Recognize patterns suitable for the TRN instructions.  */
15350 static bool
15351 aarch64_evpc_trn (struct expand_vec_perm_d *d)
15352 {
15353   HOST_WIDE_INT odd;
15354   poly_uint64 nelt = d->perm.length ();
15355   rtx out, in0, in1, x;
15356   machine_mode vmode = d->vmode;
15357
15358   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15359     return false;
15360
15361   /* Note that these are little-endian tests.
15362      We correct for big-endian later.  */
15363   if (!d->perm[0].is_constant (&odd)
15364       || (odd != 0 && odd != 1)
15365       || !d->perm.series_p (0, 2, odd, 2)
15366       || !d->perm.series_p (1, 2, nelt + odd, 2))
15367     return false;
15368
15369   /* Success!  */
15370   if (d->testing_p)
15371     return true;
15372
15373   in0 = d->op0;
15374   in1 = d->op1;
15375   /* We don't need a big-endian lane correction for SVE; see the comment
15376      at the head of aarch64-sve.md for details.  */
15377   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15378     {
15379       x = in0, in0 = in1, in1 = x;
15380       odd = !odd;
15381     }
15382   out = d->target;
15383
15384   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15385                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
15386   return true;
15387 }
15388
15389 /* Recognize patterns suitable for the UZP instructions.  */
15390 static bool
15391 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
15392 {
15393   HOST_WIDE_INT odd;
15394   rtx out, in0, in1, x;
15395   machine_mode vmode = d->vmode;
15396
15397   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15398     return false;
15399
15400   /* Note that these are little-endian tests.
15401      We correct for big-endian later.  */
15402   if (!d->perm[0].is_constant (&odd)
15403       || (odd != 0 && odd != 1)
15404       || !d->perm.series_p (0, 1, odd, 2))
15405     return false;
15406
15407   /* Success!  */
15408   if (d->testing_p)
15409     return true;
15410
15411   in0 = d->op0;
15412   in1 = d->op1;
15413   /* We don't need a big-endian lane correction for SVE; see the comment
15414      at the head of aarch64-sve.md for details.  */
15415   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15416     {
15417       x = in0, in0 = in1, in1 = x;
15418       odd = !odd;
15419     }
15420   out = d->target;
15421
15422   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15423                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
15424   return true;
15425 }
15426
15427 /* Recognize patterns suitable for the ZIP instructions.  */
15428 static bool
15429 aarch64_evpc_zip (struct expand_vec_perm_d *d)
15430 {
15431   unsigned int high;
15432   poly_uint64 nelt = d->perm.length ();
15433   rtx out, in0, in1, x;
15434   machine_mode vmode = d->vmode;
15435
15436   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15437     return false;
15438
15439   /* Note that these are little-endian tests.
15440      We correct for big-endian later.  */
15441   poly_uint64 first = d->perm[0];
15442   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
15443       || !d->perm.series_p (0, 2, first, 1)
15444       || !d->perm.series_p (1, 2, first + nelt, 1))
15445     return false;
15446   high = maybe_ne (first, 0U);
15447
15448   /* Success!  */
15449   if (d->testing_p)
15450     return true;
15451
15452   in0 = d->op0;
15453   in1 = d->op1;
15454   /* We don't need a big-endian lane correction for SVE; see the comment
15455      at the head of aarch64-sve.md for details.  */
15456   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15457     {
15458       x = in0, in0 = in1, in1 = x;
15459       high = !high;
15460     }
15461   out = d->target;
15462
15463   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15464                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
15465   return true;
15466 }
15467
15468 /* Recognize patterns for the EXT insn.  */
15469
15470 static bool
15471 aarch64_evpc_ext (struct expand_vec_perm_d *d)
15472 {
15473   HOST_WIDE_INT location;
15474   rtx offset;
15475
15476   /* The first element always refers to the first vector.
15477      Check if the extracted indices are increasing by one.  */
15478   if (d->vec_flags == VEC_SVE_PRED
15479       || !d->perm[0].is_constant (&location)
15480       || !d->perm.series_p (0, 1, location, 1))
15481     return false;
15482
15483   /* Success! */
15484   if (d->testing_p)
15485     return true;
15486
15487   /* The case where (location == 0) is a no-op for both big- and little-endian,
15488      and is removed by the mid-end at optimization levels -O1 and higher.
15489
15490      We don't need a big-endian lane correction for SVE; see the comment
15491      at the head of aarch64-sve.md for details.  */
15492   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
15493     {
15494       /* After setup, we want the high elements of the first vector (stored
15495          at the LSB end of the register), and the low elements of the second
15496          vector (stored at the MSB end of the register). So swap.  */
15497       std::swap (d->op0, d->op1);
15498       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
15499          to_constant () is safe since this is restricted to Advanced SIMD
15500          vectors.  */
15501       location = d->perm.length ().to_constant () - location;
15502     }
15503
15504   offset = GEN_INT (location);
15505   emit_set_insn (d->target,
15506                  gen_rtx_UNSPEC (d->vmode,
15507                                  gen_rtvec (3, d->op0, d->op1, offset),
15508                                  UNSPEC_EXT));
15509   return true;
15510 }
15511
15512 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
15513    within each 64-bit, 32-bit or 16-bit granule.  */
15514
15515 static bool
15516 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
15517 {
15518   HOST_WIDE_INT diff;
15519   unsigned int i, size, unspec;
15520   machine_mode pred_mode;
15521
15522   if (d->vec_flags == VEC_SVE_PRED
15523       || !d->one_vector_p
15524       || !d->perm[0].is_constant (&diff))
15525     return false;
15526
15527   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
15528   if (size == 8)
15529     {
15530       unspec = UNSPEC_REV64;
15531       pred_mode = VNx2BImode;
15532     }
15533   else if (size == 4)
15534     {
15535       unspec = UNSPEC_REV32;
15536       pred_mode = VNx4BImode;
15537     }
15538   else if (size == 2)
15539     {
15540       unspec = UNSPEC_REV16;
15541       pred_mode = VNx8BImode;
15542     }
15543   else
15544     return false;
15545
15546   unsigned int step = diff + 1;
15547   for (i = 0; i < step; ++i)
15548     if (!d->perm.series_p (i, step, diff - i, step))
15549       return false;
15550
15551   /* Success! */
15552   if (d->testing_p)
15553     return true;
15554
15555   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
15556   if (d->vec_flags == VEC_SVE_DATA)
15557     {
15558       rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15559       src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
15560                             UNSPEC_MERGE_PTRUE);
15561     }
15562   emit_set_insn (d->target, src);
15563   return true;
15564 }
15565
15566 /* Recognize patterns for the REV insn, which reverses elements within
15567    a full vector.  */
15568
15569 static bool
15570 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
15571 {
15572   poly_uint64 nelt = d->perm.length ();
15573
15574   if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
15575     return false;
15576
15577   if (!d->perm.series_p (0, 1, nelt - 1, -1))
15578     return false;
15579
15580   /* Success! */
15581   if (d->testing_p)
15582     return true;
15583
15584   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
15585   emit_set_insn (d->target, src);
15586   return true;
15587 }
15588
15589 static bool
15590 aarch64_evpc_dup (struct expand_vec_perm_d *d)
15591 {
15592   rtx out = d->target;
15593   rtx in0;
15594   HOST_WIDE_INT elt;
15595   machine_mode vmode = d->vmode;
15596   rtx lane;
15597
15598   if (d->vec_flags == VEC_SVE_PRED
15599       || d->perm.encoding ().encoded_nelts () != 1
15600       || !d->perm[0].is_constant (&elt))
15601     return false;
15602
15603   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
15604     return false;
15605
15606   /* Success! */
15607   if (d->testing_p)
15608     return true;
15609
15610   /* The generic preparation in aarch64_expand_vec_perm_const_1
15611      swaps the operand order and the permute indices if it finds
15612      d->perm[0] to be in the second operand.  Thus, we can always
15613      use d->op0 and need not do any extra arithmetic to get the
15614      correct lane number.  */
15615   in0 = d->op0;
15616   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
15617
15618   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
15619   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
15620   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
15621   return true;
15622 }
15623
15624 static bool
15625 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
15626 {
15627   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
15628   machine_mode vmode = d->vmode;
15629
15630   /* Make sure that the indices are constant.  */
15631   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
15632   for (unsigned int i = 0; i < encoded_nelts; ++i)
15633     if (!d->perm[i].is_constant ())
15634       return false;
15635
15636   if (d->testing_p)
15637     return true;
15638
15639   /* Generic code will try constant permutation twice.  Once with the
15640      original mode and again with the elements lowered to QImode.
15641      So wait and don't do the selector expansion ourselves.  */
15642   if (vmode != V8QImode && vmode != V16QImode)
15643     return false;
15644
15645   /* to_constant is safe since this routine is specific to Advanced SIMD
15646      vectors.  */
15647   unsigned int nelt = d->perm.length ().to_constant ();
15648   for (unsigned int i = 0; i < nelt; ++i)
15649     /* If big-endian and two vectors we end up with a weird mixed-endian
15650        mode on NEON.  Reverse the index within each word but not the word
15651        itself.  to_constant is safe because we checked is_constant above.  */
15652     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
15653                         ? d->perm[i].to_constant () ^ (nelt - 1)
15654                         : d->perm[i].to_constant ());
15655
15656   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
15657   sel = force_reg (vmode, sel);
15658
15659   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
15660   return true;
15661 }
15662
15663 /* Try to implement D using an SVE TBL instruction.  */
15664
15665 static bool
15666 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
15667 {
15668   unsigned HOST_WIDE_INT nelt;
15669
15670   /* Permuting two variable-length vectors could overflow the
15671      index range.  */
15672   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
15673     return false;
15674
15675   if (d->testing_p)
15676     return true;
15677
15678   machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
15679   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
15680   aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
15681   return true;
15682 }
15683
15684 static bool
15685 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
15686 {
15687   /* The pattern matching functions above are written to look for a small
15688      number to begin the sequence (0, 1, N/2).  If we begin with an index
15689      from the second operand, we can swap the operands.  */
15690   poly_int64 nelt = d->perm.length ();
15691   if (known_ge (d->perm[0], nelt))
15692     {
15693       d->perm.rotate_inputs (1);
15694       std::swap (d->op0, d->op1);
15695     }
15696
15697   if ((d->vec_flags == VEC_ADVSIMD
15698        || d->vec_flags == VEC_SVE_DATA
15699        || d->vec_flags == VEC_SVE_PRED)
15700       && known_gt (nelt, 1))
15701     {
15702       if (aarch64_evpc_rev_local (d))
15703         return true;
15704       else if (aarch64_evpc_rev_global (d))
15705         return true;
15706       else if (aarch64_evpc_ext (d))
15707         return true;
15708       else if (aarch64_evpc_dup (d))
15709         return true;
15710       else if (aarch64_evpc_zip (d))
15711         return true;
15712       else if (aarch64_evpc_uzp (d))
15713         return true;
15714       else if (aarch64_evpc_trn (d))
15715         return true;
15716       if (d->vec_flags == VEC_SVE_DATA)
15717         return aarch64_evpc_sve_tbl (d);
15718       else if (d->vec_flags == VEC_SVE_DATA)
15719         return aarch64_evpc_tbl (d);
15720     }
15721   return false;
15722 }
15723
15724 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
15725
15726 static bool
15727 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
15728                                   rtx op1, const vec_perm_indices &sel)
15729 {
15730   struct expand_vec_perm_d d;
15731
15732   /* Check whether the mask can be applied to a single vector.  */
15733   if (op0 && rtx_equal_p (op0, op1))
15734     d.one_vector_p = true;
15735   else if (sel.all_from_input_p (0))
15736     {
15737       d.one_vector_p = true;
15738       op1 = op0;
15739     }
15740   else if (sel.all_from_input_p (1))
15741     {
15742       d.one_vector_p = true;
15743       op0 = op1;
15744     }
15745   else
15746     d.one_vector_p = false;
15747
15748   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
15749                      sel.nelts_per_input ());
15750   d.vmode = vmode;
15751   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
15752   d.target = target;
15753   d.op0 = op0;
15754   d.op1 = op1;
15755   d.testing_p = !target;
15756
15757   if (!d.testing_p)
15758     return aarch64_expand_vec_perm_const_1 (&d);
15759
15760   rtx_insn *last = get_last_insn ();
15761   bool ret = aarch64_expand_vec_perm_const_1 (&d);
15762   gcc_assert (last == get_last_insn ());
15763
15764   return ret;
15765 }
15766
15767 /* Generate a byte permute mask for a register of mode MODE,
15768    which has NUNITS units.  */
15769
15770 rtx
15771 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
15772 {
15773   /* We have to reverse each vector because we dont have
15774      a permuted load that can reverse-load according to ABI rules.  */
15775   rtx mask;
15776   rtvec v = rtvec_alloc (16);
15777   unsigned int i, j;
15778   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
15779
15780   gcc_assert (BYTES_BIG_ENDIAN);
15781   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
15782
15783   for (i = 0; i < nunits; i++)
15784     for (j = 0; j < usize; j++)
15785       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
15786   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
15787   return force_reg (V16QImode, mask);
15788 }
15789
15790 /* Return true if X is a valid second operand for the SVE instruction
15791    that implements integer comparison OP_CODE.  */
15792
15793 static bool
15794 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
15795 {
15796   if (register_operand (x, VOIDmode))
15797     return true;
15798
15799   switch (op_code)
15800     {
15801     case LTU:
15802     case LEU:
15803     case GEU:
15804     case GTU:
15805       return aarch64_sve_cmp_immediate_p (x, false);
15806     case LT:
15807     case LE:
15808     case GE:
15809     case GT:
15810     case NE:
15811     case EQ:
15812       return aarch64_sve_cmp_immediate_p (x, true);
15813     default:
15814       gcc_unreachable ();
15815     }
15816 }
15817
15818 /* Use predicated SVE instructions to implement the equivalent of:
15819
15820      (set TARGET OP)
15821
15822    given that PTRUE is an all-true predicate of the appropriate mode.  */
15823
15824 static void
15825 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
15826 {
15827   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
15828                                gen_rtvec (2, ptrue, op),
15829                                UNSPEC_MERGE_PTRUE);
15830   rtx_insn *insn = emit_set_insn (target, unspec);
15831   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
15832 }
15833
15834 /* Likewise, but also clobber the condition codes.  */
15835
15836 static void
15837 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
15838 {
15839   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
15840                                gen_rtvec (2, ptrue, op),
15841                                UNSPEC_MERGE_PTRUE);
15842   rtx_insn *insn = emit_insn (gen_set_clobber_cc (target, unspec));
15843   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
15844 }
15845
15846 /* Return the UNSPEC_COND_* code for comparison CODE.  */
15847
15848 static unsigned int
15849 aarch64_unspec_cond_code (rtx_code code)
15850 {
15851   switch (code)
15852     {
15853     case NE:
15854       return UNSPEC_COND_NE;
15855     case EQ:
15856       return UNSPEC_COND_EQ;
15857     case LT:
15858       return UNSPEC_COND_LT;
15859     case GT:
15860       return UNSPEC_COND_GT;
15861     case LE:
15862       return UNSPEC_COND_LE;
15863     case GE:
15864       return UNSPEC_COND_GE;
15865     default:
15866       gcc_unreachable ();
15867     }
15868 }
15869
15870 /* Emit:
15871
15872       (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
15873
15874    where <X> is the operation associated with comparison CODE.  This form
15875    of instruction is used when (and (CODE OP0 OP1) PRED) would have different
15876    semantics, such as when PRED might not be all-true and when comparing
15877    inactive lanes could have side effects.  */
15878
15879 static void
15880 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
15881                                   rtx pred, rtx op0, rtx op1)
15882 {
15883   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
15884                                gen_rtvec (3, pred, op0, op1),
15885                                aarch64_unspec_cond_code (code));
15886   emit_set_insn (target, unspec);
15887 }
15888
15889 /* Expand an SVE integer comparison using the SVE equivalent of:
15890
15891      (set TARGET (CODE OP0 OP1)).  */
15892
15893 void
15894 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
15895 {
15896   machine_mode pred_mode = GET_MODE (target);
15897   machine_mode data_mode = GET_MODE (op0);
15898
15899   if (!aarch64_sve_cmp_operand_p (code, op1))
15900     op1 = force_reg (data_mode, op1);
15901
15902   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15903   rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15904   aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
15905 }
15906
15907 /* Emit the SVE equivalent of:
15908
15909       (set TMP1 (CODE1 OP0 OP1))
15910       (set TMP2 (CODE2 OP0 OP1))
15911       (set TARGET (ior:PRED_MODE TMP1 TMP2))
15912
15913    PTRUE is an all-true predicate with the same mode as TARGET.  */
15914
15915 static void
15916 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
15917                            rtx ptrue, rtx op0, rtx op1)
15918 {
15919   machine_mode pred_mode = GET_MODE (ptrue);
15920   rtx tmp1 = gen_reg_rtx (pred_mode);
15921   aarch64_emit_sve_ptrue_op (tmp1, ptrue,
15922                              gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
15923   rtx tmp2 = gen_reg_rtx (pred_mode);
15924   aarch64_emit_sve_ptrue_op (tmp2, ptrue,
15925                              gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
15926   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
15927 }
15928
15929 /* Emit the SVE equivalent of:
15930
15931       (set TMP (CODE OP0 OP1))
15932       (set TARGET (not TMP))
15933
15934    PTRUE is an all-true predicate with the same mode as TARGET.  */
15935
15936 static void
15937 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
15938                                 rtx op0, rtx op1)
15939 {
15940   machine_mode pred_mode = GET_MODE (ptrue);
15941   rtx tmp = gen_reg_rtx (pred_mode);
15942   aarch64_emit_sve_ptrue_op (tmp, ptrue,
15943                              gen_rtx_fmt_ee (code, pred_mode, op0, op1));
15944   aarch64_emit_unop (target, one_cmpl_optab, tmp);
15945 }
15946
15947 /* Expand an SVE floating-point comparison using the SVE equivalent of:
15948
15949      (set TARGET (CODE OP0 OP1))
15950
15951    If CAN_INVERT_P is true, the caller can also handle inverted results;
15952    return true if the result is in fact inverted.  */
15953
15954 bool
15955 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
15956                                   rtx op0, rtx op1, bool can_invert_p)
15957 {
15958   machine_mode pred_mode = GET_MODE (target);
15959   machine_mode data_mode = GET_MODE (op0);
15960
15961   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15962   switch (code)
15963     {
15964     case UNORDERED:
15965       /* UNORDERED has no immediate form.  */
15966       op1 = force_reg (data_mode, op1);
15967       /* fall through */
15968     case LT:
15969     case LE:
15970     case GT:
15971     case GE:
15972     case EQ:
15973     case NE:
15974       {
15975         /* There is native support for the comparison.  */
15976         rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15977         aarch64_emit_sve_ptrue_op (target, ptrue, cond);
15978         return false;
15979       }
15980
15981     case LTGT:
15982       /* This is a trapping operation (LT or GT).  */
15983       aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
15984       return false;
15985
15986     case UNEQ:
15987       if (!flag_trapping_math)
15988         {
15989           /* This would trap for signaling NaNs.  */
15990           op1 = force_reg (data_mode, op1);
15991           aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
15992           return false;
15993         }
15994       /* fall through */
15995     case UNLT:
15996     case UNLE:
15997     case UNGT:
15998     case UNGE:
15999       if (flag_trapping_math)
16000         {
16001           /* Work out which elements are ordered.  */
16002           rtx ordered = gen_reg_rtx (pred_mode);
16003           op1 = force_reg (data_mode, op1);
16004           aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
16005
16006           /* Test the opposite condition for the ordered elements,
16007              then invert the result.  */
16008           if (code == UNEQ)
16009             code = NE;
16010           else
16011             code = reverse_condition_maybe_unordered (code);
16012           if (can_invert_p)
16013             {
16014               aarch64_emit_sve_predicated_cond (target, code,
16015                                                 ordered, op0, op1);
16016               return true;
16017             }
16018           rtx tmp = gen_reg_rtx (pred_mode);
16019           aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
16020           aarch64_emit_unop (target, one_cmpl_optab, tmp);
16021           return false;
16022         }
16023       break;
16024
16025     case ORDERED:
16026       /* ORDERED has no immediate form.  */
16027       op1 = force_reg (data_mode, op1);
16028       break;
16029
16030     default:
16031       gcc_unreachable ();
16032     }
16033
16034   /* There is native support for the inverse comparison.  */
16035   code = reverse_condition_maybe_unordered (code);
16036   if (can_invert_p)
16037     {
16038       rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16039       aarch64_emit_sve_ptrue_op (target, ptrue, cond);
16040       return true;
16041     }
16042   aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
16043   return false;
16044 }
16045
16046 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
16047    of the data being selected and CMP_MODE is the mode of the values being
16048    compared.  */
16049
16050 void
16051 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
16052                           rtx *ops)
16053 {
16054   machine_mode pred_mode
16055     = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
16056                              GET_MODE_SIZE (cmp_mode)).require ();
16057   rtx pred = gen_reg_rtx (pred_mode);
16058   if (FLOAT_MODE_P (cmp_mode))
16059     {
16060       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
16061                                             ops[4], ops[5], true))
16062         std::swap (ops[1], ops[2]);
16063     }
16064   else
16065     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
16066
16067   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
16068   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
16069 }
16070
16071 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
16072    true.  However due to issues with register allocation it is preferable
16073    to avoid tieing integer scalar and FP scalar modes.  Executing integer
16074    operations in general registers is better than treating them as scalar
16075    vector operations.  This reduces latency and avoids redundant int<->FP
16076    moves.  So tie modes if they are either the same class, or vector modes
16077    with other vector modes, vector structs or any scalar mode.  */
16078
16079 static bool
16080 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
16081 {
16082   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
16083     return true;
16084
16085   /* We specifically want to allow elements of "structure" modes to
16086      be tieable to the structure.  This more general condition allows
16087      other rarer situations too.  The reason we don't extend this to
16088      predicate modes is that there are no predicate structure modes
16089      nor any specific instructions for extracting part of a predicate
16090      register.  */
16091   if (aarch64_vector_data_mode_p (mode1)
16092       && aarch64_vector_data_mode_p (mode2))
16093     return true;
16094
16095   /* Also allow any scalar modes with vectors.  */
16096   if (aarch64_vector_mode_supported_p (mode1)
16097       || aarch64_vector_mode_supported_p (mode2))
16098     return true;
16099
16100   return false;
16101 }
16102
16103 /* Return a new RTX holding the result of moving POINTER forward by
16104    AMOUNT bytes.  */
16105
16106 static rtx
16107 aarch64_move_pointer (rtx pointer, poly_int64 amount)
16108 {
16109   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
16110
16111   return adjust_automodify_address (pointer, GET_MODE (pointer),
16112                                     next, amount);
16113 }
16114
16115 /* Return a new RTX holding the result of moving POINTER forward by the
16116    size of the mode it points to.  */
16117
16118 static rtx
16119 aarch64_progress_pointer (rtx pointer)
16120 {
16121   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
16122 }
16123
16124 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
16125    MODE bytes.  */
16126
16127 static void
16128 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
16129                                               machine_mode mode)
16130 {
16131   rtx reg = gen_reg_rtx (mode);
16132
16133   /* "Cast" the pointers to the correct mode.  */
16134   *src = adjust_address (*src, mode, 0);
16135   *dst = adjust_address (*dst, mode, 0);
16136   /* Emit the memcpy.  */
16137   emit_move_insn (reg, *src);
16138   emit_move_insn (*dst, reg);
16139   /* Move the pointers forward.  */
16140   *src = aarch64_progress_pointer (*src);
16141   *dst = aarch64_progress_pointer (*dst);
16142 }
16143
16144 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
16145    we succeed, otherwise return false.  */
16146
16147 bool
16148 aarch64_expand_movmem (rtx *operands)
16149 {
16150   int n, mode_bits;
16151   rtx dst = operands[0];
16152   rtx src = operands[1];
16153   rtx base;
16154   machine_mode cur_mode = BLKmode, next_mode;
16155   bool speed_p = !optimize_function_for_size_p (cfun);
16156
16157   /* When optimizing for size, give a better estimate of the length of a
16158      memcpy call, but use the default otherwise.  Moves larger than 8 bytes
16159      will always require an even number of instructions to do now.  And each
16160      operation requires both a load+store, so devide the max number by 2.  */
16161   int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
16162
16163   /* We can't do anything smart if the amount to copy is not constant.  */
16164   if (!CONST_INT_P (operands[2]))
16165     return false;
16166
16167   n = INTVAL (operands[2]);
16168
16169   /* Try to keep the number of instructions low.  For all cases we will do at
16170      most two moves for the residual amount, since we'll always overlap the
16171      remainder.  */
16172   if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
16173     return false;
16174
16175   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
16176   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
16177
16178   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
16179   src = adjust_automodify_address (src, VOIDmode, base, 0);
16180
16181   /* Convert n to bits to make the rest of the code simpler.  */
16182   n = n * BITS_PER_UNIT;
16183
16184   while (n > 0)
16185     {
16186       /* Find the largest mode in which to do the copy in without over reading
16187          or writing.  */
16188       opt_scalar_int_mode mode_iter;
16189       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
16190         if (GET_MODE_BITSIZE (mode_iter.require ()) <= n)
16191           cur_mode = mode_iter.require ();
16192
16193       gcc_assert (cur_mode != BLKmode);
16194
16195       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
16196       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
16197
16198       n -= mode_bits;
16199
16200       /* Do certain trailing copies as overlapping if it's going to be
16201          cheaper.  i.e. less instructions to do so.  For instance doing a 15
16202          byte copy it's more efficient to do two overlapping 8 byte copies than
16203          8 + 6 + 1.  */
16204       next_mode = smallest_mode_for_size (n, MODE_INT);
16205       int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
16206       if (n > 0 && n_bits > n && n_bits <= 8 * BITS_PER_UNIT)
16207         {
16208           src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
16209           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
16210           n = n_bits;
16211         }
16212     }
16213
16214   return true;
16215 }
16216
16217 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
16218    SImode stores.  Handle the case when the constant has identical
16219    bottom and top halves.  This is beneficial when the two stores can be
16220    merged into an STP and we avoid synthesising potentially expensive
16221    immediates twice.  Return true if such a split is possible.  */
16222
16223 bool
16224 aarch64_split_dimode_const_store (rtx dst, rtx src)
16225 {
16226   rtx lo = gen_lowpart (SImode, src);
16227   rtx hi = gen_highpart_mode (SImode, DImode, src);
16228
16229   bool size_p = optimize_function_for_size_p (cfun);
16230
16231   if (!rtx_equal_p (lo, hi))
16232     return false;
16233
16234   unsigned int orig_cost
16235     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
16236   unsigned int lo_cost
16237     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
16238
16239   /* We want to transform:
16240      MOV        x1, 49370
16241      MOVK       x1, 0x140, lsl 16
16242      MOVK       x1, 0xc0da, lsl 32
16243      MOVK       x1, 0x140, lsl 48
16244      STR        x1, [x0]
16245    into:
16246      MOV        w1, 49370
16247      MOVK       w1, 0x140, lsl 16
16248      STP        w1, w1, [x0]
16249    So we want to perform this only when we save two instructions
16250    or more.  When optimizing for size, however, accept any code size
16251    savings we can.  */
16252   if (size_p && orig_cost <= lo_cost)
16253     return false;
16254
16255   if (!size_p
16256       && (orig_cost <= lo_cost + 1))
16257     return false;
16258
16259   rtx mem_lo = adjust_address (dst, SImode, 0);
16260   if (!aarch64_mem_pair_operand (mem_lo, SImode))
16261     return false;
16262
16263   rtx tmp_reg = gen_reg_rtx (SImode);
16264   aarch64_expand_mov_immediate (tmp_reg, lo);
16265   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
16266   /* Don't emit an explicit store pair as this may not be always profitable.
16267      Let the sched-fusion logic decide whether to merge them.  */
16268   emit_move_insn (mem_lo, tmp_reg);
16269   emit_move_insn (mem_hi, tmp_reg);
16270
16271   return true;
16272 }
16273
16274 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
16275
16276 static unsigned HOST_WIDE_INT
16277 aarch64_asan_shadow_offset (void)
16278 {
16279   return (HOST_WIDE_INT_1 << 36);
16280 }
16281
16282 static rtx
16283 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
16284                         int code, tree treeop0, tree treeop1)
16285 {
16286   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16287   rtx op0, op1;
16288   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16289   insn_code icode;
16290   struct expand_operand ops[4];
16291
16292   start_sequence ();
16293   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16294
16295   op_mode = GET_MODE (op0);
16296   if (op_mode == VOIDmode)
16297     op_mode = GET_MODE (op1);
16298
16299   switch (op_mode)
16300     {
16301     case E_QImode:
16302     case E_HImode:
16303     case E_SImode:
16304       cmp_mode = SImode;
16305       icode = CODE_FOR_cmpsi;
16306       break;
16307
16308     case E_DImode:
16309       cmp_mode = DImode;
16310       icode = CODE_FOR_cmpdi;
16311       break;
16312
16313     case E_SFmode:
16314       cmp_mode = SFmode;
16315       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16316       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
16317       break;
16318
16319     case E_DFmode:
16320       cmp_mode = DFmode;
16321       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16322       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
16323       break;
16324
16325     default:
16326       end_sequence ();
16327       return NULL_RTX;
16328     }
16329
16330   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
16331   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
16332   if (!op0 || !op1)
16333     {
16334       end_sequence ();
16335       return NULL_RTX;
16336     }
16337   *prep_seq = get_insns ();
16338   end_sequence ();
16339
16340   create_fixed_operand (&ops[0], op0);
16341   create_fixed_operand (&ops[1], op1);
16342
16343   start_sequence ();
16344   if (!maybe_expand_insn (icode, 2, ops))
16345     {
16346       end_sequence ();
16347       return NULL_RTX;
16348     }
16349   *gen_seq = get_insns ();
16350   end_sequence ();
16351
16352   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
16353                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
16354 }
16355
16356 static rtx
16357 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
16358                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
16359 {
16360   rtx op0, op1, target;
16361   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16362   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16363   insn_code icode;
16364   struct expand_operand ops[6];
16365   int aarch64_cond;
16366
16367   push_to_sequence (*prep_seq);
16368   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16369
16370   op_mode = GET_MODE (op0);
16371   if (op_mode == VOIDmode)
16372     op_mode = GET_MODE (op1);
16373
16374   switch (op_mode)
16375     {
16376     case E_QImode:
16377     case E_HImode:
16378     case E_SImode:
16379       cmp_mode = SImode;
16380       icode = CODE_FOR_ccmpsi;
16381       break;
16382
16383     case E_DImode:
16384       cmp_mode = DImode;
16385       icode = CODE_FOR_ccmpdi;
16386       break;
16387
16388     case E_SFmode:
16389       cmp_mode = SFmode;
16390       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16391       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
16392       break;
16393
16394     case E_DFmode:
16395       cmp_mode = DFmode;
16396       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16397       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
16398       break;
16399
16400     default:
16401       end_sequence ();
16402       return NULL_RTX;
16403     }
16404
16405   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
16406   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
16407   if (!op0 || !op1)
16408     {
16409       end_sequence ();
16410       return NULL_RTX;
16411     }
16412   *prep_seq = get_insns ();
16413   end_sequence ();
16414
16415   target = gen_rtx_REG (cc_mode, CC_REGNUM);
16416   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
16417
16418   if (bit_code != AND)
16419     {
16420       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
16421                                                 GET_MODE (XEXP (prev, 0))),
16422                              VOIDmode, XEXP (prev, 0), const0_rtx);
16423       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
16424     }
16425
16426   create_fixed_operand (&ops[0], XEXP (prev, 0));
16427   create_fixed_operand (&ops[1], target);
16428   create_fixed_operand (&ops[2], op0);
16429   create_fixed_operand (&ops[3], op1);
16430   create_fixed_operand (&ops[4], prev);
16431   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
16432
16433   push_to_sequence (*gen_seq);
16434   if (!maybe_expand_insn (icode, 6, ops))
16435     {
16436       end_sequence ();
16437       return NULL_RTX;
16438     }
16439
16440   *gen_seq = get_insns ();
16441   end_sequence ();
16442
16443   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
16444 }
16445
16446 #undef TARGET_GEN_CCMP_FIRST
16447 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
16448
16449 #undef TARGET_GEN_CCMP_NEXT
16450 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
16451
16452 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
16453    instruction fusion of some sort.  */
16454
16455 static bool
16456 aarch64_macro_fusion_p (void)
16457 {
16458   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
16459 }
16460
16461
16462 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
16463    should be kept together during scheduling.  */
16464
16465 static bool
16466 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
16467 {
16468   rtx set_dest;
16469   rtx prev_set = single_set (prev);
16470   rtx curr_set = single_set (curr);
16471   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
16472   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
16473
16474   if (!aarch64_macro_fusion_p ())
16475     return false;
16476
16477   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
16478     {
16479       /* We are trying to match:
16480          prev (mov)  == (set (reg r0) (const_int imm16))
16481          curr (movk) == (set (zero_extract (reg r0)
16482                                            (const_int 16)
16483                                            (const_int 16))
16484                              (const_int imm16_1))  */
16485
16486       set_dest = SET_DEST (curr_set);
16487
16488       if (GET_CODE (set_dest) == ZERO_EXTRACT
16489           && CONST_INT_P (SET_SRC (curr_set))
16490           && CONST_INT_P (SET_SRC (prev_set))
16491           && CONST_INT_P (XEXP (set_dest, 2))
16492           && INTVAL (XEXP (set_dest, 2)) == 16
16493           && REG_P (XEXP (set_dest, 0))
16494           && REG_P (SET_DEST (prev_set))
16495           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
16496         {
16497           return true;
16498         }
16499     }
16500
16501   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
16502     {
16503
16504       /*  We're trying to match:
16505           prev (adrp) == (set (reg r1)
16506                               (high (symbol_ref ("SYM"))))
16507           curr (add) == (set (reg r0)
16508                              (lo_sum (reg r1)
16509                                      (symbol_ref ("SYM"))))
16510           Note that r0 need not necessarily be the same as r1, especially
16511           during pre-regalloc scheduling.  */
16512
16513       if (satisfies_constraint_Ush (SET_SRC (prev_set))
16514           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16515         {
16516           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
16517               && REG_P (XEXP (SET_SRC (curr_set), 0))
16518               && REGNO (XEXP (SET_SRC (curr_set), 0))
16519                  == REGNO (SET_DEST (prev_set))
16520               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
16521                               XEXP (SET_SRC (curr_set), 1)))
16522             return true;
16523         }
16524     }
16525
16526   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
16527     {
16528
16529       /* We're trying to match:
16530          prev (movk) == (set (zero_extract (reg r0)
16531                                            (const_int 16)
16532                                            (const_int 32))
16533                              (const_int imm16_1))
16534          curr (movk) == (set (zero_extract (reg r0)
16535                                            (const_int 16)
16536                                            (const_int 48))
16537                              (const_int imm16_2))  */
16538
16539       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
16540           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
16541           && REG_P (XEXP (SET_DEST (prev_set), 0))
16542           && REG_P (XEXP (SET_DEST (curr_set), 0))
16543           && REGNO (XEXP (SET_DEST (prev_set), 0))
16544              == REGNO (XEXP (SET_DEST (curr_set), 0))
16545           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
16546           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
16547           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
16548           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
16549           && CONST_INT_P (SET_SRC (prev_set))
16550           && CONST_INT_P (SET_SRC (curr_set)))
16551         return true;
16552
16553     }
16554   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
16555     {
16556       /* We're trying to match:
16557           prev (adrp) == (set (reg r0)
16558                               (high (symbol_ref ("SYM"))))
16559           curr (ldr) == (set (reg r1)
16560                              (mem (lo_sum (reg r0)
16561                                              (symbol_ref ("SYM")))))
16562                  or
16563           curr (ldr) == (set (reg r1)
16564                              (zero_extend (mem
16565                                            (lo_sum (reg r0)
16566                                                    (symbol_ref ("SYM"))))))  */
16567       if (satisfies_constraint_Ush (SET_SRC (prev_set))
16568           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16569         {
16570           rtx curr_src = SET_SRC (curr_set);
16571
16572           if (GET_CODE (curr_src) == ZERO_EXTEND)
16573             curr_src = XEXP (curr_src, 0);
16574
16575           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
16576               && REG_P (XEXP (XEXP (curr_src, 0), 0))
16577               && REGNO (XEXP (XEXP (curr_src, 0), 0))
16578                  == REGNO (SET_DEST (prev_set))
16579               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
16580                               XEXP (SET_SRC (prev_set), 0)))
16581               return true;
16582         }
16583     }
16584
16585   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
16586        && aarch_crypto_can_dual_issue (prev, curr))
16587     return true;
16588
16589   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
16590       && any_condjump_p (curr))
16591     {
16592       enum attr_type prev_type = get_attr_type (prev);
16593
16594       unsigned int condreg1, condreg2;
16595       rtx cc_reg_1;
16596       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
16597       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
16598
16599       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
16600           && prev
16601           && modified_in_p (cc_reg_1, prev))
16602         {
16603           /* FIXME: this misses some which is considered simple arthematic
16604              instructions for ThunderX.  Simple shifts are missed here.  */
16605           if (prev_type == TYPE_ALUS_SREG
16606               || prev_type == TYPE_ALUS_IMM
16607               || prev_type == TYPE_LOGICS_REG
16608               || prev_type == TYPE_LOGICS_IMM)
16609             return true;
16610         }
16611     }
16612
16613   if (prev_set
16614       && curr_set
16615       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
16616       && any_condjump_p (curr))
16617     {
16618       /* We're trying to match:
16619           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
16620           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
16621                                                          (const_int 0))
16622                                                  (label_ref ("SYM"))
16623                                                  (pc))  */
16624       if (SET_DEST (curr_set) == (pc_rtx)
16625           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
16626           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
16627           && REG_P (SET_DEST (prev_set))
16628           && REGNO (SET_DEST (prev_set))
16629              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
16630         {
16631           /* Fuse ALU operations followed by conditional branch instruction.  */
16632           switch (get_attr_type (prev))
16633             {
16634             case TYPE_ALU_IMM:
16635             case TYPE_ALU_SREG:
16636             case TYPE_ADC_REG:
16637             case TYPE_ADC_IMM:
16638             case TYPE_ADCS_REG:
16639             case TYPE_ADCS_IMM:
16640             case TYPE_LOGIC_REG:
16641             case TYPE_LOGIC_IMM:
16642             case TYPE_CSEL:
16643             case TYPE_ADR:
16644             case TYPE_MOV_IMM:
16645             case TYPE_SHIFT_REG:
16646             case TYPE_SHIFT_IMM:
16647             case TYPE_BFM:
16648             case TYPE_RBIT:
16649             case TYPE_REV:
16650             case TYPE_EXTEND:
16651               return true;
16652
16653             default:;
16654             }
16655         }
16656     }
16657
16658   return false;
16659 }
16660
16661 /* Return true iff the instruction fusion described by OP is enabled.  */
16662
16663 bool
16664 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
16665 {
16666   return (aarch64_tune_params.fusible_ops & op) != 0;
16667 }
16668
16669 /* If MEM is in the form of [base+offset], extract the two parts
16670    of address and set to BASE and OFFSET, otherwise return false
16671    after clearing BASE and OFFSET.  */
16672
16673 bool
16674 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
16675 {
16676   rtx addr;
16677
16678   gcc_assert (MEM_P (mem));
16679
16680   addr = XEXP (mem, 0);
16681
16682   if (REG_P (addr))
16683     {
16684       *base = addr;
16685       *offset = const0_rtx;
16686       return true;
16687     }
16688
16689   if (GET_CODE (addr) == PLUS
16690       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
16691     {
16692       *base = XEXP (addr, 0);
16693       *offset = XEXP (addr, 1);
16694       return true;
16695     }
16696
16697   *base = NULL_RTX;
16698   *offset = NULL_RTX;
16699
16700   return false;
16701 }
16702
16703 /* Types for scheduling fusion.  */
16704 enum sched_fusion_type
16705 {
16706   SCHED_FUSION_NONE = 0,
16707   SCHED_FUSION_LD_SIGN_EXTEND,
16708   SCHED_FUSION_LD_ZERO_EXTEND,
16709   SCHED_FUSION_LD,
16710   SCHED_FUSION_ST,
16711   SCHED_FUSION_NUM
16712 };
16713
16714 /* If INSN is a load or store of address in the form of [base+offset],
16715    extract the two parts and set to BASE and OFFSET.  Return scheduling
16716    fusion type this INSN is.  */
16717
16718 static enum sched_fusion_type
16719 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
16720 {
16721   rtx x, dest, src;
16722   enum sched_fusion_type fusion = SCHED_FUSION_LD;
16723
16724   gcc_assert (INSN_P (insn));
16725   x = PATTERN (insn);
16726   if (GET_CODE (x) != SET)
16727     return SCHED_FUSION_NONE;
16728
16729   src = SET_SRC (x);
16730   dest = SET_DEST (x);
16731
16732   machine_mode dest_mode = GET_MODE (dest);
16733
16734   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
16735     return SCHED_FUSION_NONE;
16736
16737   if (GET_CODE (src) == SIGN_EXTEND)
16738     {
16739       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
16740       src = XEXP (src, 0);
16741       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16742         return SCHED_FUSION_NONE;
16743     }
16744   else if (GET_CODE (src) == ZERO_EXTEND)
16745     {
16746       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
16747       src = XEXP (src, 0);
16748       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16749         return SCHED_FUSION_NONE;
16750     }
16751
16752   if (GET_CODE (src) == MEM && REG_P (dest))
16753     extract_base_offset_in_addr (src, base, offset);
16754   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
16755     {
16756       fusion = SCHED_FUSION_ST;
16757       extract_base_offset_in_addr (dest, base, offset);
16758     }
16759   else
16760     return SCHED_FUSION_NONE;
16761
16762   if (*base == NULL_RTX || *offset == NULL_RTX)
16763     fusion = SCHED_FUSION_NONE;
16764
16765   return fusion;
16766 }
16767
16768 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
16769
16770    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
16771    and PRI are only calculated for these instructions.  For other instruction,
16772    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
16773    type instruction fusion can be added by returning different priorities.
16774
16775    It's important that irrelevant instructions get the largest FUSION_PRI.  */
16776
16777 static void
16778 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
16779                                int *fusion_pri, int *pri)
16780 {
16781   int tmp, off_val;
16782   rtx base, offset;
16783   enum sched_fusion_type fusion;
16784
16785   gcc_assert (INSN_P (insn));
16786
16787   tmp = max_pri - 1;
16788   fusion = fusion_load_store (insn, &base, &offset);
16789   if (fusion == SCHED_FUSION_NONE)
16790     {
16791       *pri = tmp;
16792       *fusion_pri = tmp;
16793       return;
16794     }
16795
16796   /* Set FUSION_PRI according to fusion type and base register.  */
16797   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
16798
16799   /* Calculate PRI.  */
16800   tmp /= 2;
16801
16802   /* INSN with smaller offset goes first.  */
16803   off_val = (int)(INTVAL (offset));
16804   if (off_val >= 0)
16805     tmp -= (off_val & 0xfffff);
16806   else
16807     tmp += ((- off_val) & 0xfffff);
16808
16809   *pri = tmp;
16810   return;
16811 }
16812
16813 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
16814    Adjust priority of sha1h instructions so they are scheduled before
16815    other SHA1 instructions.  */
16816
16817 static int
16818 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
16819 {
16820   rtx x = PATTERN (insn);
16821
16822   if (GET_CODE (x) == SET)
16823     {
16824       x = SET_SRC (x);
16825
16826       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
16827         return priority + 10;
16828     }
16829
16830   return priority;
16831 }
16832
16833 /* Given OPERANDS of consecutive load/store, check if we can merge
16834    them into ldp/stp.  LOAD is true if they are load instructions.
16835    MODE is the mode of memory operands.  */
16836
16837 bool
16838 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
16839                                 machine_mode mode)
16840 {
16841   HOST_WIDE_INT offval_1, offval_2, msize;
16842   enum reg_class rclass_1, rclass_2;
16843   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
16844
16845   if (load)
16846     {
16847       mem_1 = operands[1];
16848       mem_2 = operands[3];
16849       reg_1 = operands[0];
16850       reg_2 = operands[2];
16851       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
16852       if (REGNO (reg_1) == REGNO (reg_2))
16853         return false;
16854     }
16855   else
16856     {
16857       mem_1 = operands[0];
16858       mem_2 = operands[2];
16859       reg_1 = operands[1];
16860       reg_2 = operands[3];
16861     }
16862
16863   /* The mems cannot be volatile.  */
16864   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
16865     return false;
16866
16867   /* If we have SImode and slow unaligned ldp,
16868      check the alignment to be at least 8 byte. */
16869   if (mode == SImode
16870       && (aarch64_tune_params.extra_tuning_flags
16871           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16872       && !optimize_size
16873       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
16874     return false;
16875
16876   /* Check if the addresses are in the form of [base+offset].  */
16877   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16878   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16879     return false;
16880   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16881   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16882     return false;
16883
16884   /* Check if the bases are same.  */
16885   if (!rtx_equal_p (base_1, base_2))
16886     return false;
16887
16888   /* The operands must be of the same size.  */
16889   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
16890                          GET_MODE_SIZE (GET_MODE (mem_2))));
16891
16892   offval_1 = INTVAL (offset_1);
16893   offval_2 = INTVAL (offset_2);
16894   /* We should only be trying this for fixed-sized modes.  There is no
16895      SVE LDP/STP instruction.  */
16896   msize = GET_MODE_SIZE (mode).to_constant ();
16897   /* Check if the offsets are consecutive.  */
16898   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
16899     return false;
16900
16901   /* Check if the addresses are clobbered by load.  */
16902   if (load)
16903     {
16904       if (reg_mentioned_p (reg_1, mem_1))
16905         return false;
16906
16907       /* In increasing order, the last load can clobber the address.  */
16908       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
16909         return false;
16910     }
16911
16912   /* One of the memory accesses must be a mempair operand.
16913      If it is not the first one, they need to be swapped by the
16914      peephole.  */
16915   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
16916        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
16917     return false;
16918
16919   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
16920     rclass_1 = FP_REGS;
16921   else
16922     rclass_1 = GENERAL_REGS;
16923
16924   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
16925     rclass_2 = FP_REGS;
16926   else
16927     rclass_2 = GENERAL_REGS;
16928
16929   /* Check if the registers are of same class.  */
16930   if (rclass_1 != rclass_2)
16931     return false;
16932
16933   return true;
16934 }
16935
16936 /* Given OPERANDS of consecutive load/store that can be merged,
16937    swap them if they are not in ascending order.  */
16938 void
16939 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
16940 {
16941   rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
16942   HOST_WIDE_INT offval_1, offval_2;
16943
16944   if (load)
16945     {
16946       mem_1 = operands[1];
16947       mem_2 = operands[3];
16948     }
16949   else
16950     {
16951       mem_1 = operands[0];
16952       mem_2 = operands[2];
16953     }
16954
16955   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16956   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16957
16958   offval_1 = INTVAL (offset_1);
16959   offval_2 = INTVAL (offset_2);
16960
16961   if (offval_1 > offval_2)
16962     {
16963       /* Irrespective of whether this is a load or a store,
16964          we do the same swap.  */
16965       std::swap (operands[0], operands[2]);
16966       std::swap (operands[1], operands[3]);
16967     }
16968 }
16969
16970 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
16971    comparison between the two.  */
16972 int
16973 aarch64_host_wide_int_compare (const void *x, const void *y)
16974 {
16975   return wi::cmps (* ((const HOST_WIDE_INT *) x),
16976                    * ((const HOST_WIDE_INT *) y));
16977 }
16978
16979 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
16980    other pointing to a REG rtx containing an offset, compare the offsets
16981    of the two pairs.
16982
16983    Return:
16984
16985         1 iff offset (X) > offset (Y)
16986         0 iff offset (X) == offset (Y)
16987         -1 iff offset (X) < offset (Y)  */
16988 int
16989 aarch64_ldrstr_offset_compare (const void *x, const void *y)
16990 {
16991   const rtx * operands_1 = (const rtx *) x;
16992   const rtx * operands_2 = (const rtx *) y;
16993   rtx mem_1, mem_2, base, offset_1, offset_2;
16994
16995   if (MEM_P (operands_1[0]))
16996     mem_1 = operands_1[0];
16997   else
16998     mem_1 = operands_1[1];
16999
17000   if (MEM_P (operands_2[0]))
17001     mem_2 = operands_2[0];
17002   else
17003     mem_2 = operands_2[1];
17004
17005   /* Extract the offsets.  */
17006   extract_base_offset_in_addr (mem_1, &base, &offset_1);
17007   extract_base_offset_in_addr (mem_2, &base, &offset_2);
17008
17009   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
17010
17011   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
17012 }
17013
17014 /* Given OPERANDS of consecutive load/store, check if we can merge
17015    them into ldp/stp by adjusting the offset.  LOAD is true if they
17016    are load instructions.  MODE is the mode of memory operands.
17017
17018    Given below consecutive stores:
17019
17020      str  w1, [xb, 0x100]
17021      str  w1, [xb, 0x104]
17022      str  w1, [xb, 0x108]
17023      str  w1, [xb, 0x10c]
17024
17025    Though the offsets are out of the range supported by stp, we can
17026    still pair them after adjusting the offset, like:
17027
17028      add  scratch, xb, 0x100
17029      stp  w1, w1, [scratch]
17030      stp  w1, w1, [scratch, 0x8]
17031
17032    The peephole patterns detecting this opportunity should guarantee
17033    the scratch register is avaliable.  */
17034
17035 bool
17036 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
17037                                        scalar_mode mode)
17038 {
17039   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
17040   HOST_WIDE_INT offvals[4], msize;
17041   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
17042   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
17043
17044   if (load)
17045     {
17046       reg_1 = operands[0];
17047       mem_1 = operands[1];
17048       reg_2 = operands[2];
17049       mem_2 = operands[3];
17050       reg_3 = operands[4];
17051       mem_3 = operands[5];
17052       reg_4 = operands[6];
17053       mem_4 = operands[7];
17054       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
17055                   && REG_P (reg_3) && REG_P (reg_4));
17056
17057       /* Do not attempt to merge the loads if the loads clobber each other.  */
17058       for (int i = 0; i < 8; i += 2)
17059         for (int j = i + 2; j < 8; j += 2)
17060           if (reg_overlap_mentioned_p (operands[i], operands[j]))
17061             return false;
17062     }
17063   else
17064     {
17065       mem_1 = operands[0];
17066       reg_1 = operands[1];
17067       mem_2 = operands[2];
17068       reg_2 = operands[3];
17069       mem_3 = operands[4];
17070       reg_3 = operands[5];
17071       mem_4 = operands[6];
17072       reg_4 = operands[7];
17073     }
17074   /* Skip if memory operand is by itslef valid for ldp/stp.  */
17075   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
17076     return false;
17077
17078   /* The mems cannot be volatile.  */
17079   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
17080       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
17081     return false;
17082
17083   /* Check if the addresses are in the form of [base+offset].  */
17084   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
17085   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
17086     return false;
17087   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
17088   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
17089     return false;
17090   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
17091   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
17092     return false;
17093   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
17094   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
17095     return false;
17096
17097   /* Check if the bases are same.  */
17098   if (!rtx_equal_p (base_1, base_2)
17099       || !rtx_equal_p (base_2, base_3)
17100       || !rtx_equal_p (base_3, base_4))
17101     return false;
17102
17103   offvals[0] = INTVAL (offset_1);
17104   offvals[1] = INTVAL (offset_2);
17105   offvals[2] = INTVAL (offset_3);
17106   offvals[3] = INTVAL (offset_4);
17107   msize = GET_MODE_SIZE (mode);
17108
17109   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
17110   qsort (offvals, 4, sizeof (HOST_WIDE_INT), aarch64_host_wide_int_compare);
17111
17112   if (!(offvals[1] == offvals[0] + msize
17113         && offvals[3] == offvals[2] + msize))
17114     return false;
17115
17116   /* Check that offsets are within range of each other.  The ldp/stp
17117      instructions have 7 bit immediate offsets, so use 0x80.  */
17118   if (offvals[2] - offvals[0] >= msize * 0x80)
17119     return false;
17120
17121   /* The offsets must be aligned with respect to each other.  */
17122   if (offvals[0] % msize != offvals[2] % msize)
17123     return false;
17124
17125   /* Check if the addresses are clobbered by load.  */
17126   if (load && (reg_mentioned_p (reg_1, mem_1)
17127                || reg_mentioned_p (reg_2, mem_2)
17128                || reg_mentioned_p (reg_3, mem_3)
17129                || reg_mentioned_p (reg_4, mem_4)))
17130     return false;
17131
17132   /* If we have SImode and slow unaligned ldp,
17133      check the alignment to be at least 8 byte. */
17134   if (mode == SImode
17135       && (aarch64_tune_params.extra_tuning_flags
17136           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
17137       && !optimize_size
17138       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
17139     return false;
17140
17141   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
17142     rclass_1 = FP_REGS;
17143   else
17144     rclass_1 = GENERAL_REGS;
17145
17146   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
17147     rclass_2 = FP_REGS;
17148   else
17149     rclass_2 = GENERAL_REGS;
17150
17151   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
17152     rclass_3 = FP_REGS;
17153   else
17154     rclass_3 = GENERAL_REGS;
17155
17156   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
17157     rclass_4 = FP_REGS;
17158   else
17159     rclass_4 = GENERAL_REGS;
17160
17161   /* Check if the registers are of same class.  */
17162   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
17163     return false;
17164
17165   return true;
17166 }
17167
17168 /* Given OPERANDS of consecutive load/store, this function pairs them
17169    into LDP/STP after adjusting the offset.  It depends on the fact
17170    that the operands can be sorted so the offsets are correct for STP.
17171    MODE is the mode of memory operands.  CODE is the rtl operator
17172    which should be applied to all memory operands, it's SIGN_EXTEND,
17173    ZERO_EXTEND or UNKNOWN.  */
17174
17175 bool
17176 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
17177                              scalar_mode mode, RTX_CODE code)
17178 {
17179   rtx base, offset_1, offset_3, t1, t2;
17180   rtx mem_1, mem_2, mem_3, mem_4;
17181   rtx temp_operands[8];
17182   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
17183                 stp_off_upper_limit, stp_off_lower_limit, msize;
17184
17185   /* We make changes on a copy as we may still bail out.  */
17186   for (int i = 0; i < 8; i ++)
17187     temp_operands[i] = operands[i];
17188
17189   /* Sort the operands.  */
17190   qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
17191
17192   if (load)
17193     {
17194       mem_1 = temp_operands[1];
17195       mem_2 = temp_operands[3];
17196       mem_3 = temp_operands[5];
17197       mem_4 = temp_operands[7];
17198     }
17199   else
17200     {
17201       mem_1 = temp_operands[0];
17202       mem_2 = temp_operands[2];
17203       mem_3 = temp_operands[4];
17204       mem_4 = temp_operands[6];
17205       gcc_assert (code == UNKNOWN);
17206     }
17207
17208   extract_base_offset_in_addr (mem_1, &base, &offset_1);
17209   extract_base_offset_in_addr (mem_3, &base, &offset_3);
17210   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
17211               && offset_3 != NULL_RTX);
17212
17213   /* Adjust offset so it can fit in LDP/STP instruction.  */
17214   msize = GET_MODE_SIZE (mode);
17215   stp_off_upper_limit = msize * (0x40 - 1);
17216   stp_off_lower_limit = - msize * 0x40;
17217
17218   off_val_1 = INTVAL (offset_1);
17219   off_val_3 = INTVAL (offset_3);
17220
17221   /* The base offset is optimally half way between the two STP/LDP offsets.  */
17222   if (msize <= 4)
17223     base_off = (off_val_1 + off_val_3) / 2;
17224   else
17225     /* However, due to issues with negative LDP/STP offset generation for
17226        larger modes, for DF, DI and vector modes. we must not use negative
17227        addresses smaller than 9 signed unadjusted bits can store.  This
17228        provides the most range in this case.  */
17229     base_off = off_val_1;
17230
17231   /* Adjust the base so that it is aligned with the addresses but still
17232      optimal.  */
17233   if (base_off % msize != off_val_1 % msize)
17234     /* Fix the offset, bearing in mind we want to make it bigger not
17235        smaller.  */
17236     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17237   else if (msize <= 4)
17238     /* The negative range of LDP/STP is one larger than the positive range.  */
17239     base_off += msize;
17240
17241   /* Check if base offset is too big or too small.  We can attempt to resolve
17242      this issue by setting it to the maximum value and seeing if the offsets
17243      still fit.  */
17244   if (base_off >= 0x1000)
17245     {
17246       base_off = 0x1000 - 1;
17247       /* We must still make sure that the base offset is aligned with respect
17248          to the address.  But it may may not be made any bigger.  */
17249       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17250     }
17251
17252   /* Likewise for the case where the base is too small.  */
17253   if (base_off <= -0x1000)
17254     {
17255       base_off = -0x1000 + 1;
17256       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17257     }
17258
17259   /* Offset of the first STP/LDP.  */
17260   new_off_1 = off_val_1 - base_off;
17261
17262   /* Offset of the second STP/LDP.  */
17263   new_off_3 = off_val_3 - base_off;
17264
17265   /* The offsets must be within the range of the LDP/STP instructions.  */
17266   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
17267       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
17268     return false;
17269
17270   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
17271                                                   new_off_1), true);
17272   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
17273                                                   new_off_1 + msize), true);
17274   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
17275                                                   new_off_3), true);
17276   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
17277                                                   new_off_3 + msize), true);
17278
17279   if (!aarch64_mem_pair_operand (mem_1, mode)
17280       || !aarch64_mem_pair_operand (mem_3, mode))
17281     return false;
17282
17283   if (code == ZERO_EXTEND)
17284     {
17285       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
17286       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
17287       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
17288       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
17289     }
17290   else if (code == SIGN_EXTEND)
17291     {
17292       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
17293       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
17294       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
17295       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
17296     }
17297
17298   if (load)
17299     {
17300       operands[0] = temp_operands[0];
17301       operands[1] = mem_1;
17302       operands[2] = temp_operands[2];
17303       operands[3] = mem_2;
17304       operands[4] = temp_operands[4];
17305       operands[5] = mem_3;
17306       operands[6] = temp_operands[6];
17307       operands[7] = mem_4;
17308     }
17309   else
17310     {
17311       operands[0] = mem_1;
17312       operands[1] = temp_operands[1];
17313       operands[2] = mem_2;
17314       operands[3] = temp_operands[3];
17315       operands[4] = mem_3;
17316       operands[5] = temp_operands[5];
17317       operands[6] = mem_4;
17318       operands[7] = temp_operands[7];
17319     }
17320
17321   /* Emit adjusting instruction.  */
17322   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
17323   /* Emit ldp/stp instructions.  */
17324   t1 = gen_rtx_SET (operands[0], operands[1]);
17325   t2 = gen_rtx_SET (operands[2], operands[3]);
17326   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17327   t1 = gen_rtx_SET (operands[4], operands[5]);
17328   t2 = gen_rtx_SET (operands[6], operands[7]);
17329   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17330   return true;
17331 }
17332
17333 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
17334    it isn't worth branching around empty masked ops (including masked
17335    stores).  */
17336
17337 static bool
17338 aarch64_empty_mask_is_expensive (unsigned)
17339 {
17340   return false;
17341 }
17342
17343 /* Return 1 if pseudo register should be created and used to hold
17344    GOT address for PIC code.  */
17345
17346 bool
17347 aarch64_use_pseudo_pic_reg (void)
17348 {
17349   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
17350 }
17351
17352 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
17353
17354 static int
17355 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
17356 {
17357   switch (XINT (x, 1))
17358     {
17359     case UNSPEC_GOTSMALLPIC:
17360     case UNSPEC_GOTSMALLPIC28K:
17361     case UNSPEC_GOTTINYPIC:
17362       return 0;
17363     default:
17364       break;
17365     }
17366
17367   return default_unspec_may_trap_p (x, flags);
17368 }
17369
17370
17371 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
17372    return the log2 of that value.  Otherwise return -1.  */
17373
17374 int
17375 aarch64_fpconst_pow_of_2 (rtx x)
17376 {
17377   const REAL_VALUE_TYPE *r;
17378
17379   if (!CONST_DOUBLE_P (x))
17380     return -1;
17381
17382   r = CONST_DOUBLE_REAL_VALUE (x);
17383
17384   if (REAL_VALUE_NEGATIVE (*r)
17385       || REAL_VALUE_ISNAN (*r)
17386       || REAL_VALUE_ISINF (*r)
17387       || !real_isinteger (r, DFmode))
17388     return -1;
17389
17390   return exact_log2 (real_to_integer (r));
17391 }
17392
17393 /* If X is a vector of equal CONST_DOUBLE values and that value is
17394    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
17395
17396 int
17397 aarch64_vec_fpconst_pow_of_2 (rtx x)
17398 {
17399   int nelts;
17400   if (GET_CODE (x) != CONST_VECTOR
17401       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
17402     return -1;
17403
17404   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
17405     return -1;
17406
17407   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
17408   if (firstval <= 0)
17409     return -1;
17410
17411   for (int i = 1; i < nelts; i++)
17412     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
17413       return -1;
17414
17415   return firstval;
17416 }
17417
17418 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
17419    to float.
17420
17421    __fp16 always promotes through this hook.
17422    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
17423    through the generic excess precision logic rather than here.  */
17424
17425 static tree
17426 aarch64_promoted_type (const_tree t)
17427 {
17428   if (SCALAR_FLOAT_TYPE_P (t)
17429       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
17430     return float_type_node;
17431
17432   return NULL_TREE;
17433 }
17434
17435 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
17436
17437 static bool
17438 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
17439                            optimization_type opt_type)
17440 {
17441   switch (op)
17442     {
17443     case rsqrt_optab:
17444       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
17445
17446     default:
17447       return true;
17448     }
17449 }
17450
17451 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
17452
17453 static unsigned int
17454 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
17455                                         int *offset)
17456 {
17457   /* Polynomial invariant 1 == (VG / 2) - 1.  */
17458   gcc_assert (i == 1);
17459   *factor = 2;
17460   *offset = 1;
17461   return AARCH64_DWARF_VG;
17462 }
17463
17464 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
17465    if MODE is HFmode, and punt to the generic implementation otherwise.  */
17466
17467 static bool
17468 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
17469 {
17470   return (mode == HFmode
17471           ? true
17472           : default_libgcc_floating_mode_supported_p (mode));
17473 }
17474
17475 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
17476    if MODE is HFmode, and punt to the generic implementation otherwise.  */
17477
17478 static bool
17479 aarch64_scalar_mode_supported_p (scalar_mode mode)
17480 {
17481   return (mode == HFmode
17482           ? true
17483           : default_scalar_mode_supported_p (mode));
17484 }
17485
17486 /* Set the value of FLT_EVAL_METHOD.
17487    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
17488
17489     0: evaluate all operations and constants, whose semantic type has at
17490        most the range and precision of type float, to the range and
17491        precision of float; evaluate all other operations and constants to
17492        the range and precision of the semantic type;
17493
17494     N, where _FloatN is a supported interchange floating type
17495        evaluate all operations and constants, whose semantic type has at
17496        most the range and precision of _FloatN type, to the range and
17497        precision of the _FloatN type; evaluate all other operations and
17498        constants to the range and precision of the semantic type;
17499
17500    If we have the ARMv8.2-A extensions then we support _Float16 in native
17501    precision, so we should set this to 16.  Otherwise, we support the type,
17502    but want to evaluate expressions in float precision, so set this to
17503    0.  */
17504
17505 static enum flt_eval_method
17506 aarch64_excess_precision (enum excess_precision_type type)
17507 {
17508   switch (type)
17509     {
17510       case EXCESS_PRECISION_TYPE_FAST:
17511       case EXCESS_PRECISION_TYPE_STANDARD:
17512         /* We can calculate either in 16-bit range and precision or
17513            32-bit range and precision.  Make that decision based on whether
17514            we have native support for the ARMv8.2-A 16-bit floating-point
17515            instructions or not.  */
17516         return (TARGET_FP_F16INST
17517                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
17518                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
17519       case EXCESS_PRECISION_TYPE_IMPLICIT:
17520         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
17521       default:
17522         gcc_unreachable ();
17523     }
17524   return FLT_EVAL_METHOD_UNPREDICTABLE;
17525 }
17526
17527 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
17528    scheduled for speculative execution.  Reject the long-running division
17529    and square-root instructions.  */
17530
17531 static bool
17532 aarch64_sched_can_speculate_insn (rtx_insn *insn)
17533 {
17534   switch (get_attr_type (insn))
17535     {
17536       case TYPE_SDIV:
17537       case TYPE_UDIV:
17538       case TYPE_FDIVS:
17539       case TYPE_FDIVD:
17540       case TYPE_FSQRTS:
17541       case TYPE_FSQRTD:
17542       case TYPE_NEON_FP_SQRT_S:
17543       case TYPE_NEON_FP_SQRT_D:
17544       case TYPE_NEON_FP_SQRT_S_Q:
17545       case TYPE_NEON_FP_SQRT_D_Q:
17546       case TYPE_NEON_FP_DIV_S:
17547       case TYPE_NEON_FP_DIV_D:
17548       case TYPE_NEON_FP_DIV_S_Q:
17549       case TYPE_NEON_FP_DIV_D_Q:
17550         return false;
17551       default:
17552         return true;
17553     }
17554 }
17555
17556 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
17557
17558 static int
17559 aarch64_compute_pressure_classes (reg_class *classes)
17560 {
17561   int i = 0;
17562   classes[i++] = GENERAL_REGS;
17563   classes[i++] = FP_REGS;
17564   /* PR_REGS isn't a useful pressure class because many predicate pseudo
17565      registers need to go in PR_LO_REGS at some point during their
17566      lifetime.  Splitting it into two halves has the effect of making
17567      all predicates count against PR_LO_REGS, so that we try whenever
17568      possible to restrict the number of live predicates to 8.  This
17569      greatly reduces the amount of spilling in certain loops.  */
17570   classes[i++] = PR_LO_REGS;
17571   classes[i++] = PR_HI_REGS;
17572   return i;
17573 }
17574
17575 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
17576
17577 static bool
17578 aarch64_can_change_mode_class (machine_mode from,
17579                                machine_mode to, reg_class_t)
17580 {
17581   if (BYTES_BIG_ENDIAN)
17582     {
17583       bool from_sve_p = aarch64_sve_data_mode_p (from);
17584       bool to_sve_p = aarch64_sve_data_mode_p (to);
17585
17586       /* Don't allow changes between SVE data modes and non-SVE modes.
17587          See the comment at the head of aarch64-sve.md for details.  */
17588       if (from_sve_p != to_sve_p)
17589         return false;
17590
17591       /* Don't allow changes in element size: lane 0 of the new vector
17592          would not then be lane 0 of the old vector.  See the comment
17593          above aarch64_maybe_expand_sve_subreg_move for a more detailed
17594          description.
17595
17596          In the worst case, this forces a register to be spilled in
17597          one mode and reloaded in the other, which handles the
17598          endianness correctly.  */
17599       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
17600         return false;
17601     }
17602   return true;
17603 }
17604
17605 /* Implement TARGET_EARLY_REMAT_MODES.  */
17606
17607 static void
17608 aarch64_select_early_remat_modes (sbitmap modes)
17609 {
17610   /* SVE values are not normally live across a call, so it should be
17611      worth doing early rematerialization even in VL-specific mode.  */
17612   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
17613     {
17614       machine_mode mode = (machine_mode) i;
17615       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
17616       if (vec_flags & VEC_ANY_SVE)
17617         bitmap_set_bit (modes, i);
17618     }
17619 }
17620
17621 /* Target-specific selftests.  */
17622
17623 #if CHECKING_P
17624
17625 namespace selftest {
17626
17627 /* Selftest for the RTL loader.
17628    Verify that the RTL loader copes with a dump from
17629    print_rtx_function.  This is essentially just a test that class
17630    function_reader can handle a real dump, but it also verifies
17631    that lookup_reg_by_dump_name correctly handles hard regs.
17632    The presence of hard reg names in the dump means that the test is
17633    target-specific, hence it is in this file.  */
17634
17635 static void
17636 aarch64_test_loading_full_dump ()
17637 {
17638   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
17639
17640   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
17641
17642   rtx_insn *insn_1 = get_insn_by_uid (1);
17643   ASSERT_EQ (NOTE, GET_CODE (insn_1));
17644
17645   rtx_insn *insn_15 = get_insn_by_uid (15);
17646   ASSERT_EQ (INSN, GET_CODE (insn_15));
17647   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
17648
17649   /* Verify crtl->return_rtx.  */
17650   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
17651   ASSERT_EQ (0, REGNO (crtl->return_rtx));
17652   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
17653 }
17654
17655 /* Run all target-specific selftests.  */
17656
17657 static void
17658 aarch64_run_selftests (void)
17659 {
17660   aarch64_test_loading_full_dump ();
17661 }
17662
17663 } // namespace selftest
17664
17665 #endif /* #if CHECKING_P */
17666
17667 #undef TARGET_ADDRESS_COST
17668 #define TARGET_ADDRESS_COST aarch64_address_cost
17669
17670 /* This hook will determines whether unnamed bitfields affect the alignment
17671    of the containing structure.  The hook returns true if the structure
17672    should inherit the alignment requirements of an unnamed bitfield's
17673    type.  */
17674 #undef TARGET_ALIGN_ANON_BITFIELD
17675 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
17676
17677 #undef TARGET_ASM_ALIGNED_DI_OP
17678 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
17679
17680 #undef TARGET_ASM_ALIGNED_HI_OP
17681 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
17682
17683 #undef TARGET_ASM_ALIGNED_SI_OP
17684 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
17685
17686 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
17687 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
17688   hook_bool_const_tree_hwi_hwi_const_tree_true
17689
17690 #undef TARGET_ASM_FILE_START
17691 #define TARGET_ASM_FILE_START aarch64_start_file
17692
17693 #undef TARGET_ASM_OUTPUT_MI_THUNK
17694 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
17695
17696 #undef TARGET_ASM_SELECT_RTX_SECTION
17697 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
17698
17699 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
17700 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
17701
17702 #undef TARGET_BUILD_BUILTIN_VA_LIST
17703 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
17704
17705 #undef TARGET_CALLEE_COPIES
17706 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
17707
17708 #undef TARGET_CAN_ELIMINATE
17709 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
17710
17711 #undef TARGET_CAN_INLINE_P
17712 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
17713
17714 #undef TARGET_CANNOT_FORCE_CONST_MEM
17715 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
17716
17717 #undef TARGET_CASE_VALUES_THRESHOLD
17718 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
17719
17720 #undef TARGET_CONDITIONAL_REGISTER_USAGE
17721 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
17722
17723 /* Only the least significant bit is used for initialization guard
17724    variables.  */
17725 #undef TARGET_CXX_GUARD_MASK_BIT
17726 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
17727
17728 #undef TARGET_C_MODE_FOR_SUFFIX
17729 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
17730
17731 #ifdef TARGET_BIG_ENDIAN_DEFAULT
17732 #undef  TARGET_DEFAULT_TARGET_FLAGS
17733 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
17734 #endif
17735
17736 #undef TARGET_CLASS_MAX_NREGS
17737 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
17738
17739 #undef TARGET_BUILTIN_DECL
17740 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
17741
17742 #undef TARGET_BUILTIN_RECIPROCAL
17743 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
17744
17745 #undef TARGET_C_EXCESS_PRECISION
17746 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
17747
17748 #undef  TARGET_EXPAND_BUILTIN
17749 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
17750
17751 #undef TARGET_EXPAND_BUILTIN_VA_START
17752 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
17753
17754 #undef TARGET_FOLD_BUILTIN
17755 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
17756
17757 #undef TARGET_FUNCTION_ARG
17758 #define TARGET_FUNCTION_ARG aarch64_function_arg
17759
17760 #undef TARGET_FUNCTION_ARG_ADVANCE
17761 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
17762
17763 #undef TARGET_FUNCTION_ARG_BOUNDARY
17764 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
17765
17766 #undef TARGET_FUNCTION_ARG_PADDING
17767 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
17768
17769 #undef TARGET_GET_RAW_RESULT_MODE
17770 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
17771 #undef TARGET_GET_RAW_ARG_MODE
17772 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
17773
17774 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
17775 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
17776
17777 #undef TARGET_FUNCTION_VALUE
17778 #define TARGET_FUNCTION_VALUE aarch64_function_value
17779
17780 #undef TARGET_FUNCTION_VALUE_REGNO_P
17781 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
17782
17783 #undef TARGET_GIMPLE_FOLD_BUILTIN
17784 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
17785
17786 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
17787 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
17788
17789 #undef  TARGET_INIT_BUILTINS
17790 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
17791
17792 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
17793 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
17794   aarch64_ira_change_pseudo_allocno_class
17795
17796 #undef TARGET_LEGITIMATE_ADDRESS_P
17797 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
17798
17799 #undef TARGET_LEGITIMATE_CONSTANT_P
17800 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
17801
17802 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
17803 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
17804   aarch64_legitimize_address_displacement
17805
17806 #undef TARGET_LIBGCC_CMP_RETURN_MODE
17807 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
17808
17809 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
17810 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
17811 aarch64_libgcc_floating_mode_supported_p
17812
17813 #undef TARGET_MANGLE_TYPE
17814 #define TARGET_MANGLE_TYPE aarch64_mangle_type
17815
17816 #undef TARGET_MEMORY_MOVE_COST
17817 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
17818
17819 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
17820 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
17821
17822 #undef TARGET_MUST_PASS_IN_STACK
17823 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
17824
17825 /* This target hook should return true if accesses to volatile bitfields
17826    should use the narrowest mode possible.  It should return false if these
17827    accesses should use the bitfield container type.  */
17828 #undef TARGET_NARROW_VOLATILE_BITFIELD
17829 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
17830
17831 #undef  TARGET_OPTION_OVERRIDE
17832 #define TARGET_OPTION_OVERRIDE aarch64_override_options
17833
17834 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
17835 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
17836   aarch64_override_options_after_change
17837
17838 #undef TARGET_OPTION_SAVE
17839 #define TARGET_OPTION_SAVE aarch64_option_save
17840
17841 #undef TARGET_OPTION_RESTORE
17842 #define TARGET_OPTION_RESTORE aarch64_option_restore
17843
17844 #undef TARGET_OPTION_PRINT
17845 #define TARGET_OPTION_PRINT aarch64_option_print
17846
17847 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
17848 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
17849
17850 #undef TARGET_SET_CURRENT_FUNCTION
17851 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
17852
17853 #undef TARGET_PASS_BY_REFERENCE
17854 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
17855
17856 #undef TARGET_PREFERRED_RELOAD_CLASS
17857 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
17858
17859 #undef TARGET_SCHED_REASSOCIATION_WIDTH
17860 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
17861
17862 #undef TARGET_PROMOTED_TYPE
17863 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
17864
17865 #undef TARGET_SECONDARY_RELOAD
17866 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
17867
17868 #undef TARGET_SHIFT_TRUNCATION_MASK
17869 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
17870
17871 #undef TARGET_SETUP_INCOMING_VARARGS
17872 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
17873
17874 #undef TARGET_STRUCT_VALUE_RTX
17875 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
17876
17877 #undef TARGET_REGISTER_MOVE_COST
17878 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
17879
17880 #undef TARGET_RETURN_IN_MEMORY
17881 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
17882
17883 #undef TARGET_RETURN_IN_MSB
17884 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
17885
17886 #undef TARGET_RTX_COSTS
17887 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
17888
17889 #undef TARGET_SCALAR_MODE_SUPPORTED_P
17890 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
17891
17892 #undef TARGET_SCHED_ISSUE_RATE
17893 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
17894
17895 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
17896 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
17897   aarch64_sched_first_cycle_multipass_dfa_lookahead
17898
17899 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
17900 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
17901   aarch64_first_cycle_multipass_dfa_lookahead_guard
17902
17903 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
17904 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
17905   aarch64_get_separate_components
17906
17907 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
17908 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
17909   aarch64_components_for_bb
17910
17911 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
17912 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
17913   aarch64_disqualify_components
17914
17915 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
17916 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
17917   aarch64_emit_prologue_components
17918
17919 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
17920 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
17921   aarch64_emit_epilogue_components
17922
17923 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
17924 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
17925   aarch64_set_handled_components
17926
17927 #undef TARGET_TRAMPOLINE_INIT
17928 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
17929
17930 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
17931 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
17932
17933 #undef TARGET_VECTOR_MODE_SUPPORTED_P
17934 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
17935
17936 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
17937 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
17938   aarch64_builtin_support_vector_misalignment
17939
17940 #undef TARGET_ARRAY_MODE
17941 #define TARGET_ARRAY_MODE aarch64_array_mode
17942
17943 #undef TARGET_ARRAY_MODE_SUPPORTED_P
17944 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
17945
17946 #undef TARGET_VECTORIZE_ADD_STMT_COST
17947 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
17948
17949 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
17950 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
17951   aarch64_builtin_vectorization_cost
17952
17953 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
17954 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
17955
17956 #undef TARGET_VECTORIZE_BUILTINS
17957 #define TARGET_VECTORIZE_BUILTINS
17958
17959 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
17960 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
17961   aarch64_builtin_vectorized_function
17962
17963 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
17964 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
17965   aarch64_autovectorize_vector_sizes
17966
17967 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
17968 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
17969   aarch64_atomic_assign_expand_fenv
17970
17971 /* Section anchor support.  */
17972
17973 #undef TARGET_MIN_ANCHOR_OFFSET
17974 #define TARGET_MIN_ANCHOR_OFFSET -256
17975
17976 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
17977    byte offset; we can do much more for larger data types, but have no way
17978    to determine the size of the access.  We assume accesses are aligned.  */
17979 #undef TARGET_MAX_ANCHOR_OFFSET
17980 #define TARGET_MAX_ANCHOR_OFFSET 4095
17981
17982 #undef TARGET_VECTOR_ALIGNMENT
17983 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
17984
17985 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
17986 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
17987   aarch64_vectorize_preferred_vector_alignment
17988 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
17989 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
17990   aarch64_simd_vector_alignment_reachable
17991
17992 /* vec_perm support.  */
17993
17994 #undef TARGET_VECTORIZE_VEC_PERM_CONST
17995 #define TARGET_VECTORIZE_VEC_PERM_CONST \
17996   aarch64_vectorize_vec_perm_const
17997
17998 #undef TARGET_VECTORIZE_GET_MASK_MODE
17999 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
18000 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
18001 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
18002   aarch64_empty_mask_is_expensive
18003 #undef TARGET_PREFERRED_ELSE_VALUE
18004 #define TARGET_PREFERRED_ELSE_VALUE \
18005   aarch64_preferred_else_value
18006
18007 #undef TARGET_INIT_LIBFUNCS
18008 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
18009
18010 #undef TARGET_FIXED_CONDITION_CODE_REGS
18011 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
18012
18013 #undef TARGET_FLAGS_REGNUM
18014 #define TARGET_FLAGS_REGNUM CC_REGNUM
18015
18016 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
18017 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
18018
18019 #undef TARGET_ASAN_SHADOW_OFFSET
18020 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
18021
18022 #undef TARGET_LEGITIMIZE_ADDRESS
18023 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
18024
18025 #undef TARGET_SCHED_CAN_SPECULATE_INSN
18026 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
18027
18028 #undef TARGET_CAN_USE_DOLOOP_P
18029 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
18030
18031 #undef TARGET_SCHED_ADJUST_PRIORITY
18032 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
18033
18034 #undef TARGET_SCHED_MACRO_FUSION_P
18035 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
18036
18037 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
18038 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
18039
18040 #undef TARGET_SCHED_FUSION_PRIORITY
18041 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
18042
18043 #undef TARGET_UNSPEC_MAY_TRAP_P
18044 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
18045
18046 #undef TARGET_USE_PSEUDO_PIC_REG
18047 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
18048
18049 #undef TARGET_PRINT_OPERAND
18050 #define TARGET_PRINT_OPERAND aarch64_print_operand
18051
18052 #undef TARGET_PRINT_OPERAND_ADDRESS
18053 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
18054
18055 #undef TARGET_OPTAB_SUPPORTED_P
18056 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
18057
18058 #undef TARGET_OMIT_STRUCT_RETURN_REG
18059 #define TARGET_OMIT_STRUCT_RETURN_REG true
18060
18061 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
18062 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
18063   aarch64_dwarf_poly_indeterminate_value
18064
18065 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
18066 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
18067 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
18068
18069 #undef TARGET_HARD_REGNO_NREGS
18070 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
18071 #undef TARGET_HARD_REGNO_MODE_OK
18072 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
18073
18074 #undef TARGET_MODES_TIEABLE_P
18075 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
18076
18077 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
18078 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
18079   aarch64_hard_regno_call_part_clobbered
18080
18081 #undef TARGET_CONSTANT_ALIGNMENT
18082 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
18083
18084 #undef TARGET_COMPUTE_PRESSURE_CLASSES
18085 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
18086
18087 #undef TARGET_CAN_CHANGE_MODE_CLASS
18088 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
18089
18090 #undef TARGET_SELECT_EARLY_REMAT_MODES
18091 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
18092
18093 #if CHECKING_P
18094 #undef TARGET_RUN_TARGET_SELFTESTS
18095 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
18096 #endif /* #if CHECKING_P */
18097
18098 struct gcc_target targetm = TARGET_INITIALIZER;
18099
18100 #include "gt-aarch64.h"