gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2018 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #include "config.h"
  24 #define INCLUDE_STRING
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "memmodel.h"
  32 #include "gimple.h"
  33 #include "cfghooks.h"
  34 #include "cfgloop.h"
  35 #include "df.h"
  36 #include "tm_p.h"
  37 #include "stringpool.h"
  38 #include "attribs.h"
  39 #include "optabs.h"
  40 #include "regs.h"
  41 #include "emit-rtl.h"
  42 #include "recog.h"
  43 #include "diagnostic.h"
  44 #include "insn-attr.h"
  45 #include "alias.h"
  46 #include "fold-const.h"
  47 #include "stor-layout.h"
  48 #include "calls.h"
  49 #include "varasm.h"
  50 #include "output.h"
  51 #include "flags.h"
  52 #include "explow.h"
  53 #include "expr.h"
  54 #include "reload.h"
  55 #include "langhooks.h"
  56 #include "opts.h"
  57 #include "params.h"
  58 #include "gimplify.h"
  59 #include "dwarf2.h"
  60 #include "gimple-iterator.h"
  61 #include "tree-vectorizer.h"
  62 #include "aarch64-cost-tables.h"
  63 #include "dumpfile.h"
  64 #include "builtins.h"
  65 #include "rtl-iter.h"
  66 #include "tm-constrs.h"
  67 #include "sched-int.h"
  68 #include "target-globals.h"
  69 #include "common/common-target.h"
  70 #include "cfgrtl.h"
  71 #include "selftest.h"
  72 #include "selftest-rtl.h"
  73 #include "rtx-vector-builder.h"
  74
  75 /* This file should be included last.  */
  76 #include "target-def.h"
  77
  78 /* Defined for convenience.  */
  79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  80
  81 /* Classifies an address.
  82
  83    ADDRESS_REG_IMM
  84        A simple base register plus immediate offset.
  85
  86    ADDRESS_REG_WB
  87        A base register indexed by immediate offset with writeback.
  88
  89    ADDRESS_REG_REG
  90        A base register indexed by (optionally scaled) register.
  91
  92    ADDRESS_REG_UXTW
  93        A base register indexed by (optionally scaled) zero-extended register.
  94
  95    ADDRESS_REG_SXTW
  96        A base register indexed by (optionally scaled) sign-extended register.
  97
  98    ADDRESS_LO_SUM
  99        A LO_SUM rtx with a base register and "LO12" symbol relocation.
 100
 101    ADDRESS_SYMBOLIC:
 102        A constant symbolic address, in pc-relative literal pool.  */
 103
 104 enum aarch64_address_type {
 105   ADDRESS_REG_IMM,
 106   ADDRESS_REG_WB,
 107   ADDRESS_REG_REG,
 108   ADDRESS_REG_UXTW,
 109   ADDRESS_REG_SXTW,
 110   ADDRESS_LO_SUM,
 111   ADDRESS_SYMBOLIC
 112 };
 113
 114 struct aarch64_address_info {
 115   enum aarch64_address_type type;
 116   rtx base;
 117   rtx offset;
 118   poly_int64 const_offset;
 119   int shift;
 120   enum aarch64_symbol_type symbol_type;
 121 };
 122
 123 /* Information about a legitimate vector immediate operand.  */
 124 struct simd_immediate_info
 125 {
 126   enum insn_type { MOV, MVN };
 127   enum modifier_type { LSL, MSL };
 128
 129   simd_immediate_info () {}
 130   simd_immediate_info (scalar_float_mode, rtx);
 131   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
 132                        insn_type = MOV, modifier_type = LSL,
 133                        unsigned int = 0);
 134   simd_immediate_info (scalar_mode, rtx, rtx);
 135
 136   /* The mode of the elements.  */
 137   scalar_mode elt_mode;
 138
 139   /* The value of each element if all elements are the same, or the
 140      first value if the constant is a series.  */
 141   rtx value;
 142
 143   /* The value of the step if the constant is a series, null otherwise.  */
 144   rtx step;
 145
 146   /* The instruction to use to move the immediate into a vector.  */
 147   insn_type insn;
 148
 149   /* The kind of shift modifier to use, and the number of bits to shift.
 150      This is (LSL, 0) if no shift is needed.  */
 151   modifier_type modifier;
 152   unsigned int shift;
 153 };
 154
 155 /* Construct a floating-point immediate in which each element has mode
 156    ELT_MODE_IN and value VALUE_IN.  */
 157 inline simd_immediate_info
 158 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 159   : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
 160     modifier (LSL), shift (0)
 161 {}
 162
 163 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 164    and value VALUE_IN.  The other parameters are as for the structure
 165    fields.  */
 166 inline simd_immediate_info
 167 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 168                        unsigned HOST_WIDE_INT value_in,
 169                        insn_type insn_in, modifier_type modifier_in,
 170                        unsigned int shift_in)
 171   : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
 172     step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
 173 {}
 174
 175 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 176    and where element I is equal to VALUE_IN + I * STEP_IN.  */
 177 inline simd_immediate_info
 178 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
 179   : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
 180     modifier (LSL), shift (0)
 181 {}
 182
 183 /* The current code model.  */
 184 enum aarch64_code_model aarch64_cmodel;
 185
 186 /* The number of 64-bit elements in an SVE vector.  */
 187 poly_uint16 aarch64_sve_vg;
 188
 189 #ifdef HAVE_AS_TLS
 190 #undef TARGET_HAVE_TLS
 191 #define TARGET_HAVE_TLS 1
 192 #endif
 193
 194 static bool aarch64_composite_type_p (const_tree, machine_mode);
 195 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 196                                                      const_tree,
 197                                                      machine_mode *, int *,
 198                                                      bool *);
 199 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 200 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 201 static void aarch64_override_options_after_change (void);
 202 static bool aarch64_vector_mode_supported_p (machine_mode);
 203 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 204 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 205                                                          const_tree type,
 206                                                          int misalignment,
 207                                                          bool is_packed);
 208 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 209 static bool aarch64_print_ldpstp_address (FILE *, machine_mode, rtx);
 210
 211 /* Major revision number of the ARM Architecture implemented by the target.  */
 212 unsigned aarch64_architecture_version;
 213
 214 /* The processor for which instructions should be scheduled.  */
 215 enum aarch64_processor aarch64_tune = cortexa53;
 216
 217 /* Mask to specify which instruction scheduling options should be used.  */
 218 unsigned long aarch64_tune_flags = 0;
 219
 220 /* Global flag for PC relative loads.  */
 221 bool aarch64_pcrelative_literal_loads;
 222
 223 /* Global flag for whether frame pointer is enabled.  */
 224 bool aarch64_use_frame_pointer;
 225
 226 /* Support for command line parsing of boolean flags in the tuning
 227    structures.  */
 228 struct aarch64_flag_desc
 229 {
 230   const char* name;
 231   unsigned int flag;
 232 };
 233
 234 #define AARCH64_FUSION_PAIR(name, internal_name) \
 235   { name, AARCH64_FUSE_##internal_name },
 236 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 237 {
 238   { "none", AARCH64_FUSE_NOTHING },
 239 #include "aarch64-fusion-pairs.def"
 240   { "all", AARCH64_FUSE_ALL },
 241   { NULL, AARCH64_FUSE_NOTHING }
 242 };
 243
 244 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 245   { name, AARCH64_EXTRA_TUNE_##internal_name },
 246 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 247 {
 248   { "none", AARCH64_EXTRA_TUNE_NONE },
 249 #include "aarch64-tuning-flags.def"
 250   { "all", AARCH64_EXTRA_TUNE_ALL },
 251   { NULL, AARCH64_EXTRA_TUNE_NONE }
 252 };
 253
 254 /* Tuning parameters.  */
 255
 256 static const struct cpu_addrcost_table generic_addrcost_table =
 257 {
 258     {
 259       1, /* hi  */
 260       0, /* si  */
 261       0, /* di  */
 262       1, /* ti  */
 263     },
 264   0, /* pre_modify  */
 265   0, /* post_modify  */
 266   0, /* register_offset  */
 267   0, /* register_sextend  */
 268   0, /* register_zextend  */
 269   0 /* imm_offset  */
 270 };
 271
 272 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 273 {
 274     {
 275       0, /* hi  */
 276       0, /* si  */
 277       0, /* di  */
 278       2, /* ti  */
 279     },
 280   0, /* pre_modify  */
 281   0, /* post_modify  */
 282   1, /* register_offset  */
 283   1, /* register_sextend  */
 284   2, /* register_zextend  */
 285   0, /* imm_offset  */
 286 };
 287
 288 static const struct cpu_addrcost_table xgene1_addrcost_table =
 289 {
 290     {
 291       1, /* hi  */
 292       0, /* si  */
 293       0, /* di  */
 294       1, /* ti  */
 295     },
 296   1, /* pre_modify  */
 297   0, /* post_modify  */
 298   0, /* register_offset  */
 299   1, /* register_sextend  */
 300   1, /* register_zextend  */
 301   0, /* imm_offset  */
 302 };
 303
 304 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 305 {
 306     {
 307       1, /* hi  */
 308       1, /* si  */
 309       1, /* di  */
 310       2, /* ti  */
 311     },
 312   0, /* pre_modify  */
 313   0, /* post_modify  */
 314   2, /* register_offset  */
 315   3, /* register_sextend  */
 316   3, /* register_zextend  */
 317   0, /* imm_offset  */
 318 };
 319
 320 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 321 {
 322     {
 323       1, /* hi  */
 324       1, /* si  */
 325       1, /* di  */
 326       2, /* ti  */
 327     },
 328   1, /* pre_modify  */
 329   1, /* post_modify  */
 330   3, /* register_offset  */
 331   4, /* register_sextend  */
 332   3, /* register_zextend  */
 333   2, /* imm_offset  */
 334 };
 335
 336 static const struct cpu_regmove_cost generic_regmove_cost =
 337 {
 338   1, /* GP2GP  */
 339   /* Avoid the use of slow int<->fp moves for spilling by setting
 340      their cost higher than memmov_cost.  */
 341   5, /* GP2FP  */
 342   5, /* FP2GP  */
 343   2 /* FP2FP  */
 344 };
 345
 346 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 347 {
 348   1, /* GP2GP  */
 349   /* Avoid the use of slow int<->fp moves for spilling by setting
 350      their cost higher than memmov_cost.  */
 351   5, /* GP2FP  */
 352   5, /* FP2GP  */
 353   2 /* FP2FP  */
 354 };
 355
 356 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 357 {
 358   1, /* GP2GP  */
 359   /* Avoid the use of slow int<->fp moves for spilling by setting
 360      their cost higher than memmov_cost.  */
 361   5, /* GP2FP  */
 362   5, /* FP2GP  */
 363   2 /* FP2FP  */
 364 };
 365
 366 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 367 {
 368   1, /* GP2GP  */
 369   /* Avoid the use of slow int<->fp moves for spilling by setting
 370      their cost higher than memmov_cost (actual, 4 and 9).  */
 371   9, /* GP2FP  */
 372   9, /* FP2GP  */
 373   1 /* FP2FP  */
 374 };
 375
 376 static const struct cpu_regmove_cost thunderx_regmove_cost =
 377 {
 378   2, /* GP2GP  */
 379   2, /* GP2FP  */
 380   6, /* FP2GP  */
 381   4 /* FP2FP  */
 382 };
 383
 384 static const struct cpu_regmove_cost xgene1_regmove_cost =
 385 {
 386   1, /* GP2GP  */
 387   /* Avoid the use of slow int<->fp moves for spilling by setting
 388      their cost higher than memmov_cost.  */
 389   8, /* GP2FP  */
 390   8, /* FP2GP  */
 391   2 /* FP2FP  */
 392 };
 393
 394 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 395 {
 396   2, /* GP2GP  */
 397   /* Avoid the use of int<->fp moves for spilling.  */
 398   6, /* GP2FP  */
 399   6, /* FP2GP  */
 400   4 /* FP2FP  */
 401 };
 402
 403 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 404 {
 405   1, /* GP2GP  */
 406   /* Avoid the use of int<->fp moves for spilling.  */
 407   8, /* GP2FP  */
 408   8, /* FP2GP  */
 409   4  /* FP2FP  */
 410 };
 411
 412 /* Generic costs for vector insn classes.  */
 413 static const struct cpu_vector_cost generic_vector_cost =
 414 {
 415   1, /* scalar_int_stmt_cost  */
 416   1, /* scalar_fp_stmt_cost  */
 417   1, /* scalar_load_cost  */
 418   1, /* scalar_store_cost  */
 419   1, /* vec_int_stmt_cost  */
 420   1, /* vec_fp_stmt_cost  */
 421   2, /* vec_permute_cost  */
 422   1, /* vec_to_scalar_cost  */
 423   1, /* scalar_to_vec_cost  */
 424   1, /* vec_align_load_cost  */
 425   1, /* vec_unalign_load_cost  */
 426   1, /* vec_unalign_store_cost  */
 427   1, /* vec_store_cost  */
 428   3, /* cond_taken_branch_cost  */
 429   1 /* cond_not_taken_branch_cost  */
 430 };
 431
 432 /* ThunderX costs for vector insn classes.  */
 433 static const struct cpu_vector_cost thunderx_vector_cost =
 434 {
 435   1, /* scalar_int_stmt_cost  */
 436   1, /* scalar_fp_stmt_cost  */
 437   3, /* scalar_load_cost  */
 438   1, /* scalar_store_cost  */
 439   4, /* vec_int_stmt_cost  */
 440   1, /* vec_fp_stmt_cost  */
 441   4, /* vec_permute_cost  */
 442   2, /* vec_to_scalar_cost  */
 443   2, /* scalar_to_vec_cost  */
 444   3, /* vec_align_load_cost  */
 445   5, /* vec_unalign_load_cost  */
 446   5, /* vec_unalign_store_cost  */
 447   1, /* vec_store_cost  */
 448   3, /* cond_taken_branch_cost  */
 449   3 /* cond_not_taken_branch_cost  */
 450 };
 451
 452 /* Generic costs for vector insn classes.  */
 453 static const struct cpu_vector_cost cortexa57_vector_cost =
 454 {
 455   1, /* scalar_int_stmt_cost  */
 456   1, /* scalar_fp_stmt_cost  */
 457   4, /* scalar_load_cost  */
 458   1, /* scalar_store_cost  */
 459   2, /* vec_int_stmt_cost  */
 460   2, /* vec_fp_stmt_cost  */
 461   3, /* vec_permute_cost  */
 462   8, /* vec_to_scalar_cost  */
 463   8, /* scalar_to_vec_cost  */
 464   4, /* vec_align_load_cost  */
 465   4, /* vec_unalign_load_cost  */
 466   1, /* vec_unalign_store_cost  */
 467   1, /* vec_store_cost  */
 468   1, /* cond_taken_branch_cost  */
 469   1 /* cond_not_taken_branch_cost  */
 470 };
 471
 472 static const struct cpu_vector_cost exynosm1_vector_cost =
 473 {
 474   1, /* scalar_int_stmt_cost  */
 475   1, /* scalar_fp_stmt_cost  */
 476   5, /* scalar_load_cost  */
 477   1, /* scalar_store_cost  */
 478   3, /* vec_int_stmt_cost  */
 479   3, /* vec_fp_stmt_cost  */
 480   3, /* vec_permute_cost  */
 481   3, /* vec_to_scalar_cost  */
 482   3, /* scalar_to_vec_cost  */
 483   5, /* vec_align_load_cost  */
 484   5, /* vec_unalign_load_cost  */
 485   1, /* vec_unalign_store_cost  */
 486   1, /* vec_store_cost  */
 487   1, /* cond_taken_branch_cost  */
 488   1 /* cond_not_taken_branch_cost  */
 489 };
 490
 491 /* Generic costs for vector insn classes.  */
 492 static const struct cpu_vector_cost xgene1_vector_cost =
 493 {
 494   1, /* scalar_int_stmt_cost  */
 495   1, /* scalar_fp_stmt_cost  */
 496   5, /* scalar_load_cost  */
 497   1, /* scalar_store_cost  */
 498   2, /* vec_int_stmt_cost  */
 499   2, /* vec_fp_stmt_cost  */
 500   2, /* vec_permute_cost  */
 501   4, /* vec_to_scalar_cost  */
 502   4, /* scalar_to_vec_cost  */
 503   10, /* vec_align_load_cost  */
 504   10, /* vec_unalign_load_cost  */
 505   2, /* vec_unalign_store_cost  */
 506   2, /* vec_store_cost  */
 507   2, /* cond_taken_branch_cost  */
 508   1 /* cond_not_taken_branch_cost  */
 509 };
 510
 511 /* Costs for vector insn classes for Vulcan.  */
 512 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 513 {
 514   1, /* scalar_int_stmt_cost  */
 515   6, /* scalar_fp_stmt_cost  */
 516   4, /* scalar_load_cost  */
 517   1, /* scalar_store_cost  */
 518   5, /* vec_int_stmt_cost  */
 519   6, /* vec_fp_stmt_cost  */
 520   3, /* vec_permute_cost  */
 521   6, /* vec_to_scalar_cost  */
 522   5, /* scalar_to_vec_cost  */
 523   8, /* vec_align_load_cost  */
 524   8, /* vec_unalign_load_cost  */
 525   4, /* vec_unalign_store_cost  */
 526   4, /* vec_store_cost  */
 527   2, /* cond_taken_branch_cost  */
 528   1  /* cond_not_taken_branch_cost  */
 529 };
 530
 531 /* Generic costs for branch instructions.  */
 532 static const struct cpu_branch_cost generic_branch_cost =
 533 {
 534   1,  /* Predictable.  */
 535   3   /* Unpredictable.  */
 536 };
 537
 538 /* Generic approximation modes.  */
 539 static const cpu_approx_modes generic_approx_modes =
 540 {
 541   AARCH64_APPROX_NONE,  /* division  */
 542   AARCH64_APPROX_NONE,  /* sqrt  */
 543   AARCH64_APPROX_NONE   /* recip_sqrt  */
 544 };
 545
 546 /* Approximation modes for Exynos M1.  */
 547 static const cpu_approx_modes exynosm1_approx_modes =
 548 {
 549   AARCH64_APPROX_NONE,  /* division  */
 550   AARCH64_APPROX_ALL,   /* sqrt  */
 551   AARCH64_APPROX_ALL    /* recip_sqrt  */
 552 };
 553
 554 /* Approximation modes for X-Gene 1.  */
 555 static const cpu_approx_modes xgene1_approx_modes =
 556 {
 557   AARCH64_APPROX_NONE,  /* division  */
 558   AARCH64_APPROX_NONE,  /* sqrt  */
 559   AARCH64_APPROX_ALL    /* recip_sqrt  */
 560 };
 561
 562 /* Generic prefetch settings (which disable prefetch).  */
 563 static const cpu_prefetch_tune generic_prefetch_tune =
 564 {
 565   0,                    /* num_slots  */
 566   -1,                   /* l1_cache_size  */
 567   -1,                   /* l1_cache_line_size  */
 568   -1,                   /* l2_cache_size  */
 569   true,                 /* prefetch_dynamic_strides */
 570   -1,                   /* minimum_stride */
 571   -1                    /* default_opt_level  */
 572 };
 573
 574 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 575 {
 576   0,                    /* num_slots  */
 577   -1,                   /* l1_cache_size  */
 578   64,                   /* l1_cache_line_size  */
 579   -1,                   /* l2_cache_size  */
 580   true,                 /* prefetch_dynamic_strides */
 581   -1,                   /* minimum_stride */
 582   -1                    /* default_opt_level  */
 583 };
 584
 585 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 586 {
 587   4,                    /* num_slots  */
 588   32,                   /* l1_cache_size  */
 589   64,                   /* l1_cache_line_size  */
 590   512,                  /* l2_cache_size  */
 591   false,                /* prefetch_dynamic_strides */
 592   2048,                 /* minimum_stride */
 593   3                     /* default_opt_level  */
 594 };
 595
 596 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 597 {
 598   8,                    /* num_slots  */
 599   32,                   /* l1_cache_size  */
 600   128,                  /* l1_cache_line_size  */
 601   16*1024,              /* l2_cache_size  */
 602   true,                 /* prefetch_dynamic_strides */
 603   -1,                   /* minimum_stride */
 604   3                     /* default_opt_level  */
 605 };
 606
 607 static const cpu_prefetch_tune thunderx_prefetch_tune =
 608 {
 609   8,                    /* num_slots  */
 610   32,                   /* l1_cache_size  */
 611   128,                  /* l1_cache_line_size  */
 612   -1,                   /* l2_cache_size  */
 613   true,                 /* prefetch_dynamic_strides */
 614   -1,                   /* minimum_stride */
 615   -1                    /* default_opt_level  */
 616 };
 617
 618 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 619 {
 620   8,                    /* num_slots  */
 621   32,                   /* l1_cache_size  */
 622   64,                   /* l1_cache_line_size  */
 623   256,                  /* l2_cache_size  */
 624   true,                 /* prefetch_dynamic_strides */
 625   -1,                   /* minimum_stride */
 626   -1                    /* default_opt_level  */
 627 };
 628
 629 static const struct tune_params generic_tunings =
 630 {
 631   &cortexa57_extra_costs,
 632   &generic_addrcost_table,
 633   &generic_regmove_cost,
 634   &generic_vector_cost,
 635   &generic_branch_cost,
 636   &generic_approx_modes,
 637   4, /* memmov_cost  */
 638   2, /* issue_rate  */
 639   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 640   8,    /* function_align.  */
 641   4,    /* jump_align.  */
 642   8,    /* loop_align.  */
 643   2,    /* int_reassoc_width.  */
 644   4,    /* fp_reassoc_width.  */
 645   1,    /* vec_reassoc_width.  */
 646   2,    /* min_div_recip_mul_sf.  */
 647   2,    /* min_div_recip_mul_df.  */
 648   0,    /* max_case_values.  */
 649   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 650   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 651   &generic_prefetch_tune
 652 };
 653
 654 static const struct tune_params cortexa35_tunings =
 655 {
 656   &cortexa53_extra_costs,
 657   &generic_addrcost_table,
 658   &cortexa53_regmove_cost,
 659   &generic_vector_cost,
 660   &generic_branch_cost,
 661   &generic_approx_modes,
 662   4, /* memmov_cost  */
 663   1, /* issue_rate  */
 664   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 665    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 666   16,   /* function_align.  */
 667   4,    /* jump_align.  */
 668   8,    /* loop_align.  */
 669   2,    /* int_reassoc_width.  */
 670   4,    /* fp_reassoc_width.  */
 671   1,    /* vec_reassoc_width.  */
 672   2,    /* min_div_recip_mul_sf.  */
 673   2,    /* min_div_recip_mul_df.  */
 674   0,    /* max_case_values.  */
 675   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 676   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 677   &generic_prefetch_tune
 678 };
 679
 680 static const struct tune_params cortexa53_tunings =
 681 {
 682   &cortexa53_extra_costs,
 683   &generic_addrcost_table,
 684   &cortexa53_regmove_cost,
 685   &generic_vector_cost,
 686   &generic_branch_cost,
 687   &generic_approx_modes,
 688   4, /* memmov_cost  */
 689   2, /* issue_rate  */
 690   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 691    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 692   16,   /* function_align.  */
 693   4,    /* jump_align.  */
 694   8,    /* loop_align.  */
 695   2,    /* int_reassoc_width.  */
 696   4,    /* fp_reassoc_width.  */
 697   1,    /* vec_reassoc_width.  */
 698   2,    /* min_div_recip_mul_sf.  */
 699   2,    /* min_div_recip_mul_df.  */
 700   0,    /* max_case_values.  */
 701   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 702   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 703   &generic_prefetch_tune
 704 };
 705
 706 static const struct tune_params cortexa57_tunings =
 707 {
 708   &cortexa57_extra_costs,
 709   &generic_addrcost_table,
 710   &cortexa57_regmove_cost,
 711   &cortexa57_vector_cost,
 712   &generic_branch_cost,
 713   &generic_approx_modes,
 714   4, /* memmov_cost  */
 715   3, /* issue_rate  */
 716   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 717    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 718   16,   /* function_align.  */
 719   4,    /* jump_align.  */
 720   8,    /* loop_align.  */
 721   2,    /* int_reassoc_width.  */
 722   4,    /* fp_reassoc_width.  */
 723   1,    /* vec_reassoc_width.  */
 724   2,    /* min_div_recip_mul_sf.  */
 725   2,    /* min_div_recip_mul_df.  */
 726   0,    /* max_case_values.  */
 727   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 728   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 729   &generic_prefetch_tune
 730 };
 731
 732 static const struct tune_params cortexa72_tunings =
 733 {
 734   &cortexa57_extra_costs,
 735   &generic_addrcost_table,
 736   &cortexa57_regmove_cost,
 737   &cortexa57_vector_cost,
 738   &generic_branch_cost,
 739   &generic_approx_modes,
 740   4, /* memmov_cost  */
 741   3, /* issue_rate  */
 742   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 743    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 744   16,   /* function_align.  */
 745   4,    /* jump_align.  */
 746   8,    /* loop_align.  */
 747   2,    /* int_reassoc_width.  */
 748   4,    /* fp_reassoc_width.  */
 749   1,    /* vec_reassoc_width.  */
 750   2,    /* min_div_recip_mul_sf.  */
 751   2,    /* min_div_recip_mul_df.  */
 752   0,    /* max_case_values.  */
 753   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 754   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 755   &generic_prefetch_tune
 756 };
 757
 758 static const struct tune_params cortexa73_tunings =
 759 {
 760   &cortexa57_extra_costs,
 761   &generic_addrcost_table,
 762   &cortexa57_regmove_cost,
 763   &cortexa57_vector_cost,
 764   &generic_branch_cost,
 765   &generic_approx_modes,
 766   4, /* memmov_cost.  */
 767   2, /* issue_rate.  */
 768   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 769    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 770   16,   /* function_align.  */
 771   4,    /* jump_align.  */
 772   8,    /* loop_align.  */
 773   2,    /* int_reassoc_width.  */
 774   4,    /* fp_reassoc_width.  */
 775   1,    /* vec_reassoc_width.  */
 776   2,    /* min_div_recip_mul_sf.  */
 777   2,    /* min_div_recip_mul_df.  */
 778   0,    /* max_case_values.  */
 779   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 780   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 781   &generic_prefetch_tune
 782 };
 783
 784
 785
 786 static const struct tune_params exynosm1_tunings =
 787 {
 788   &exynosm1_extra_costs,
 789   &exynosm1_addrcost_table,
 790   &exynosm1_regmove_cost,
 791   &exynosm1_vector_cost,
 792   &generic_branch_cost,
 793   &exynosm1_approx_modes,
 794   4,    /* memmov_cost  */
 795   3,    /* issue_rate  */
 796   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 797   4,    /* function_align.  */
 798   4,    /* jump_align.  */
 799   4,    /* loop_align.  */
 800   2,    /* int_reassoc_width.  */
 801   4,    /* fp_reassoc_width.  */
 802   1,    /* vec_reassoc_width.  */
 803   2,    /* min_div_recip_mul_sf.  */
 804   2,    /* min_div_recip_mul_df.  */
 805   48,   /* max_case_values.  */
 806   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 807   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 808   &exynosm1_prefetch_tune
 809 };
 810
 811 static const struct tune_params thunderxt88_tunings =
 812 {
 813   &thunderx_extra_costs,
 814   &generic_addrcost_table,
 815   &thunderx_regmove_cost,
 816   &thunderx_vector_cost,
 817   &generic_branch_cost,
 818   &generic_approx_modes,
 819   6, /* memmov_cost  */
 820   2, /* issue_rate  */
 821   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 822   8,    /* function_align.  */
 823   8,    /* jump_align.  */
 824   8,    /* loop_align.  */
 825   2,    /* int_reassoc_width.  */
 826   4,    /* fp_reassoc_width.  */
 827   1,    /* vec_reassoc_width.  */
 828   2,    /* min_div_recip_mul_sf.  */
 829   2,    /* min_div_recip_mul_df.  */
 830   0,    /* max_case_values.  */
 831   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 832   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 833   &thunderxt88_prefetch_tune
 834 };
 835
 836 static const struct tune_params thunderx_tunings =
 837 {
 838   &thunderx_extra_costs,
 839   &generic_addrcost_table,
 840   &thunderx_regmove_cost,
 841   &thunderx_vector_cost,
 842   &generic_branch_cost,
 843   &generic_approx_modes,
 844   6, /* memmov_cost  */
 845   2, /* issue_rate  */
 846   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 847   8,    /* function_align.  */
 848   8,    /* jump_align.  */
 849   8,    /* loop_align.  */
 850   2,    /* int_reassoc_width.  */
 851   4,    /* fp_reassoc_width.  */
 852   1,    /* vec_reassoc_width.  */
 853   2,    /* min_div_recip_mul_sf.  */
 854   2,    /* min_div_recip_mul_df.  */
 855   0,    /* max_case_values.  */
 856   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 857   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 858    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 859   &thunderx_prefetch_tune
 860 };
 861
 862 static const struct tune_params xgene1_tunings =
 863 {
 864   &xgene1_extra_costs,
 865   &xgene1_addrcost_table,
 866   &xgene1_regmove_cost,
 867   &xgene1_vector_cost,
 868   &generic_branch_cost,
 869   &xgene1_approx_modes,
 870   6, /* memmov_cost  */
 871   4, /* issue_rate  */
 872   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 873   16,   /* function_align.  */
 874   8,    /* jump_align.  */
 875   16,   /* loop_align.  */
 876   2,    /* int_reassoc_width.  */
 877   4,    /* fp_reassoc_width.  */
 878   1,    /* vec_reassoc_width.  */
 879   2,    /* min_div_recip_mul_sf.  */
 880   2,    /* min_div_recip_mul_df.  */
 881   0,    /* max_case_values.  */
 882   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 883   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 884   &generic_prefetch_tune
 885 };
 886
 887 static const struct tune_params qdf24xx_tunings =
 888 {
 889   &qdf24xx_extra_costs,
 890   &qdf24xx_addrcost_table,
 891   &qdf24xx_regmove_cost,
 892   &generic_vector_cost,
 893   &generic_branch_cost,
 894   &generic_approx_modes,
 895   4, /* memmov_cost  */
 896   4, /* issue_rate  */
 897   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 898    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 899   16,   /* function_align.  */
 900   8,    /* jump_align.  */
 901   16,   /* loop_align.  */
 902   2,    /* int_reassoc_width.  */
 903   4,    /* fp_reassoc_width.  */
 904   1,    /* vec_reassoc_width.  */
 905   2,    /* min_div_recip_mul_sf.  */
 906   2,    /* min_div_recip_mul_df.  */
 907   0,    /* max_case_values.  */
 908   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 909   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
 910   &qdf24xx_prefetch_tune
 911 };
 912
 913 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
 914    for now.  */
 915 static const struct tune_params saphira_tunings =
 916 {
 917   &generic_extra_costs,
 918   &generic_addrcost_table,
 919   &generic_regmove_cost,
 920   &generic_vector_cost,
 921   &generic_branch_cost,
 922   &generic_approx_modes,
 923   4, /* memmov_cost  */
 924   4, /* issue_rate  */
 925   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 926    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 927   16,   /* function_align.  */
 928   8,    /* jump_align.  */
 929   16,   /* loop_align.  */
 930   2,    /* int_reassoc_width.  */
 931   4,    /* fp_reassoc_width.  */
 932   1,    /* vec_reassoc_width.  */
 933   2,    /* min_div_recip_mul_sf.  */
 934   2,    /* min_div_recip_mul_df.  */
 935   0,    /* max_case_values.  */
 936   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 937   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
 938   &generic_prefetch_tune
 939 };
 940
 941 static const struct tune_params thunderx2t99_tunings =
 942 {
 943   &thunderx2t99_extra_costs,
 944   &thunderx2t99_addrcost_table,
 945   &thunderx2t99_regmove_cost,
 946   &thunderx2t99_vector_cost,
 947   &generic_branch_cost,
 948   &generic_approx_modes,
 949   4, /* memmov_cost.  */
 950   4, /* issue_rate.  */
 951   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
 952    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 953   16,   /* function_align.  */
 954   8,    /* jump_align.  */
 955   16,   /* loop_align.  */
 956   3,    /* int_reassoc_width.  */
 957   2,    /* fp_reassoc_width.  */
 958   2,    /* vec_reassoc_width.  */
 959   2,    /* min_div_recip_mul_sf.  */
 960   2,    /* min_div_recip_mul_df.  */
 961   0,    /* max_case_values.  */
 962   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 963   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 964   &thunderx2t99_prefetch_tune
 965 };
 966
 967 /* Support for fine-grained override of the tuning structures.  */
 968 struct aarch64_tuning_override_function
 969 {
 970   const char* name;
 971   void (*parse_override)(const char*, struct tune_params*);
 972 };
 973
 974 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 975 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 976
 977 static const struct aarch64_tuning_override_function
 978 aarch64_tuning_override_functions[] =
 979 {
 980   { "fuse", aarch64_parse_fuse_string },
 981   { "tune", aarch64_parse_tune_string },
 982   { NULL, NULL }
 983 };
 984
 985 /* A processor implementing AArch64.  */
 986 struct processor
 987 {
 988   const char *const name;
 989   enum aarch64_processor ident;
 990   enum aarch64_processor sched_core;
 991   enum aarch64_arch arch;
 992   unsigned architecture_version;
 993   const unsigned long flags;
 994   const struct tune_params *const tune;
 995 };
 996
 997 /* Architectures implementing AArch64.  */
 998 static const struct processor all_architectures[] =
 999 {
1000 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1001   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1002 #include "aarch64-arches.def"
1003   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1004 };
1005
1006 /* Processor cores implementing AArch64.  */
1007 static const struct processor all_cores[] =
1008 {
1009 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1010   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
1011   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
1012   FLAGS, &COSTS##_tunings},
1013 #include "aarch64-cores.def"
1014   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1015     AARCH64_FL_FOR_ARCH8, &generic_tunings},
1016   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1017 };
1018
1019
1020 /* Target specification.  These are populated by the -march, -mtune, -mcpu
1021    handling code or by target attributes.  */
1022 static const struct processor *selected_arch;
1023 static const struct processor *selected_cpu;
1024 static const struct processor *selected_tune;
1025
1026 /* The current tuning set.  */
1027 struct tune_params aarch64_tune_params = generic_tunings;
1028
1029 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1030
1031 /* An ISA extension in the co-processor and main instruction set space.  */
1032 struct aarch64_option_extension
1033 {
1034   const char *const name;
1035   const unsigned long flags_on;
1036   const unsigned long flags_off;
1037 };
1038
1039 typedef enum aarch64_cond_code
1040 {
1041   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1042   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1043   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1044 }
1045 aarch64_cc;
1046
1047 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1048
1049 /* The condition codes of the processor, and the inverse function.  */
1050 static const char * const aarch64_condition_codes[] =
1051 {
1052   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1053   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1054 };
1055
1056 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1057 const char *
1058 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1059                         const char * branch_format)
1060 {
1061     rtx_code_label * tmp_label = gen_label_rtx ();
1062     char label_buf[256];
1063     char buffer[128];
1064     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1065                                  CODE_LABEL_NUMBER (tmp_label));
1066     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1067     rtx dest_label = operands[pos_label];
1068     operands[pos_label] = tmp_label;
1069
1070     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1071     output_asm_insn (buffer, operands);
1072
1073     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1074     operands[pos_label] = dest_label;
1075     output_asm_insn (buffer, operands);
1076     return "";
1077 }
1078
1079 void
1080 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
1081 {
1082   const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
1083   if (TARGET_GENERAL_REGS_ONLY)
1084     error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
1085   else
1086     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
1087 }
1088
1089 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1090    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1091    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1092    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1093    and GENERAL_REGS is lower than the memory cost (in this case the best class
1094    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
1095    cost results in bad allocations with many redundant int<->FP moves which
1096    are expensive on various cores.
1097    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1098    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
1099    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
1100    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
1101    The result of this is that it is no longer inefficient to have a higher
1102    memory move cost than the register move cost.
1103 */
1104
1105 static reg_class_t
1106 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1107                                          reg_class_t best_class)
1108 {
1109   machine_mode mode;
1110
1111   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1112       || !reg_class_subset_p (FP_REGS, allocno_class))
1113     return allocno_class;
1114
1115   if (!reg_class_subset_p (GENERAL_REGS, best_class)
1116       || !reg_class_subset_p (FP_REGS, best_class))
1117     return best_class;
1118
1119   mode = PSEUDO_REGNO_MODE (regno);
1120   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1121 }
1122
1123 static unsigned int
1124 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1125 {
1126   if (GET_MODE_UNIT_SIZE (mode) == 4)
1127     return aarch64_tune_params.min_div_recip_mul_sf;
1128   return aarch64_tune_params.min_div_recip_mul_df;
1129 }
1130
1131 /* Return the reassociation width of treeop OPC with mode MODE.  */
1132 static int
1133 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1134 {
1135   if (VECTOR_MODE_P (mode))
1136     return aarch64_tune_params.vec_reassoc_width;
1137   if (INTEGRAL_MODE_P (mode))
1138     return aarch64_tune_params.int_reassoc_width;
1139   /* Avoid reassociating floating point addition so we emit more FMAs.  */
1140   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1141     return aarch64_tune_params.fp_reassoc_width;
1142   return 1;
1143 }
1144
1145 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1146 unsigned
1147 aarch64_dbx_register_number (unsigned regno)
1148 {
1149    if (GP_REGNUM_P (regno))
1150      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1151    else if (regno == SP_REGNUM)
1152      return AARCH64_DWARF_SP;
1153    else if (FP_REGNUM_P (regno))
1154      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1155    else if (PR_REGNUM_P (regno))
1156      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1157    else if (regno == VG_REGNUM)
1158      return AARCH64_DWARF_VG;
1159
1160    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1161       equivalent DWARF register.  */
1162    return DWARF_FRAME_REGISTERS;
1163 }
1164
1165 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1166 static bool
1167 aarch64_advsimd_struct_mode_p (machine_mode mode)
1168 {
1169   return (TARGET_SIMD
1170           && (mode == OImode || mode == CImode || mode == XImode));
1171 }
1172
1173 /* Return true if MODE is an SVE predicate mode.  */
1174 static bool
1175 aarch64_sve_pred_mode_p (machine_mode mode)
1176 {
1177   return (TARGET_SVE
1178           && (mode == VNx16BImode
1179               || mode == VNx8BImode
1180               || mode == VNx4BImode
1181               || mode == VNx2BImode));
1182 }
1183
1184 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1185 const unsigned int VEC_ADVSIMD  = 1;
1186 const unsigned int VEC_SVE_DATA = 2;
1187 const unsigned int VEC_SVE_PRED = 4;
1188 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1189    a structure of 2, 3 or 4 vectors.  */
1190 const unsigned int VEC_STRUCT   = 8;
1191 /* Useful combinations of the above.  */
1192 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1193 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1194
1195 /* Return a set of flags describing the vector properties of mode MODE.
1196    Ignore modes that are not supported by the current target.  */
1197 static unsigned int
1198 aarch64_classify_vector_mode (machine_mode mode)
1199 {
1200   if (aarch64_advsimd_struct_mode_p (mode))
1201     return VEC_ADVSIMD | VEC_STRUCT;
1202
1203   if (aarch64_sve_pred_mode_p (mode))
1204     return VEC_SVE_PRED;
1205
1206   scalar_mode inner = GET_MODE_INNER (mode);
1207   if (VECTOR_MODE_P (mode)
1208       && (inner == QImode
1209           || inner == HImode
1210           || inner == HFmode
1211           || inner == SImode
1212           || inner == SFmode
1213           || inner == DImode
1214           || inner == DFmode))
1215     {
1216       if (TARGET_SVE)
1217         {
1218           if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1219             return VEC_SVE_DATA;
1220           if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1221               || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1222               || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1223             return VEC_SVE_DATA | VEC_STRUCT;
1224         }
1225
1226       /* This includes V1DF but not V1DI (which doesn't exist).  */
1227       if (TARGET_SIMD
1228           && (known_eq (GET_MODE_BITSIZE (mode), 64)
1229               || known_eq (GET_MODE_BITSIZE (mode), 128)))
1230         return VEC_ADVSIMD;
1231     }
1232
1233   return 0;
1234 }
1235
1236 /* Return true if MODE is any of the data vector modes, including
1237    structure modes.  */
1238 static bool
1239 aarch64_vector_data_mode_p (machine_mode mode)
1240 {
1241   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1242 }
1243
1244 /* Return true if MODE is an SVE data vector mode; either a single vector
1245    or a structure of vectors.  */
1246 static bool
1247 aarch64_sve_data_mode_p (machine_mode mode)
1248 {
1249   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1250 }
1251
1252 /* Implement target hook TARGET_ARRAY_MODE.  */
1253 static opt_machine_mode
1254 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1255 {
1256   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1257       && IN_RANGE (nelems, 2, 4))
1258     return mode_for_vector (GET_MODE_INNER (mode),
1259                             GET_MODE_NUNITS (mode) * nelems);
1260
1261   return opt_machine_mode ();
1262 }
1263
1264 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1265 static bool
1266 aarch64_array_mode_supported_p (machine_mode mode,
1267                                 unsigned HOST_WIDE_INT nelems)
1268 {
1269   if (TARGET_SIMD
1270       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1271           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1272       && (nelems >= 2 && nelems <= 4))
1273     return true;
1274
1275   return false;
1276 }
1277
1278 /* Return the SVE predicate mode to use for elements that have
1279    ELEM_NBYTES bytes, if such a mode exists.  */
1280
1281 opt_machine_mode
1282 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1283 {
1284   if (TARGET_SVE)
1285     {
1286       if (elem_nbytes == 1)
1287         return VNx16BImode;
1288       if (elem_nbytes == 2)
1289         return VNx8BImode;
1290       if (elem_nbytes == 4)
1291         return VNx4BImode;
1292       if (elem_nbytes == 8)
1293         return VNx2BImode;
1294     }
1295   return opt_machine_mode ();
1296 }
1297
1298 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1299
1300 static opt_machine_mode
1301 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1302 {
1303   if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1304     {
1305       unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1306       machine_mode pred_mode;
1307       if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1308         return pred_mode;
1309     }
1310
1311   return default_get_mask_mode (nunits, nbytes);
1312 }
1313
1314 /* Implement TARGET_HARD_REGNO_NREGS.  */
1315
1316 static unsigned int
1317 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1318 {
1319   /* ??? Logically we should only need to provide a value when
1320      HARD_REGNO_MODE_OK says that the combination is valid,
1321      but at the moment we need to handle all modes.  Just ignore
1322      any runtime parts for registers that can't store them.  */
1323   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1324   switch (aarch64_regno_regclass (regno))
1325     {
1326     case FP_REGS:
1327     case FP_LO_REGS:
1328       if (aarch64_sve_data_mode_p (mode))
1329         return exact_div (GET_MODE_SIZE (mode),
1330                           BYTES_PER_SVE_VECTOR).to_constant ();
1331       return CEIL (lowest_size, UNITS_PER_VREG);
1332     case PR_REGS:
1333     case PR_LO_REGS:
1334     case PR_HI_REGS:
1335       return 1;
1336     default:
1337       return CEIL (lowest_size, UNITS_PER_WORD);
1338     }
1339   gcc_unreachable ();
1340 }
1341
1342 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1343
1344 static bool
1345 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1346 {
1347   if (GET_MODE_CLASS (mode) == MODE_CC)
1348     return regno == CC_REGNUM;
1349
1350   if (regno == VG_REGNUM)
1351     /* This must have the same size as _Unwind_Word.  */
1352     return mode == DImode;
1353
1354   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1355   if (vec_flags & VEC_SVE_PRED)
1356     return PR_REGNUM_P (regno);
1357
1358   if (PR_REGNUM_P (regno))
1359     return 0;
1360
1361   if (regno == SP_REGNUM)
1362     /* The purpose of comparing with ptr_mode is to support the
1363        global register variable associated with the stack pointer
1364        register via the syntax of asm ("wsp") in ILP32.  */
1365     return mode == Pmode || mode == ptr_mode;
1366
1367   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1368     return mode == Pmode;
1369
1370   if (GP_REGNUM_P (regno) && known_le (GET_MODE_SIZE (mode), 16))
1371     return true;
1372
1373   if (FP_REGNUM_P (regno))
1374     {
1375       if (vec_flags & VEC_STRUCT)
1376         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1377       else
1378         return !VECTOR_MODE_P (mode) || vec_flags != 0;
1379     }
1380
1381   return false;
1382 }
1383
1384 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1385    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1386    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1387
1388 static bool
1389 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1390 {
1391   return FP_REGNUM_P (regno) && maybe_gt (GET_MODE_SIZE (mode), 8);
1392 }
1393
1394 /* Implement REGMODE_NATURAL_SIZE.  */
1395 poly_uint64
1396 aarch64_regmode_natural_size (machine_mode mode)
1397 {
1398   /* The natural size for SVE data modes is one SVE data vector,
1399      and similarly for predicates.  We can't independently modify
1400      anything smaller than that.  */
1401   /* ??? For now, only do this for variable-width SVE registers.
1402      Doing it for constant-sized registers breaks lower-subreg.c.  */
1403   /* ??? And once that's fixed, we should probably have similar
1404      code for Advanced SIMD.  */
1405   if (!aarch64_sve_vg.is_constant ())
1406     {
1407       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1408       if (vec_flags & VEC_SVE_PRED)
1409         return BYTES_PER_SVE_PRED;
1410       if (vec_flags & VEC_SVE_DATA)
1411         return BYTES_PER_SVE_VECTOR;
1412     }
1413   return UNITS_PER_WORD;
1414 }
1415
1416 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1417 machine_mode
1418 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1419                                      machine_mode mode)
1420 {
1421   /* The predicate mode determines which bits are significant and
1422      which are "don't care".  Decreasing the number of lanes would
1423      lose data while increasing the number of lanes would make bits
1424      unnecessarily significant.  */
1425   if (PR_REGNUM_P (regno))
1426     return mode;
1427   if (known_ge (GET_MODE_SIZE (mode), 4))
1428     return mode;
1429   else
1430     return SImode;
1431 }
1432
1433 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
1434    that strcpy from constants will be faster.  */
1435
1436 static HOST_WIDE_INT
1437 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1438 {
1439   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1440     return MAX (align, BITS_PER_WORD);
1441   return align;
1442 }
1443
1444 /* Return true if calls to DECL should be treated as
1445    long-calls (ie called via a register).  */
1446 static bool
1447 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1448 {
1449   return false;
1450 }
1451
1452 /* Return true if calls to symbol-ref SYM should be treated as
1453    long-calls (ie called via a register).  */
1454 bool
1455 aarch64_is_long_call_p (rtx sym)
1456 {
1457   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1458 }
1459
1460 /* Return true if calls to symbol-ref SYM should not go through
1461    plt stubs.  */
1462
1463 bool
1464 aarch64_is_noplt_call_p (rtx sym)
1465 {
1466   const_tree decl = SYMBOL_REF_DECL (sym);
1467
1468   if (flag_pic
1469       && decl
1470       && (!flag_plt
1471           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1472       && !targetm.binds_local_p (decl))
1473     return true;
1474
1475   return false;
1476 }
1477
1478 /* Return true if the offsets to a zero/sign-extract operation
1479    represent an expression that matches an extend operation.  The
1480    operands represent the paramters from
1481
1482    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1483 bool
1484 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1485                                 rtx extract_imm)
1486 {
1487   HOST_WIDE_INT mult_val, extract_val;
1488
1489   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1490     return false;
1491
1492   mult_val = INTVAL (mult_imm);
1493   extract_val = INTVAL (extract_imm);
1494
1495   if (extract_val > 8
1496       && extract_val < GET_MODE_BITSIZE (mode)
1497       && exact_log2 (extract_val & ~7) > 0
1498       && (extract_val & 7) <= 4
1499       && mult_val == (1 << (extract_val & 7)))
1500     return true;
1501
1502   return false;
1503 }
1504
1505 /* Emit an insn that's a simple single-set.  Both the operands must be
1506    known to be valid.  */
1507 inline static rtx_insn *
1508 emit_set_insn (rtx x, rtx y)
1509 {
1510   return emit_insn (gen_rtx_SET (x, y));
1511 }
1512
1513 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1514    return the rtx for register 0 in the proper mode.  */
1515 rtx
1516 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1517 {
1518   machine_mode mode = SELECT_CC_MODE (code, x, y);
1519   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1520
1521   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1522   return cc_reg;
1523 }
1524
1525 /* Build the SYMBOL_REF for __tls_get_addr.  */
1526
1527 static GTY(()) rtx tls_get_addr_libfunc;
1528
1529 rtx
1530 aarch64_tls_get_addr (void)
1531 {
1532   if (!tls_get_addr_libfunc)
1533     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1534   return tls_get_addr_libfunc;
1535 }
1536
1537 /* Return the TLS model to use for ADDR.  */
1538
1539 static enum tls_model
1540 tls_symbolic_operand_type (rtx addr)
1541 {
1542   enum tls_model tls_kind = TLS_MODEL_NONE;
1543   if (GET_CODE (addr) == CONST)
1544     {
1545       poly_int64 addend;
1546       rtx sym = strip_offset (addr, &addend);
1547       if (GET_CODE (sym) == SYMBOL_REF)
1548         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1549     }
1550   else if (GET_CODE (addr) == SYMBOL_REF)
1551     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1552
1553   return tls_kind;
1554 }
1555
1556 /* We'll allow lo_sum's in addresses in our legitimate addresses
1557    so that combine would take care of combining addresses where
1558    necessary, but for generation purposes, we'll generate the address
1559    as :
1560    RTL                               Absolute
1561    tmp = hi (symbol_ref);            adrp  x1, foo
1562    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1563                                      nop
1564
1565    PIC                               TLS
1566    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1567    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1568                                      bl   __tls_get_addr
1569                                      nop
1570
1571    Load TLS symbol, depending on TLS mechanism and TLS access model.
1572
1573    Global Dynamic - Traditional TLS:
1574    adrp tmp, :tlsgd:imm
1575    add  dest, tmp, #:tlsgd_lo12:imm
1576    bl   __tls_get_addr
1577
1578    Global Dynamic - TLS Descriptors:
1579    adrp dest, :tlsdesc:imm
1580    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1581    add  dest, dest, #:tlsdesc_lo12:imm
1582    blr  tmp
1583    mrs  tp, tpidr_el0
1584    add  dest, dest, tp
1585
1586    Initial Exec:
1587    mrs  tp, tpidr_el0
1588    adrp tmp, :gottprel:imm
1589    ldr  dest, [tmp, #:gottprel_lo12:imm]
1590    add  dest, dest, tp
1591
1592    Local Exec:
1593    mrs  tp, tpidr_el0
1594    add  t0, tp, #:tprel_hi12:imm, lsl #12
1595    add  t0, t0, #:tprel_lo12_nc:imm
1596 */
1597
1598 static void
1599 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1600                                    enum aarch64_symbol_type type)
1601 {
1602   switch (type)
1603     {
1604     case SYMBOL_SMALL_ABSOLUTE:
1605       {
1606         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1607         rtx tmp_reg = dest;
1608         machine_mode mode = GET_MODE (dest);
1609
1610         gcc_assert (mode == Pmode || mode == ptr_mode);
1611
1612         if (can_create_pseudo_p ())
1613           tmp_reg = gen_reg_rtx (mode);
1614
1615         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1616         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1617         return;
1618       }
1619
1620     case SYMBOL_TINY_ABSOLUTE:
1621       emit_insn (gen_rtx_SET (dest, imm));
1622       return;
1623
1624     case SYMBOL_SMALL_GOT_28K:
1625       {
1626         machine_mode mode = GET_MODE (dest);
1627         rtx gp_rtx = pic_offset_table_rtx;
1628         rtx insn;
1629         rtx mem;
1630
1631         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1632            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1633            decide rtx costs, in which case pic_offset_table_rtx is not
1634            initialized.  For that case no need to generate the first adrp
1635            instruction as the final cost for global variable access is
1636            one instruction.  */
1637         if (gp_rtx != NULL)
1638           {
1639             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1640                using the page base as GOT base, the first page may be wasted,
1641                in the worst scenario, there is only 28K space for GOT).
1642
1643                The generate instruction sequence for accessing global variable
1644                is:
1645
1646                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1647
1648                Only one instruction needed. But we must initialize
1649                pic_offset_table_rtx properly.  We generate initialize insn for
1650                every global access, and allow CSE to remove all redundant.
1651
1652                The final instruction sequences will look like the following
1653                for multiply global variables access.
1654
1655                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1656
1657                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1658                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1659                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1660                  ...  */
1661
1662             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1663             crtl->uses_pic_offset_table = 1;
1664             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1665
1666             if (mode != GET_MODE (gp_rtx))
1667              gp_rtx = gen_lowpart (mode, gp_rtx);
1668
1669           }
1670
1671         if (mode == ptr_mode)
1672           {
1673             if (mode == DImode)
1674               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1675             else
1676               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1677
1678             mem = XVECEXP (SET_SRC (insn), 0, 0);
1679           }
1680         else
1681           {
1682             gcc_assert (mode == Pmode);
1683
1684             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1685             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1686           }
1687
1688         /* The operand is expected to be MEM.  Whenever the related insn
1689            pattern changed, above code which calculate mem should be
1690            updated.  */
1691         gcc_assert (GET_CODE (mem) == MEM);
1692         MEM_READONLY_P (mem) = 1;
1693         MEM_NOTRAP_P (mem) = 1;
1694         emit_insn (insn);
1695         return;
1696       }
1697
1698     case SYMBOL_SMALL_GOT_4G:
1699       {
1700         /* In ILP32, the mode of dest can be either SImode or DImode,
1701            while the got entry is always of SImode size.  The mode of
1702            dest depends on how dest is used: if dest is assigned to a
1703            pointer (e.g. in the memory), it has SImode; it may have
1704            DImode if dest is dereferenced to access the memeory.
1705            This is why we have to handle three different ldr_got_small
1706            patterns here (two patterns for ILP32).  */
1707
1708         rtx insn;
1709         rtx mem;
1710         rtx tmp_reg = dest;
1711         machine_mode mode = GET_MODE (dest);
1712
1713         if (can_create_pseudo_p ())
1714           tmp_reg = gen_reg_rtx (mode);
1715
1716         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1717         if (mode == ptr_mode)
1718           {
1719             if (mode == DImode)
1720               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1721             else
1722               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1723
1724             mem = XVECEXP (SET_SRC (insn), 0, 0);
1725           }
1726         else
1727           {
1728             gcc_assert (mode == Pmode);
1729
1730             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1731             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1732           }
1733
1734         gcc_assert (GET_CODE (mem) == MEM);
1735         MEM_READONLY_P (mem) = 1;
1736         MEM_NOTRAP_P (mem) = 1;
1737         emit_insn (insn);
1738         return;
1739       }
1740
1741     case SYMBOL_SMALL_TLSGD:
1742       {
1743         rtx_insn *insns;
1744         machine_mode mode = GET_MODE (dest);
1745         rtx result = gen_rtx_REG (mode, R0_REGNUM);
1746
1747         start_sequence ();
1748         if (TARGET_ILP32)
1749           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1750         else
1751           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1752         insns = get_insns ();
1753         end_sequence ();
1754
1755         RTL_CONST_CALL_P (insns) = 1;
1756         emit_libcall_block (insns, dest, result, imm);
1757         return;
1758       }
1759
1760     case SYMBOL_SMALL_TLSDESC:
1761       {
1762         machine_mode mode = GET_MODE (dest);
1763         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1764         rtx tp;
1765
1766         gcc_assert (mode == Pmode || mode == ptr_mode);
1767
1768         /* In ILP32, the got entry is always of SImode size.  Unlike
1769            small GOT, the dest is fixed at reg 0.  */
1770         if (TARGET_ILP32)
1771           emit_insn (gen_tlsdesc_small_si (imm));
1772         else
1773           emit_insn (gen_tlsdesc_small_di (imm));
1774         tp = aarch64_load_tp (NULL);
1775
1776         if (mode != Pmode)
1777           tp = gen_lowpart (mode, tp);
1778
1779         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1780         if (REG_P (dest))
1781           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1782         return;
1783       }
1784
1785     case SYMBOL_SMALL_TLSIE:
1786       {
1787         /* In ILP32, the mode of dest can be either SImode or DImode,
1788            while the got entry is always of SImode size.  The mode of
1789            dest depends on how dest is used: if dest is assigned to a
1790            pointer (e.g. in the memory), it has SImode; it may have
1791            DImode if dest is dereferenced to access the memeory.
1792            This is why we have to handle three different tlsie_small
1793            patterns here (two patterns for ILP32).  */
1794         machine_mode mode = GET_MODE (dest);
1795         rtx tmp_reg = gen_reg_rtx (mode);
1796         rtx tp = aarch64_load_tp (NULL);
1797
1798         if (mode == ptr_mode)
1799           {
1800             if (mode == DImode)
1801               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1802             else
1803               {
1804                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1805                 tp = gen_lowpart (mode, tp);
1806               }
1807           }
1808         else
1809           {
1810             gcc_assert (mode == Pmode);
1811             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1812           }
1813
1814         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1815         if (REG_P (dest))
1816           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1817         return;
1818       }
1819
1820     case SYMBOL_TLSLE12:
1821     case SYMBOL_TLSLE24:
1822     case SYMBOL_TLSLE32:
1823     case SYMBOL_TLSLE48:
1824       {
1825         machine_mode mode = GET_MODE (dest);
1826         rtx tp = aarch64_load_tp (NULL);
1827
1828         if (mode != Pmode)
1829           tp = gen_lowpart (mode, tp);
1830
1831         switch (type)
1832           {
1833           case SYMBOL_TLSLE12:
1834             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1835                         (dest, tp, imm));
1836             break;
1837           case SYMBOL_TLSLE24:
1838             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1839                         (dest, tp, imm));
1840           break;
1841           case SYMBOL_TLSLE32:
1842             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1843                         (dest, imm));
1844             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1845                         (dest, dest, tp));
1846           break;
1847           case SYMBOL_TLSLE48:
1848             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1849                         (dest, imm));
1850             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1851                         (dest, dest, tp));
1852             break;
1853           default:
1854             gcc_unreachable ();
1855           }
1856
1857         if (REG_P (dest))
1858           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1859         return;
1860       }
1861
1862     case SYMBOL_TINY_GOT:
1863       emit_insn (gen_ldr_got_tiny (dest, imm));
1864       return;
1865
1866     case SYMBOL_TINY_TLSIE:
1867       {
1868         machine_mode mode = GET_MODE (dest);
1869         rtx tp = aarch64_load_tp (NULL);
1870
1871         if (mode == ptr_mode)
1872           {
1873             if (mode == DImode)
1874               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1875             else
1876               {
1877                 tp = gen_lowpart (mode, tp);
1878                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1879               }
1880           }
1881         else
1882           {
1883             gcc_assert (mode == Pmode);
1884             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1885           }
1886
1887         if (REG_P (dest))
1888           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1889         return;
1890       }
1891
1892     default:
1893       gcc_unreachable ();
1894     }
1895 }
1896
1897 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1898    handle all moves if !can_create_pseudo_p ().  The distinction is
1899    important because, unlike emit_move_insn, the move expanders know
1900    how to force Pmode objects into the constant pool even when the
1901    constant pool address is not itself legitimate.  */
1902 static rtx
1903 aarch64_emit_move (rtx dest, rtx src)
1904 {
1905   return (can_create_pseudo_p ()
1906           ? emit_move_insn (dest, src)
1907           : emit_move_insn_1 (dest, src));
1908 }
1909
1910 /* Apply UNOPTAB to OP and store the result in DEST.  */
1911
1912 static void
1913 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
1914 {
1915   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
1916   if (dest != tmp)
1917     emit_move_insn (dest, tmp);
1918 }
1919
1920 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
1921
1922 static void
1923 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
1924 {
1925   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
1926                           OPTAB_DIRECT);
1927   if (dest != tmp)
1928     emit_move_insn (dest, tmp);
1929 }
1930
1931 /* Split a 128-bit move operation into two 64-bit move operations,
1932    taking care to handle partial overlap of register to register
1933    copies.  Special cases are needed when moving between GP regs and
1934    FP regs.  SRC can be a register, constant or memory; DST a register
1935    or memory.  If either operand is memory it must not have any side
1936    effects.  */
1937 void
1938 aarch64_split_128bit_move (rtx dst, rtx src)
1939 {
1940   rtx dst_lo, dst_hi;
1941   rtx src_lo, src_hi;
1942
1943   machine_mode mode = GET_MODE (dst);
1944
1945   gcc_assert (mode == TImode || mode == TFmode);
1946   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1947   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1948
1949   if (REG_P (dst) && REG_P (src))
1950     {
1951       int src_regno = REGNO (src);
1952       int dst_regno = REGNO (dst);
1953
1954       /* Handle FP <-> GP regs.  */
1955       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1956         {
1957           src_lo = gen_lowpart (word_mode, src);
1958           src_hi = gen_highpart (word_mode, src);
1959
1960           if (mode == TImode)
1961             {
1962               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1963               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1964             }
1965           else
1966             {
1967               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1968               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1969             }
1970           return;
1971         }
1972       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1973         {
1974           dst_lo = gen_lowpart (word_mode, dst);
1975           dst_hi = gen_highpart (word_mode, dst);
1976
1977           if (mode == TImode)
1978             {
1979               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1980               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1981             }
1982           else
1983             {
1984               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1985               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1986             }
1987           return;
1988         }
1989     }
1990
1991   dst_lo = gen_lowpart (word_mode, dst);
1992   dst_hi = gen_highpart (word_mode, dst);
1993   src_lo = gen_lowpart (word_mode, src);
1994   src_hi = gen_highpart_mode (word_mode, mode, src);
1995
1996   /* At most one pairing may overlap.  */
1997   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1998     {
1999       aarch64_emit_move (dst_hi, src_hi);
2000       aarch64_emit_move (dst_lo, src_lo);
2001     }
2002   else
2003     {
2004       aarch64_emit_move (dst_lo, src_lo);
2005       aarch64_emit_move (dst_hi, src_hi);
2006     }
2007 }
2008
2009 bool
2010 aarch64_split_128bit_move_p (rtx dst, rtx src)
2011 {
2012   return (! REG_P (src)
2013           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2014 }
2015
2016 /* Split a complex SIMD combine.  */
2017
2018 void
2019 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2020 {
2021   machine_mode src_mode = GET_MODE (src1);
2022   machine_mode dst_mode = GET_MODE (dst);
2023
2024   gcc_assert (VECTOR_MODE_P (dst_mode));
2025   gcc_assert (register_operand (dst, dst_mode)
2026               && register_operand (src1, src_mode)
2027               && register_operand (src2, src_mode));
2028
2029   rtx (*gen) (rtx, rtx, rtx);
2030
2031   switch (src_mode)
2032     {
2033     case E_V8QImode:
2034       gen = gen_aarch64_simd_combinev8qi;
2035       break;
2036     case E_V4HImode:
2037       gen = gen_aarch64_simd_combinev4hi;
2038       break;
2039     case E_V2SImode:
2040       gen = gen_aarch64_simd_combinev2si;
2041       break;
2042     case E_V4HFmode:
2043       gen = gen_aarch64_simd_combinev4hf;
2044       break;
2045     case E_V2SFmode:
2046       gen = gen_aarch64_simd_combinev2sf;
2047       break;
2048     case E_DImode:
2049       gen = gen_aarch64_simd_combinedi;
2050       break;
2051     case E_DFmode:
2052       gen = gen_aarch64_simd_combinedf;
2053       break;
2054     default:
2055       gcc_unreachable ();
2056     }
2057
2058   emit_insn (gen (dst, src1, src2));
2059   return;
2060 }
2061
2062 /* Split a complex SIMD move.  */
2063
2064 void
2065 aarch64_split_simd_move (rtx dst, rtx src)
2066 {
2067   machine_mode src_mode = GET_MODE (src);
2068   machine_mode dst_mode = GET_MODE (dst);
2069
2070   gcc_assert (VECTOR_MODE_P (dst_mode));
2071
2072   if (REG_P (dst) && REG_P (src))
2073     {
2074       rtx (*gen) (rtx, rtx);
2075
2076       gcc_assert (VECTOR_MODE_P (src_mode));
2077
2078       switch (src_mode)
2079         {
2080         case E_V16QImode:
2081           gen = gen_aarch64_split_simd_movv16qi;
2082           break;
2083         case E_V8HImode:
2084           gen = gen_aarch64_split_simd_movv8hi;
2085           break;
2086         case E_V4SImode:
2087           gen = gen_aarch64_split_simd_movv4si;
2088           break;
2089         case E_V2DImode:
2090           gen = gen_aarch64_split_simd_movv2di;
2091           break;
2092         case E_V8HFmode:
2093           gen = gen_aarch64_split_simd_movv8hf;
2094           break;
2095         case E_V4SFmode:
2096           gen = gen_aarch64_split_simd_movv4sf;
2097           break;
2098         case E_V2DFmode:
2099           gen = gen_aarch64_split_simd_movv2df;
2100           break;
2101         default:
2102           gcc_unreachable ();
2103         }
2104
2105       emit_insn (gen (dst, src));
2106       return;
2107     }
2108 }
2109
2110 bool
2111 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2112                               machine_mode ymode, rtx y)
2113 {
2114   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2115   gcc_assert (r != NULL);
2116   return rtx_equal_p (x, r);
2117 }
2118
2119
2120 static rtx
2121 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2122 {
2123   if (can_create_pseudo_p ())
2124     return force_reg (mode, value);
2125   else
2126     {
2127       gcc_assert (x);
2128       aarch64_emit_move (x, value);
2129       return x;
2130     }
2131 }
2132
2133 /* Return true if we can move VALUE into a register using a single
2134    CNT[BHWD] instruction.  */
2135
2136 static bool
2137 aarch64_sve_cnt_immediate_p (poly_int64 value)
2138 {
2139   HOST_WIDE_INT factor = value.coeffs[0];
2140   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
2141   return (value.coeffs[1] == factor
2142           && IN_RANGE (factor, 2, 16 * 16)
2143           && (factor & 1) == 0
2144           && factor <= 16 * (factor & -factor));
2145 }
2146
2147 /* Likewise for rtx X.  */
2148
2149 bool
2150 aarch64_sve_cnt_immediate_p (rtx x)
2151 {
2152   poly_int64 value;
2153   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2154 }
2155
2156 /* Return the asm string for an instruction with a CNT-like vector size
2157    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2158    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2159    first part of the operands template (the part that comes before the
2160    vector size itself).  FACTOR is the number of quadwords.
2161    NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2162    If it is zero, we can use any element size.  */
2163
2164 static char *
2165 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2166                                   unsigned int factor,
2167                                   unsigned int nelts_per_vq)
2168 {
2169   static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2170
2171   if (nelts_per_vq == 0)
2172     /* There is some overlap in the ranges of the four CNT instructions.
2173        Here we always use the smallest possible element size, so that the
2174        multiplier is 1 whereever possible.  */
2175     nelts_per_vq = factor & -factor;
2176   int shift = std::min (exact_log2 (nelts_per_vq), 4);
2177   gcc_assert (IN_RANGE (shift, 1, 4));
2178   char suffix = "dwhb"[shift - 1];
2179
2180   factor >>= shift;
2181   unsigned int written;
2182   if (factor == 1)
2183     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2184                         prefix, suffix, operands);
2185   else
2186     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2187                         prefix, suffix, operands, factor);
2188   gcc_assert (written < sizeof (buffer));
2189   return buffer;
2190 }
2191
2192 /* Return the asm string for an instruction with a CNT-like vector size
2193    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2194    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2195    first part of the operands template (the part that comes before the
2196    vector size itself).  X is the value of the vector size operand,
2197    as a polynomial integer rtx.  */
2198
2199 char *
2200 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2201                                   rtx x)
2202 {
2203   poly_int64 value = rtx_to_poly_int64 (x);
2204   gcc_assert (aarch64_sve_cnt_immediate_p (value));
2205   return aarch64_output_sve_cnt_immediate (prefix, operands,
2206                                            value.coeffs[1], 0);
2207 }
2208
2209 /* Return true if we can add VALUE to a register using a single ADDVL
2210    or ADDPL instruction.  */
2211
2212 static bool
2213 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2214 {
2215   HOST_WIDE_INT factor = value.coeffs[0];
2216   if (factor == 0 || value.coeffs[1] != factor)
2217     return false;
2218   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2219      and a value of 16 is one vector width.  */
2220   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2221           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2222 }
2223
2224 /* Likewise for rtx X.  */
2225
2226 bool
2227 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2228 {
2229   poly_int64 value;
2230   return (poly_int_rtx_p (x, &value)
2231           && aarch64_sve_addvl_addpl_immediate_p (value));
2232 }
2233
2234 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2235    and storing the result in operand 0.  */
2236
2237 char *
2238 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2239 {
2240   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2241   poly_int64 offset_value = rtx_to_poly_int64 (offset);
2242   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2243
2244   /* Use INC or DEC if possible.  */
2245   if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2246     {
2247       if (aarch64_sve_cnt_immediate_p (offset_value))
2248         return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2249                                                  offset_value.coeffs[1], 0);
2250       if (aarch64_sve_cnt_immediate_p (-offset_value))
2251         return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2252                                                  -offset_value.coeffs[1], 0);
2253     }
2254
2255   int factor = offset_value.coeffs[1];
2256   if ((factor & 15) == 0)
2257     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2258   else
2259     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2260   return buffer;
2261 }
2262
2263 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2264    instruction.  If it is, store the number of elements in each vector
2265    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2266    factor in *FACTOR_OUT (if nonnull).  */
2267
2268 bool
2269 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2270                                  unsigned int *nelts_per_vq_out)
2271 {
2272   rtx elt;
2273   poly_int64 value;
2274
2275   if (!const_vec_duplicate_p (x, &elt)
2276       || !poly_int_rtx_p (elt, &value))
2277     return false;
2278
2279   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2280   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2281     /* There's no vector INCB.  */
2282     return false;
2283
2284   HOST_WIDE_INT factor = value.coeffs[0];
2285   if (value.coeffs[1] != factor)
2286     return false;
2287
2288   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
2289   if ((factor % nelts_per_vq) != 0
2290       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2291     return false;
2292
2293   if (factor_out)
2294     *factor_out = factor;
2295   if (nelts_per_vq_out)
2296     *nelts_per_vq_out = nelts_per_vq;
2297   return true;
2298 }
2299
2300 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2301    instruction.  */
2302
2303 bool
2304 aarch64_sve_inc_dec_immediate_p (rtx x)
2305 {
2306   return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2307 }
2308
2309 /* Return the asm template for an SVE vector INC or DEC instruction.
2310    OPERANDS gives the operands before the vector count and X is the
2311    value of the vector count operand itself.  */
2312
2313 char *
2314 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2315 {
2316   int factor;
2317   unsigned int nelts_per_vq;
2318   if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2319     gcc_unreachable ();
2320   if (factor < 0)
2321     return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2322                                              nelts_per_vq);
2323   else
2324     return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2325                                              nelts_per_vq);
2326 }
2327
2328 static int
2329 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2330                                 scalar_int_mode mode)
2331 {
2332   int i;
2333   unsigned HOST_WIDE_INT val, val2, mask;
2334   int one_match, zero_match;
2335   int num_insns;
2336
2337   val = INTVAL (imm);
2338
2339   if (aarch64_move_imm (val, mode))
2340     {
2341       if (generate)
2342         emit_insn (gen_rtx_SET (dest, imm));
2343       return 1;
2344     }
2345
2346   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2347      (with XXXX non-zero). In that case check to see if the move can be done in
2348      a smaller mode.  */
2349   val2 = val & 0xffffffff;
2350   if (mode == DImode
2351       && aarch64_move_imm (val2, SImode)
2352       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2353     {
2354       if (generate)
2355         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2356
2357       /* Check if we have to emit a second instruction by checking to see
2358          if any of the upper 32 bits of the original DI mode value is set.  */
2359       if (val == val2)
2360         return 1;
2361
2362       i = (val >> 48) ? 48 : 32;
2363
2364       if (generate)
2365          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2366                                     GEN_INT ((val >> i) & 0xffff)));
2367
2368       return 2;
2369     }
2370
2371   if ((val >> 32) == 0 || mode == SImode)
2372     {
2373       if (generate)
2374         {
2375           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2376           if (mode == SImode)
2377             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2378                                        GEN_INT ((val >> 16) & 0xffff)));
2379           else
2380             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2381                                        GEN_INT ((val >> 16) & 0xffff)));
2382         }
2383       return 2;
2384     }
2385
2386   /* Remaining cases are all for DImode.  */
2387
2388   mask = 0xffff;
2389   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2390     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2391   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2392     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2393
2394   if (zero_match != 2 && one_match != 2)
2395     {
2396       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2397          For a 64-bit bitmask try whether changing 16 bits to all ones or
2398          zeroes creates a valid bitmask.  To check any repeated bitmask,
2399          try using 16 bits from the other 32-bit half of val.  */
2400
2401       for (i = 0; i < 64; i += 16, mask <<= 16)
2402         {
2403           val2 = val & ~mask;
2404           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2405             break;
2406           val2 = val | mask;
2407           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2408             break;
2409           val2 = val2 & ~mask;
2410           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2411           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2412             break;
2413         }
2414       if (i != 64)
2415         {
2416           if (generate)
2417             {
2418               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2419               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2420                                          GEN_INT ((val >> i) & 0xffff)));
2421             }
2422           return 2;
2423         }
2424     }
2425
2426   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2427      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
2428      otherwise skip zero bits.  */
2429
2430   num_insns = 1;
2431   mask = 0xffff;
2432   val2 = one_match > zero_match ? ~val : val;
2433   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2434
2435   if (generate)
2436     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2437                                            ? (val | ~(mask << i))
2438                                            : (val & (mask << i)))));
2439   for (i += 16; i < 64; i += 16)
2440     {
2441       if ((val2 & (mask << i)) == 0)
2442         continue;
2443       if (generate)
2444         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2445                                    GEN_INT ((val >> i) & 0xffff)));
2446       num_insns ++;
2447     }
2448
2449   return num_insns;
2450 }
2451
2452 /* Return whether imm is a 128-bit immediate which is simple enough to
2453    expand inline.  */
2454 bool
2455 aarch64_mov128_immediate (rtx imm)
2456 {
2457   if (GET_CODE (imm) == CONST_INT)
2458     return true;
2459
2460   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2461
2462   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2463   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2464
2465   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2466          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2467 }
2468
2469
2470 /* Return the number of temporary registers that aarch64_add_offset_1
2471    would need to add OFFSET to a register.  */
2472
2473 static unsigned int
2474 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2475 {
2476   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2477 }
2478
2479 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
2480    a non-polynomial OFFSET.  MODE is the mode of the addition.
2481    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2482    be set and CFA adjustments added to the generated instructions.
2483
2484    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2485    temporary if register allocation is already complete.  This temporary
2486    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
2487    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2488    the immediate again.
2489
2490    Since this function may be used to adjust the stack pointer, we must
2491    ensure that it cannot cause transient stack deallocation (for example
2492    by first incrementing SP and then decrementing when adjusting by a
2493    large immediate).  */
2494
2495 static void
2496 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2497                       rtx src, HOST_WIDE_INT offset, rtx temp1,
2498                       bool frame_related_p, bool emit_move_imm)
2499 {
2500   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2501   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2502
2503   HOST_WIDE_INT moffset = abs_hwi (offset);
2504   rtx_insn *insn;
2505
2506   if (!moffset)
2507     {
2508       if (!rtx_equal_p (dest, src))
2509         {
2510           insn = emit_insn (gen_rtx_SET (dest, src));
2511           RTX_FRAME_RELATED_P (insn) = frame_related_p;
2512         }
2513       return;
2514     }
2515
2516   /* Single instruction adjustment.  */
2517   if (aarch64_uimm12_shift (moffset))
2518     {
2519       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2520       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2521       return;
2522     }
2523
2524   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2525      and either:
2526
2527      a) the offset cannot be loaded by a 16-bit move or
2528      b) there is no spare register into which we can move it.  */
2529   if (moffset < 0x1000000
2530       && ((!temp1 && !can_create_pseudo_p ())
2531           || !aarch64_move_imm (moffset, mode)))
2532     {
2533       HOST_WIDE_INT low_off = moffset & 0xfff;
2534
2535       low_off = offset < 0 ? -low_off : low_off;
2536       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2537       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2538       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2539       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2540       return;
2541     }
2542
2543   /* Emit a move immediate if required and an addition/subtraction.  */
2544   if (emit_move_imm)
2545     {
2546       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2547       temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2548     }
2549   insn = emit_insn (offset < 0
2550                     ? gen_sub3_insn (dest, src, temp1)
2551                     : gen_add3_insn (dest, src, temp1));
2552   if (frame_related_p)
2553     {
2554       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2555       rtx adj = plus_constant (mode, src, offset);
2556       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2557     }
2558 }
2559
2560 /* Return the number of temporary registers that aarch64_add_offset
2561    would need to move OFFSET into a register or add OFFSET to a register;
2562    ADD_P is true if we want the latter rather than the former.  */
2563
2564 static unsigned int
2565 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2566 {
2567   /* This follows the same structure as aarch64_add_offset.  */
2568   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2569     return 0;
2570
2571   unsigned int count = 0;
2572   HOST_WIDE_INT factor = offset.coeffs[1];
2573   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2574   poly_int64 poly_offset (factor, factor);
2575   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2576     /* Need one register for the ADDVL/ADDPL result.  */
2577     count += 1;
2578   else if (factor != 0)
2579     {
2580       factor = abs (factor);
2581       if (factor > 16 * (factor & -factor))
2582         /* Need one register for the CNT result and one for the multiplication
2583            factor.  If necessary, the second temporary can be reused for the
2584            constant part of the offset.  */
2585         return 2;
2586       /* Need one register for the CNT result (which might then
2587          be shifted).  */
2588       count += 1;
2589     }
2590   return count + aarch64_add_offset_1_temporaries (constant);
2591 }
2592
2593 /* If X can be represented as a poly_int64, return the number
2594    of temporaries that are required to add it to a register.
2595    Return -1 otherwise.  */
2596
2597 int
2598 aarch64_add_offset_temporaries (rtx x)
2599 {
2600   poly_int64 offset;
2601   if (!poly_int_rtx_p (x, &offset))
2602     return -1;
2603   return aarch64_offset_temporaries (true, offset);
2604 }
2605
2606 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
2607    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2608    be set and CFA adjustments added to the generated instructions.
2609
2610    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2611    temporary if register allocation is already complete.  This temporary
2612    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2613    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2614    false to avoid emitting the immediate again.
2615
2616    TEMP2, if nonnull, is a second temporary register that doesn't
2617    overlap either DEST or REG.
2618
2619    Since this function may be used to adjust the stack pointer, we must
2620    ensure that it cannot cause transient stack deallocation (for example
2621    by first incrementing SP and then decrementing when adjusting by a
2622    large immediate).  */
2623
2624 static void
2625 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2626                     poly_int64 offset, rtx temp1, rtx temp2,
2627                     bool frame_related_p, bool emit_move_imm = true)
2628 {
2629   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2630   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2631   gcc_assert (temp1 == NULL_RTX
2632               || !frame_related_p
2633               || !reg_overlap_mentioned_p (temp1, dest));
2634   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2635
2636   /* Try using ADDVL or ADDPL to add the whole value.  */
2637   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2638     {
2639       rtx offset_rtx = gen_int_mode (offset, mode);
2640       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2641       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2642       return;
2643     }
2644
2645   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2646      SVE vector register, over and above the minimum size of 128 bits.
2647      This is equivalent to half the value returned by CNTD with a
2648      vector shape of ALL.  */
2649   HOST_WIDE_INT factor = offset.coeffs[1];
2650   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2651
2652   /* Try using ADDVL or ADDPL to add the VG-based part.  */
2653   poly_int64 poly_offset (factor, factor);
2654   if (src != const0_rtx
2655       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2656     {
2657       rtx offset_rtx = gen_int_mode (poly_offset, mode);
2658       if (frame_related_p)
2659         {
2660           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2661           RTX_FRAME_RELATED_P (insn) = true;
2662           src = dest;
2663         }
2664       else
2665         {
2666           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2667           src = aarch64_force_temporary (mode, temp1, addr);
2668           temp1 = temp2;
2669           temp2 = NULL_RTX;
2670         }
2671     }
2672   /* Otherwise use a CNT-based sequence.  */
2673   else if (factor != 0)
2674     {
2675       /* Use a subtraction if we have a negative factor.  */
2676       rtx_code code = PLUS;
2677       if (factor < 0)
2678         {
2679           factor = -factor;
2680           code = MINUS;
2681         }
2682
2683       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
2684          into the multiplication.  */
2685       rtx val;
2686       int shift = 0;
2687       if (factor & 1)
2688         /* Use a right shift by 1.  */
2689         shift = -1;
2690       else
2691         factor /= 2;
2692       HOST_WIDE_INT low_bit = factor & -factor;
2693       if (factor <= 16 * low_bit)
2694         {
2695           if (factor > 16 * 8)
2696             {
2697               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2698                  the value with the minimum multiplier and shift it into
2699                  position.  */
2700               int extra_shift = exact_log2 (low_bit);
2701               shift += extra_shift;
2702               factor >>= extra_shift;
2703             }
2704           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
2705         }
2706       else
2707         {
2708           /* Use CNTD, then multiply it by FACTOR.  */
2709           val = gen_int_mode (poly_int64 (2, 2), mode);
2710           val = aarch64_force_temporary (mode, temp1, val);
2711
2712           /* Go back to using a negative multiplication factor if we have
2713              no register from which to subtract.  */
2714           if (code == MINUS && src == const0_rtx)
2715             {
2716               factor = -factor;
2717               code = PLUS;
2718             }
2719           rtx coeff1 = gen_int_mode (factor, mode);
2720           coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
2721           val = gen_rtx_MULT (mode, val, coeff1);
2722         }
2723
2724       if (shift > 0)
2725         {
2726           /* Multiply by 1 << SHIFT.  */
2727           val = aarch64_force_temporary (mode, temp1, val);
2728           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
2729         }
2730       else if (shift == -1)
2731         {
2732           /* Divide by 2.  */
2733           val = aarch64_force_temporary (mode, temp1, val);
2734           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
2735         }
2736
2737       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
2738       if (src != const0_rtx)
2739         {
2740           val = aarch64_force_temporary (mode, temp1, val);
2741           val = gen_rtx_fmt_ee (code, mode, src, val);
2742         }
2743       else if (code == MINUS)
2744         {
2745           val = aarch64_force_temporary (mode, temp1, val);
2746           val = gen_rtx_NEG (mode, val);
2747         }
2748
2749       if (constant == 0 || frame_related_p)
2750         {
2751           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
2752           if (frame_related_p)
2753             {
2754               RTX_FRAME_RELATED_P (insn) = true;
2755               add_reg_note (insn, REG_CFA_ADJUST_CFA,
2756                             gen_rtx_SET (dest, plus_constant (Pmode, src,
2757                                                               poly_offset)));
2758             }
2759           src = dest;
2760           if (constant == 0)
2761             return;
2762         }
2763       else
2764         {
2765           src = aarch64_force_temporary (mode, temp1, val);
2766           temp1 = temp2;
2767           temp2 = NULL_RTX;
2768         }
2769
2770       emit_move_imm = true;
2771     }
2772
2773   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
2774                         frame_related_p, emit_move_imm);
2775 }
2776
2777 /* Like aarch64_add_offset, but the offset is given as an rtx rather
2778    than a poly_int64.  */
2779
2780 void
2781 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2782                           rtx offset_rtx, rtx temp1, rtx temp2)
2783 {
2784   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
2785                       temp1, temp2, false);
2786 }
2787
2788 /* Add DELTA to the stack pointer, marking the instructions frame-related.
2789    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
2790    if TEMP1 already contains abs (DELTA).  */
2791
2792 static inline void
2793 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
2794 {
2795   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
2796                       temp1, temp2, true, emit_move_imm);
2797 }
2798
2799 /* Subtract DELTA from the stack pointer, marking the instructions
2800    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
2801    if nonnull.  */
2802
2803 static inline void
2804 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p)
2805 {
2806   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
2807                       temp1, temp2, frame_related_p);
2808 }
2809
2810 /* Set DEST to (vec_series BASE STEP).  */
2811
2812 static void
2813 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
2814 {
2815   machine_mode mode = GET_MODE (dest);
2816   scalar_mode inner = GET_MODE_INNER (mode);
2817
2818   /* Each operand can be a register or an immediate in the range [-16, 15].  */
2819   if (!aarch64_sve_index_immediate_p (base))
2820     base = force_reg (inner, base);
2821   if (!aarch64_sve_index_immediate_p (step))
2822     step = force_reg (inner, step);
2823
2824   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
2825 }
2826
2827 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
2828    integer of mode INT_MODE.  Return true on success.  */
2829
2830 static bool
2831 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
2832                                       rtx src)
2833 {
2834   /* If the constant is smaller than 128 bits, we can do the move
2835      using a vector of SRC_MODEs.  */
2836   if (src_mode != TImode)
2837     {
2838       poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
2839                                      GET_MODE_SIZE (src_mode));
2840       machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
2841       emit_move_insn (gen_lowpart (dup_mode, dest),
2842                       gen_const_vec_duplicate (dup_mode, src));
2843       return true;
2844     }
2845
2846   /* Use LD1RQ[BHWD] to load the 128 bits from memory.  */
2847   src = force_const_mem (src_mode, src);
2848   if (!src)
2849     return false;
2850
2851   /* Make sure that the address is legitimate.  */
2852   if (!aarch64_sve_ld1r_operand_p (src))
2853     {
2854       rtx addr = force_reg (Pmode, XEXP (src, 0));
2855       src = replace_equiv_address (src, addr);
2856     }
2857
2858   machine_mode mode = GET_MODE (dest);
2859   unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
2860   machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
2861   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
2862   src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
2863   emit_insn (gen_rtx_SET (dest, src));
2864   return true;
2865 }
2866
2867 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
2868    isn't a simple duplicate or series.  */
2869
2870 static void
2871 aarch64_expand_sve_const_vector (rtx dest, rtx src)
2872 {
2873   machine_mode mode = GET_MODE (src);
2874   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
2875   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
2876   gcc_assert (npatterns > 1);
2877
2878   if (nelts_per_pattern == 1)
2879     {
2880       /* The constant is a repeating seqeuence of at least two elements,
2881          where the repeating elements occupy no more than 128 bits.
2882          Get an integer representation of the replicated value.  */
2883       scalar_int_mode int_mode;
2884       if (BYTES_BIG_ENDIAN)
2885         /* For now, always use LD1RQ to load the value on big-endian
2886            targets, since the handling of smaller integers includes a
2887            subreg that is semantically an element reverse.  */
2888         int_mode = TImode;
2889       else
2890         {
2891           unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
2892           gcc_assert (int_bits <= 128);
2893           int_mode = int_mode_for_size (int_bits, 0).require ();
2894         }
2895       rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
2896       if (int_value
2897           && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
2898         return;
2899     }
2900
2901   /* Expand each pattern individually.  */
2902   rtx_vector_builder builder;
2903   auto_vec<rtx, 16> vectors (npatterns);
2904   for (unsigned int i = 0; i < npatterns; ++i)
2905     {
2906       builder.new_vector (mode, 1, nelts_per_pattern);
2907       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
2908         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
2909       vectors.quick_push (force_reg (mode, builder.build ()));
2910     }
2911
2912   /* Use permutes to interleave the separate vectors.  */
2913   while (npatterns > 1)
2914     {
2915       npatterns /= 2;
2916       for (unsigned int i = 0; i < npatterns; ++i)
2917         {
2918           rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
2919           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
2920           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
2921           vectors[i] = tmp;
2922         }
2923     }
2924   gcc_assert (vectors[0] == dest);
2925 }
2926
2927 /* Set DEST to immediate IMM.  For SVE vector modes, GEN_VEC_DUPLICATE
2928    is a pattern that can be used to set DEST to a replicated scalar
2929    element.  */
2930
2931 void
2932 aarch64_expand_mov_immediate (rtx dest, rtx imm,
2933                               rtx (*gen_vec_duplicate) (rtx, rtx))
2934 {
2935   machine_mode mode = GET_MODE (dest);
2936
2937   /* Check on what type of symbol it is.  */
2938   scalar_int_mode int_mode;
2939   if ((GET_CODE (imm) == SYMBOL_REF
2940        || GET_CODE (imm) == LABEL_REF
2941        || GET_CODE (imm) == CONST
2942        || GET_CODE (imm) == CONST_POLY_INT)
2943       && is_a <scalar_int_mode> (mode, &int_mode))
2944     {
2945       rtx mem;
2946       poly_int64 offset;
2947       HOST_WIDE_INT const_offset;
2948       enum aarch64_symbol_type sty;
2949
2950       /* If we have (const (plus symbol offset)), separate out the offset
2951          before we start classifying the symbol.  */
2952       rtx base = strip_offset (imm, &offset);
2953
2954       /* We must always add an offset involving VL separately, rather than
2955          folding it into the relocation.  */
2956       if (!offset.is_constant (&const_offset))
2957         {
2958           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
2959             emit_insn (gen_rtx_SET (dest, imm));
2960           else
2961             {
2962               /* Do arithmetic on 32-bit values if the result is smaller
2963                  than that.  */
2964               if (partial_subreg_p (int_mode, SImode))
2965                 {
2966                   /* It is invalid to do symbol calculations in modes
2967                      narrower than SImode.  */
2968                   gcc_assert (base == const0_rtx);
2969                   dest = gen_lowpart (SImode, dest);
2970                   int_mode = SImode;
2971                 }
2972               if (base != const0_rtx)
2973                 {
2974                   base = aarch64_force_temporary (int_mode, dest, base);
2975                   aarch64_add_offset (int_mode, dest, base, offset,
2976                                       NULL_RTX, NULL_RTX, false);
2977                 }
2978               else
2979                 aarch64_add_offset (int_mode, dest, base, offset,
2980                                     dest, NULL_RTX, false);
2981             }
2982           return;
2983         }
2984
2985       sty = aarch64_classify_symbol (base, const_offset);
2986       switch (sty)
2987         {
2988         case SYMBOL_FORCE_TO_MEM:
2989           if (const_offset != 0
2990               && targetm.cannot_force_const_mem (int_mode, imm))
2991             {
2992               gcc_assert (can_create_pseudo_p ());
2993               base = aarch64_force_temporary (int_mode, dest, base);
2994               aarch64_add_offset (int_mode, dest, base, const_offset,
2995                                   NULL_RTX, NULL_RTX, false);
2996               return;
2997             }
2998
2999           mem = force_const_mem (ptr_mode, imm);
3000           gcc_assert (mem);
3001
3002           /* If we aren't generating PC relative literals, then
3003              we need to expand the literal pool access carefully.
3004              This is something that needs to be done in a number
3005              of places, so could well live as a separate function.  */
3006           if (!aarch64_pcrelative_literal_loads)
3007             {
3008               gcc_assert (can_create_pseudo_p ());
3009               base = gen_reg_rtx (ptr_mode);
3010               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
3011               if (ptr_mode != Pmode)
3012                 base = convert_memory_address (Pmode, base);
3013               mem = gen_rtx_MEM (ptr_mode, base);
3014             }
3015
3016           if (int_mode != ptr_mode)
3017             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
3018
3019           emit_insn (gen_rtx_SET (dest, mem));
3020
3021           return;
3022
3023         case SYMBOL_SMALL_TLSGD:
3024         case SYMBOL_SMALL_TLSDESC:
3025         case SYMBOL_SMALL_TLSIE:
3026         case SYMBOL_SMALL_GOT_28K:
3027         case SYMBOL_SMALL_GOT_4G:
3028         case SYMBOL_TINY_GOT:
3029         case SYMBOL_TINY_TLSIE:
3030           if (const_offset != 0)
3031             {
3032               gcc_assert(can_create_pseudo_p ());
3033               base = aarch64_force_temporary (int_mode, dest, base);
3034               aarch64_add_offset (int_mode, dest, base, const_offset,
3035                                   NULL_RTX, NULL_RTX, false);
3036               return;
3037             }
3038           /* FALLTHRU */
3039
3040         case SYMBOL_SMALL_ABSOLUTE:
3041         case SYMBOL_TINY_ABSOLUTE:
3042         case SYMBOL_TLSLE12:
3043         case SYMBOL_TLSLE24:
3044         case SYMBOL_TLSLE32:
3045         case SYMBOL_TLSLE48:
3046           aarch64_load_symref_appropriately (dest, imm, sty);
3047           return;
3048
3049         default:
3050           gcc_unreachable ();
3051         }
3052     }
3053
3054   if (!CONST_INT_P (imm))
3055     {
3056       rtx base, step, value;
3057       if (GET_CODE (imm) == HIGH
3058           || aarch64_simd_valid_immediate (imm, NULL))
3059         emit_insn (gen_rtx_SET (dest, imm));
3060       else if (const_vec_series_p (imm, &base, &step))
3061         aarch64_expand_vec_series (dest, base, step);
3062       else if (const_vec_duplicate_p (imm, &value))
3063         {
3064           /* If the constant is out of range of an SVE vector move,
3065              load it from memory if we can, otherwise move it into
3066              a register and use a DUP.  */
3067           scalar_mode inner_mode = GET_MODE_INNER (mode);
3068           rtx op = force_const_mem (inner_mode, value);
3069           if (!op)
3070             op = force_reg (inner_mode, value);
3071           else if (!aarch64_sve_ld1r_operand_p (op))
3072             {
3073               rtx addr = force_reg (Pmode, XEXP (op, 0));
3074               op = replace_equiv_address (op, addr);
3075             }
3076           emit_insn (gen_vec_duplicate (dest, op));
3077         }
3078       else if (GET_CODE (imm) == CONST_VECTOR
3079                && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3080         aarch64_expand_sve_const_vector (dest, imm);
3081       else
3082         {
3083           rtx mem = force_const_mem (mode, imm);
3084           gcc_assert (mem);
3085           emit_move_insn (dest, mem);
3086         }
3087
3088       return;
3089     }
3090
3091   aarch64_internal_mov_immediate (dest, imm, true,
3092                                   as_a <scalar_int_mode> (mode));
3093 }
3094
3095 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
3096    that is known to contain PTRUE.  */
3097
3098 void
3099 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3100 {
3101   emit_insn (gen_rtx_SET (dest, gen_rtx_UNSPEC (GET_MODE (dest),
3102                                                 gen_rtvec (2, pred, src),
3103                                                 UNSPEC_MERGE_PTRUE)));
3104 }
3105
3106 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3107    operand is in memory.  In this case we need to use the predicated LD1
3108    and ST1 instead of LDR and STR, both for correctness on big-endian
3109    targets and because LD1 and ST1 support a wider range of addressing modes.
3110    PRED_MODE is the mode of the predicate.
3111
3112    See the comment at the head of aarch64-sve.md for details about the
3113    big-endian handling.  */
3114
3115 void
3116 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3117 {
3118   machine_mode mode = GET_MODE (dest);
3119   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3120   if (!register_operand (src, mode)
3121       && !register_operand (dest, mode))
3122     {
3123       rtx tmp = gen_reg_rtx (mode);
3124       if (MEM_P (src))
3125         aarch64_emit_sve_pred_move (tmp, ptrue, src);
3126       else
3127         emit_move_insn (tmp, src);
3128       src = tmp;
3129     }
3130   aarch64_emit_sve_pred_move (dest, ptrue, src);
3131 }
3132
3133 /* Called only on big-endian targets.  See whether an SVE vector move
3134    from SRC to DEST is effectively a REV[BHW] instruction, because at
3135    least one operand is a subreg of an SVE vector that has wider or
3136    narrower elements.  Return true and emit the instruction if so.
3137
3138    For example:
3139
3140      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3141
3142    represents a VIEW_CONVERT between the following vectors, viewed
3143    in memory order:
3144
3145      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
3146      R1: { [0],      [1],      [2],      [3],     ... }
3147
3148    The high part of lane X in R2 should therefore correspond to lane X*2
3149    of R1, but the register representations are:
3150
3151          msb                                      lsb
3152      R2: ...... [1].high  [1].low   [0].high  [0].low
3153      R1: ...... [3]       [2]       [1]       [0]
3154
3155    where the low part of lane X in R2 corresponds to lane X*2 in R1.
3156    We therefore need a reverse operation to swap the high and low values
3157    around.
3158
3159    This is purely an optimization.  Without it we would spill the
3160    subreg operand to the stack in one mode and reload it in the
3161    other mode, which has the same effect as the REV.  */
3162
3163 bool
3164 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3165 {
3166   gcc_assert (BYTES_BIG_ENDIAN);
3167   if (GET_CODE (dest) == SUBREG)
3168     dest = SUBREG_REG (dest);
3169   if (GET_CODE (src) == SUBREG)
3170     src = SUBREG_REG (src);
3171
3172   /* The optimization handles two single SVE REGs with different element
3173      sizes.  */
3174   if (!REG_P (dest)
3175       || !REG_P (src)
3176       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3177       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3178       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3179           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3180     return false;
3181
3182   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
3183   rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3184   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3185                                UNSPEC_REV_SUBREG);
3186   emit_insn (gen_rtx_SET (dest, unspec));
3187   return true;
3188 }
3189
3190 /* Return a copy of X with mode MODE, without changing its other
3191    attributes.  Unlike gen_lowpart, this doesn't care whether the
3192    mode change is valid.  */
3193
3194 static rtx
3195 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3196 {
3197   if (GET_MODE (x) == mode)
3198     return x;
3199
3200   x = shallow_copy_rtx (x);
3201   set_mode_and_regno (x, mode, REGNO (x));
3202   return x;
3203 }
3204
3205 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3206    operands.  */
3207
3208 void
3209 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3210 {
3211   /* Decide which REV operation we need.  The mode with narrower elements
3212      determines the mode of the operands and the mode with the wider
3213      elements determines the reverse width.  */
3214   machine_mode mode_with_wider_elts = GET_MODE (dest);
3215   machine_mode mode_with_narrower_elts = GET_MODE (src);
3216   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3217       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3218     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3219
3220   unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3221   unsigned int unspec;
3222   if (wider_bytes == 8)
3223     unspec = UNSPEC_REV64;
3224   else if (wider_bytes == 4)
3225     unspec = UNSPEC_REV32;
3226   else if (wider_bytes == 2)
3227     unspec = UNSPEC_REV16;
3228   else
3229     gcc_unreachable ();
3230   machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3231
3232   /* Emit:
3233
3234        (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3235                          UNSPEC_MERGE_PTRUE))
3236
3237      with the appropriate modes.  */
3238   ptrue = gen_lowpart (pred_mode, ptrue);
3239   dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3240   src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3241   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3242   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3243                         UNSPEC_MERGE_PTRUE);
3244   emit_insn (gen_rtx_SET (dest, src));
3245 }
3246
3247 static bool
3248 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3249                                  tree exp ATTRIBUTE_UNUSED)
3250 {
3251   /* Currently, always true.  */
3252   return true;
3253 }
3254
3255 /* Implement TARGET_PASS_BY_REFERENCE.  */
3256
3257 static bool
3258 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3259                            machine_mode mode,
3260                            const_tree type,
3261                            bool named ATTRIBUTE_UNUSED)
3262 {
3263   HOST_WIDE_INT size;
3264   machine_mode dummymode;
3265   int nregs;
3266
3267   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
3268   if (mode == BLKmode && type)
3269     size = int_size_in_bytes (type);
3270   else
3271     /* No frontends can create types with variable-sized modes, so we
3272        shouldn't be asked to pass or return them.  */
3273     size = GET_MODE_SIZE (mode).to_constant ();
3274
3275   /* Aggregates are passed by reference based on their size.  */
3276   if (type && AGGREGATE_TYPE_P (type))
3277     {
3278       size = int_size_in_bytes (type);
3279     }
3280
3281   /* Variable sized arguments are always returned by reference.  */
3282   if (size < 0)
3283     return true;
3284
3285   /* Can this be a candidate to be passed in fp/simd register(s)?  */
3286   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3287                                                &dummymode, &nregs,
3288                                                NULL))
3289     return false;
3290
3291   /* Arguments which are variable sized or larger than 2 registers are
3292      passed by reference unless they are a homogenous floating point
3293      aggregate.  */
3294   return size > 2 * UNITS_PER_WORD;
3295 }
3296
3297 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
3298 static bool
3299 aarch64_return_in_msb (const_tree valtype)
3300 {
3301   machine_mode dummy_mode;
3302   int dummy_int;
3303
3304   /* Never happens in little-endian mode.  */
3305   if (!BYTES_BIG_ENDIAN)
3306     return false;
3307
3308   /* Only composite types smaller than or equal to 16 bytes can
3309      be potentially returned in registers.  */
3310   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3311       || int_size_in_bytes (valtype) <= 0
3312       || int_size_in_bytes (valtype) > 16)
3313     return false;
3314
3315   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3316      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3317      is always passed/returned in the least significant bits of fp/simd
3318      register(s).  */
3319   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3320                                                &dummy_mode, &dummy_int, NULL))
3321     return false;
3322
3323   return true;
3324 }
3325
3326 /* Implement TARGET_FUNCTION_VALUE.
3327    Define how to find the value returned by a function.  */
3328
3329 static rtx
3330 aarch64_function_value (const_tree type, const_tree func,
3331                         bool outgoing ATTRIBUTE_UNUSED)
3332 {
3333   machine_mode mode;
3334   int unsignedp;
3335   int count;
3336   machine_mode ag_mode;
3337
3338   mode = TYPE_MODE (type);
3339   if (INTEGRAL_TYPE_P (type))
3340     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3341
3342   if (aarch64_return_in_msb (type))
3343     {
3344       HOST_WIDE_INT size = int_size_in_bytes (type);
3345
3346       if (size % UNITS_PER_WORD != 0)
3347         {
3348           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3349           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3350         }
3351     }
3352
3353   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3354                                                &ag_mode, &count, NULL))
3355     {
3356       if (!aarch64_composite_type_p (type, mode))
3357         {
3358           gcc_assert (count == 1 && mode == ag_mode);
3359           return gen_rtx_REG (mode, V0_REGNUM);
3360         }
3361       else
3362         {
3363           int i;
3364           rtx par;
3365
3366           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3367           for (i = 0; i < count; i++)
3368             {
3369               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3370               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3371               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3372               XVECEXP (par, 0, i) = tmp;
3373             }
3374           return par;
3375         }
3376     }
3377   else
3378     return gen_rtx_REG (mode, R0_REGNUM);
3379 }
3380
3381 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3382    Return true if REGNO is the number of a hard register in which the values
3383    of called function may come back.  */
3384
3385 static bool
3386 aarch64_function_value_regno_p (const unsigned int regno)
3387 {
3388   /* Maximum of 16 bytes can be returned in the general registers.  Examples
3389      of 16-byte return values are: 128-bit integers and 16-byte small
3390      structures (excluding homogeneous floating-point aggregates).  */
3391   if (regno == R0_REGNUM || regno == R1_REGNUM)
3392     return true;
3393
3394   /* Up to four fp/simd registers can return a function value, e.g. a
3395      homogeneous floating-point aggregate having four members.  */
3396   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3397     return TARGET_FLOAT;
3398
3399   return false;
3400 }
3401
3402 /* Implement TARGET_RETURN_IN_MEMORY.
3403
3404    If the type T of the result of a function is such that
3405      void func (T arg)
3406    would require that arg be passed as a value in a register (or set of
3407    registers) according to the parameter passing rules, then the result
3408    is returned in the same registers as would be used for such an
3409    argument.  */
3410
3411 static bool
3412 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3413 {
3414   HOST_WIDE_INT size;
3415   machine_mode ag_mode;
3416   int count;
3417
3418   if (!AGGREGATE_TYPE_P (type)
3419       && TREE_CODE (type) != COMPLEX_TYPE
3420       && TREE_CODE (type) != VECTOR_TYPE)
3421     /* Simple scalar types always returned in registers.  */
3422     return false;
3423
3424   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3425                                                type,
3426                                                &ag_mode,
3427                                                &count,
3428                                                NULL))
3429     return false;
3430
3431   /* Types larger than 2 registers returned in memory.  */
3432   size = int_size_in_bytes (type);
3433   return (size < 0 || size > 2 * UNITS_PER_WORD);
3434 }
3435
3436 static bool
3437 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3438                                const_tree type, int *nregs)
3439 {
3440   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3441   return aarch64_vfp_is_call_or_return_candidate (mode,
3442                                                   type,
3443                                                   &pcum->aapcs_vfp_rmode,
3444                                                   nregs,
3445                                                   NULL);
3446 }
3447
3448 /* Given MODE and TYPE of a function argument, return the alignment in
3449    bits.  The idea is to suppress any stronger alignment requested by
3450    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3451    This is a helper function for local use only.  */
3452
3453 static unsigned int
3454 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
3455 {
3456   if (!type)
3457     return GET_MODE_ALIGNMENT (mode);
3458
3459   if (integer_zerop (TYPE_SIZE (type)))
3460     return 0;
3461
3462   gcc_assert (TYPE_MODE (type) == mode);
3463
3464   if (!AGGREGATE_TYPE_P (type))
3465     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3466
3467   if (TREE_CODE (type) == ARRAY_TYPE)
3468     return TYPE_ALIGN (TREE_TYPE (type));
3469
3470   unsigned int alignment = 0;
3471   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3472     if (TREE_CODE (field) == FIELD_DECL)
3473       alignment = std::max (alignment, DECL_ALIGN (field));
3474
3475   return alignment;
3476 }
3477
3478 /* Layout a function argument according to the AAPCS64 rules.  The rule
3479    numbers refer to the rule numbers in the AAPCS64.  */
3480
3481 static void
3482 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3483                     const_tree type,
3484                     bool named ATTRIBUTE_UNUSED)
3485 {
3486   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3487   int ncrn, nvrn, nregs;
3488   bool allocate_ncrn, allocate_nvrn;
3489   HOST_WIDE_INT size;
3490
3491   /* We need to do this once per argument.  */
3492   if (pcum->aapcs_arg_processed)
3493     return;
3494
3495   pcum->aapcs_arg_processed = true;
3496
3497   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
3498   if (type)
3499     size = int_size_in_bytes (type);
3500   else
3501     /* No frontends can create types with variable-sized modes, so we
3502        shouldn't be asked to pass or return them.  */
3503     size = GET_MODE_SIZE (mode).to_constant ();
3504   size = ROUND_UP (size, UNITS_PER_WORD);
3505
3506   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3507   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3508                                                  mode,
3509                                                  type,
3510                                                  &nregs);
3511
3512   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3513      The following code thus handles passing by SIMD/FP registers first.  */
3514
3515   nvrn = pcum->aapcs_nvrn;
3516
3517   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3518      and homogenous short-vector aggregates (HVA).  */
3519   if (allocate_nvrn)
3520     {
3521       if (!TARGET_FLOAT)
3522         aarch64_err_no_fpadvsimd (mode, "argument");
3523
3524       if (nvrn + nregs <= NUM_FP_ARG_REGS)
3525         {
3526           pcum->aapcs_nextnvrn = nvrn + nregs;
3527           if (!aarch64_composite_type_p (type, mode))
3528             {
3529               gcc_assert (nregs == 1);
3530               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3531             }
3532           else
3533             {
3534               rtx par;
3535               int i;
3536               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3537               for (i = 0; i < nregs; i++)
3538                 {
3539                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3540                                          V0_REGNUM + nvrn + i);
3541                   rtx offset = gen_int_mode
3542                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3543                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3544                   XVECEXP (par, 0, i) = tmp;
3545                 }
3546               pcum->aapcs_reg = par;
3547             }
3548           return;
3549         }
3550       else
3551         {
3552           /* C.3 NSRN is set to 8.  */
3553           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3554           goto on_stack;
3555         }
3556     }
3557
3558   ncrn = pcum->aapcs_ncrn;
3559   nregs = size / UNITS_PER_WORD;
3560
3561   /* C6 - C9.  though the sign and zero extension semantics are
3562      handled elsewhere.  This is the case where the argument fits
3563      entirely general registers.  */
3564   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3565     {
3566
3567       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3568
3569       /* C.8 if the argument has an alignment of 16 then the NGRN is
3570          rounded up to the next even number.  */
3571       if (nregs == 2
3572           && ncrn % 2
3573           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3574              comparison is there because for > 16 * BITS_PER_UNIT
3575              alignment nregs should be > 2 and therefore it should be
3576              passed by reference rather than value.  */
3577           && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3578         {
3579           ++ncrn;
3580           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3581         }
3582
3583       /* NREGS can be 0 when e.g. an empty structure is to be passed.
3584          A reg is still generated for it, but the caller should be smart
3585          enough not to use it.  */
3586       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3587         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3588       else
3589         {
3590           rtx par;
3591           int i;
3592
3593           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3594           for (i = 0; i < nregs; i++)
3595             {
3596               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3597               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3598                                        GEN_INT (i * UNITS_PER_WORD));
3599               XVECEXP (par, 0, i) = tmp;
3600             }
3601           pcum->aapcs_reg = par;
3602         }
3603
3604       pcum->aapcs_nextncrn = ncrn + nregs;
3605       return;
3606     }
3607
3608   /* C.11  */
3609   pcum->aapcs_nextncrn = NUM_ARG_REGS;
3610
3611   /* The argument is passed on stack; record the needed number of words for
3612      this argument and align the total size if necessary.  */
3613 on_stack:
3614   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3615
3616   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3617     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
3618                                        16 / UNITS_PER_WORD);
3619   return;
3620 }
3621
3622 /* Implement TARGET_FUNCTION_ARG.  */
3623
3624 static rtx
3625 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3626                       const_tree type, bool named)
3627 {
3628   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3629   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3630
3631   if (mode == VOIDmode)
3632     return NULL_RTX;
3633
3634   aarch64_layout_arg (pcum_v, mode, type, named);
3635   return pcum->aapcs_reg;
3636 }
3637
3638 void
3639 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
3640                            const_tree fntype ATTRIBUTE_UNUSED,
3641                            rtx libname ATTRIBUTE_UNUSED,
3642                            const_tree fndecl ATTRIBUTE_UNUSED,
3643                            unsigned n_named ATTRIBUTE_UNUSED)
3644 {
3645   pcum->aapcs_ncrn = 0;
3646   pcum->aapcs_nvrn = 0;
3647   pcum->aapcs_nextncrn = 0;
3648   pcum->aapcs_nextnvrn = 0;
3649   pcum->pcs_variant = ARM_PCS_AAPCS64;
3650   pcum->aapcs_reg = NULL_RTX;
3651   pcum->aapcs_arg_processed = false;
3652   pcum->aapcs_stack_words = 0;
3653   pcum->aapcs_stack_size = 0;
3654
3655   if (!TARGET_FLOAT
3656       && fndecl && TREE_PUBLIC (fndecl)
3657       && fntype && fntype != error_mark_node)
3658     {
3659       const_tree type = TREE_TYPE (fntype);
3660       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
3661       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
3662       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
3663                                                    &mode, &nregs, NULL))
3664         aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
3665     }
3666   return;
3667 }
3668
3669 static void
3670 aarch64_function_arg_advance (cumulative_args_t pcum_v,
3671                               machine_mode mode,
3672                               const_tree type,
3673                               bool named)
3674 {
3675   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3676   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
3677     {
3678       aarch64_layout_arg (pcum_v, mode, type, named);
3679       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
3680                   != (pcum->aapcs_stack_words != 0));
3681       pcum->aapcs_arg_processed = false;
3682       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
3683       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
3684       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
3685       pcum->aapcs_stack_words = 0;
3686       pcum->aapcs_reg = NULL_RTX;
3687     }
3688 }
3689
3690 bool
3691 aarch64_function_arg_regno_p (unsigned regno)
3692 {
3693   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
3694           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
3695 }
3696
3697 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
3698    PARM_BOUNDARY bits of alignment, but will be given anything up
3699    to STACK_BOUNDARY bits if the type requires it.  This makes sure
3700    that both before and after the layout of each argument, the Next
3701    Stacked Argument Address (NSAA) will have a minimum alignment of
3702    8 bytes.  */
3703
3704 static unsigned int
3705 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
3706 {
3707   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
3708   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
3709 }
3710
3711 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
3712
3713 static fixed_size_mode
3714 aarch64_get_reg_raw_mode (int regno)
3715 {
3716   if (TARGET_SVE && FP_REGNUM_P (regno))
3717     /* Don't use the SVE part of the register for __builtin_apply and
3718        __builtin_return.  The SVE registers aren't used by the normal PCS,
3719        so using them there would be a waste of time.  The PCS extensions
3720        for SVE types are fundamentally incompatible with the
3721        __builtin_return/__builtin_apply interface.  */
3722     return as_a <fixed_size_mode> (V16QImode);
3723   return default_get_reg_raw_mode (regno);
3724 }
3725
3726 /* Implement TARGET_FUNCTION_ARG_PADDING.
3727
3728    Small aggregate types are placed in the lowest memory address.
3729
3730    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
3731
3732 static pad_direction
3733 aarch64_function_arg_padding (machine_mode mode, const_tree type)
3734 {
3735   /* On little-endian targets, the least significant byte of every stack
3736      argument is passed at the lowest byte address of the stack slot.  */
3737   if (!BYTES_BIG_ENDIAN)
3738     return PAD_UPWARD;
3739
3740   /* Otherwise, integral, floating-point and pointer types are padded downward:
3741      the least significant byte of a stack argument is passed at the highest
3742      byte address of the stack slot.  */
3743   if (type
3744       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
3745          || POINTER_TYPE_P (type))
3746       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
3747     return PAD_DOWNWARD;
3748
3749   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
3750   return PAD_UPWARD;
3751 }
3752
3753 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
3754
3755    It specifies padding for the last (may also be the only)
3756    element of a block move between registers and memory.  If
3757    assuming the block is in the memory, padding upward means that
3758    the last element is padded after its highest significant byte,
3759    while in downward padding, the last element is padded at the
3760    its least significant byte side.
3761
3762    Small aggregates and small complex types are always padded
3763    upwards.
3764
3765    We don't need to worry about homogeneous floating-point or
3766    short-vector aggregates; their move is not affected by the
3767    padding direction determined here.  Regardless of endianness,
3768    each element of such an aggregate is put in the least
3769    significant bits of a fp/simd register.
3770
3771    Return !BYTES_BIG_ENDIAN if the least significant byte of the
3772    register has useful data, and return the opposite if the most
3773    significant byte does.  */
3774
3775 bool
3776 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
3777                      bool first ATTRIBUTE_UNUSED)
3778 {
3779
3780   /* Small composite types are always padded upward.  */
3781   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
3782     {
3783       HOST_WIDE_INT size;
3784       if (type)
3785         size = int_size_in_bytes (type);
3786       else
3787         /* No frontends can create types with variable-sized modes, so we
3788            shouldn't be asked to pass or return them.  */
3789         size = GET_MODE_SIZE (mode).to_constant ();
3790       if (size < 2 * UNITS_PER_WORD)
3791         return true;
3792     }
3793
3794   /* Otherwise, use the default padding.  */
3795   return !BYTES_BIG_ENDIAN;
3796 }
3797
3798 static scalar_int_mode
3799 aarch64_libgcc_cmp_return_mode (void)
3800 {
3801   return SImode;
3802 }
3803
3804 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
3805
3806 /* We use the 12-bit shifted immediate arithmetic instructions so values
3807    must be multiple of (1 << 12), i.e. 4096.  */
3808 #define ARITH_FACTOR 4096
3809
3810 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
3811 #error Cannot use simple address calculation for stack probing
3812 #endif
3813
3814 /* The pair of scratch registers used for stack probing.  */
3815 #define PROBE_STACK_FIRST_REG  9
3816 #define PROBE_STACK_SECOND_REG 10
3817
3818 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
3819    inclusive.  These are offsets from the current stack pointer.  */
3820
3821 static void
3822 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
3823 {
3824   HOST_WIDE_INT size;
3825   if (!poly_size.is_constant (&size))
3826     {
3827       sorry ("stack probes for SVE frames");
3828       return;
3829     }
3830
3831   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
3832
3833   /* See the same assertion on PROBE_INTERVAL above.  */
3834   gcc_assert ((first % ARITH_FACTOR) == 0);
3835
3836   /* See if we have a constant small number of probes to generate.  If so,
3837      that's the easy case.  */
3838   if (size <= PROBE_INTERVAL)
3839     {
3840       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
3841
3842       emit_set_insn (reg1,
3843                      plus_constant (Pmode,
3844                                     stack_pointer_rtx, -(first + base)));
3845       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
3846     }
3847
3848   /* The run-time loop is made up of 8 insns in the generic case while the
3849      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
3850   else if (size <= 4 * PROBE_INTERVAL)
3851     {
3852       HOST_WIDE_INT i, rem;
3853
3854       emit_set_insn (reg1,
3855                      plus_constant (Pmode,
3856                                     stack_pointer_rtx,
3857                                     -(first + PROBE_INTERVAL)));
3858       emit_stack_probe (reg1);
3859
3860       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
3861          it exceeds SIZE.  If only two probes are needed, this will not
3862          generate any code.  Then probe at FIRST + SIZE.  */
3863       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
3864         {
3865           emit_set_insn (reg1,
3866                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
3867           emit_stack_probe (reg1);
3868         }
3869
3870       rem = size - (i - PROBE_INTERVAL);
3871       if (rem > 256)
3872         {
3873           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3874
3875           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
3876           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
3877         }
3878       else
3879         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
3880     }
3881
3882   /* Otherwise, do the same as above, but in a loop.  Note that we must be
3883      extra careful with variables wrapping around because we might be at
3884      the very top (or the very bottom) of the address space and we have
3885      to be able to handle this case properly; in particular, we use an
3886      equality test for the loop condition.  */
3887   else
3888     {
3889       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
3890
3891       /* Step 1: round SIZE to the previous multiple of the interval.  */
3892
3893       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
3894
3895
3896       /* Step 2: compute initial and final value of the loop counter.  */
3897
3898       /* TEST_ADDR = SP + FIRST.  */
3899       emit_set_insn (reg1,
3900                      plus_constant (Pmode, stack_pointer_rtx, -first));
3901
3902       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
3903       HOST_WIDE_INT adjustment = - (first + rounded_size);
3904       if (! aarch64_uimm12_shift (adjustment))
3905         {
3906           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
3907                                           true, Pmode);
3908           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
3909         }
3910       else
3911         emit_set_insn (reg2,
3912                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
3913
3914       /* Step 3: the loop
3915
3916          do
3917            {
3918              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
3919              probe at TEST_ADDR
3920            }
3921          while (TEST_ADDR != LAST_ADDR)
3922
3923          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
3924          until it is equal to ROUNDED_SIZE.  */
3925
3926       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
3927
3928
3929       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
3930          that SIZE is equal to ROUNDED_SIZE.  */
3931
3932       if (size != rounded_size)
3933         {
3934           HOST_WIDE_INT rem = size - rounded_size;
3935
3936           if (rem > 256)
3937             {
3938               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3939
3940               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
3941               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
3942             }
3943           else
3944             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
3945         }
3946     }
3947
3948   /* Make sure nothing is scheduled before we are done.  */
3949   emit_insn (gen_blockage ());
3950 }
3951
3952 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
3953    absolute addresses.  */
3954
3955 const char *
3956 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
3957 {
3958   static int labelno = 0;
3959   char loop_lab[32];
3960   rtx xops[2];
3961
3962   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
3963
3964   /* Loop.  */
3965   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
3966
3967   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
3968   xops[0] = reg1;
3969   xops[1] = GEN_INT (PROBE_INTERVAL);
3970   output_asm_insn ("sub\t%0, %0, %1", xops);
3971
3972   /* Probe at TEST_ADDR.  */
3973   output_asm_insn ("str\txzr, [%0]", xops);
3974
3975   /* Test if TEST_ADDR == LAST_ADDR.  */
3976   xops[1] = reg2;
3977   output_asm_insn ("cmp\t%0, %1", xops);
3978
3979   /* Branch.  */
3980   fputs ("\tb.ne\t", asm_out_file);
3981   assemble_name_raw (asm_out_file, loop_lab);
3982   fputc ('\n', asm_out_file);
3983
3984   return "";
3985 }
3986
3987 /* Determine whether a frame chain needs to be generated.  */
3988 static bool
3989 aarch64_needs_frame_chain (void)
3990 {
3991   /* Force a frame chain for EH returns so the return address is at FP+8.  */
3992   if (frame_pointer_needed || crtl->calls_eh_return)
3993     return true;
3994
3995   /* A leaf function cannot have calls or write LR.  */
3996   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
3997
3998   /* Don't use a frame chain in leaf functions if leaf frame pointers
3999      are disabled.  */
4000   if (flag_omit_leaf_frame_pointer && is_leaf)
4001     return false;
4002
4003   return aarch64_use_frame_pointer;
4004 }
4005
4006 /* Mark the registers that need to be saved by the callee and calculate
4007    the size of the callee-saved registers area and frame record (both FP
4008    and LR may be omitted).  */
4009 static void
4010 aarch64_layout_frame (void)
4011 {
4012   HOST_WIDE_INT offset = 0;
4013   int regno, last_fp_reg = INVALID_REGNUM;
4014
4015   if (reload_completed && cfun->machine->frame.laid_out)
4016     return;
4017
4018   cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
4019
4020 #define SLOT_NOT_REQUIRED (-2)
4021 #define SLOT_REQUIRED     (-1)
4022
4023   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
4024   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
4025
4026   /* First mark all the registers that really need to be saved...  */
4027   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4028     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4029
4030   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4031     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4032
4033   /* ... that includes the eh data registers (if needed)...  */
4034   if (crtl->calls_eh_return)
4035     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
4036       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
4037         = SLOT_REQUIRED;
4038
4039   /* ... and any callee saved register that dataflow says is live.  */
4040   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4041     if (df_regs_ever_live_p (regno)
4042         && (regno == R30_REGNUM
4043             || !call_used_regs[regno]))
4044       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4045
4046   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4047     if (df_regs_ever_live_p (regno)
4048         && !call_used_regs[regno])
4049       {
4050         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4051         last_fp_reg = regno;
4052       }
4053
4054   if (cfun->machine->frame.emit_frame_chain)
4055     {
4056       /* FP and LR are placed in the linkage record.  */
4057       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4058       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4059       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4060       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4061       offset = 2 * UNITS_PER_WORD;
4062     }
4063
4064   /* Now assign stack slots for them.  */
4065   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4066     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4067       {
4068         cfun->machine->frame.reg_offset[regno] = offset;
4069         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4070           cfun->machine->frame.wb_candidate1 = regno;
4071         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4072           cfun->machine->frame.wb_candidate2 = regno;
4073         offset += UNITS_PER_WORD;
4074       }
4075
4076   HOST_WIDE_INT max_int_offset = offset;
4077   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4078   bool has_align_gap = offset != max_int_offset;
4079
4080   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4081     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4082       {
4083         /* If there is an alignment gap between integer and fp callee-saves,
4084            allocate the last fp register to it if possible.  */
4085         if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
4086           {
4087             cfun->machine->frame.reg_offset[regno] = max_int_offset;
4088             break;
4089           }
4090
4091         cfun->machine->frame.reg_offset[regno] = offset;
4092         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4093           cfun->machine->frame.wb_candidate1 = regno;
4094         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4095                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4096           cfun->machine->frame.wb_candidate2 = regno;
4097         offset += UNITS_PER_WORD;
4098       }
4099
4100   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4101
4102   cfun->machine->frame.saved_regs_size = offset;
4103
4104   HOST_WIDE_INT varargs_and_saved_regs_size
4105     = offset + cfun->machine->frame.saved_varargs_size;
4106
4107   cfun->machine->frame.hard_fp_offset
4108     = aligned_upper_bound (varargs_and_saved_regs_size
4109                            + get_frame_size (),
4110                            STACK_BOUNDARY / BITS_PER_UNIT);
4111
4112   /* Both these values are already aligned.  */
4113   gcc_assert (multiple_p (crtl->outgoing_args_size,
4114                           STACK_BOUNDARY / BITS_PER_UNIT));
4115   cfun->machine->frame.frame_size
4116     = (cfun->machine->frame.hard_fp_offset
4117        + crtl->outgoing_args_size);
4118
4119   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4120
4121   cfun->machine->frame.initial_adjust = 0;
4122   cfun->machine->frame.final_adjust = 0;
4123   cfun->machine->frame.callee_adjust = 0;
4124   cfun->machine->frame.callee_offset = 0;
4125
4126   HOST_WIDE_INT max_push_offset = 0;
4127   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4128     max_push_offset = 512;
4129   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4130     max_push_offset = 256;
4131
4132   HOST_WIDE_INT const_size, const_fp_offset;
4133   if (cfun->machine->frame.frame_size.is_constant (&const_size)
4134       && const_size < max_push_offset
4135       && known_eq (crtl->outgoing_args_size, 0))
4136     {
4137       /* Simple, small frame with no outgoing arguments:
4138          stp reg1, reg2, [sp, -frame_size]!
4139          stp reg3, reg4, [sp, 16]  */
4140       cfun->machine->frame.callee_adjust = const_size;
4141     }
4142   else if (known_lt (crtl->outgoing_args_size
4143                      + cfun->machine->frame.saved_regs_size, 512)
4144            && !(cfun->calls_alloca
4145                 && known_lt (cfun->machine->frame.hard_fp_offset,
4146                              max_push_offset)))
4147     {
4148       /* Frame with small outgoing arguments:
4149          sub sp, sp, frame_size
4150          stp reg1, reg2, [sp, outgoing_args_size]
4151          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
4152       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4153       cfun->machine->frame.callee_offset
4154         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4155     }
4156   else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4157            && const_fp_offset < max_push_offset)
4158     {
4159       /* Frame with large outgoing arguments but a small local area:
4160          stp reg1, reg2, [sp, -hard_fp_offset]!
4161          stp reg3, reg4, [sp, 16]
4162          sub sp, sp, outgoing_args_size  */
4163       cfun->machine->frame.callee_adjust = const_fp_offset;
4164       cfun->machine->frame.final_adjust
4165         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4166     }
4167   else
4168     {
4169       /* Frame with large local area and outgoing arguments using frame pointer:
4170          sub sp, sp, hard_fp_offset
4171          stp x29, x30, [sp, 0]
4172          add x29, sp, 0
4173          stp reg3, reg4, [sp, 16]
4174          sub sp, sp, outgoing_args_size  */
4175       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4176       cfun->machine->frame.final_adjust
4177         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4178     }
4179
4180   cfun->machine->frame.laid_out = true;
4181 }
4182
4183 /* Return true if the register REGNO is saved on entry to
4184    the current function.  */
4185
4186 static bool
4187 aarch64_register_saved_on_entry (int regno)
4188 {
4189   return cfun->machine->frame.reg_offset[regno] >= 0;
4190 }
4191
4192 /* Return the next register up from REGNO up to LIMIT for the callee
4193    to save.  */
4194
4195 static unsigned
4196 aarch64_next_callee_save (unsigned regno, unsigned limit)
4197 {
4198   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4199     regno ++;
4200   return regno;
4201 }
4202
4203 /* Push the register number REGNO of mode MODE to the stack with write-back
4204    adjusting the stack by ADJUSTMENT.  */
4205
4206 static void
4207 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4208                            HOST_WIDE_INT adjustment)
4209  {
4210   rtx base_rtx = stack_pointer_rtx;
4211   rtx insn, reg, mem;
4212
4213   reg = gen_rtx_REG (mode, regno);
4214   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4215                             plus_constant (Pmode, base_rtx, -adjustment));
4216   mem = gen_frame_mem (mode, mem);
4217
4218   insn = emit_move_insn (mem, reg);
4219   RTX_FRAME_RELATED_P (insn) = 1;
4220 }
4221
4222 /* Generate and return an instruction to store the pair of registers
4223    REG and REG2 of mode MODE to location BASE with write-back adjusting
4224    the stack location BASE by ADJUSTMENT.  */
4225
4226 static rtx
4227 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4228                           HOST_WIDE_INT adjustment)
4229 {
4230   switch (mode)
4231     {
4232     case E_DImode:
4233       return gen_storewb_pairdi_di (base, base, reg, reg2,
4234                                     GEN_INT (-adjustment),
4235                                     GEN_INT (UNITS_PER_WORD - adjustment));
4236     case E_DFmode:
4237       return gen_storewb_pairdf_di (base, base, reg, reg2,
4238                                     GEN_INT (-adjustment),
4239                                     GEN_INT (UNITS_PER_WORD - adjustment));
4240     default:
4241       gcc_unreachable ();
4242     }
4243 }
4244
4245 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4246    stack pointer by ADJUSTMENT.  */
4247
4248 static void
4249 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4250 {
4251   rtx_insn *insn;
4252   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4253
4254   if (regno2 == INVALID_REGNUM)
4255     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4256
4257   rtx reg1 = gen_rtx_REG (mode, regno1);
4258   rtx reg2 = gen_rtx_REG (mode, regno2);
4259
4260   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4261                                               reg2, adjustment));
4262   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4263   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4264   RTX_FRAME_RELATED_P (insn) = 1;
4265 }
4266
4267 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4268    adjusting it by ADJUSTMENT afterwards.  */
4269
4270 static rtx
4271 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4272                          HOST_WIDE_INT adjustment)
4273 {
4274   switch (mode)
4275     {
4276     case E_DImode:
4277       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4278                                    GEN_INT (UNITS_PER_WORD));
4279     case E_DFmode:
4280       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4281                                    GEN_INT (UNITS_PER_WORD));
4282     default:
4283       gcc_unreachable ();
4284     }
4285 }
4286
4287 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4288    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4289    into CFI_OPS.  */
4290
4291 static void
4292 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4293                   rtx *cfi_ops)
4294 {
4295   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4296   rtx reg1 = gen_rtx_REG (mode, regno1);
4297
4298   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4299
4300   if (regno2 == INVALID_REGNUM)
4301     {
4302       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4303       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4304       emit_move_insn (reg1, gen_frame_mem (mode, mem));
4305     }
4306   else
4307     {
4308       rtx reg2 = gen_rtx_REG (mode, regno2);
4309       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4310       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4311                                           reg2, adjustment));
4312     }
4313 }
4314
4315 /* Generate and return a store pair instruction of mode MODE to store
4316    register REG1 to MEM1 and register REG2 to MEM2.  */
4317
4318 static rtx
4319 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4320                         rtx reg2)
4321 {
4322   switch (mode)
4323     {
4324     case E_DImode:
4325       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
4326
4327     case E_DFmode:
4328       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
4329
4330     default:
4331       gcc_unreachable ();
4332     }
4333 }
4334
4335 /* Generate and regurn a load pair isntruction of mode MODE to load register
4336    REG1 from MEM1 and register REG2 from MEM2.  */
4337
4338 static rtx
4339 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4340                        rtx mem2)
4341 {
4342   switch (mode)
4343     {
4344     case E_DImode:
4345       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
4346
4347     case E_DFmode:
4348       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
4349
4350     default:
4351       gcc_unreachable ();
4352     }
4353 }
4354
4355 /* Return TRUE if return address signing should be enabled for the current
4356    function, otherwise return FALSE.  */
4357
4358 bool
4359 aarch64_return_address_signing_enabled (void)
4360 {
4361   /* This function should only be called after frame laid out.   */
4362   gcc_assert (cfun->machine->frame.laid_out);
4363
4364   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4365      if it's LR is pushed onto stack.  */
4366   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4367           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4368               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4369 }
4370
4371 /* Emit code to save the callee-saved registers from register number START
4372    to LIMIT to the stack at the location starting at offset START_OFFSET,
4373    skipping any write-back candidates if SKIP_WB is true.  */
4374
4375 static void
4376 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4377                            unsigned start, unsigned limit, bool skip_wb)
4378 {
4379   rtx_insn *insn;
4380   unsigned regno;
4381   unsigned regno2;
4382
4383   for (regno = aarch64_next_callee_save (start, limit);
4384        regno <= limit;
4385        regno = aarch64_next_callee_save (regno + 1, limit))
4386     {
4387       rtx reg, mem;
4388       poly_int64 offset;
4389
4390       if (skip_wb
4391           && (regno == cfun->machine->frame.wb_candidate1
4392               || regno == cfun->machine->frame.wb_candidate2))
4393         continue;
4394
4395       if (cfun->machine->reg_is_wrapped_separately[regno])
4396        continue;
4397
4398       reg = gen_rtx_REG (mode, regno);
4399       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4400       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4401                                                 offset));
4402
4403       regno2 = aarch64_next_callee_save (regno + 1, limit);
4404
4405       if (regno2 <= limit
4406           && !cfun->machine->reg_is_wrapped_separately[regno2]
4407           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4408               == cfun->machine->frame.reg_offset[regno2]))
4409
4410         {
4411           rtx reg2 = gen_rtx_REG (mode, regno2);
4412           rtx mem2;
4413
4414           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4415           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4416                                                      offset));
4417           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4418                                                     reg2));
4419
4420           /* The first part of a frame-related parallel insn is
4421              always assumed to be relevant to the frame
4422              calculations; subsequent parts, are only
4423              frame-related if explicitly marked.  */
4424           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4425           regno = regno2;
4426         }
4427       else
4428         insn = emit_move_insn (mem, reg);
4429
4430       RTX_FRAME_RELATED_P (insn) = 1;
4431     }
4432 }
4433
4434 /* Emit code to restore the callee registers of mode MODE from register
4435    number START up to and including LIMIT.  Restore from the stack offset
4436    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4437    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
4438
4439 static void
4440 aarch64_restore_callee_saves (machine_mode mode,
4441                               poly_int64 start_offset, unsigned start,
4442                               unsigned limit, bool skip_wb, rtx *cfi_ops)
4443 {
4444   rtx base_rtx = stack_pointer_rtx;
4445   unsigned regno;
4446   unsigned regno2;
4447   poly_int64 offset;
4448
4449   for (regno = aarch64_next_callee_save (start, limit);
4450        regno <= limit;
4451        regno = aarch64_next_callee_save (regno + 1, limit))
4452     {
4453       if (cfun->machine->reg_is_wrapped_separately[regno])
4454        continue;
4455
4456       rtx reg, mem;
4457
4458       if (skip_wb
4459           && (regno == cfun->machine->frame.wb_candidate1
4460               || regno == cfun->machine->frame.wb_candidate2))
4461         continue;
4462
4463       reg = gen_rtx_REG (mode, regno);
4464       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4465       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4466
4467       regno2 = aarch64_next_callee_save (regno + 1, limit);
4468
4469       if (regno2 <= limit
4470           && !cfun->machine->reg_is_wrapped_separately[regno2]
4471           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4472               == cfun->machine->frame.reg_offset[regno2]))
4473         {
4474           rtx reg2 = gen_rtx_REG (mode, regno2);
4475           rtx mem2;
4476
4477           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4478           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4479           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4480
4481           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4482           regno = regno2;
4483         }
4484       else
4485         emit_move_insn (reg, mem);
4486       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
4487     }
4488 }
4489
4490 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4491    of MODE.  */
4492
4493 static inline bool
4494 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4495 {
4496   HOST_WIDE_INT multiple;
4497   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4498           && IN_RANGE (multiple, -8, 7));
4499 }
4500
4501 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4502    of MODE.  */
4503
4504 static inline bool
4505 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4506 {
4507   HOST_WIDE_INT multiple;
4508   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4509           && IN_RANGE (multiple, 0, 63));
4510 }
4511
4512 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4513    of MODE.  */
4514
4515 bool
4516 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4517 {
4518   HOST_WIDE_INT multiple;
4519   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4520           && IN_RANGE (multiple, -64, 63));
4521 }
4522
4523 /* Return true if OFFSET is a signed 9-bit value.  */
4524
4525 static inline bool
4526 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
4527                                poly_int64 offset)
4528 {
4529   HOST_WIDE_INT const_offset;
4530   return (offset.is_constant (&const_offset)
4531           && IN_RANGE (const_offset, -256, 255));
4532 }
4533
4534 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4535    of MODE.  */
4536
4537 static inline bool
4538 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4539 {
4540   HOST_WIDE_INT multiple;
4541   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4542           && IN_RANGE (multiple, -256, 255));
4543 }
4544
4545 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4546    of MODE.  */
4547
4548 static inline bool
4549 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4550 {
4551   HOST_WIDE_INT multiple;
4552   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4553           && IN_RANGE (multiple, 0, 4095));
4554 }
4555
4556 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
4557
4558 static sbitmap
4559 aarch64_get_separate_components (void)
4560 {
4561   aarch64_layout_frame ();
4562
4563   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4564   bitmap_clear (components);
4565
4566   /* The registers we need saved to the frame.  */
4567   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4568     if (aarch64_register_saved_on_entry (regno))
4569       {
4570         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4571         if (!frame_pointer_needed)
4572           offset += cfun->machine->frame.frame_size
4573                     - cfun->machine->frame.hard_fp_offset;
4574         /* Check that we can access the stack slot of the register with one
4575            direct load with no adjustments needed.  */
4576         if (offset_12bit_unsigned_scaled_p (DImode, offset))
4577           bitmap_set_bit (components, regno);
4578       }
4579
4580   /* Don't mess with the hard frame pointer.  */
4581   if (frame_pointer_needed)
4582     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
4583
4584   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4585   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4586   /* If aarch64_layout_frame has chosen registers to store/restore with
4587      writeback don't interfere with them to avoid having to output explicit
4588      stack adjustment instructions.  */
4589   if (reg2 != INVALID_REGNUM)
4590     bitmap_clear_bit (components, reg2);
4591   if (reg1 != INVALID_REGNUM)
4592     bitmap_clear_bit (components, reg1);
4593
4594   bitmap_clear_bit (components, LR_REGNUM);
4595   bitmap_clear_bit (components, SP_REGNUM);
4596
4597   return components;
4598 }
4599
4600 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
4601
4602 static sbitmap
4603 aarch64_components_for_bb (basic_block bb)
4604 {
4605   bitmap in = DF_LIVE_IN (bb);
4606   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
4607   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
4608
4609   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4610   bitmap_clear (components);
4611
4612   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
4613   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4614     if ((!call_used_regs[regno])
4615        && (bitmap_bit_p (in, regno)
4616            || bitmap_bit_p (gen, regno)
4617            || bitmap_bit_p (kill, regno)))
4618       {
4619         unsigned regno2, offset, offset2;
4620         bitmap_set_bit (components, regno);
4621
4622         /* If there is a callee-save at an adjacent offset, add it too
4623            to increase the use of LDP/STP.  */
4624         offset = cfun->machine->frame.reg_offset[regno];
4625         regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
4626
4627         if (regno2 <= LAST_SAVED_REGNUM)
4628           {
4629             offset2 = cfun->machine->frame.reg_offset[regno2];
4630             if ((offset & ~8) == (offset2 & ~8))
4631               bitmap_set_bit (components, regno2);
4632           }
4633       }
4634
4635   return components;
4636 }
4637
4638 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
4639    Nothing to do for aarch64.  */
4640
4641 static void
4642 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
4643 {
4644 }
4645
4646 /* Return the next set bit in BMP from START onwards.  Return the total number
4647    of bits in BMP if no set bit is found at or after START.  */
4648
4649 static unsigned int
4650 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
4651 {
4652   unsigned int nbits = SBITMAP_SIZE (bmp);
4653   if (start == nbits)
4654     return start;
4655
4656   gcc_assert (start < nbits);
4657   for (unsigned int i = start; i < nbits; i++)
4658     if (bitmap_bit_p (bmp, i))
4659       return i;
4660
4661   return nbits;
4662 }
4663
4664 /* Do the work for aarch64_emit_prologue_components and
4665    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
4666    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
4667    for these components or the epilogue sequence.  That is, it determines
4668    whether we should emit stores or loads and what kind of CFA notes to attach
4669    to the insns.  Otherwise the logic for the two sequences is very
4670    similar.  */
4671
4672 static void
4673 aarch64_process_components (sbitmap components, bool prologue_p)
4674 {
4675   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
4676                              ? HARD_FRAME_POINTER_REGNUM
4677                              : STACK_POINTER_REGNUM);
4678
4679   unsigned last_regno = SBITMAP_SIZE (components);
4680   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
4681   rtx_insn *insn = NULL;
4682
4683   while (regno != last_regno)
4684     {
4685       /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
4686          so DFmode for the vector registers is enough.  */
4687       machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
4688       rtx reg = gen_rtx_REG (mode, regno);
4689       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4690       if (!frame_pointer_needed)
4691         offset += cfun->machine->frame.frame_size
4692                   - cfun->machine->frame.hard_fp_offset;
4693       rtx addr = plus_constant (Pmode, ptr_reg, offset);
4694       rtx mem = gen_frame_mem (mode, addr);
4695
4696       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
4697       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
4698       /* No more registers to handle after REGNO.
4699          Emit a single save/restore and exit.  */
4700       if (regno2 == last_regno)
4701         {
4702           insn = emit_insn (set);
4703           RTX_FRAME_RELATED_P (insn) = 1;
4704           if (prologue_p)
4705             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4706           else
4707             add_reg_note (insn, REG_CFA_RESTORE, reg);
4708           break;
4709         }
4710
4711       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
4712       /* The next register is not of the same class or its offset is not
4713          mergeable with the current one into a pair.  */
4714       if (!satisfies_constraint_Ump (mem)
4715           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
4716           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
4717                        GET_MODE_SIZE (mode)))
4718         {
4719           insn = emit_insn (set);
4720           RTX_FRAME_RELATED_P (insn) = 1;
4721           if (prologue_p)
4722             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4723           else
4724             add_reg_note (insn, REG_CFA_RESTORE, reg);
4725
4726           regno = regno2;
4727           continue;
4728         }
4729
4730       /* REGNO2 can be saved/restored in a pair with REGNO.  */
4731       rtx reg2 = gen_rtx_REG (mode, regno2);
4732       if (!frame_pointer_needed)
4733         offset2 += cfun->machine->frame.frame_size
4734                   - cfun->machine->frame.hard_fp_offset;
4735       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
4736       rtx mem2 = gen_frame_mem (mode, addr2);
4737       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
4738                              : gen_rtx_SET (reg2, mem2);
4739
4740       if (prologue_p)
4741         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
4742       else
4743         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4744
4745       RTX_FRAME_RELATED_P (insn) = 1;
4746       if (prologue_p)
4747         {
4748           add_reg_note (insn, REG_CFA_OFFSET, set);
4749           add_reg_note (insn, REG_CFA_OFFSET, set2);
4750         }
4751       else
4752         {
4753           add_reg_note (insn, REG_CFA_RESTORE, reg);
4754           add_reg_note (insn, REG_CFA_RESTORE, reg2);
4755         }
4756
4757       regno = aarch64_get_next_set_bit (components, regno2 + 1);
4758     }
4759 }
4760
4761 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
4762
4763 static void
4764 aarch64_emit_prologue_components (sbitmap components)
4765 {
4766   aarch64_process_components (components, true);
4767 }
4768
4769 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
4770
4771 static void
4772 aarch64_emit_epilogue_components (sbitmap components)
4773 {
4774   aarch64_process_components (components, false);
4775 }
4776
4777 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
4778
4779 static void
4780 aarch64_set_handled_components (sbitmap components)
4781 {
4782   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4783     if (bitmap_bit_p (components, regno))
4784       cfun->machine->reg_is_wrapped_separately[regno] = true;
4785 }
4786
4787 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
4788    is saved at BASE + OFFSET.  */
4789
4790 static void
4791 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
4792                             rtx base, poly_int64 offset)
4793 {
4794   rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
4795   add_reg_note (insn, REG_CFA_EXPRESSION,
4796                 gen_rtx_SET (mem, regno_reg_rtx[reg]));
4797 }
4798
4799 /* AArch64 stack frames generated by this compiler look like:
4800
4801         +-------------------------------+
4802         |                               |
4803         |  incoming stack arguments     |
4804         |                               |
4805         +-------------------------------+
4806         |                               | <-- incoming stack pointer (aligned)
4807         |  callee-allocated save area   |
4808         |  for register varargs         |
4809         |                               |
4810         +-------------------------------+
4811         |  local variables              | <-- frame_pointer_rtx
4812         |                               |
4813         +-------------------------------+
4814         |  padding0                     | \
4815         +-------------------------------+  |
4816         |  callee-saved registers       |  | frame.saved_regs_size
4817         +-------------------------------+  |
4818         |  LR'                          |  |
4819         +-------------------------------+  |
4820         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
4821         +-------------------------------+
4822         |  dynamic allocation           |
4823         +-------------------------------+
4824         |  padding                      |
4825         +-------------------------------+
4826         |  outgoing stack arguments     | <-- arg_pointer
4827         |                               |
4828         +-------------------------------+
4829         |                               | <-- stack_pointer_rtx (aligned)
4830
4831    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
4832    but leave frame_pointer_rtx and hard_frame_pointer_rtx
4833    unchanged.  */
4834
4835 /* Generate the prologue instructions for entry into a function.
4836    Establish the stack frame by decreasing the stack pointer with a
4837    properly calculated size and, if necessary, create a frame record
4838    filled with the values of LR and previous frame pointer.  The
4839    current FP is also set up if it is in use.  */
4840
4841 void
4842 aarch64_expand_prologue (void)
4843 {
4844   aarch64_layout_frame ();
4845
4846   poly_int64 frame_size = cfun->machine->frame.frame_size;
4847   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4848   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4849   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4850   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4851   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4852   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4853   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
4854   rtx_insn *insn;
4855
4856   /* Sign return address for functions.  */
4857   if (aarch64_return_address_signing_enabled ())
4858     {
4859       insn = emit_insn (gen_pacisp ());
4860       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
4861       RTX_FRAME_RELATED_P (insn) = 1;
4862     }
4863
4864   if (flag_stack_usage_info)
4865     current_function_static_stack_size = constant_lower_bound (frame_size);
4866
4867   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4868     {
4869       if (crtl->is_leaf && !cfun->calls_alloca)
4870         {
4871           if (maybe_gt (frame_size, PROBE_INTERVAL)
4872               && maybe_gt (frame_size, get_stack_check_protect ()))
4873             aarch64_emit_probe_stack_range (get_stack_check_protect (),
4874                                             (frame_size
4875                                              - get_stack_check_protect ()));
4876         }
4877       else if (maybe_gt (frame_size, 0))
4878         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
4879     }
4880
4881   rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4882   rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4883
4884   aarch64_sub_sp (ip0_rtx, ip1_rtx, initial_adjust, true);
4885
4886   if (callee_adjust != 0)
4887     aarch64_push_regs (reg1, reg2, callee_adjust);
4888
4889   if (emit_frame_chain)
4890     {
4891       poly_int64 reg_offset = callee_adjust;
4892       if (callee_adjust == 0)
4893         {
4894           reg1 = R29_REGNUM;
4895           reg2 = R30_REGNUM;
4896           reg_offset = callee_offset;
4897           aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
4898         }
4899       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
4900                           stack_pointer_rtx, callee_offset,
4901                           ip1_rtx, ip0_rtx, frame_pointer_needed);
4902       if (frame_pointer_needed && !frame_size.is_constant ())
4903         {
4904           /* Variable-sized frames need to describe the save slot
4905              address using DW_CFA_expression rather than DW_CFA_offset.
4906              This means that, without taking further action, the
4907              locations of the registers that we've already saved would
4908              remain based on the stack pointer even after we redefine
4909              the CFA based on the frame pointer.  We therefore need new
4910              DW_CFA_expressions to re-express the save slots with addresses
4911              based on the frame pointer.  */
4912           rtx_insn *insn = get_last_insn ();
4913           gcc_assert (RTX_FRAME_RELATED_P (insn));
4914
4915           /* Add an explicit CFA definition if this was previously
4916              implicit.  */
4917           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
4918             {
4919               rtx src = plus_constant (Pmode, stack_pointer_rtx,
4920                                        callee_offset);
4921               add_reg_note (insn, REG_CFA_ADJUST_CFA,
4922                             gen_rtx_SET (hard_frame_pointer_rtx, src));
4923             }
4924
4925           /* Change the save slot expressions for the registers that
4926              we've already saved.  */
4927           reg_offset -= callee_offset;
4928           aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
4929                                       reg_offset + UNITS_PER_WORD);
4930           aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
4931                                       reg_offset);
4932         }
4933       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
4934     }
4935
4936   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4937                              callee_adjust != 0 || emit_frame_chain);
4938   aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4939                              callee_adjust != 0 || emit_frame_chain);
4940   aarch64_sub_sp (ip1_rtx, ip0_rtx, final_adjust, !frame_pointer_needed);
4941 }
4942
4943 /* Return TRUE if we can use a simple_return insn.
4944
4945    This function checks whether the callee saved stack is empty, which
4946    means no restore actions are need. The pro_and_epilogue will use
4947    this to check whether shrink-wrapping opt is feasible.  */
4948
4949 bool
4950 aarch64_use_return_insn_p (void)
4951 {
4952   if (!reload_completed)
4953     return false;
4954
4955   if (crtl->profile)
4956     return false;
4957
4958   aarch64_layout_frame ();
4959
4960   return known_eq (cfun->machine->frame.frame_size, 0);
4961 }
4962
4963 /* Generate the epilogue instructions for returning from a function.
4964    This is almost exactly the reverse of the prolog sequence, except
4965    that we need to insert barriers to avoid scheduling loads that read
4966    from a deallocated stack, and we optimize the unwind records by
4967    emitting them all together if possible.  */
4968 void
4969 aarch64_expand_epilogue (bool for_sibcall)
4970 {
4971   aarch64_layout_frame ();
4972
4973   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4974   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4975   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4976   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4977   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4978   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4979   rtx cfi_ops = NULL;
4980   rtx_insn *insn;
4981   /* A stack clash protection prologue may not have left IP0_REGNUM or
4982      IP1_REGNUM in a usable state.  The same is true for allocations
4983      with an SVE component, since we then need both temporary registers
4984      for each allocation.  */
4985   bool can_inherit_p = (initial_adjust.is_constant ()
4986                         && final_adjust.is_constant ()
4987                         && !flag_stack_clash_protection);
4988
4989   /* We need to add memory barrier to prevent read from deallocated stack.  */
4990   bool need_barrier_p
4991     = maybe_ne (get_frame_size ()
4992                 + cfun->machine->frame.saved_varargs_size, 0);
4993
4994   /* Emit a barrier to prevent loads from a deallocated stack.  */
4995   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
4996       || cfun->calls_alloca
4997       || crtl->calls_eh_return)
4998     {
4999       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5000       need_barrier_p = false;
5001     }
5002
5003   /* Restore the stack pointer from the frame pointer if it may not
5004      be the same as the stack pointer.  */
5005   rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
5006   rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
5007   if (frame_pointer_needed
5008       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
5009     /* If writeback is used when restoring callee-saves, the CFA
5010        is restored on the instruction doing the writeback.  */
5011     aarch64_add_offset (Pmode, stack_pointer_rtx,
5012                         hard_frame_pointer_rtx, -callee_offset,
5013                         ip1_rtx, ip0_rtx, callee_adjust == 0);
5014   else
5015     aarch64_add_sp (ip1_rtx, ip0_rtx, final_adjust,
5016                     !can_inherit_p || df_regs_ever_live_p (IP1_REGNUM));
5017
5018   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5019                                 callee_adjust != 0, &cfi_ops);
5020   aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5021                                 callee_adjust != 0, &cfi_ops);
5022
5023   if (need_barrier_p)
5024     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5025
5026   if (callee_adjust != 0)
5027     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
5028
5029   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
5030     {
5031       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
5032       insn = get_last_insn ();
5033       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
5034       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
5035       RTX_FRAME_RELATED_P (insn) = 1;
5036       cfi_ops = NULL;
5037     }
5038
5039   aarch64_add_sp (ip0_rtx, ip1_rtx, initial_adjust,
5040                   !can_inherit_p || df_regs_ever_live_p (IP0_REGNUM));
5041
5042   if (cfi_ops)
5043     {
5044       /* Emit delayed restores and reset the CFA to be SP.  */
5045       insn = get_last_insn ();
5046       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
5047       REG_NOTES (insn) = cfi_ops;
5048       RTX_FRAME_RELATED_P (insn) = 1;
5049     }
5050
5051   /* We prefer to emit the combined return/authenticate instruction RETAA,
5052      however there are three cases in which we must instead emit an explicit
5053      authentication instruction.
5054
5055         1) Sibcalls don't return in a normal way, so if we're about to call one
5056            we must authenticate.
5057
5058         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5059            generating code for !TARGET_ARMV8_3 we can't use it and must
5060            explicitly authenticate.
5061
5062         3) On an eh_return path we make extra stack adjustments to update the
5063            canonical frame address to be the exception handler's CFA.  We want
5064            to authenticate using the CFA of the function which calls eh_return.
5065     */
5066   if (aarch64_return_address_signing_enabled ()
5067       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
5068     {
5069       insn = emit_insn (gen_autisp ());
5070       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5071       RTX_FRAME_RELATED_P (insn) = 1;
5072     }
5073
5074   /* Stack adjustment for exception handler.  */
5075   if (crtl->calls_eh_return)
5076     {
5077       /* We need to unwind the stack by the offset computed by
5078          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
5079          to be SP; letting the CFA move during this adjustment
5080          is just as correct as retaining the CFA from the body
5081          of the function.  Therefore, do nothing special.  */
5082       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
5083     }
5084
5085   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
5086   if (!for_sibcall)
5087     emit_jump_insn (ret_rtx);
5088 }
5089
5090 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
5091    normally or return to a previous frame after unwinding.
5092
5093    An EH return uses a single shared return sequence.  The epilogue is
5094    exactly like a normal epilogue except that it has an extra input
5095    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5096    that must be applied after the frame has been destroyed.  An extra label
5097    is inserted before the epilogue which initializes this register to zero,
5098    and this is the entry point for a normal return.
5099
5100    An actual EH return updates the return address, initializes the stack
5101    adjustment and jumps directly into the epilogue (bypassing the zeroing
5102    of the adjustment).  Since the return address is typically saved on the
5103    stack when a function makes a call, the saved LR must be updated outside
5104    the epilogue.
5105
5106    This poses problems as the store is generated well before the epilogue,
5107    so the offset of LR is not known yet.  Also optimizations will remove the
5108    store as it appears dead, even after the epilogue is generated (as the
5109    base or offset for loading LR is different in many cases).
5110
5111    To avoid these problems this implementation forces the frame pointer
5112    in eh_return functions so that the location of LR is fixed and known early.
5113    It also marks the store volatile, so no optimization is permitted to
5114    remove the store.  */
5115 rtx
5116 aarch64_eh_return_handler_rtx (void)
5117 {
5118   rtx tmp = gen_frame_mem (Pmode,
5119     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
5120
5121   /* Mark the store volatile, so no optimization is permitted to remove it.  */
5122   MEM_VOLATILE_P (tmp) = true;
5123   return tmp;
5124 }
5125
5126 /* Output code to add DELTA to the first argument, and then jump
5127    to FUNCTION.  Used for C++ multiple inheritance.  */
5128 static void
5129 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
5130                          HOST_WIDE_INT delta,
5131                          HOST_WIDE_INT vcall_offset,
5132                          tree function)
5133 {
5134   /* The this pointer is always in x0.  Note that this differs from
5135      Arm where the this pointer maybe bumped to r1 if r0 is required
5136      to return a pointer to an aggregate.  On AArch64 a result value
5137      pointer will be in x8.  */
5138   int this_regno = R0_REGNUM;
5139   rtx this_rtx, temp0, temp1, addr, funexp;
5140   rtx_insn *insn;
5141
5142   reload_completed = 1;
5143   emit_note (NOTE_INSN_PROLOGUE_END);
5144
5145   this_rtx = gen_rtx_REG (Pmode, this_regno);
5146   temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
5147   temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
5148
5149   if (vcall_offset == 0)
5150     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
5151   else
5152     {
5153       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
5154
5155       addr = this_rtx;
5156       if (delta != 0)
5157         {
5158           if (delta >= -256 && delta < 256)
5159             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
5160                                        plus_constant (Pmode, this_rtx, delta));
5161           else
5162             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
5163                                 temp1, temp0, false);
5164         }
5165
5166       if (Pmode == ptr_mode)
5167         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
5168       else
5169         aarch64_emit_move (temp0,
5170                            gen_rtx_ZERO_EXTEND (Pmode,
5171                                                 gen_rtx_MEM (ptr_mode, addr)));
5172
5173       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
5174           addr = plus_constant (Pmode, temp0, vcall_offset);
5175       else
5176         {
5177           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
5178                                           Pmode);
5179           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
5180         }
5181
5182       if (Pmode == ptr_mode)
5183         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
5184       else
5185         aarch64_emit_move (temp1,
5186                            gen_rtx_SIGN_EXTEND (Pmode,
5187                                                 gen_rtx_MEM (ptr_mode, addr)));
5188
5189       emit_insn (gen_add2_insn (this_rtx, temp1));
5190     }
5191
5192   /* Generate a tail call to the target function.  */
5193   if (!TREE_USED (function))
5194     {
5195       assemble_external (function);
5196       TREE_USED (function) = 1;
5197     }
5198   funexp = XEXP (DECL_RTL (function), 0);
5199   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
5200   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
5201   SIBLING_CALL_P (insn) = 1;
5202
5203   insn = get_insns ();
5204   shorten_branches (insn);
5205   final_start_function (insn, file, 1);
5206   final (insn, file, 1);
5207   final_end_function ();
5208
5209   /* Stop pretending to be a post-reload pass.  */
5210   reload_completed = 0;
5211 }
5212
5213 static bool
5214 aarch64_tls_referenced_p (rtx x)
5215 {
5216   if (!TARGET_HAVE_TLS)
5217     return false;
5218   subrtx_iterator::array_type array;
5219   FOR_EACH_SUBRTX (iter, array, x, ALL)
5220     {
5221       const_rtx x = *iter;
5222       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
5223         return true;
5224       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
5225          TLS offsets, not real symbol references.  */
5226       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5227         iter.skip_subrtxes ();
5228     }
5229   return false;
5230 }
5231
5232
5233 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5234    a left shift of 0 or 12 bits.  */
5235 bool
5236 aarch64_uimm12_shift (HOST_WIDE_INT val)
5237 {
5238   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
5239           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
5240           );
5241 }
5242
5243
5244 /* Return true if val is an immediate that can be loaded into a
5245    register by a MOVZ instruction.  */
5246 static bool
5247 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
5248 {
5249   if (GET_MODE_SIZE (mode) > 4)
5250     {
5251       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
5252           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
5253         return 1;
5254     }
5255   else
5256     {
5257       /* Ignore sign extension.  */
5258       val &= (HOST_WIDE_INT) 0xffffffff;
5259     }
5260   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
5261           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
5262 }
5263
5264 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
5265    64-bit (DImode) integer.  */
5266
5267 static unsigned HOST_WIDE_INT
5268 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5269 {
5270   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
5271   while (size < 64)
5272     {
5273       val &= (HOST_WIDE_INT_1U << size) - 1;
5274       val |= val << size;
5275       size *= 2;
5276     }
5277   return val;
5278 }
5279
5280 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
5281
5282 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
5283   {
5284     0x0000000100000001ull,
5285     0x0001000100010001ull,
5286     0x0101010101010101ull,
5287     0x1111111111111111ull,
5288     0x5555555555555555ull,
5289   };
5290
5291
5292 /* Return true if val is a valid bitmask immediate.  */
5293
5294 bool
5295 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
5296 {
5297   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
5298   int bits;
5299
5300   /* Check for a single sequence of one bits and return quickly if so.
5301      The special cases of all ones and all zeroes returns false.  */
5302   val = aarch64_replicate_bitmask_imm (val_in, mode);
5303   tmp = val + (val & -val);
5304
5305   if (tmp == (tmp & -tmp))
5306     return (val + 1) > 1;
5307
5308   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
5309   if (mode == SImode)
5310     val = (val << 32) | (val & 0xffffffff);
5311
5312   /* Invert if the immediate doesn't start with a zero bit - this means we
5313      only need to search for sequences of one bits.  */
5314   if (val & 1)
5315     val = ~val;
5316
5317   /* Find the first set bit and set tmp to val with the first sequence of one
5318      bits removed.  Return success if there is a single sequence of ones.  */
5319   first_one = val & -val;
5320   tmp = val & (val + first_one);
5321
5322   if (tmp == 0)
5323     return true;
5324
5325   /* Find the next set bit and compute the difference in bit position.  */
5326   next_one = tmp & -tmp;
5327   bits = clz_hwi (first_one) - clz_hwi (next_one);
5328   mask = val ^ tmp;
5329
5330   /* Check the bit position difference is a power of 2, and that the first
5331      sequence of one bits fits within 'bits' bits.  */
5332   if ((mask >> bits) != 0 || bits != (bits & -bits))
5333     return false;
5334
5335   /* Check the sequence of one bits is repeated 64/bits times.  */
5336   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
5337 }
5338
5339 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5340    Assumed precondition: VAL_IN Is not zero.  */
5341
5342 unsigned HOST_WIDE_INT
5343 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
5344 {
5345   int lowest_bit_set = ctz_hwi (val_in);
5346   int highest_bit_set = floor_log2 (val_in);
5347   gcc_assert (val_in != 0);
5348
5349   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
5350           (HOST_WIDE_INT_1U << lowest_bit_set));
5351 }
5352
5353 /* Create constant where bits outside of lowest bit set to highest bit set
5354    are set to 1.  */
5355
5356 unsigned HOST_WIDE_INT
5357 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
5358 {
5359   return val_in | ~aarch64_and_split_imm1 (val_in);
5360 }
5361
5362 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
5363
5364 bool
5365 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
5366 {
5367   scalar_int_mode int_mode;
5368   if (!is_a <scalar_int_mode> (mode, &int_mode))
5369     return false;
5370
5371   if (aarch64_bitmask_imm (val_in, int_mode))
5372     return false;
5373
5374   if (aarch64_move_imm (val_in, int_mode))
5375     return false;
5376
5377   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
5378
5379   return aarch64_bitmask_imm (imm2, int_mode);
5380 }
5381
5382 /* Return true if val is an immediate that can be loaded into a
5383    register in a single instruction.  */
5384 bool
5385 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
5386 {
5387   scalar_int_mode int_mode;
5388   if (!is_a <scalar_int_mode> (mode, &int_mode))
5389     return false;
5390
5391   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
5392     return 1;
5393   return aarch64_bitmask_imm (val, int_mode);
5394 }
5395
5396 static bool
5397 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
5398 {
5399   rtx base, offset;
5400
5401   if (GET_CODE (x) == HIGH)
5402     return true;
5403
5404   /* There's no way to calculate VL-based values using relocations.  */
5405   subrtx_iterator::array_type array;
5406   FOR_EACH_SUBRTX (iter, array, x, ALL)
5407     if (GET_CODE (*iter) == CONST_POLY_INT)
5408       return true;
5409
5410   split_const (x, &base, &offset);
5411   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
5412     {
5413       if (aarch64_classify_symbol (base, INTVAL (offset))
5414           != SYMBOL_FORCE_TO_MEM)
5415         return true;
5416       else
5417         /* Avoid generating a 64-bit relocation in ILP32; leave
5418            to aarch64_expand_mov_immediate to handle it properly.  */
5419         return mode != ptr_mode;
5420     }
5421
5422   return aarch64_tls_referenced_p (x);
5423 }
5424
5425 /* Implement TARGET_CASE_VALUES_THRESHOLD.
5426    The expansion for a table switch is quite expensive due to the number
5427    of instructions, the table lookup and hard to predict indirect jump.
5428    When optimizing for speed, and -O3 enabled, use the per-core tuning if
5429    set, otherwise use tables for > 16 cases as a tradeoff between size and
5430    performance.  When optimizing for size, use the default setting.  */
5431
5432 static unsigned int
5433 aarch64_case_values_threshold (void)
5434 {
5435   /* Use the specified limit for the number of cases before using jump
5436      tables at higher optimization levels.  */
5437   if (optimize > 2
5438       && selected_cpu->tune->max_case_values != 0)
5439     return selected_cpu->tune->max_case_values;
5440   else
5441     return optimize_size ? default_case_values_threshold () : 17;
5442 }
5443
5444 /* Return true if register REGNO is a valid index register.
5445    STRICT_P is true if REG_OK_STRICT is in effect.  */
5446
5447 bool
5448 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
5449 {
5450   if (!HARD_REGISTER_NUM_P (regno))
5451     {
5452       if (!strict_p)
5453         return true;
5454
5455       if (!reg_renumber)
5456         return false;
5457
5458       regno = reg_renumber[regno];
5459     }
5460   return GP_REGNUM_P (regno);
5461 }
5462
5463 /* Return true if register REGNO is a valid base register for mode MODE.
5464    STRICT_P is true if REG_OK_STRICT is in effect.  */
5465
5466 bool
5467 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
5468 {
5469   if (!HARD_REGISTER_NUM_P (regno))
5470     {
5471       if (!strict_p)
5472         return true;
5473
5474       if (!reg_renumber)
5475         return false;
5476
5477       regno = reg_renumber[regno];
5478     }
5479
5480   /* The fake registers will be eliminated to either the stack or
5481      hard frame pointer, both of which are usually valid base registers.
5482      Reload deals with the cases where the eliminated form isn't valid.  */
5483   return (GP_REGNUM_P (regno)
5484           || regno == SP_REGNUM
5485           || regno == FRAME_POINTER_REGNUM
5486           || regno == ARG_POINTER_REGNUM);
5487 }
5488
5489 /* Return true if X is a valid base register for mode MODE.
5490    STRICT_P is true if REG_OK_STRICT is in effect.  */
5491
5492 static bool
5493 aarch64_base_register_rtx_p (rtx x, bool strict_p)
5494 {
5495   if (!strict_p
5496       && GET_CODE (x) == SUBREG
5497       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
5498     x = SUBREG_REG (x);
5499
5500   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
5501 }
5502
5503 /* Return true if address offset is a valid index.  If it is, fill in INFO
5504    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
5505
5506 static bool
5507 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
5508                         machine_mode mode, bool strict_p)
5509 {
5510   enum aarch64_address_type type;
5511   rtx index;
5512   int shift;
5513
5514   /* (reg:P) */
5515   if ((REG_P (x) || GET_CODE (x) == SUBREG)
5516       && GET_MODE (x) == Pmode)
5517     {
5518       type = ADDRESS_REG_REG;
5519       index = x;
5520       shift = 0;
5521     }
5522   /* (sign_extend:DI (reg:SI)) */
5523   else if ((GET_CODE (x) == SIGN_EXTEND
5524             || GET_CODE (x) == ZERO_EXTEND)
5525            && GET_MODE (x) == DImode
5526            && GET_MODE (XEXP (x, 0)) == SImode)
5527     {
5528       type = (GET_CODE (x) == SIGN_EXTEND)
5529         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5530       index = XEXP (x, 0);
5531       shift = 0;
5532     }
5533   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
5534   else if (GET_CODE (x) == MULT
5535            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5536                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5537            && GET_MODE (XEXP (x, 0)) == DImode
5538            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5539            && CONST_INT_P (XEXP (x, 1)))
5540     {
5541       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5542         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5543       index = XEXP (XEXP (x, 0), 0);
5544       shift = exact_log2 (INTVAL (XEXP (x, 1)));
5545     }
5546   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
5547   else if (GET_CODE (x) == ASHIFT
5548            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5549                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5550            && GET_MODE (XEXP (x, 0)) == DImode
5551            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5552            && CONST_INT_P (XEXP (x, 1)))
5553     {
5554       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5555         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5556       index = XEXP (XEXP (x, 0), 0);
5557       shift = INTVAL (XEXP (x, 1));
5558     }
5559   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
5560   else if ((GET_CODE (x) == SIGN_EXTRACT
5561             || GET_CODE (x) == ZERO_EXTRACT)
5562            && GET_MODE (x) == DImode
5563            && GET_CODE (XEXP (x, 0)) == MULT
5564            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5565            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5566     {
5567       type = (GET_CODE (x) == SIGN_EXTRACT)
5568         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5569       index = XEXP (XEXP (x, 0), 0);
5570       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5571       if (INTVAL (XEXP (x, 1)) != 32 + shift
5572           || INTVAL (XEXP (x, 2)) != 0)
5573         shift = -1;
5574     }
5575   /* (and:DI (mult:DI (reg:DI) (const_int scale))
5576      (const_int 0xffffffff<<shift)) */
5577   else if (GET_CODE (x) == AND
5578            && GET_MODE (x) == DImode
5579            && GET_CODE (XEXP (x, 0)) == MULT
5580            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5581            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5582            && CONST_INT_P (XEXP (x, 1)))
5583     {
5584       type = ADDRESS_REG_UXTW;
5585       index = XEXP (XEXP (x, 0), 0);
5586       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5587       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5588         shift = -1;
5589     }
5590   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
5591   else if ((GET_CODE (x) == SIGN_EXTRACT
5592             || GET_CODE (x) == ZERO_EXTRACT)
5593            && GET_MODE (x) == DImode
5594            && GET_CODE (XEXP (x, 0)) == ASHIFT
5595            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5596            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5597     {
5598       type = (GET_CODE (x) == SIGN_EXTRACT)
5599         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5600       index = XEXP (XEXP (x, 0), 0);
5601       shift = INTVAL (XEXP (XEXP (x, 0), 1));
5602       if (INTVAL (XEXP (x, 1)) != 32 + shift
5603           || INTVAL (XEXP (x, 2)) != 0)
5604         shift = -1;
5605     }
5606   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
5607      (const_int 0xffffffff<<shift)) */
5608   else if (GET_CODE (x) == AND
5609            && GET_MODE (x) == DImode
5610            && GET_CODE (XEXP (x, 0)) == ASHIFT
5611            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5612            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5613            && CONST_INT_P (XEXP (x, 1)))
5614     {
5615       type = ADDRESS_REG_UXTW;
5616       index = XEXP (XEXP (x, 0), 0);
5617       shift = INTVAL (XEXP (XEXP (x, 0), 1));
5618       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5619         shift = -1;
5620     }
5621   /* (mult:P (reg:P) (const_int scale)) */
5622   else if (GET_CODE (x) == MULT
5623            && GET_MODE (x) == Pmode
5624            && GET_MODE (XEXP (x, 0)) == Pmode
5625            && CONST_INT_P (XEXP (x, 1)))
5626     {
5627       type = ADDRESS_REG_REG;
5628       index = XEXP (x, 0);
5629       shift = exact_log2 (INTVAL (XEXP (x, 1)));
5630     }
5631   /* (ashift:P (reg:P) (const_int shift)) */
5632   else if (GET_CODE (x) == ASHIFT
5633            && GET_MODE (x) == Pmode
5634            && GET_MODE (XEXP (x, 0)) == Pmode
5635            && CONST_INT_P (XEXP (x, 1)))
5636     {
5637       type = ADDRESS_REG_REG;
5638       index = XEXP (x, 0);
5639       shift = INTVAL (XEXP (x, 1));
5640     }
5641   else
5642     return false;
5643
5644   if (!strict_p
5645       && GET_CODE (index) == SUBREG
5646       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
5647     index = SUBREG_REG (index);
5648
5649   if (aarch64_sve_data_mode_p (mode))
5650     {
5651       if (type != ADDRESS_REG_REG
5652           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
5653         return false;
5654     }
5655   else
5656     {
5657       if (shift != 0
5658           && !(IN_RANGE (shift, 1, 3)
5659                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
5660         return false;
5661     }
5662
5663   if (REG_P (index)
5664       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
5665     {
5666       info->type = type;
5667       info->offset = index;
5668       info->shift = shift;
5669       return true;
5670     }
5671
5672   return false;
5673 }
5674
5675 /* Return true if MODE is one of the modes for which we
5676    support LDP/STP operations.  */
5677
5678 static bool
5679 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
5680 {
5681   return mode == SImode || mode == DImode
5682          || mode == SFmode || mode == DFmode
5683          || (aarch64_vector_mode_supported_p (mode)
5684              && known_eq (GET_MODE_SIZE (mode), 8));
5685 }
5686
5687 /* Return true if REGNO is a virtual pointer register, or an eliminable
5688    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
5689    include stack_pointer or hard_frame_pointer.  */
5690 static bool
5691 virt_or_elim_regno_p (unsigned regno)
5692 {
5693   return ((regno >= FIRST_VIRTUAL_REGISTER
5694            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
5695           || regno == FRAME_POINTER_REGNUM
5696           || regno == ARG_POINTER_REGNUM);
5697 }
5698
5699 /* Return true if X is a valid address of type TYPE for machine mode MODE.
5700    If it is, fill in INFO appropriately.  STRICT_P is true if
5701    REG_OK_STRICT is in effect.  */
5702
5703 static bool
5704 aarch64_classify_address (struct aarch64_address_info *info,
5705                           rtx x, machine_mode mode, bool strict_p,
5706                           aarch64_addr_query_type type = ADDR_QUERY_M)
5707 {
5708   enum rtx_code code = GET_CODE (x);
5709   rtx op0, op1;
5710   poly_int64 offset;
5711
5712   HOST_WIDE_INT const_size;
5713
5714   /* On BE, we use load/store pair for all large int mode load/stores.
5715      TI/TFmode may also use a load/store pair.  */
5716   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5717   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
5718   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
5719                             || mode == TImode
5720                             || mode == TFmode
5721                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
5722
5723   bool allow_reg_index_p = (!load_store_pair_p
5724                             && (known_lt (GET_MODE_SIZE (mode), 16)
5725                                 || vec_flags == VEC_ADVSIMD
5726                                 || vec_flags == VEC_SVE_DATA));
5727
5728   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
5729      [Rn, #offset, MUL VL].  */
5730   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
5731       && (code != REG && code != PLUS))
5732     return false;
5733
5734   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
5735      REG addressing.  */
5736   if (advsimd_struct_p
5737       && !BYTES_BIG_ENDIAN
5738       && (code != POST_INC && code != REG))
5739     return false;
5740
5741   gcc_checking_assert (GET_MODE (x) == VOIDmode
5742                        || SCALAR_INT_MODE_P (GET_MODE (x)));
5743
5744   switch (code)
5745     {
5746     case REG:
5747     case SUBREG:
5748       info->type = ADDRESS_REG_IMM;
5749       info->base = x;
5750       info->offset = const0_rtx;
5751       info->const_offset = 0;
5752       return aarch64_base_register_rtx_p (x, strict_p);
5753
5754     case PLUS:
5755       op0 = XEXP (x, 0);
5756       op1 = XEXP (x, 1);
5757
5758       if (! strict_p
5759           && REG_P (op0)
5760           && virt_or_elim_regno_p (REGNO (op0))
5761           && poly_int_rtx_p (op1, &offset))
5762         {
5763           info->type = ADDRESS_REG_IMM;
5764           info->base = op0;
5765           info->offset = op1;
5766           info->const_offset = offset;
5767
5768           return true;
5769         }
5770
5771       if (maybe_ne (GET_MODE_SIZE (mode), 0)
5772           && aarch64_base_register_rtx_p (op0, strict_p)
5773           && poly_int_rtx_p (op1, &offset))
5774         {
5775           info->type = ADDRESS_REG_IMM;
5776           info->base = op0;
5777           info->offset = op1;
5778           info->const_offset = offset;
5779
5780           /* TImode and TFmode values are allowed in both pairs of X
5781              registers and individual Q registers.  The available
5782              address modes are:
5783              X,X: 7-bit signed scaled offset
5784              Q:   9-bit signed offset
5785              We conservatively require an offset representable in either mode.
5786              When performing the check for pairs of X registers i.e.  LDP/STP
5787              pass down DImode since that is the natural size of the LDP/STP
5788              instruction memory accesses.  */
5789           if (mode == TImode || mode == TFmode)
5790             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
5791                     && (offset_9bit_signed_unscaled_p (mode, offset)
5792                         || offset_12bit_unsigned_scaled_p (mode, offset)));
5793
5794           /* A 7bit offset check because OImode will emit a ldp/stp
5795              instruction (only big endian will get here).
5796              For ldp/stp instructions, the offset is scaled for the size of a
5797              single element of the pair.  */
5798           if (mode == OImode)
5799             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
5800
5801           /* Three 9/12 bit offsets checks because CImode will emit three
5802              ldr/str instructions (only big endian will get here).  */
5803           if (mode == CImode)
5804             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5805                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
5806                         || offset_12bit_unsigned_scaled_p (V16QImode,
5807                                                            offset + 32)));
5808
5809           /* Two 7bit offsets checks because XImode will emit two ldp/stp
5810              instructions (only big endian will get here).  */
5811           if (mode == XImode)
5812             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5813                     && aarch64_offset_7bit_signed_scaled_p (TImode,
5814                                                             offset + 32));
5815
5816           /* Make "m" use the LD1 offset range for SVE data modes, so
5817              that pre-RTL optimizers like ivopts will work to that
5818              instead of the wider LDR/STR range.  */
5819           if (vec_flags == VEC_SVE_DATA)
5820             return (type == ADDR_QUERY_M
5821                     ? offset_4bit_signed_scaled_p (mode, offset)
5822                     : offset_9bit_signed_scaled_p (mode, offset));
5823
5824           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
5825             {
5826               poly_int64 end_offset = (offset
5827                                        + GET_MODE_SIZE (mode)
5828                                        - BYTES_PER_SVE_VECTOR);
5829               return (type == ADDR_QUERY_M
5830                       ? offset_4bit_signed_scaled_p (mode, offset)
5831                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
5832                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
5833                                                          end_offset)));
5834             }
5835
5836           if (vec_flags == VEC_SVE_PRED)
5837             return offset_9bit_signed_scaled_p (mode, offset);
5838
5839           if (load_store_pair_p)
5840             return ((known_eq (GET_MODE_SIZE (mode), 4)
5841                      || known_eq (GET_MODE_SIZE (mode), 8))
5842                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5843           else
5844             return (offset_9bit_signed_unscaled_p (mode, offset)
5845                     || offset_12bit_unsigned_scaled_p (mode, offset));
5846         }
5847
5848       if (allow_reg_index_p)
5849         {
5850           /* Look for base + (scaled/extended) index register.  */
5851           if (aarch64_base_register_rtx_p (op0, strict_p)
5852               && aarch64_classify_index (info, op1, mode, strict_p))
5853             {
5854               info->base = op0;
5855               return true;
5856             }
5857           if (aarch64_base_register_rtx_p (op1, strict_p)
5858               && aarch64_classify_index (info, op0, mode, strict_p))
5859             {
5860               info->base = op1;
5861               return true;
5862             }
5863         }
5864
5865       return false;
5866
5867     case POST_INC:
5868     case POST_DEC:
5869     case PRE_INC:
5870     case PRE_DEC:
5871       info->type = ADDRESS_REG_WB;
5872       info->base = XEXP (x, 0);
5873       info->offset = NULL_RTX;
5874       return aarch64_base_register_rtx_p (info->base, strict_p);
5875
5876     case POST_MODIFY:
5877     case PRE_MODIFY:
5878       info->type = ADDRESS_REG_WB;
5879       info->base = XEXP (x, 0);
5880       if (GET_CODE (XEXP (x, 1)) == PLUS
5881           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
5882           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
5883           && aarch64_base_register_rtx_p (info->base, strict_p))
5884         {
5885           info->offset = XEXP (XEXP (x, 1), 1);
5886           info->const_offset = offset;
5887
5888           /* TImode and TFmode values are allowed in both pairs of X
5889              registers and individual Q registers.  The available
5890              address modes are:
5891              X,X: 7-bit signed scaled offset
5892              Q:   9-bit signed offset
5893              We conservatively require an offset representable in either mode.
5894            */
5895           if (mode == TImode || mode == TFmode)
5896             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
5897                     && offset_9bit_signed_unscaled_p (mode, offset));
5898
5899           if (load_store_pair_p)
5900             return ((known_eq (GET_MODE_SIZE (mode), 4)
5901                      || known_eq (GET_MODE_SIZE (mode), 8))
5902                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5903           else
5904             return offset_9bit_signed_unscaled_p (mode, offset);
5905         }
5906       return false;
5907
5908     case CONST:
5909     case SYMBOL_REF:
5910     case LABEL_REF:
5911       /* load literal: pc-relative constant pool entry.  Only supported
5912          for SI mode or larger.  */
5913       info->type = ADDRESS_SYMBOLIC;
5914
5915       if (!load_store_pair_p
5916           && GET_MODE_SIZE (mode).is_constant (&const_size)
5917           && const_size >= 4)
5918         {
5919           rtx sym, addend;
5920
5921           split_const (x, &sym, &addend);
5922           return ((GET_CODE (sym) == LABEL_REF
5923                    || (GET_CODE (sym) == SYMBOL_REF
5924                        && CONSTANT_POOL_ADDRESS_P (sym)
5925                        && aarch64_pcrelative_literal_loads)));
5926         }
5927       return false;
5928
5929     case LO_SUM:
5930       info->type = ADDRESS_LO_SUM;
5931       info->base = XEXP (x, 0);
5932       info->offset = XEXP (x, 1);
5933       if (allow_reg_index_p
5934           && aarch64_base_register_rtx_p (info->base, strict_p))
5935         {
5936           rtx sym, offs;
5937           split_const (info->offset, &sym, &offs);
5938           if (GET_CODE (sym) == SYMBOL_REF
5939               && (aarch64_classify_symbol (sym, INTVAL (offs))
5940                   == SYMBOL_SMALL_ABSOLUTE))
5941             {
5942               /* The symbol and offset must be aligned to the access size.  */
5943               unsigned int align;
5944
5945               if (CONSTANT_POOL_ADDRESS_P (sym))
5946                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
5947               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
5948                 {
5949                   tree exp = SYMBOL_REF_DECL (sym);
5950                   align = TYPE_ALIGN (TREE_TYPE (exp));
5951                   align = aarch64_constant_alignment (exp, align);
5952                 }
5953               else if (SYMBOL_REF_DECL (sym))
5954                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
5955               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
5956                        && SYMBOL_REF_BLOCK (sym) != NULL)
5957                 align = SYMBOL_REF_BLOCK (sym)->alignment;
5958               else
5959                 align = BITS_PER_UNIT;
5960
5961               poly_int64 ref_size = GET_MODE_SIZE (mode);
5962               if (known_eq (ref_size, 0))
5963                 ref_size = GET_MODE_SIZE (DImode);
5964
5965               return (multiple_p (INTVAL (offs), ref_size)
5966                       && multiple_p (align / BITS_PER_UNIT, ref_size));
5967             }
5968         }
5969       return false;
5970
5971     default:
5972       return false;
5973     }
5974 }
5975
5976 /* Return true if the address X is valid for a PRFM instruction.
5977    STRICT_P is true if we should do strict checking with
5978    aarch64_classify_address.  */
5979
5980 bool
5981 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
5982 {
5983   struct aarch64_address_info addr;
5984
5985   /* PRFM accepts the same addresses as DImode...  */
5986   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
5987   if (!res)
5988     return false;
5989
5990   /* ... except writeback forms.  */
5991   return addr.type != ADDRESS_REG_WB;
5992 }
5993
5994 bool
5995 aarch64_symbolic_address_p (rtx x)
5996 {
5997   rtx offset;
5998
5999   split_const (x, &x, &offset);
6000   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
6001 }
6002
6003 /* Classify the base of symbolic expression X.  */
6004
6005 enum aarch64_symbol_type
6006 aarch64_classify_symbolic_expression (rtx x)
6007 {
6008   rtx offset;
6009
6010   split_const (x, &x, &offset);
6011   return aarch64_classify_symbol (x, INTVAL (offset));
6012 }
6013
6014
6015 /* Return TRUE if X is a legitimate address for accessing memory in
6016    mode MODE.  */
6017 static bool
6018 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
6019 {
6020   struct aarch64_address_info addr;
6021
6022   return aarch64_classify_address (&addr, x, mode, strict_p);
6023 }
6024
6025 /* Return TRUE if X is a legitimate address of type TYPE for accessing
6026    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
6027 bool
6028 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
6029                               aarch64_addr_query_type type)
6030 {
6031   struct aarch64_address_info addr;
6032
6033   return aarch64_classify_address (&addr, x, mode, strict_p, type);
6034 }
6035
6036 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
6037
6038 static bool
6039 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
6040                                          poly_int64 orig_offset,
6041                                          machine_mode mode)
6042 {
6043   HOST_WIDE_INT size;
6044   if (GET_MODE_SIZE (mode).is_constant (&size))
6045     {
6046       HOST_WIDE_INT const_offset, second_offset;
6047
6048       /* A general SVE offset is A * VQ + B.  Remove the A component from
6049          coefficient 0 in order to get the constant B.  */
6050       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
6051
6052       /* Split an out-of-range address displacement into a base and
6053          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
6054          range otherwise to increase opportunities for sharing the base
6055          address of different sizes.  Unaligned accesses use the signed
6056          9-bit range, TImode/TFmode use the intersection of signed
6057          scaled 7-bit and signed 9-bit offset.  */
6058       if (mode == TImode || mode == TFmode)
6059         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
6060       else if ((const_offset & (size - 1)) != 0)
6061         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
6062       else
6063         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
6064
6065       if (second_offset == 0 || known_eq (orig_offset, second_offset))
6066         return false;
6067
6068       /* Split the offset into second_offset and the rest.  */
6069       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6070       *offset2 = gen_int_mode (second_offset, Pmode);
6071       return true;
6072     }
6073   else
6074     {
6075       /* Get the mode we should use as the basis of the range.  For structure
6076          modes this is the mode of one vector.  */
6077       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6078       machine_mode step_mode
6079         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
6080
6081       /* Get the "mul vl" multiplier we'd like to use.  */
6082       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
6083       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
6084       if (vec_flags & VEC_SVE_DATA)
6085         /* LDR supports a 9-bit range, but the move patterns for
6086            structure modes require all vectors to be in range of the
6087            same base.  The simplest way of accomodating that while still
6088            promoting reuse of anchor points between different modes is
6089            to use an 8-bit range unconditionally.  */
6090         vnum = ((vnum + 128) & 255) - 128;
6091       else
6092         /* Predicates are only handled singly, so we might as well use
6093            the full range.  */
6094         vnum = ((vnum + 256) & 511) - 256;
6095       if (vnum == 0)
6096         return false;
6097
6098       /* Convert the "mul vl" multiplier into a byte offset.  */
6099       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
6100       if (known_eq (second_offset, orig_offset))
6101         return false;
6102
6103       /* Split the offset into second_offset and the rest.  */
6104       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6105       *offset2 = gen_int_mode (second_offset, Pmode);
6106       return true;
6107     }
6108 }
6109
6110 /* Return the binary representation of floating point constant VALUE in INTVAL.
6111    If the value cannot be converted, return false without setting INTVAL.
6112    The conversion is done in the given MODE.  */
6113 bool
6114 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
6115 {
6116
6117   /* We make a general exception for 0.  */
6118   if (aarch64_float_const_zero_rtx_p (value))
6119     {
6120       *intval = 0;
6121       return true;
6122     }
6123
6124   scalar_float_mode mode;
6125   if (GET_CODE (value) != CONST_DOUBLE
6126       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
6127       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
6128       /* Only support up to DF mode.  */
6129       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
6130     return false;
6131
6132   unsigned HOST_WIDE_INT ival = 0;
6133
6134   long res[2];
6135   real_to_target (res,
6136                   CONST_DOUBLE_REAL_VALUE (value),
6137                   REAL_MODE_FORMAT (mode));
6138
6139   if (mode == DFmode)
6140     {
6141       int order = BYTES_BIG_ENDIAN ? 1 : 0;
6142       ival = zext_hwi (res[order], 32);
6143       ival |= (zext_hwi (res[1 - order], 32) << 32);
6144     }
6145   else
6146       ival = zext_hwi (res[0], 32);
6147
6148   *intval = ival;
6149   return true;
6150 }
6151
6152 /* Return TRUE if rtx X is an immediate constant that can be moved using a
6153    single MOV(+MOVK) followed by an FMOV.  */
6154 bool
6155 aarch64_float_const_rtx_p (rtx x)
6156 {
6157   machine_mode mode = GET_MODE (x);
6158   if (mode == VOIDmode)
6159     return false;
6160
6161   /* Determine whether it's cheaper to write float constants as
6162      mov/movk pairs over ldr/adrp pairs.  */
6163   unsigned HOST_WIDE_INT ival;
6164
6165   if (GET_CODE (x) == CONST_DOUBLE
6166       && SCALAR_FLOAT_MODE_P (mode)
6167       && aarch64_reinterpret_float_as_int (x, &ival))
6168     {
6169       scalar_int_mode imode = (mode == HFmode
6170                                ? SImode
6171                                : int_mode_for_mode (mode).require ());
6172       int num_instr = aarch64_internal_mov_immediate
6173                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
6174       return num_instr < 3;
6175     }
6176
6177   return false;
6178 }
6179
6180 /* Return TRUE if rtx X is immediate constant 0.0 */
6181 bool
6182 aarch64_float_const_zero_rtx_p (rtx x)
6183 {
6184   if (GET_MODE (x) == VOIDmode)
6185     return false;
6186
6187   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
6188     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
6189   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
6190 }
6191
6192 /* Return TRUE if rtx X is immediate constant that fits in a single
6193    MOVI immediate operation.  */
6194 bool
6195 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
6196 {
6197   if (!TARGET_SIMD)
6198      return false;
6199
6200   machine_mode vmode;
6201   scalar_int_mode imode;
6202   unsigned HOST_WIDE_INT ival;
6203
6204   if (GET_CODE (x) == CONST_DOUBLE
6205       && SCALAR_FLOAT_MODE_P (mode))
6206     {
6207       if (!aarch64_reinterpret_float_as_int (x, &ival))
6208         return false;
6209
6210       /* We make a general exception for 0.  */
6211       if (aarch64_float_const_zero_rtx_p (x))
6212         return true;
6213
6214       imode = int_mode_for_mode (mode).require ();
6215     }
6216   else if (GET_CODE (x) == CONST_INT
6217            && is_a <scalar_int_mode> (mode, &imode))
6218     ival = INTVAL (x);
6219   else
6220     return false;
6221
6222    /* use a 64 bit mode for everything except for DI/DF mode, where we use
6223      a 128 bit vector mode.  */
6224   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
6225
6226   vmode = aarch64_simd_container_mode (imode, width);
6227   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
6228
6229   return aarch64_simd_valid_immediate (v_op, NULL);
6230 }
6231
6232
6233 /* Return the fixed registers used for condition codes.  */
6234
6235 static bool
6236 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
6237 {
6238   *p1 = CC_REGNUM;
6239   *p2 = INVALID_REGNUM;
6240   return true;
6241 }
6242
6243 /* This function is used by the call expanders of the machine description.
6244    RESULT is the register in which the result is returned.  It's NULL for
6245    "call" and "sibcall".
6246    MEM is the location of the function call.
6247    SIBCALL indicates whether this function call is normal call or sibling call.
6248    It will generate different pattern accordingly.  */
6249
6250 void
6251 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
6252 {
6253   rtx call, callee, tmp;
6254   rtvec vec;
6255   machine_mode mode;
6256
6257   gcc_assert (MEM_P (mem));
6258   callee = XEXP (mem, 0);
6259   mode = GET_MODE (callee);
6260   gcc_assert (mode == Pmode);
6261
6262   /* Decide if we should generate indirect calls by loading the
6263      address of the callee into a register before performing
6264      the branch-and-link.  */
6265   if (SYMBOL_REF_P (callee)
6266       ? (aarch64_is_long_call_p (callee)
6267          || aarch64_is_noplt_call_p (callee))
6268       : !REG_P (callee))
6269     XEXP (mem, 0) = force_reg (mode, callee);
6270
6271   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
6272
6273   if (result != NULL_RTX)
6274     call = gen_rtx_SET (result, call);
6275
6276   if (sibcall)
6277     tmp = ret_rtx;
6278   else
6279     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
6280
6281   vec = gen_rtvec (2, call, tmp);
6282   call = gen_rtx_PARALLEL (VOIDmode, vec);
6283
6284   aarch64_emit_call_insn (call);
6285 }
6286
6287 /* Emit call insn with PAT and do aarch64-specific handling.  */
6288
6289 void
6290 aarch64_emit_call_insn (rtx pat)
6291 {
6292   rtx insn = emit_call_insn (pat);
6293
6294   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
6295   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
6296   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
6297 }
6298
6299 machine_mode
6300 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
6301 {
6302   /* All floating point compares return CCFP if it is an equality
6303      comparison, and CCFPE otherwise.  */
6304   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
6305     {
6306       switch (code)
6307         {
6308         case EQ:
6309         case NE:
6310         case UNORDERED:
6311         case ORDERED:
6312         case UNLT:
6313         case UNLE:
6314         case UNGT:
6315         case UNGE:
6316         case UNEQ:
6317           return CCFPmode;
6318
6319         case LT:
6320         case LE:
6321         case GT:
6322         case GE:
6323         case LTGT:
6324           return CCFPEmode;
6325
6326         default:
6327           gcc_unreachable ();
6328         }
6329     }
6330
6331   /* Equality comparisons of short modes against zero can be performed
6332      using the TST instruction with the appropriate bitmask.  */
6333   if (y == const0_rtx && REG_P (x)
6334       && (code == EQ || code == NE)
6335       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
6336     return CC_NZmode;
6337
6338   /* Similarly, comparisons of zero_extends from shorter modes can
6339      be performed using an ANDS with an immediate mask.  */
6340   if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
6341       && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6342       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
6343       && (code == EQ || code == NE))
6344     return CC_NZmode;
6345
6346   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6347       && y == const0_rtx
6348       && (code == EQ || code == NE || code == LT || code == GE)
6349       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
6350           || GET_CODE (x) == NEG
6351           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
6352               && CONST_INT_P (XEXP (x, 2)))))
6353     return CC_NZmode;
6354
6355   /* A compare with a shifted operand.  Because of canonicalization,
6356      the comparison will have to be swapped when we emit the assembly
6357      code.  */
6358   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6359       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
6360       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
6361           || GET_CODE (x) == LSHIFTRT
6362           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
6363     return CC_SWPmode;
6364
6365   /* Similarly for a negated operand, but we can only do this for
6366      equalities.  */
6367   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6368       && (REG_P (y) || GET_CODE (y) == SUBREG)
6369       && (code == EQ || code == NE)
6370       && GET_CODE (x) == NEG)
6371     return CC_Zmode;
6372
6373   /* A test for unsigned overflow.  */
6374   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
6375       && code == NE
6376       && GET_CODE (x) == PLUS
6377       && GET_CODE (y) == ZERO_EXTEND)
6378     return CC_Cmode;
6379
6380   /* For everything else, return CCmode.  */
6381   return CCmode;
6382 }
6383
6384 static int
6385 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
6386
6387 int
6388 aarch64_get_condition_code (rtx x)
6389 {
6390   machine_mode mode = GET_MODE (XEXP (x, 0));
6391   enum rtx_code comp_code = GET_CODE (x);
6392
6393   if (GET_MODE_CLASS (mode) != MODE_CC)
6394     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
6395   return aarch64_get_condition_code_1 (mode, comp_code);
6396 }
6397
6398 static int
6399 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
6400 {
6401   switch (mode)
6402     {
6403     case E_CCFPmode:
6404     case E_CCFPEmode:
6405       switch (comp_code)
6406         {
6407         case GE: return AARCH64_GE;
6408         case GT: return AARCH64_GT;
6409         case LE: return AARCH64_LS;
6410         case LT: return AARCH64_MI;
6411         case NE: return AARCH64_NE;
6412         case EQ: return AARCH64_EQ;
6413         case ORDERED: return AARCH64_VC;
6414         case UNORDERED: return AARCH64_VS;
6415         case UNLT: return AARCH64_LT;
6416         case UNLE: return AARCH64_LE;
6417         case UNGT: return AARCH64_HI;
6418         case UNGE: return AARCH64_PL;
6419         default: return -1;
6420         }
6421       break;
6422
6423     case E_CCmode:
6424       switch (comp_code)
6425         {
6426         case NE: return AARCH64_NE;
6427         case EQ: return AARCH64_EQ;
6428         case GE: return AARCH64_GE;
6429         case GT: return AARCH64_GT;
6430         case LE: return AARCH64_LE;
6431         case LT: return AARCH64_LT;
6432         case GEU: return AARCH64_CS;
6433         case GTU: return AARCH64_HI;
6434         case LEU: return AARCH64_LS;
6435         case LTU: return AARCH64_CC;
6436         default: return -1;
6437         }
6438       break;
6439
6440     case E_CC_SWPmode:
6441       switch (comp_code)
6442         {
6443         case NE: return AARCH64_NE;
6444         case EQ: return AARCH64_EQ;
6445         case GE: return AARCH64_LE;
6446         case GT: return AARCH64_LT;
6447         case LE: return AARCH64_GE;
6448         case LT: return AARCH64_GT;
6449         case GEU: return AARCH64_LS;
6450         case GTU: return AARCH64_CC;
6451         case LEU: return AARCH64_CS;
6452         case LTU: return AARCH64_HI;
6453         default: return -1;
6454         }
6455       break;
6456
6457     case E_CC_NZmode:
6458       switch (comp_code)
6459         {
6460         case NE: return AARCH64_NE;
6461         case EQ: return AARCH64_EQ;
6462         case GE: return AARCH64_PL;
6463         case LT: return AARCH64_MI;
6464         default: return -1;
6465         }
6466       break;
6467
6468     case E_CC_Zmode:
6469       switch (comp_code)
6470         {
6471         case NE: return AARCH64_NE;
6472         case EQ: return AARCH64_EQ;
6473         default: return -1;
6474         }
6475       break;
6476
6477     case E_CC_Cmode:
6478       switch (comp_code)
6479         {
6480         case NE: return AARCH64_CS;
6481         case EQ: return AARCH64_CC;
6482         default: return -1;
6483         }
6484       break;
6485
6486     default:
6487       return -1;
6488     }
6489
6490   return -1;
6491 }
6492
6493 bool
6494 aarch64_const_vec_all_same_in_range_p (rtx x,
6495                                        HOST_WIDE_INT minval,
6496                                        HOST_WIDE_INT maxval)
6497 {
6498   rtx elt;
6499   return (const_vec_duplicate_p (x, &elt)
6500           && CONST_INT_P (elt)
6501           && IN_RANGE (INTVAL (elt), minval, maxval));
6502 }
6503
6504 bool
6505 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
6506 {
6507   return aarch64_const_vec_all_same_in_range_p (x, val, val);
6508 }
6509
6510 /* Return true if VEC is a constant in which every element is in the range
6511    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
6512
6513 static bool
6514 aarch64_const_vec_all_in_range_p (rtx vec,
6515                                   HOST_WIDE_INT minval,
6516                                   HOST_WIDE_INT maxval)
6517 {
6518   if (GET_CODE (vec) != CONST_VECTOR
6519       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
6520     return false;
6521
6522   int nunits;
6523   if (!CONST_VECTOR_STEPPED_P (vec))
6524     nunits = const_vector_encoded_nelts (vec);
6525   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
6526     return false;
6527
6528   for (int i = 0; i < nunits; i++)
6529     {
6530       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
6531       if (!CONST_INT_P (vec_elem)
6532           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
6533         return false;
6534     }
6535   return true;
6536 }
6537
6538 /* N Z C V.  */
6539 #define AARCH64_CC_V 1
6540 #define AARCH64_CC_C (1 << 1)
6541 #define AARCH64_CC_Z (1 << 2)
6542 #define AARCH64_CC_N (1 << 3)
6543
6544 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
6545 static const int aarch64_nzcv_codes[] =
6546 {
6547   0,            /* EQ, Z == 1.  */
6548   AARCH64_CC_Z, /* NE, Z == 0.  */
6549   0,            /* CS, C == 1.  */
6550   AARCH64_CC_C, /* CC, C == 0.  */
6551   0,            /* MI, N == 1.  */
6552   AARCH64_CC_N, /* PL, N == 0.  */
6553   0,            /* VS, V == 1.  */
6554   AARCH64_CC_V, /* VC, V == 0.  */
6555   0,            /* HI, C ==1 && Z == 0.  */
6556   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
6557   AARCH64_CC_V, /* GE, N == V.  */
6558   0,            /* LT, N != V.  */
6559   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
6560   0,            /* LE, !(Z == 0 && N == V).  */
6561   0,            /* AL, Any.  */
6562   0             /* NV, Any.  */
6563 };
6564
6565 /* Print floating-point vector immediate operand X to F, negating it
6566    first if NEGATE is true.  Return true on success, false if it isn't
6567    a constant we can handle.  */
6568
6569 static bool
6570 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
6571 {
6572   rtx elt;
6573
6574   if (!const_vec_duplicate_p (x, &elt))
6575     return false;
6576
6577   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
6578   if (negate)
6579     r = real_value_negate (&r);
6580
6581   /* We only handle the SVE single-bit immediates here.  */
6582   if (real_equal (&r, &dconst0))
6583     asm_fprintf (f, "0.0");
6584   else if (real_equal (&r, &dconst1))
6585     asm_fprintf (f, "1.0");
6586   else if (real_equal (&r, &dconsthalf))
6587     asm_fprintf (f, "0.5");
6588   else
6589     return false;
6590
6591   return true;
6592 }
6593
6594 /* Return the equivalent letter for size.  */
6595 static char
6596 sizetochar (int size)
6597 {
6598   switch (size)
6599     {
6600     case 64: return 'd';
6601     case 32: return 's';
6602     case 16: return 'h';
6603     case 8 : return 'b';
6604     default: gcc_unreachable ();
6605     }
6606 }
6607
6608 /* Print operand X to file F in a target specific manner according to CODE.
6609    The acceptable formatting commands given by CODE are:
6610      'c':               An integer or symbol address without a preceding #
6611                         sign.
6612      'C':               Take the duplicated element in a vector constant
6613                         and print it in hex.
6614      'D':               Take the duplicated element in a vector constant
6615                         and print it as an unsigned integer, in decimal.
6616      'e':               Print the sign/zero-extend size as a character 8->b,
6617                         16->h, 32->w.
6618      'p':               Prints N such that 2^N == X (X must be power of 2 and
6619                         const int).
6620      'P':               Print the number of non-zero bits in X (a const_int).
6621      'H':               Print the higher numbered register of a pair (TImode)
6622                         of regs.
6623      'm':               Print a condition (eq, ne, etc).
6624      'M':               Same as 'm', but invert condition.
6625      'N':               Take the duplicated element in a vector constant
6626                         and print the negative of it in decimal.
6627      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
6628      'S/T/U/V':         Print a FP/SIMD register name for a register list.
6629                         The register printed is the FP/SIMD register name
6630                         of X + 0/1/2/3 for S/T/U/V.
6631      'R':               Print a scalar FP/SIMD register name + 1.
6632      'X':               Print bottom 16 bits of integer constant in hex.
6633      'w/x':             Print a general register name or the zero register
6634                         (32-bit or 64-bit).
6635      '0':               Print a normal operand, if it's a general register,
6636                         then we assume DImode.
6637      'k':               Print NZCV for conditional compare instructions.
6638      'A':               Output address constant representing the first
6639                         argument of X, specifying a relocation offset
6640                         if appropriate.
6641      'L':               Output constant address specified by X
6642                         with a relocation offset if appropriate.
6643      'G':               Prints address of X, specifying a PC relative
6644                         relocation mode if appropriate.
6645      'y':               Output address of LDP or STP - this is used for
6646                         some LDP/STPs which don't use a PARALLEL in their
6647                         pattern (so the mode needs to be adjusted).
6648      'z':               Output address of a typical LDP or STP.  */
6649
6650 static void
6651 aarch64_print_operand (FILE *f, rtx x, int code)
6652 {
6653   rtx elt;
6654   switch (code)
6655     {
6656     case 'c':
6657       switch (GET_CODE (x))
6658         {
6659         case CONST_INT:
6660           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
6661           break;
6662
6663         case SYMBOL_REF:
6664           output_addr_const (f, x);
6665           break;
6666
6667         case CONST:
6668           if (GET_CODE (XEXP (x, 0)) == PLUS
6669               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
6670             {
6671               output_addr_const (f, x);
6672               break;
6673             }
6674           /* Fall through.  */
6675
6676         default:
6677           output_operand_lossage ("unsupported operand for code '%c'", code);
6678         }
6679       break;
6680
6681     case 'e':
6682       {
6683         int n;
6684
6685         if (!CONST_INT_P (x)
6686             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
6687           {
6688             output_operand_lossage ("invalid operand for '%%%c'", code);
6689             return;
6690           }
6691
6692         switch (n)
6693           {
6694           case 3:
6695             fputc ('b', f);
6696             break;
6697           case 4:
6698             fputc ('h', f);
6699             break;
6700           case 5:
6701             fputc ('w', f);
6702             break;
6703           default:
6704             output_operand_lossage ("invalid operand for '%%%c'", code);
6705             return;
6706           }
6707       }
6708       break;
6709
6710     case 'p':
6711       {
6712         int n;
6713
6714         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
6715           {
6716             output_operand_lossage ("invalid operand for '%%%c'", code);
6717             return;
6718           }
6719
6720         asm_fprintf (f, "%d", n);
6721       }
6722       break;
6723
6724     case 'P':
6725       if (!CONST_INT_P (x))
6726         {
6727           output_operand_lossage ("invalid operand for '%%%c'", code);
6728           return;
6729         }
6730
6731       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
6732       break;
6733
6734     case 'H':
6735       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
6736         {
6737           output_operand_lossage ("invalid operand for '%%%c'", code);
6738           return;
6739         }
6740
6741       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
6742       break;
6743
6744     case 'M':
6745     case 'm':
6746       {
6747         int cond_code;
6748         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
6749         if (x == const_true_rtx)
6750           {
6751             if (code == 'M')
6752               fputs ("nv", f);
6753             return;
6754           }
6755
6756         if (!COMPARISON_P (x))
6757           {
6758             output_operand_lossage ("invalid operand for '%%%c'", code);
6759             return;
6760           }
6761
6762         cond_code = aarch64_get_condition_code (x);
6763         gcc_assert (cond_code >= 0);
6764         if (code == 'M')
6765           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
6766         fputs (aarch64_condition_codes[cond_code], f);
6767       }
6768       break;
6769
6770     case 'N':
6771       if (!const_vec_duplicate_p (x, &elt))
6772         {
6773           output_operand_lossage ("invalid vector constant");
6774           return;
6775         }
6776
6777       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6778         asm_fprintf (f, "%wd", -INTVAL (elt));
6779       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6780                && aarch64_print_vector_float_operand (f, x, true))
6781         ;
6782       else
6783         {
6784           output_operand_lossage ("invalid vector constant");
6785           return;
6786         }
6787       break;
6788
6789     case 'b':
6790     case 'h':
6791     case 's':
6792     case 'd':
6793     case 'q':
6794       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6795         {
6796           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6797           return;
6798         }
6799       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
6800       break;
6801
6802     case 'S':
6803     case 'T':
6804     case 'U':
6805     case 'V':
6806       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6807         {
6808           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6809           return;
6810         }
6811       asm_fprintf (f, "%c%d",
6812                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
6813                    REGNO (x) - V0_REGNUM + (code - 'S'));
6814       break;
6815
6816     case 'R':
6817       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6818         {
6819           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6820           return;
6821         }
6822       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
6823       break;
6824
6825     case 'X':
6826       if (!CONST_INT_P (x))
6827         {
6828           output_operand_lossage ("invalid operand for '%%%c'", code);
6829           return;
6830         }
6831       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
6832       break;
6833
6834     case 'C':
6835       {
6836         /* Print a replicated constant in hex.  */
6837         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6838           {
6839             output_operand_lossage ("invalid operand for '%%%c'", code);
6840             return;
6841           }
6842         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6843         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6844       }
6845       break;
6846
6847     case 'D':
6848       {
6849         /* Print a replicated constant in decimal, treating it as
6850            unsigned.  */
6851         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6852           {
6853             output_operand_lossage ("invalid operand for '%%%c'", code);
6854             return;
6855           }
6856         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6857         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6858       }
6859       break;
6860
6861     case 'w':
6862     case 'x':
6863       if (x == const0_rtx
6864           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
6865         {
6866           asm_fprintf (f, "%czr", code);
6867           break;
6868         }
6869
6870       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
6871         {
6872           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
6873           break;
6874         }
6875
6876       if (REG_P (x) && REGNO (x) == SP_REGNUM)
6877         {
6878           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
6879           break;
6880         }
6881
6882       /* Fall through */
6883
6884     case 0:
6885       if (x == NULL)
6886         {
6887           output_operand_lossage ("missing operand");
6888           return;
6889         }
6890
6891       switch (GET_CODE (x))
6892         {
6893         case REG:
6894           if (aarch64_sve_data_mode_p (GET_MODE (x)))
6895             {
6896               if (REG_NREGS (x) == 1)
6897                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
6898               else
6899                 {
6900                   char suffix
6901                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
6902                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
6903                                REGNO (x) - V0_REGNUM, suffix,
6904                                END_REGNO (x) - V0_REGNUM - 1, suffix);
6905                 }
6906             }
6907           else
6908             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
6909           break;
6910
6911         case MEM:
6912           output_address (GET_MODE (x), XEXP (x, 0));
6913           break;
6914
6915         case LABEL_REF:
6916         case SYMBOL_REF:
6917           output_addr_const (asm_out_file, x);
6918           break;
6919
6920         case CONST_INT:
6921           asm_fprintf (f, "%wd", INTVAL (x));
6922           break;
6923
6924         case CONST:
6925           if (!VECTOR_MODE_P (GET_MODE (x)))
6926             {
6927               output_addr_const (asm_out_file, x);
6928               break;
6929             }
6930           /* fall through */
6931
6932         case CONST_VECTOR:
6933           if (!const_vec_duplicate_p (x, &elt))
6934             {
6935               output_operand_lossage ("invalid vector constant");
6936               return;
6937             }
6938
6939           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6940             asm_fprintf (f, "%wd", INTVAL (elt));
6941           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6942                    && aarch64_print_vector_float_operand (f, x, false))
6943             ;
6944           else
6945             {
6946               output_operand_lossage ("invalid vector constant");
6947               return;
6948             }
6949           break;
6950
6951         case CONST_DOUBLE:
6952           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
6953              be getting CONST_DOUBLEs holding integers.  */
6954           gcc_assert (GET_MODE (x) != VOIDmode);
6955           if (aarch64_float_const_zero_rtx_p (x))
6956             {
6957               fputc ('0', f);
6958               break;
6959             }
6960           else if (aarch64_float_const_representable_p (x))
6961             {
6962 #define buf_size 20
6963               char float_buf[buf_size] = {'\0'};
6964               real_to_decimal_for_mode (float_buf,
6965                                         CONST_DOUBLE_REAL_VALUE (x),
6966                                         buf_size, buf_size,
6967                                         1, GET_MODE (x));
6968               asm_fprintf (asm_out_file, "%s", float_buf);
6969               break;
6970 #undef buf_size
6971             }
6972           output_operand_lossage ("invalid constant");
6973           return;
6974         default:
6975           output_operand_lossage ("invalid operand");
6976           return;
6977         }
6978       break;
6979
6980     case 'A':
6981       if (GET_CODE (x) == HIGH)
6982         x = XEXP (x, 0);
6983
6984       switch (aarch64_classify_symbolic_expression (x))
6985         {
6986         case SYMBOL_SMALL_GOT_4G:
6987           asm_fprintf (asm_out_file, ":got:");
6988           break;
6989
6990         case SYMBOL_SMALL_TLSGD:
6991           asm_fprintf (asm_out_file, ":tlsgd:");
6992           break;
6993
6994         case SYMBOL_SMALL_TLSDESC:
6995           asm_fprintf (asm_out_file, ":tlsdesc:");
6996           break;
6997
6998         case SYMBOL_SMALL_TLSIE:
6999           asm_fprintf (asm_out_file, ":gottprel:");
7000           break;
7001
7002         case SYMBOL_TLSLE24:
7003           asm_fprintf (asm_out_file, ":tprel:");
7004           break;
7005
7006         case SYMBOL_TINY_GOT:
7007           gcc_unreachable ();
7008           break;
7009
7010         default:
7011           break;
7012         }
7013       output_addr_const (asm_out_file, x);
7014       break;
7015
7016     case 'L':
7017       switch (aarch64_classify_symbolic_expression (x))
7018         {
7019         case SYMBOL_SMALL_GOT_4G:
7020           asm_fprintf (asm_out_file, ":lo12:");
7021           break;
7022
7023         case SYMBOL_SMALL_TLSGD:
7024           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
7025           break;
7026
7027         case SYMBOL_SMALL_TLSDESC:
7028           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
7029           break;
7030
7031         case SYMBOL_SMALL_TLSIE:
7032           asm_fprintf (asm_out_file, ":gottprel_lo12:");
7033           break;
7034
7035         case SYMBOL_TLSLE12:
7036           asm_fprintf (asm_out_file, ":tprel_lo12:");
7037           break;
7038
7039         case SYMBOL_TLSLE24:
7040           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
7041           break;
7042
7043         case SYMBOL_TINY_GOT:
7044           asm_fprintf (asm_out_file, ":got:");
7045           break;
7046
7047         case SYMBOL_TINY_TLSIE:
7048           asm_fprintf (asm_out_file, ":gottprel:");
7049           break;
7050
7051         default:
7052           break;
7053         }
7054       output_addr_const (asm_out_file, x);
7055       break;
7056
7057     case 'G':
7058       switch (aarch64_classify_symbolic_expression (x))
7059         {
7060         case SYMBOL_TLSLE24:
7061           asm_fprintf (asm_out_file, ":tprel_hi12:");
7062           break;
7063         default:
7064           break;
7065         }
7066       output_addr_const (asm_out_file, x);
7067       break;
7068
7069     case 'k':
7070       {
7071         HOST_WIDE_INT cond_code;
7072
7073         if (!CONST_INT_P (x))
7074           {
7075             output_operand_lossage ("invalid operand for '%%%c'", code);
7076             return;
7077           }
7078
7079         cond_code = INTVAL (x);
7080         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
7081         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
7082       }
7083       break;
7084
7085     case 'y':
7086     case 'z':
7087       {
7088         machine_mode mode = GET_MODE (x);
7089
7090         if (GET_CODE (x) != MEM
7091             || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
7092           {
7093             output_operand_lossage ("invalid operand for '%%%c'", code);
7094             return;
7095           }
7096
7097         if (code == 'y')
7098           /* LDP/STP which uses a single double-width memory operand.
7099              Adjust the mode to appear like a typical LDP/STP.
7100              Currently this is supported for 16-byte accesses only.  */
7101           mode = DFmode;
7102
7103         if (!aarch64_print_ldpstp_address (f, mode, XEXP (x, 0)))
7104           output_operand_lossage ("invalid operand prefix '%%%c'", code);
7105       }
7106       break;
7107
7108     default:
7109       output_operand_lossage ("invalid operand prefix '%%%c'", code);
7110       return;
7111     }
7112 }
7113
7114 /* Print address 'x' of a memory access with mode 'mode'.
7115    'op' is the context required by aarch64_classify_address.  It can either be
7116    MEM for a normal memory access or PARALLEL for LDP/STP.  */
7117 static bool
7118 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
7119                                 aarch64_addr_query_type type)
7120 {
7121   struct aarch64_address_info addr;
7122   unsigned int size;
7123
7124   /* Check all addresses are Pmode - including ILP32.  */
7125   if (GET_MODE (x) != Pmode)
7126     output_operand_lossage ("invalid address mode");
7127
7128   if (aarch64_classify_address (&addr, x, mode, true, type))
7129     switch (addr.type)
7130       {
7131       case ADDRESS_REG_IMM:
7132         if (known_eq (addr.const_offset, 0))
7133           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
7134         else if (aarch64_sve_data_mode_p (mode))
7135           {
7136             HOST_WIDE_INT vnum
7137               = exact_div (addr.const_offset,
7138                            BYTES_PER_SVE_VECTOR).to_constant ();
7139             asm_fprintf (f, "[%s, #%wd, mul vl]",
7140                          reg_names[REGNO (addr.base)], vnum);
7141           }
7142         else if (aarch64_sve_pred_mode_p (mode))
7143           {
7144             HOST_WIDE_INT vnum
7145               = exact_div (addr.const_offset,
7146                            BYTES_PER_SVE_PRED).to_constant ();
7147             asm_fprintf (f, "[%s, #%wd, mul vl]",
7148                          reg_names[REGNO (addr.base)], vnum);
7149           }
7150         else
7151           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
7152                        INTVAL (addr.offset));
7153         return true;
7154
7155       case ADDRESS_REG_REG:
7156         if (addr.shift == 0)
7157           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
7158                        reg_names [REGNO (addr.offset)]);
7159         else
7160           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
7161                        reg_names [REGNO (addr.offset)], addr.shift);
7162         return true;
7163
7164       case ADDRESS_REG_UXTW:
7165         if (addr.shift == 0)
7166           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
7167                        REGNO (addr.offset) - R0_REGNUM);
7168         else
7169           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
7170                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
7171         return true;
7172
7173       case ADDRESS_REG_SXTW:
7174         if (addr.shift == 0)
7175           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
7176                        REGNO (addr.offset) - R0_REGNUM);
7177         else
7178           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
7179                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
7180         return true;
7181
7182       case ADDRESS_REG_WB:
7183         /* Writeback is only supported for fixed-width modes.  */
7184         size = GET_MODE_SIZE (mode).to_constant ();
7185         switch (GET_CODE (x))
7186           {
7187           case PRE_INC:
7188             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
7189             return true;
7190           case POST_INC:
7191             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
7192             return true;
7193           case PRE_DEC:
7194             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
7195             return true;
7196           case POST_DEC:
7197             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
7198             return true;
7199           case PRE_MODIFY:
7200             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
7201                          INTVAL (addr.offset));
7202             return true;
7203           case POST_MODIFY:
7204             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
7205                          INTVAL (addr.offset));
7206             return true;
7207           default:
7208             break;
7209           }
7210         break;
7211
7212       case ADDRESS_LO_SUM:
7213         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
7214         output_addr_const (f, addr.offset);
7215         asm_fprintf (f, "]");
7216         return true;
7217
7218       case ADDRESS_SYMBOLIC:
7219         output_addr_const (f, x);
7220         return true;
7221       }
7222
7223   return false;
7224 }
7225
7226 /* Print address 'x' of a LDP/STP with mode 'mode'.  */
7227 static bool
7228 aarch64_print_ldpstp_address (FILE *f, machine_mode mode, rtx x)
7229 {
7230   return aarch64_print_address_internal (f, mode, x, ADDR_QUERY_LDP_STP);
7231 }
7232
7233 /* Print address 'x' of a memory access with mode 'mode'.  */
7234 static void
7235 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
7236 {
7237   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
7238     output_addr_const (f, x);
7239 }
7240
7241 bool
7242 aarch64_label_mentioned_p (rtx x)
7243 {
7244   const char *fmt;
7245   int i;
7246
7247   if (GET_CODE (x) == LABEL_REF)
7248     return true;
7249
7250   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
7251      referencing instruction, but they are constant offsets, not
7252      symbols.  */
7253   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
7254     return false;
7255
7256   fmt = GET_RTX_FORMAT (GET_CODE (x));
7257   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
7258     {
7259       if (fmt[i] == 'E')
7260         {
7261           int j;
7262
7263           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
7264             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
7265               return 1;
7266         }
7267       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
7268         return 1;
7269     }
7270
7271   return 0;
7272 }
7273
7274 /* Implement REGNO_REG_CLASS.  */
7275
7276 enum reg_class
7277 aarch64_regno_regclass (unsigned regno)
7278 {
7279   if (GP_REGNUM_P (regno))
7280     return GENERAL_REGS;
7281
7282   if (regno == SP_REGNUM)
7283     return STACK_REG;
7284
7285   if (regno == FRAME_POINTER_REGNUM
7286       || regno == ARG_POINTER_REGNUM)
7287     return POINTER_REGS;
7288
7289   if (FP_REGNUM_P (regno))
7290     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
7291
7292   if (PR_REGNUM_P (regno))
7293     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
7294
7295   return NO_REGS;
7296 }
7297
7298 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
7299    If OFFSET is out of range, return an offset of an anchor point
7300    that is in range.  Return 0 otherwise.  */
7301
7302 static HOST_WIDE_INT
7303 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
7304                        machine_mode mode)
7305 {
7306   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
7307   if (size > 16)
7308     return (offset + 0x400) & ~0x7f0;
7309
7310   /* For offsets that aren't a multiple of the access size, the limit is
7311      -256...255.  */
7312   if (offset & (size - 1))
7313     {
7314       /* BLKmode typically uses LDP of X-registers.  */
7315       if (mode == BLKmode)
7316         return (offset + 512) & ~0x3ff;
7317       return (offset + 0x100) & ~0x1ff;
7318     }
7319
7320   /* Small negative offsets are supported.  */
7321   if (IN_RANGE (offset, -256, 0))
7322     return 0;
7323
7324   if (mode == TImode || mode == TFmode)
7325     return (offset + 0x100) & ~0x1ff;
7326
7327   /* Use 12-bit offset by access size.  */
7328   return offset & (~0xfff * size);
7329 }
7330
7331 static rtx
7332 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
7333 {
7334   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
7335      where mask is selected by alignment and size of the offset.
7336      We try to pick as large a range for the offset as possible to
7337      maximize the chance of a CSE.  However, for aligned addresses
7338      we limit the range to 4k so that structures with different sized
7339      elements are likely to use the same base.  We need to be careful
7340      not to split a CONST for some forms of address expression, otherwise
7341      it will generate sub-optimal code.  */
7342
7343   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
7344     {
7345       rtx base = XEXP (x, 0);
7346       rtx offset_rtx = XEXP (x, 1);
7347       HOST_WIDE_INT offset = INTVAL (offset_rtx);
7348
7349       if (GET_CODE (base) == PLUS)
7350         {
7351           rtx op0 = XEXP (base, 0);
7352           rtx op1 = XEXP (base, 1);
7353
7354           /* Force any scaling into a temp for CSE.  */
7355           op0 = force_reg (Pmode, op0);
7356           op1 = force_reg (Pmode, op1);
7357
7358           /* Let the pointer register be in op0.  */
7359           if (REG_POINTER (op1))
7360             std::swap (op0, op1);
7361
7362           /* If the pointer is virtual or frame related, then we know that
7363              virtual register instantiation or register elimination is going
7364              to apply a second constant.  We want the two constants folded
7365              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
7366           if (virt_or_elim_regno_p (REGNO (op0)))
7367             {
7368               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
7369                                    NULL_RTX, true, OPTAB_DIRECT);
7370               return gen_rtx_PLUS (Pmode, base, op1);
7371             }
7372
7373           /* Otherwise, in order to encourage CSE (and thence loop strength
7374              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
7375           base = expand_binop (Pmode, add_optab, op0, op1,
7376                                NULL_RTX, true, OPTAB_DIRECT);
7377           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
7378         }
7379
7380       HOST_WIDE_INT size;
7381       if (GET_MODE_SIZE (mode).is_constant (&size))
7382         {
7383           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
7384                                                              mode);
7385           if (base_offset != 0)
7386             {
7387               base = plus_constant (Pmode, base, base_offset);
7388               base = force_operand (base, NULL_RTX);
7389               return plus_constant (Pmode, base, offset - base_offset);
7390             }
7391         }
7392     }
7393
7394   return x;
7395 }
7396
7397 /* Return the reload icode required for a constant pool in mode.  */
7398 static enum insn_code
7399 aarch64_constant_pool_reload_icode (machine_mode mode)
7400 {
7401   switch (mode)
7402     {
7403     case E_SFmode:
7404       return CODE_FOR_aarch64_reload_movcpsfdi;
7405
7406     case E_DFmode:
7407       return CODE_FOR_aarch64_reload_movcpdfdi;
7408
7409     case E_TFmode:
7410       return CODE_FOR_aarch64_reload_movcptfdi;
7411
7412     case E_V8QImode:
7413       return CODE_FOR_aarch64_reload_movcpv8qidi;
7414
7415     case E_V16QImode:
7416       return CODE_FOR_aarch64_reload_movcpv16qidi;
7417
7418     case E_V4HImode:
7419       return CODE_FOR_aarch64_reload_movcpv4hidi;
7420
7421     case E_V8HImode:
7422       return CODE_FOR_aarch64_reload_movcpv8hidi;
7423
7424     case E_V2SImode:
7425       return CODE_FOR_aarch64_reload_movcpv2sidi;
7426
7427     case E_V4SImode:
7428       return CODE_FOR_aarch64_reload_movcpv4sidi;
7429
7430     case E_V2DImode:
7431       return CODE_FOR_aarch64_reload_movcpv2didi;
7432
7433     case E_V2DFmode:
7434       return CODE_FOR_aarch64_reload_movcpv2dfdi;
7435
7436     default:
7437       gcc_unreachable ();
7438     }
7439
7440   gcc_unreachable ();
7441 }
7442 static reg_class_t
7443 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
7444                           reg_class_t rclass,
7445                           machine_mode mode,
7446                           secondary_reload_info *sri)
7447 {
7448   /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
7449      directly by the *aarch64_sve_mov<mode>_be move pattern.  See the
7450      comment at the head of aarch64-sve.md for more details about the
7451      big-endian handling.  */
7452   if (BYTES_BIG_ENDIAN
7453       && reg_class_subset_p (rclass, FP_REGS)
7454       && !((REG_P (x) && HARD_REGISTER_P (x))
7455            || aarch64_simd_valid_immediate (x, NULL))
7456       && aarch64_sve_data_mode_p (mode))
7457     {
7458       sri->icode = CODE_FOR_aarch64_sve_reload_be;
7459       return NO_REGS;
7460     }
7461
7462   /* If we have to disable direct literal pool loads and stores because the
7463      function is too big, then we need a scratch register.  */
7464   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
7465       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
7466           || targetm.vector_mode_supported_p (GET_MODE (x)))
7467       && !aarch64_pcrelative_literal_loads)
7468     {
7469       sri->icode = aarch64_constant_pool_reload_icode (mode);
7470       return NO_REGS;
7471     }
7472
7473   /* Without the TARGET_SIMD instructions we cannot move a Q register
7474      to a Q register directly.  We need a scratch.  */
7475   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
7476       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
7477       && reg_class_subset_p (rclass, FP_REGS))
7478     {
7479       if (mode == TFmode)
7480         sri->icode = CODE_FOR_aarch64_reload_movtf;
7481       else if (mode == TImode)
7482         sri->icode = CODE_FOR_aarch64_reload_movti;
7483       return NO_REGS;
7484     }
7485
7486   /* A TFmode or TImode memory access should be handled via an FP_REGS
7487      because AArch64 has richer addressing modes for LDR/STR instructions
7488      than LDP/STP instructions.  */
7489   if (TARGET_FLOAT && rclass == GENERAL_REGS
7490       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
7491     return FP_REGS;
7492
7493   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
7494       return GENERAL_REGS;
7495
7496   return NO_REGS;
7497 }
7498
7499 static bool
7500 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
7501 {
7502   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
7503
7504   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
7505      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
7506   if (frame_pointer_needed)
7507     return to == HARD_FRAME_POINTER_REGNUM;
7508   return true;
7509 }
7510
7511 poly_int64
7512 aarch64_initial_elimination_offset (unsigned from, unsigned to)
7513 {
7514   aarch64_layout_frame ();
7515
7516   if (to == HARD_FRAME_POINTER_REGNUM)
7517     {
7518       if (from == ARG_POINTER_REGNUM)
7519         return cfun->machine->frame.hard_fp_offset;
7520
7521       if (from == FRAME_POINTER_REGNUM)
7522         return cfun->machine->frame.hard_fp_offset
7523                - cfun->machine->frame.locals_offset;
7524     }
7525
7526   if (to == STACK_POINTER_REGNUM)
7527     {
7528       if (from == FRAME_POINTER_REGNUM)
7529           return cfun->machine->frame.frame_size
7530                  - cfun->machine->frame.locals_offset;
7531     }
7532
7533   return cfun->machine->frame.frame_size;
7534 }
7535
7536 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
7537    previous frame.  */
7538
7539 rtx
7540 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
7541 {
7542   if (count != 0)
7543     return const0_rtx;
7544   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
7545 }
7546
7547
7548 static void
7549 aarch64_asm_trampoline_template (FILE *f)
7550 {
7551   if (TARGET_ILP32)
7552     {
7553       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
7554       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
7555     }
7556   else
7557     {
7558       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
7559       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
7560     }
7561   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
7562   assemble_aligned_integer (4, const0_rtx);
7563   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7564   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7565 }
7566
7567 static void
7568 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
7569 {
7570   rtx fnaddr, mem, a_tramp;
7571   const int tramp_code_sz = 16;
7572
7573   /* Don't need to copy the trailing D-words, we fill those in below.  */
7574   emit_block_move (m_tramp, assemble_trampoline_template (),
7575                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
7576   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
7577   fnaddr = XEXP (DECL_RTL (fndecl), 0);
7578   if (GET_MODE (fnaddr) != ptr_mode)
7579     fnaddr = convert_memory_address (ptr_mode, fnaddr);
7580   emit_move_insn (mem, fnaddr);
7581
7582   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
7583   emit_move_insn (mem, chain_value);
7584
7585   /* XXX We should really define a "clear_cache" pattern and use
7586      gen_clear_cache().  */
7587   a_tramp = XEXP (m_tramp, 0);
7588   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
7589                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
7590                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
7591                      ptr_mode);
7592 }
7593
7594 static unsigned char
7595 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
7596 {
7597   /* ??? Logically we should only need to provide a value when
7598      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
7599      can hold MODE, but at the moment we need to handle all modes.
7600      Just ignore any runtime parts for registers that can't store them.  */
7601   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
7602   unsigned int nregs;
7603   switch (regclass)
7604     {
7605     case TAILCALL_ADDR_REGS:
7606     case POINTER_REGS:
7607     case GENERAL_REGS:
7608     case ALL_REGS:
7609     case POINTER_AND_FP_REGS:
7610     case FP_REGS:
7611     case FP_LO_REGS:
7612       if (aarch64_sve_data_mode_p (mode)
7613           && constant_multiple_p (GET_MODE_SIZE (mode),
7614                                   BYTES_PER_SVE_VECTOR, &nregs))
7615         return nregs;
7616       return (aarch64_vector_data_mode_p (mode)
7617               ? CEIL (lowest_size, UNITS_PER_VREG)
7618               : CEIL (lowest_size, UNITS_PER_WORD));
7619     case STACK_REG:
7620     case PR_REGS:
7621     case PR_LO_REGS:
7622     case PR_HI_REGS:
7623       return 1;
7624
7625     case NO_REGS:
7626       return 0;
7627
7628     default:
7629       break;
7630     }
7631   gcc_unreachable ();
7632 }
7633
7634 static reg_class_t
7635 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
7636 {
7637   if (regclass == POINTER_REGS)
7638     return GENERAL_REGS;
7639
7640   if (regclass == STACK_REG)
7641     {
7642       if (REG_P(x)
7643           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
7644           return regclass;
7645
7646       return NO_REGS;
7647     }
7648
7649   /* Register eliminiation can result in a request for
7650      SP+constant->FP_REGS.  We cannot support such operations which
7651      use SP as source and an FP_REG as destination, so reject out
7652      right now.  */
7653   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
7654     {
7655       rtx lhs = XEXP (x, 0);
7656
7657       /* Look through a possible SUBREG introduced by ILP32.  */
7658       if (GET_CODE (lhs) == SUBREG)
7659         lhs = SUBREG_REG (lhs);
7660
7661       gcc_assert (REG_P (lhs));
7662       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
7663                                       POINTER_REGS));
7664       return NO_REGS;
7665     }
7666
7667   return regclass;
7668 }
7669
7670 void
7671 aarch64_asm_output_labelref (FILE* f, const char *name)
7672 {
7673   asm_fprintf (f, "%U%s", name);
7674 }
7675
7676 static void
7677 aarch64_elf_asm_constructor (rtx symbol, int priority)
7678 {
7679   if (priority == DEFAULT_INIT_PRIORITY)
7680     default_ctor_section_asm_out_constructor (symbol, priority);
7681   else
7682     {
7683       section *s;
7684       /* While priority is known to be in range [0, 65535], so 18 bytes
7685          would be enough, the compiler might not know that.  To avoid
7686          -Wformat-truncation false positive, use a larger size.  */
7687       char buf[23];
7688       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
7689       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7690       switch_to_section (s);
7691       assemble_align (POINTER_SIZE);
7692       assemble_aligned_integer (POINTER_BYTES, symbol);
7693     }
7694 }
7695
7696 static void
7697 aarch64_elf_asm_destructor (rtx symbol, int priority)
7698 {
7699   if (priority == DEFAULT_INIT_PRIORITY)
7700     default_dtor_section_asm_out_destructor (symbol, priority);
7701   else
7702     {
7703       section *s;
7704       /* While priority is known to be in range [0, 65535], so 18 bytes
7705          would be enough, the compiler might not know that.  To avoid
7706          -Wformat-truncation false positive, use a larger size.  */
7707       char buf[23];
7708       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
7709       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7710       switch_to_section (s);
7711       assemble_align (POINTER_SIZE);
7712       assemble_aligned_integer (POINTER_BYTES, symbol);
7713     }
7714 }
7715
7716 const char*
7717 aarch64_output_casesi (rtx *operands)
7718 {
7719   char buf[100];
7720   char label[100];
7721   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
7722   int index;
7723   static const char *const patterns[4][2] =
7724   {
7725     {
7726       "ldrb\t%w3, [%0,%w1,uxtw]",
7727       "add\t%3, %4, %w3, sxtb #2"
7728     },
7729     {
7730       "ldrh\t%w3, [%0,%w1,uxtw #1]",
7731       "add\t%3, %4, %w3, sxth #2"
7732     },
7733     {
7734       "ldr\t%w3, [%0,%w1,uxtw #2]",
7735       "add\t%3, %4, %w3, sxtw #2"
7736     },
7737     /* We assume that DImode is only generated when not optimizing and
7738        that we don't really need 64-bit address offsets.  That would
7739        imply an object file with 8GB of code in a single function!  */
7740     {
7741       "ldr\t%w3, [%0,%w1,uxtw #2]",
7742       "add\t%3, %4, %w3, sxtw #2"
7743     }
7744   };
7745
7746   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
7747
7748   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
7749   index = exact_log2 (GET_MODE_SIZE (mode));
7750
7751   gcc_assert (index >= 0 && index <= 3);
7752
7753   /* Need to implement table size reduction, by chaning the code below.  */
7754   output_asm_insn (patterns[index][0], operands);
7755   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
7756   snprintf (buf, sizeof (buf),
7757             "adr\t%%4, %s", targetm.strip_name_encoding (label));
7758   output_asm_insn (buf, operands);
7759   output_asm_insn (patterns[index][1], operands);
7760   output_asm_insn ("br\t%3", operands);
7761   assemble_label (asm_out_file, label);
7762   return "";
7763 }
7764
7765
7766 /* Return size in bits of an arithmetic operand which is shifted/scaled and
7767    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
7768    operator.  */
7769
7770 int
7771 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
7772 {
7773   if (shift >= 0 && shift <= 3)
7774     {
7775       int size;
7776       for (size = 8; size <= 32; size *= 2)
7777         {
7778           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
7779           if (mask == bits << shift)
7780             return size;
7781         }
7782     }
7783   return 0;
7784 }
7785
7786 /* Constant pools are per function only when PC relative
7787    literal loads are true or we are in the large memory
7788    model.  */
7789
7790 static inline bool
7791 aarch64_can_use_per_function_literal_pools_p (void)
7792 {
7793   return (aarch64_pcrelative_literal_loads
7794           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
7795 }
7796
7797 static bool
7798 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
7799 {
7800   /* We can't use blocks for constants when we're using a per-function
7801      constant pool.  */
7802   return !aarch64_can_use_per_function_literal_pools_p ();
7803 }
7804
7805 /* Select appropriate section for constants depending
7806    on where we place literal pools.  */
7807
7808 static section *
7809 aarch64_select_rtx_section (machine_mode mode,
7810                             rtx x,
7811                             unsigned HOST_WIDE_INT align)
7812 {
7813   if (aarch64_can_use_per_function_literal_pools_p ())
7814     return function_section (current_function_decl);
7815
7816   return default_elf_select_rtx_section (mode, x, align);
7817 }
7818
7819 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
7820 void
7821 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
7822                                   HOST_WIDE_INT offset)
7823 {
7824   /* When using per-function literal pools, we must ensure that any code
7825      section is aligned to the minimal instruction length, lest we get
7826      errors from the assembler re "unaligned instructions".  */
7827   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
7828     ASM_OUTPUT_ALIGN (f, 2);
7829 }
7830
7831 /* Costs.  */
7832
7833 /* Helper function for rtx cost calculation.  Strip a shift expression
7834    from X.  Returns the inner operand if successful, or the original
7835    expression on failure.  */
7836 static rtx
7837 aarch64_strip_shift (rtx x)
7838 {
7839   rtx op = x;
7840
7841   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
7842      we can convert both to ROR during final output.  */
7843   if ((GET_CODE (op) == ASHIFT
7844        || GET_CODE (op) == ASHIFTRT
7845        || GET_CODE (op) == LSHIFTRT
7846        || GET_CODE (op) == ROTATERT
7847        || GET_CODE (op) == ROTATE)
7848       && CONST_INT_P (XEXP (op, 1)))
7849     return XEXP (op, 0);
7850
7851   if (GET_CODE (op) == MULT
7852       && CONST_INT_P (XEXP (op, 1))
7853       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
7854     return XEXP (op, 0);
7855
7856   return x;
7857 }
7858
7859 /* Helper function for rtx cost calculation.  Strip an extend
7860    expression from X.  Returns the inner operand if successful, or the
7861    original expression on failure.  We deal with a number of possible
7862    canonicalization variations here. If STRIP_SHIFT is true, then
7863    we can strip off a shift also.  */
7864 static rtx
7865 aarch64_strip_extend (rtx x, bool strip_shift)
7866 {
7867   scalar_int_mode mode;
7868   rtx op = x;
7869
7870   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
7871     return op;
7872
7873   /* Zero and sign extraction of a widened value.  */
7874   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
7875       && XEXP (op, 2) == const0_rtx
7876       && GET_CODE (XEXP (op, 0)) == MULT
7877       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
7878                                          XEXP (op, 1)))
7879     return XEXP (XEXP (op, 0), 0);
7880
7881   /* It can also be represented (for zero-extend) as an AND with an
7882      immediate.  */
7883   if (GET_CODE (op) == AND
7884       && GET_CODE (XEXP (op, 0)) == MULT
7885       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
7886       && CONST_INT_P (XEXP (op, 1))
7887       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
7888                            INTVAL (XEXP (op, 1))) != 0)
7889     return XEXP (XEXP (op, 0), 0);
7890
7891   /* Now handle extended register, as this may also have an optional
7892      left shift by 1..4.  */
7893   if (strip_shift
7894       && GET_CODE (op) == ASHIFT
7895       && CONST_INT_P (XEXP (op, 1))
7896       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
7897     op = XEXP (op, 0);
7898
7899   if (GET_CODE (op) == ZERO_EXTEND
7900       || GET_CODE (op) == SIGN_EXTEND)
7901     op = XEXP (op, 0);
7902
7903   if (op != x)
7904     return op;
7905
7906   return x;
7907 }
7908
7909 /* Return true iff CODE is a shift supported in combination
7910    with arithmetic instructions.  */
7911
7912 static bool
7913 aarch64_shift_p (enum rtx_code code)
7914 {
7915   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
7916 }
7917
7918
7919 /* Return true iff X is a cheap shift without a sign extend. */
7920
7921 static bool
7922 aarch64_cheap_mult_shift_p (rtx x)
7923 {
7924   rtx op0, op1;
7925
7926   op0 = XEXP (x, 0);
7927   op1 = XEXP (x, 1);
7928
7929   if (!(aarch64_tune_params.extra_tuning_flags
7930                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
7931     return false;
7932
7933   if (GET_CODE (op0) == SIGN_EXTEND)
7934     return false;
7935
7936   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
7937       && UINTVAL (op1) <= 4)
7938     return true;
7939
7940   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
7941     return false;
7942
7943   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
7944
7945   if (l2 > 0 && l2 <= 4)
7946     return true;
7947
7948   return false;
7949 }
7950
7951 /* Helper function for rtx cost calculation.  Calculate the cost of
7952    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
7953    Return the calculated cost of the expression, recursing manually in to
7954    operands where needed.  */
7955
7956 static int
7957 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
7958 {
7959   rtx op0, op1;
7960   const struct cpu_cost_table *extra_cost
7961     = aarch64_tune_params.insn_extra_cost;
7962   int cost = 0;
7963   bool compound_p = (outer == PLUS || outer == MINUS);
7964   machine_mode mode = GET_MODE (x);
7965
7966   gcc_checking_assert (code == MULT);
7967
7968   op0 = XEXP (x, 0);
7969   op1 = XEXP (x, 1);
7970
7971   if (VECTOR_MODE_P (mode))
7972     mode = GET_MODE_INNER (mode);
7973
7974   /* Integer multiply/fma.  */
7975   if (GET_MODE_CLASS (mode) == MODE_INT)
7976     {
7977       /* The multiply will be canonicalized as a shift, cost it as such.  */
7978       if (aarch64_shift_p (GET_CODE (x))
7979           || (CONST_INT_P (op1)
7980               && exact_log2 (INTVAL (op1)) > 0))
7981         {
7982           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
7983                            || GET_CODE (op0) == SIGN_EXTEND;
7984           if (speed)
7985             {
7986               if (compound_p)
7987                 {
7988                   /* If the shift is considered cheap,
7989                      then don't add any cost. */
7990                   if (aarch64_cheap_mult_shift_p (x))
7991                     ;
7992                   else if (REG_P (op1))
7993                     /* ARITH + shift-by-register.  */
7994                     cost += extra_cost->alu.arith_shift_reg;
7995                   else if (is_extend)
7996                     /* ARITH + extended register.  We don't have a cost field
7997                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
7998                     cost += extra_cost->alu.extend_arith;
7999                   else
8000                     /* ARITH + shift-by-immediate.  */
8001                     cost += extra_cost->alu.arith_shift;
8002                 }
8003               else
8004                 /* LSL (immediate).  */
8005                 cost += extra_cost->alu.shift;
8006
8007             }
8008           /* Strip extends as we will have costed them in the case above.  */
8009           if (is_extend)
8010             op0 = aarch64_strip_extend (op0, true);
8011
8012           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
8013
8014           return cost;
8015         }
8016
8017       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
8018          compound and let the below cases handle it.  After all, MNEG is a
8019          special-case alias of MSUB.  */
8020       if (GET_CODE (op0) == NEG)
8021         {
8022           op0 = XEXP (op0, 0);
8023           compound_p = true;
8024         }
8025
8026       /* Integer multiplies or FMAs have zero/sign extending variants.  */
8027       if ((GET_CODE (op0) == ZERO_EXTEND
8028            && GET_CODE (op1) == ZERO_EXTEND)
8029           || (GET_CODE (op0) == SIGN_EXTEND
8030               && GET_CODE (op1) == SIGN_EXTEND))
8031         {
8032           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
8033           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
8034
8035           if (speed)
8036             {
8037               if (compound_p)
8038                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
8039                 cost += extra_cost->mult[0].extend_add;
8040               else
8041                 /* MUL/SMULL/UMULL.  */
8042                 cost += extra_cost->mult[0].extend;
8043             }
8044
8045           return cost;
8046         }
8047
8048       /* This is either an integer multiply or a MADD.  In both cases
8049          we want to recurse and cost the operands.  */
8050       cost += rtx_cost (op0, mode, MULT, 0, speed);
8051       cost += rtx_cost (op1, mode, MULT, 1, speed);
8052
8053       if (speed)
8054         {
8055           if (compound_p)
8056             /* MADD/MSUB.  */
8057             cost += extra_cost->mult[mode == DImode].add;
8058           else
8059             /* MUL.  */
8060             cost += extra_cost->mult[mode == DImode].simple;
8061         }
8062
8063       return cost;
8064     }
8065   else
8066     {
8067       if (speed)
8068         {
8069           /* Floating-point FMA/FMUL can also support negations of the
8070              operands, unless the rounding mode is upward or downward in
8071              which case FNMUL is different than FMUL with operand negation.  */
8072           bool neg0 = GET_CODE (op0) == NEG;
8073           bool neg1 = GET_CODE (op1) == NEG;
8074           if (compound_p || !flag_rounding_math || (neg0 && neg1))
8075             {
8076               if (neg0)
8077                 op0 = XEXP (op0, 0);
8078               if (neg1)
8079                 op1 = XEXP (op1, 0);
8080             }
8081
8082           if (compound_p)
8083             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
8084             cost += extra_cost->fp[mode == DFmode].fma;
8085           else
8086             /* FMUL/FNMUL.  */
8087             cost += extra_cost->fp[mode == DFmode].mult;
8088         }
8089
8090       cost += rtx_cost (op0, mode, MULT, 0, speed);
8091       cost += rtx_cost (op1, mode, MULT, 1, speed);
8092       return cost;
8093     }
8094 }
8095
8096 static int
8097 aarch64_address_cost (rtx x,
8098                       machine_mode mode,
8099                       addr_space_t as ATTRIBUTE_UNUSED,
8100                       bool speed)
8101 {
8102   enum rtx_code c = GET_CODE (x);
8103   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
8104   struct aarch64_address_info info;
8105   int cost = 0;
8106   info.shift = 0;
8107
8108   if (!aarch64_classify_address (&info, x, mode, false))
8109     {
8110       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
8111         {
8112           /* This is a CONST or SYMBOL ref which will be split
8113              in a different way depending on the code model in use.
8114              Cost it through the generic infrastructure.  */
8115           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
8116           /* Divide through by the cost of one instruction to
8117              bring it to the same units as the address costs.  */
8118           cost_symbol_ref /= COSTS_N_INSNS (1);
8119           /* The cost is then the cost of preparing the address,
8120              followed by an immediate (possibly 0) offset.  */
8121           return cost_symbol_ref + addr_cost->imm_offset;
8122         }
8123       else
8124         {
8125           /* This is most likely a jump table from a case
8126              statement.  */
8127           return addr_cost->register_offset;
8128         }
8129     }
8130
8131   switch (info.type)
8132     {
8133       case ADDRESS_LO_SUM:
8134       case ADDRESS_SYMBOLIC:
8135       case ADDRESS_REG_IMM:
8136         cost += addr_cost->imm_offset;
8137         break;
8138
8139       case ADDRESS_REG_WB:
8140         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
8141           cost += addr_cost->pre_modify;
8142         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
8143           cost += addr_cost->post_modify;
8144         else
8145           gcc_unreachable ();
8146
8147         break;
8148
8149       case ADDRESS_REG_REG:
8150         cost += addr_cost->register_offset;
8151         break;
8152
8153       case ADDRESS_REG_SXTW:
8154         cost += addr_cost->register_sextend;
8155         break;
8156
8157       case ADDRESS_REG_UXTW:
8158         cost += addr_cost->register_zextend;
8159         break;
8160
8161       default:
8162         gcc_unreachable ();
8163     }
8164
8165
8166   if (info.shift > 0)
8167     {
8168       /* For the sake of calculating the cost of the shifted register
8169          component, we can treat same sized modes in the same way.  */
8170       if (known_eq (GET_MODE_BITSIZE (mode), 16))
8171         cost += addr_cost->addr_scale_costs.hi;
8172       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
8173         cost += addr_cost->addr_scale_costs.si;
8174       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
8175         cost += addr_cost->addr_scale_costs.di;
8176       else
8177         /* We can't tell, or this is a 128-bit vector.  */
8178         cost += addr_cost->addr_scale_costs.ti;
8179     }
8180
8181   return cost;
8182 }
8183
8184 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
8185    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
8186    to be taken.  */
8187
8188 int
8189 aarch64_branch_cost (bool speed_p, bool predictable_p)
8190 {
8191   /* When optimizing for speed, use the cost of unpredictable branches.  */
8192   const struct cpu_branch_cost *branch_costs =
8193     aarch64_tune_params.branch_costs;
8194
8195   if (!speed_p || predictable_p)
8196     return branch_costs->predictable;
8197   else
8198     return branch_costs->unpredictable;
8199 }
8200
8201 /* Return true if the RTX X in mode MODE is a zero or sign extract
8202    usable in an ADD or SUB (extended register) instruction.  */
8203 static bool
8204 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
8205 {
8206   /* Catch add with a sign extract.
8207      This is add_<optab><mode>_multp2.  */
8208   if (GET_CODE (x) == SIGN_EXTRACT
8209       || GET_CODE (x) == ZERO_EXTRACT)
8210     {
8211       rtx op0 = XEXP (x, 0);
8212       rtx op1 = XEXP (x, 1);
8213       rtx op2 = XEXP (x, 2);
8214
8215       if (GET_CODE (op0) == MULT
8216           && CONST_INT_P (op1)
8217           && op2 == const0_rtx
8218           && CONST_INT_P (XEXP (op0, 1))
8219           && aarch64_is_extend_from_extract (mode,
8220                                              XEXP (op0, 1),
8221                                              op1))
8222         {
8223           return true;
8224         }
8225     }
8226   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
8227      No shift.  */
8228   else if (GET_CODE (x) == SIGN_EXTEND
8229            || GET_CODE (x) == ZERO_EXTEND)
8230     return REG_P (XEXP (x, 0));
8231
8232   return false;
8233 }
8234
8235 static bool
8236 aarch64_frint_unspec_p (unsigned int u)
8237 {
8238   switch (u)
8239     {
8240       case UNSPEC_FRINTZ:
8241       case UNSPEC_FRINTP:
8242       case UNSPEC_FRINTM:
8243       case UNSPEC_FRINTA:
8244       case UNSPEC_FRINTN:
8245       case UNSPEC_FRINTX:
8246       case UNSPEC_FRINTI:
8247         return true;
8248
8249       default:
8250         return false;
8251     }
8252 }
8253
8254 /* Return true iff X is an rtx that will match an extr instruction
8255    i.e. as described in the *extr<mode>5_insn family of patterns.
8256    OP0 and OP1 will be set to the operands of the shifts involved
8257    on success and will be NULL_RTX otherwise.  */
8258
8259 static bool
8260 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
8261 {
8262   rtx op0, op1;
8263   scalar_int_mode mode;
8264   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
8265     return false;
8266
8267   *res_op0 = NULL_RTX;
8268   *res_op1 = NULL_RTX;
8269
8270   if (GET_CODE (x) != IOR)
8271     return false;
8272
8273   op0 = XEXP (x, 0);
8274   op1 = XEXP (x, 1);
8275
8276   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
8277       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
8278     {
8279      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
8280       if (GET_CODE (op1) == ASHIFT)
8281         std::swap (op0, op1);
8282
8283       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
8284         return false;
8285
8286       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
8287       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
8288
8289       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
8290           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
8291         {
8292           *res_op0 = XEXP (op0, 0);
8293           *res_op1 = XEXP (op1, 0);
8294           return true;
8295         }
8296     }
8297
8298   return false;
8299 }
8300
8301 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
8302    storing it in *COST.  Result is true if the total cost of the operation
8303    has now been calculated.  */
8304 static bool
8305 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
8306 {
8307   rtx inner;
8308   rtx comparator;
8309   enum rtx_code cmpcode;
8310
8311   if (COMPARISON_P (op0))
8312     {
8313       inner = XEXP (op0, 0);
8314       comparator = XEXP (op0, 1);
8315       cmpcode = GET_CODE (op0);
8316     }
8317   else
8318     {
8319       inner = op0;
8320       comparator = const0_rtx;
8321       cmpcode = NE;
8322     }
8323
8324   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
8325     {
8326       /* Conditional branch.  */
8327       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8328         return true;
8329       else
8330         {
8331           if (cmpcode == NE || cmpcode == EQ)
8332             {
8333               if (comparator == const0_rtx)
8334                 {
8335                   /* TBZ/TBNZ/CBZ/CBNZ.  */
8336                   if (GET_CODE (inner) == ZERO_EXTRACT)
8337                     /* TBZ/TBNZ.  */
8338                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
8339                                        ZERO_EXTRACT, 0, speed);
8340                   else
8341                     /* CBZ/CBNZ.  */
8342                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
8343
8344                 return true;
8345               }
8346             }
8347           else if (cmpcode == LT || cmpcode == GE)
8348             {
8349               /* TBZ/TBNZ.  */
8350               if (comparator == const0_rtx)
8351                 return true;
8352             }
8353         }
8354     }
8355   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8356     {
8357       /* CCMP.  */
8358       if (GET_CODE (op1) == COMPARE)
8359         {
8360           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
8361           if (XEXP (op1, 1) == const0_rtx)
8362             *cost += 1;
8363           if (speed)
8364             {
8365               machine_mode mode = GET_MODE (XEXP (op1, 0));
8366               const struct cpu_cost_table *extra_cost
8367                 = aarch64_tune_params.insn_extra_cost;
8368
8369               if (GET_MODE_CLASS (mode) == MODE_INT)
8370                 *cost += extra_cost->alu.arith;
8371               else
8372                 *cost += extra_cost->fp[mode == DFmode].compare;
8373             }
8374           return true;
8375         }
8376
8377       /* It's a conditional operation based on the status flags,
8378          so it must be some flavor of CSEL.  */
8379
8380       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
8381       if (GET_CODE (op1) == NEG
8382           || GET_CODE (op1) == NOT
8383           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
8384         op1 = XEXP (op1, 0);
8385       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
8386         {
8387           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
8388           op1 = XEXP (op1, 0);
8389           op2 = XEXP (op2, 0);
8390         }
8391
8392       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
8393       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
8394       return true;
8395     }
8396
8397   /* We don't know what this is, cost all operands.  */
8398   return false;
8399 }
8400
8401 /* Check whether X is a bitfield operation of the form shift + extend that
8402    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
8403    operand to which the bitfield operation is applied.  Otherwise return
8404    NULL_RTX.  */
8405
8406 static rtx
8407 aarch64_extend_bitfield_pattern_p (rtx x)
8408 {
8409   rtx_code outer_code = GET_CODE (x);
8410   machine_mode outer_mode = GET_MODE (x);
8411
8412   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
8413       && outer_mode != SImode && outer_mode != DImode)
8414     return NULL_RTX;
8415
8416   rtx inner = XEXP (x, 0);
8417   rtx_code inner_code = GET_CODE (inner);
8418   machine_mode inner_mode = GET_MODE (inner);
8419   rtx op = NULL_RTX;
8420
8421   switch (inner_code)
8422     {
8423       case ASHIFT:
8424         if (CONST_INT_P (XEXP (inner, 1))
8425             && (inner_mode == QImode || inner_mode == HImode))
8426           op = XEXP (inner, 0);
8427         break;
8428       case LSHIFTRT:
8429         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
8430             && (inner_mode == QImode || inner_mode == HImode))
8431           op = XEXP (inner, 0);
8432         break;
8433       case ASHIFTRT:
8434         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
8435             && (inner_mode == QImode || inner_mode == HImode))
8436           op = XEXP (inner, 0);
8437         break;
8438       default:
8439         break;
8440     }
8441
8442   return op;
8443 }
8444
8445 /* Return true if the mask and a shift amount from an RTX of the form
8446    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
8447    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
8448
8449 bool
8450 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
8451                                     rtx shft_amnt)
8452 {
8453   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
8454          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
8455          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
8456          && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
8457 }
8458
8459 /* Calculate the cost of calculating X, storing it in *COST.  Result
8460    is true if the total cost of the operation has now been calculated.  */
8461 static bool
8462 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
8463                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
8464 {
8465   rtx op0, op1, op2;
8466   const struct cpu_cost_table *extra_cost
8467     = aarch64_tune_params.insn_extra_cost;
8468   int code = GET_CODE (x);
8469   scalar_int_mode int_mode;
8470
8471   /* By default, assume that everything has equivalent cost to the
8472      cheapest instruction.  Any additional costs are applied as a delta
8473      above this default.  */
8474   *cost = COSTS_N_INSNS (1);
8475
8476   switch (code)
8477     {
8478     case SET:
8479       /* The cost depends entirely on the operands to SET.  */
8480       *cost = 0;
8481       op0 = SET_DEST (x);
8482       op1 = SET_SRC (x);
8483
8484       switch (GET_CODE (op0))
8485         {
8486         case MEM:
8487           if (speed)
8488             {
8489               rtx address = XEXP (op0, 0);
8490               if (VECTOR_MODE_P (mode))
8491                 *cost += extra_cost->ldst.storev;
8492               else if (GET_MODE_CLASS (mode) == MODE_INT)
8493                 *cost += extra_cost->ldst.store;
8494               else if (mode == SFmode)
8495                 *cost += extra_cost->ldst.storef;
8496               else if (mode == DFmode)
8497                 *cost += extra_cost->ldst.stored;
8498
8499               *cost +=
8500                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8501                                                      0, speed));
8502             }
8503
8504           *cost += rtx_cost (op1, mode, SET, 1, speed);
8505           return true;
8506
8507         case SUBREG:
8508           if (! REG_P (SUBREG_REG (op0)))
8509             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
8510
8511           /* Fall through.  */
8512         case REG:
8513           /* The cost is one per vector-register copied.  */
8514           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
8515             {
8516               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
8517               *cost = COSTS_N_INSNS (nregs);
8518             }
8519           /* const0_rtx is in general free, but we will use an
8520              instruction to set a register to 0.  */
8521           else if (REG_P (op1) || op1 == const0_rtx)
8522             {
8523               /* The cost is 1 per register copied.  */
8524               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
8525               *cost = COSTS_N_INSNS (nregs);
8526             }
8527           else
8528             /* Cost is just the cost of the RHS of the set.  */
8529             *cost += rtx_cost (op1, mode, SET, 1, speed);
8530           return true;
8531
8532         case ZERO_EXTRACT:
8533         case SIGN_EXTRACT:
8534           /* Bit-field insertion.  Strip any redundant widening of
8535              the RHS to meet the width of the target.  */
8536           if (GET_CODE (op1) == SUBREG)
8537             op1 = SUBREG_REG (op1);
8538           if ((GET_CODE (op1) == ZERO_EXTEND
8539                || GET_CODE (op1) == SIGN_EXTEND)
8540               && CONST_INT_P (XEXP (op0, 1))
8541               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
8542               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
8543             op1 = XEXP (op1, 0);
8544
8545           if (CONST_INT_P (op1))
8546             {
8547               /* MOV immediate is assumed to always be cheap.  */
8548               *cost = COSTS_N_INSNS (1);
8549             }
8550           else
8551             {
8552               /* BFM.  */
8553               if (speed)
8554                 *cost += extra_cost->alu.bfi;
8555               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
8556             }
8557
8558           return true;
8559
8560         default:
8561           /* We can't make sense of this, assume default cost.  */
8562           *cost = COSTS_N_INSNS (1);
8563           return false;
8564         }
8565       return false;
8566
8567     case CONST_INT:
8568       /* If an instruction can incorporate a constant within the
8569          instruction, the instruction's expression avoids calling
8570          rtx_cost() on the constant.  If rtx_cost() is called on a
8571          constant, then it is usually because the constant must be
8572          moved into a register by one or more instructions.
8573
8574          The exception is constant 0, which can be expressed
8575          as XZR/WZR and is therefore free.  The exception to this is
8576          if we have (set (reg) (const0_rtx)) in which case we must cost
8577          the move.  However, we can catch that when we cost the SET, so
8578          we don't need to consider that here.  */
8579       if (x == const0_rtx)
8580         *cost = 0;
8581       else
8582         {
8583           /* To an approximation, building any other constant is
8584              proportionally expensive to the number of instructions
8585              required to build that constant.  This is true whether we
8586              are compiling for SPEED or otherwise.  */
8587           if (!is_a <scalar_int_mode> (mode, &int_mode))
8588             int_mode = word_mode;
8589           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
8590                                  (NULL_RTX, x, false, int_mode));
8591         }
8592       return true;
8593
8594     case CONST_DOUBLE:
8595
8596       /* First determine number of instructions to do the move
8597           as an integer constant.  */
8598       if (!aarch64_float_const_representable_p (x)
8599            && !aarch64_can_const_movi_rtx_p (x, mode)
8600            && aarch64_float_const_rtx_p (x))
8601         {
8602           unsigned HOST_WIDE_INT ival;
8603           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
8604           gcc_assert (succeed);
8605
8606           scalar_int_mode imode = (mode == HFmode
8607                                    ? SImode
8608                                    : int_mode_for_mode (mode).require ());
8609           int ncost = aarch64_internal_mov_immediate
8610                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
8611           *cost += COSTS_N_INSNS (ncost);
8612           return true;
8613         }
8614
8615       if (speed)
8616         {
8617           /* mov[df,sf]_aarch64.  */
8618           if (aarch64_float_const_representable_p (x))
8619             /* FMOV (scalar immediate).  */
8620             *cost += extra_cost->fp[mode == DFmode].fpconst;
8621           else if (!aarch64_float_const_zero_rtx_p (x))
8622             {
8623               /* This will be a load from memory.  */
8624               if (mode == DFmode)
8625                 *cost += extra_cost->ldst.loadd;
8626               else
8627                 *cost += extra_cost->ldst.loadf;
8628             }
8629           else
8630             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
8631                or MOV v0.s[0], wzr - neither of which are modeled by the
8632                cost tables.  Just use the default cost.  */
8633             {
8634             }
8635         }
8636
8637       return true;
8638
8639     case MEM:
8640       if (speed)
8641         {
8642           /* For loads we want the base cost of a load, plus an
8643              approximation for the additional cost of the addressing
8644              mode.  */
8645           rtx address = XEXP (x, 0);
8646           if (VECTOR_MODE_P (mode))
8647             *cost += extra_cost->ldst.loadv;
8648           else if (GET_MODE_CLASS (mode) == MODE_INT)
8649             *cost += extra_cost->ldst.load;
8650           else if (mode == SFmode)
8651             *cost += extra_cost->ldst.loadf;
8652           else if (mode == DFmode)
8653             *cost += extra_cost->ldst.loadd;
8654
8655           *cost +=
8656                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8657                                                      0, speed));
8658         }
8659
8660       return true;
8661
8662     case NEG:
8663       op0 = XEXP (x, 0);
8664
8665       if (VECTOR_MODE_P (mode))
8666         {
8667           if (speed)
8668             {
8669               /* FNEG.  */
8670               *cost += extra_cost->vect.alu;
8671             }
8672           return false;
8673         }
8674
8675       if (GET_MODE_CLASS (mode) == MODE_INT)
8676         {
8677           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8678               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8679             {
8680               /* CSETM.  */
8681               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
8682               return true;
8683             }
8684
8685           /* Cost this as SUB wzr, X.  */
8686           op0 = CONST0_RTX (mode);
8687           op1 = XEXP (x, 0);
8688           goto cost_minus;
8689         }
8690
8691       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8692         {
8693           /* Support (neg(fma...)) as a single instruction only if
8694              sign of zeros is unimportant.  This matches the decision
8695              making in aarch64.md.  */
8696           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
8697             {
8698               /* FNMADD.  */
8699               *cost = rtx_cost (op0, mode, NEG, 0, speed);
8700               return true;
8701             }
8702           if (GET_CODE (op0) == MULT)
8703             {
8704               /* FNMUL.  */
8705               *cost = rtx_cost (op0, mode, NEG, 0, speed);
8706               return true;
8707             }
8708           if (speed)
8709             /* FNEG.  */
8710             *cost += extra_cost->fp[mode == DFmode].neg;
8711           return false;
8712         }
8713
8714       return false;
8715
8716     case CLRSB:
8717     case CLZ:
8718       if (speed)
8719         {
8720           if (VECTOR_MODE_P (mode))
8721             *cost += extra_cost->vect.alu;
8722           else
8723             *cost += extra_cost->alu.clz;
8724         }
8725
8726       return false;
8727
8728     case COMPARE:
8729       op0 = XEXP (x, 0);
8730       op1 = XEXP (x, 1);
8731
8732       if (op1 == const0_rtx
8733           && GET_CODE (op0) == AND)
8734         {
8735           x = op0;
8736           mode = GET_MODE (op0);
8737           goto cost_logic;
8738         }
8739
8740       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
8741         {
8742           /* TODO: A write to the CC flags possibly costs extra, this
8743              needs encoding in the cost tables.  */
8744
8745           mode = GET_MODE (op0);
8746           /* ANDS.  */
8747           if (GET_CODE (op0) == AND)
8748             {
8749               x = op0;
8750               goto cost_logic;
8751             }
8752
8753           if (GET_CODE (op0) == PLUS)
8754             {
8755               /* ADDS (and CMN alias).  */
8756               x = op0;
8757               goto cost_plus;
8758             }
8759
8760           if (GET_CODE (op0) == MINUS)
8761             {
8762               /* SUBS.  */
8763               x = op0;
8764               goto cost_minus;
8765             }
8766
8767           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
8768               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
8769               && CONST_INT_P (XEXP (op0, 2)))
8770             {
8771               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
8772                  Handle it here directly rather than going to cost_logic
8773                  since we know the immediate generated for the TST is valid
8774                  so we can avoid creating an intermediate rtx for it only
8775                  for costing purposes.  */
8776               if (speed)
8777                 *cost += extra_cost->alu.logical;
8778
8779               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
8780                                  ZERO_EXTRACT, 0, speed);
8781               return true;
8782             }
8783
8784           if (GET_CODE (op1) == NEG)
8785             {
8786               /* CMN.  */
8787               if (speed)
8788                 *cost += extra_cost->alu.arith;
8789
8790               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
8791               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
8792               return true;
8793             }
8794
8795           /* CMP.
8796
8797              Compare can freely swap the order of operands, and
8798              canonicalization puts the more complex operation first.
8799              But the integer MINUS logic expects the shift/extend
8800              operation in op1.  */
8801           if (! (REG_P (op0)
8802                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
8803           {
8804             op0 = XEXP (x, 1);
8805             op1 = XEXP (x, 0);
8806           }
8807           goto cost_minus;
8808         }
8809
8810       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
8811         {
8812           /* FCMP.  */
8813           if (speed)
8814             *cost += extra_cost->fp[mode == DFmode].compare;
8815
8816           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
8817             {
8818               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
8819               /* FCMP supports constant 0.0 for no extra cost. */
8820               return true;
8821             }
8822           return false;
8823         }
8824
8825       if (VECTOR_MODE_P (mode))
8826         {
8827           /* Vector compare.  */
8828           if (speed)
8829             *cost += extra_cost->vect.alu;
8830
8831           if (aarch64_float_const_zero_rtx_p (op1))
8832             {
8833               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
8834                  cost.  */
8835               return true;
8836             }
8837           return false;
8838         }
8839       return false;
8840
8841     case MINUS:
8842       {
8843         op0 = XEXP (x, 0);
8844         op1 = XEXP (x, 1);
8845
8846 cost_minus:
8847         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
8848
8849         /* Detect valid immediates.  */
8850         if ((GET_MODE_CLASS (mode) == MODE_INT
8851              || (GET_MODE_CLASS (mode) == MODE_CC
8852                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
8853             && CONST_INT_P (op1)
8854             && aarch64_uimm12_shift (INTVAL (op1)))
8855           {
8856             if (speed)
8857               /* SUB(S) (immediate).  */
8858               *cost += extra_cost->alu.arith;
8859             return true;
8860           }
8861
8862         /* Look for SUB (extended register).  */
8863         if (is_a <scalar_int_mode> (mode, &int_mode)
8864             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
8865           {
8866             if (speed)
8867               *cost += extra_cost->alu.extend_arith;
8868
8869             op1 = aarch64_strip_extend (op1, true);
8870             *cost += rtx_cost (op1, VOIDmode,
8871                                (enum rtx_code) GET_CODE (op1), 0, speed);
8872             return true;
8873           }
8874
8875         rtx new_op1 = aarch64_strip_extend (op1, false);
8876
8877         /* Cost this as an FMA-alike operation.  */
8878         if ((GET_CODE (new_op1) == MULT
8879              || aarch64_shift_p (GET_CODE (new_op1)))
8880             && code != COMPARE)
8881           {
8882             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
8883                                             (enum rtx_code) code,
8884                                             speed);
8885             return true;
8886           }
8887
8888         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
8889
8890         if (speed)
8891           {
8892             if (VECTOR_MODE_P (mode))
8893               {
8894                 /* Vector SUB.  */
8895                 *cost += extra_cost->vect.alu;
8896               }
8897             else if (GET_MODE_CLASS (mode) == MODE_INT)
8898               {
8899                 /* SUB(S).  */
8900                 *cost += extra_cost->alu.arith;
8901               }
8902             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8903               {
8904                 /* FSUB.  */
8905                 *cost += extra_cost->fp[mode == DFmode].addsub;
8906               }
8907           }
8908         return true;
8909       }
8910
8911     case PLUS:
8912       {
8913         rtx new_op0;
8914
8915         op0 = XEXP (x, 0);
8916         op1 = XEXP (x, 1);
8917
8918 cost_plus:
8919         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8920             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8921           {
8922             /* CSINC.  */
8923             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
8924             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8925             return true;
8926           }
8927
8928         if (GET_MODE_CLASS (mode) == MODE_INT
8929             && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
8930                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
8931           {
8932             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
8933
8934             if (speed)
8935               /* ADD (immediate).  */
8936               *cost += extra_cost->alu.arith;
8937             return true;
8938           }
8939
8940         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8941
8942         /* Look for ADD (extended register).  */
8943         if (is_a <scalar_int_mode> (mode, &int_mode)
8944             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
8945           {
8946             if (speed)
8947               *cost += extra_cost->alu.extend_arith;
8948
8949             op0 = aarch64_strip_extend (op0, true);
8950             *cost += rtx_cost (op0, VOIDmode,
8951                                (enum rtx_code) GET_CODE (op0), 0, speed);
8952             return true;
8953           }
8954
8955         /* Strip any extend, leave shifts behind as we will
8956            cost them through mult_cost.  */
8957         new_op0 = aarch64_strip_extend (op0, false);
8958
8959         if (GET_CODE (new_op0) == MULT
8960             || aarch64_shift_p (GET_CODE (new_op0)))
8961           {
8962             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
8963                                             speed);
8964             return true;
8965           }
8966
8967         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
8968
8969         if (speed)
8970           {
8971             if (VECTOR_MODE_P (mode))
8972               {
8973                 /* Vector ADD.  */
8974                 *cost += extra_cost->vect.alu;
8975               }
8976             else if (GET_MODE_CLASS (mode) == MODE_INT)
8977               {
8978                 /* ADD.  */
8979                 *cost += extra_cost->alu.arith;
8980               }
8981             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8982               {
8983                 /* FADD.  */
8984                 *cost += extra_cost->fp[mode == DFmode].addsub;
8985               }
8986           }
8987         return true;
8988       }
8989
8990     case BSWAP:
8991       *cost = COSTS_N_INSNS (1);
8992
8993       if (speed)
8994         {
8995           if (VECTOR_MODE_P (mode))
8996             *cost += extra_cost->vect.alu;
8997           else
8998             *cost += extra_cost->alu.rev;
8999         }
9000       return false;
9001
9002     case IOR:
9003       if (aarch_rev16_p (x))
9004         {
9005           *cost = COSTS_N_INSNS (1);
9006
9007           if (speed)
9008             {
9009               if (VECTOR_MODE_P (mode))
9010                 *cost += extra_cost->vect.alu;
9011               else
9012                 *cost += extra_cost->alu.rev;
9013             }
9014           return true;
9015         }
9016
9017       if (aarch64_extr_rtx_p (x, &op0, &op1))
9018         {
9019           *cost += rtx_cost (op0, mode, IOR, 0, speed);
9020           *cost += rtx_cost (op1, mode, IOR, 1, speed);
9021           if (speed)
9022             *cost += extra_cost->alu.shift;
9023
9024           return true;
9025         }
9026     /* Fall through.  */
9027     case XOR:
9028     case AND:
9029     cost_logic:
9030       op0 = XEXP (x, 0);
9031       op1 = XEXP (x, 1);
9032
9033       if (VECTOR_MODE_P (mode))
9034         {
9035           if (speed)
9036             *cost += extra_cost->vect.alu;
9037           return true;
9038         }
9039
9040       if (code == AND
9041           && GET_CODE (op0) == MULT
9042           && CONST_INT_P (XEXP (op0, 1))
9043           && CONST_INT_P (op1)
9044           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
9045                                INTVAL (op1)) != 0)
9046         {
9047           /* This is a UBFM/SBFM.  */
9048           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
9049           if (speed)
9050             *cost += extra_cost->alu.bfx;
9051           return true;
9052         }
9053
9054       if (is_int_mode (mode, &int_mode))
9055         {
9056           if (CONST_INT_P (op1))
9057             {
9058               /* We have a mask + shift version of a UBFIZ
9059                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
9060               if (GET_CODE (op0) == ASHIFT
9061                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
9062                                                          XEXP (op0, 1)))
9063                 {
9064                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
9065                                      (enum rtx_code) code, 0, speed);
9066                   if (speed)
9067                     *cost += extra_cost->alu.bfx;
9068
9069                   return true;
9070                 }
9071               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
9072                 {
9073                 /* We possibly get the immediate for free, this is not
9074                    modelled.  */
9075                   *cost += rtx_cost (op0, int_mode,
9076                                      (enum rtx_code) code, 0, speed);
9077                   if (speed)
9078                     *cost += extra_cost->alu.logical;
9079
9080                   return true;
9081                 }
9082             }
9083           else
9084             {
9085               rtx new_op0 = op0;
9086
9087               /* Handle ORN, EON, or BIC.  */
9088               if (GET_CODE (op0) == NOT)
9089                 op0 = XEXP (op0, 0);
9090
9091               new_op0 = aarch64_strip_shift (op0);
9092
9093               /* If we had a shift on op0 then this is a logical-shift-
9094                  by-register/immediate operation.  Otherwise, this is just
9095                  a logical operation.  */
9096               if (speed)
9097                 {
9098                   if (new_op0 != op0)
9099                     {
9100                       /* Shift by immediate.  */
9101                       if (CONST_INT_P (XEXP (op0, 1)))
9102                         *cost += extra_cost->alu.log_shift;
9103                       else
9104                         *cost += extra_cost->alu.log_shift_reg;
9105                     }
9106                   else
9107                     *cost += extra_cost->alu.logical;
9108                 }
9109
9110               /* In both cases we want to cost both operands.  */
9111               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
9112                                  0, speed);
9113               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
9114                                  1, speed);
9115
9116               return true;
9117             }
9118         }
9119       return false;
9120
9121     case NOT:
9122       x = XEXP (x, 0);
9123       op0 = aarch64_strip_shift (x);
9124
9125       if (VECTOR_MODE_P (mode))
9126         {
9127           /* Vector NOT.  */
9128           *cost += extra_cost->vect.alu;
9129           return false;
9130         }
9131
9132       /* MVN-shifted-reg.  */
9133       if (op0 != x)
9134         {
9135           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9136
9137           if (speed)
9138             *cost += extra_cost->alu.log_shift;
9139
9140           return true;
9141         }
9142       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
9143          Handle the second form here taking care that 'a' in the above can
9144          be a shift.  */
9145       else if (GET_CODE (op0) == XOR)
9146         {
9147           rtx newop0 = XEXP (op0, 0);
9148           rtx newop1 = XEXP (op0, 1);
9149           rtx op0_stripped = aarch64_strip_shift (newop0);
9150
9151           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
9152           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
9153
9154           if (speed)
9155             {
9156               if (op0_stripped != newop0)
9157                 *cost += extra_cost->alu.log_shift;
9158               else
9159                 *cost += extra_cost->alu.logical;
9160             }
9161
9162           return true;
9163         }
9164       /* MVN.  */
9165       if (speed)
9166         *cost += extra_cost->alu.logical;
9167
9168       return false;
9169
9170     case ZERO_EXTEND:
9171
9172       op0 = XEXP (x, 0);
9173       /* If a value is written in SI mode, then zero extended to DI
9174          mode, the operation will in general be free as a write to
9175          a 'w' register implicitly zeroes the upper bits of an 'x'
9176          register.  However, if this is
9177
9178            (set (reg) (zero_extend (reg)))
9179
9180          we must cost the explicit register move.  */
9181       if (mode == DImode
9182           && GET_MODE (op0) == SImode
9183           && outer == SET)
9184         {
9185           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
9186
9187         /* If OP_COST is non-zero, then the cost of the zero extend
9188            is effectively the cost of the inner operation.  Otherwise
9189            we have a MOV instruction and we take the cost from the MOV
9190            itself.  This is true independently of whether we are
9191            optimizing for space or time.  */
9192           if (op_cost)
9193             *cost = op_cost;
9194
9195           return true;
9196         }
9197       else if (MEM_P (op0))
9198         {
9199           /* All loads can zero extend to any size for free.  */
9200           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
9201           return true;
9202         }
9203
9204       op0 = aarch64_extend_bitfield_pattern_p (x);
9205       if (op0)
9206         {
9207           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
9208           if (speed)
9209             *cost += extra_cost->alu.bfx;
9210           return true;
9211         }
9212
9213       if (speed)
9214         {
9215           if (VECTOR_MODE_P (mode))
9216             {
9217               /* UMOV.  */
9218               *cost += extra_cost->vect.alu;
9219             }
9220           else
9221             {
9222               /* We generate an AND instead of UXTB/UXTH.  */
9223               *cost += extra_cost->alu.logical;
9224             }
9225         }
9226       return false;
9227
9228     case SIGN_EXTEND:
9229       if (MEM_P (XEXP (x, 0)))
9230         {
9231           /* LDRSH.  */
9232           if (speed)
9233             {
9234               rtx address = XEXP (XEXP (x, 0), 0);
9235               *cost += extra_cost->ldst.load_sign_extend;
9236
9237               *cost +=
9238                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9239                                                      0, speed));
9240             }
9241           return true;
9242         }
9243
9244       op0 = aarch64_extend_bitfield_pattern_p (x);
9245       if (op0)
9246         {
9247           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
9248           if (speed)
9249             *cost += extra_cost->alu.bfx;
9250           return true;
9251         }
9252
9253       if (speed)
9254         {
9255           if (VECTOR_MODE_P (mode))
9256             *cost += extra_cost->vect.alu;
9257           else
9258             *cost += extra_cost->alu.extend;
9259         }
9260       return false;
9261
9262     case ASHIFT:
9263       op0 = XEXP (x, 0);
9264       op1 = XEXP (x, 1);
9265
9266       if (CONST_INT_P (op1))
9267         {
9268           if (speed)
9269             {
9270               if (VECTOR_MODE_P (mode))
9271                 {
9272                   /* Vector shift (immediate).  */
9273                   *cost += extra_cost->vect.alu;
9274                 }
9275               else
9276                 {
9277                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
9278                      aliases.  */
9279                   *cost += extra_cost->alu.shift;
9280                 }
9281             }
9282
9283           /* We can incorporate zero/sign extend for free.  */
9284           if (GET_CODE (op0) == ZERO_EXTEND
9285               || GET_CODE (op0) == SIGN_EXTEND)
9286             op0 = XEXP (op0, 0);
9287
9288           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
9289           return true;
9290         }
9291       else
9292         {
9293           if (VECTOR_MODE_P (mode))
9294             {
9295               if (speed)
9296                 /* Vector shift (register).  */
9297                 *cost += extra_cost->vect.alu;
9298             }
9299           else
9300             {
9301               if (speed)
9302                 /* LSLV.  */
9303                 *cost += extra_cost->alu.shift_reg;
9304
9305               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9306                   && CONST_INT_P (XEXP (op1, 1))
9307                   && known_eq (INTVAL (XEXP (op1, 1)),
9308                                GET_MODE_BITSIZE (mode) - 1))
9309                 {
9310                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9311                   /* We already demanded XEXP (op1, 0) to be REG_P, so
9312                      don't recurse into it.  */
9313                   return true;
9314                 }
9315             }
9316           return false;  /* All arguments need to be in registers.  */
9317         }
9318
9319     case ROTATE:
9320     case ROTATERT:
9321     case LSHIFTRT:
9322     case ASHIFTRT:
9323       op0 = XEXP (x, 0);
9324       op1 = XEXP (x, 1);
9325
9326       if (CONST_INT_P (op1))
9327         {
9328           /* ASR (immediate) and friends.  */
9329           if (speed)
9330             {
9331               if (VECTOR_MODE_P (mode))
9332                 *cost += extra_cost->vect.alu;
9333               else
9334                 *cost += extra_cost->alu.shift;
9335             }
9336
9337           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9338           return true;
9339         }
9340       else
9341         {
9342           if (VECTOR_MODE_P (mode))
9343             {
9344               if (speed)
9345                 /* Vector shift (register).  */
9346                 *cost += extra_cost->vect.alu;
9347             }
9348           else
9349             {
9350               if (speed)
9351                 /* ASR (register) and friends.  */
9352                 *cost += extra_cost->alu.shift_reg;
9353
9354               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9355                   && CONST_INT_P (XEXP (op1, 1))
9356                   && known_eq (INTVAL (XEXP (op1, 1)),
9357                                GET_MODE_BITSIZE (mode) - 1))
9358                 {
9359                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9360                   /* We already demanded XEXP (op1, 0) to be REG_P, so
9361                      don't recurse into it.  */
9362                   return true;
9363                 }
9364             }
9365           return false;  /* All arguments need to be in registers.  */
9366         }
9367
9368     case SYMBOL_REF:
9369
9370       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
9371           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
9372         {
9373           /* LDR.  */
9374           if (speed)
9375             *cost += extra_cost->ldst.load;
9376         }
9377       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
9378                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
9379         {
9380           /* ADRP, followed by ADD.  */
9381           *cost += COSTS_N_INSNS (1);
9382           if (speed)
9383             *cost += 2 * extra_cost->alu.arith;
9384         }
9385       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
9386                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
9387         {
9388           /* ADR.  */
9389           if (speed)
9390             *cost += extra_cost->alu.arith;
9391         }
9392
9393       if (flag_pic)
9394         {
9395           /* One extra load instruction, after accessing the GOT.  */
9396           *cost += COSTS_N_INSNS (1);
9397           if (speed)
9398             *cost += extra_cost->ldst.load;
9399         }
9400       return true;
9401
9402     case HIGH:
9403     case LO_SUM:
9404       /* ADRP/ADD (immediate).  */
9405       if (speed)
9406         *cost += extra_cost->alu.arith;
9407       return true;
9408
9409     case ZERO_EXTRACT:
9410     case SIGN_EXTRACT:
9411       /* UBFX/SBFX.  */
9412       if (speed)
9413         {
9414           if (VECTOR_MODE_P (mode))
9415             *cost += extra_cost->vect.alu;
9416           else
9417             *cost += extra_cost->alu.bfx;
9418         }
9419
9420       /* We can trust that the immediates used will be correct (there
9421          are no by-register forms), so we need only cost op0.  */
9422       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
9423       return true;
9424
9425     case MULT:
9426       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
9427       /* aarch64_rtx_mult_cost always handles recursion to its
9428          operands.  */
9429       return true;
9430
9431     case MOD:
9432     /* We can expand signed mod by power of 2 using a NEGS, two parallel
9433        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
9434        an unconditional negate.  This case should only ever be reached through
9435        the set_smod_pow2_cheap check in expmed.c.  */
9436       if (CONST_INT_P (XEXP (x, 1))
9437           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
9438           && (mode == SImode || mode == DImode))
9439         {
9440           /* We expand to 4 instructions.  Reset the baseline.  */
9441           *cost = COSTS_N_INSNS (4);
9442
9443           if (speed)
9444             *cost += 2 * extra_cost->alu.logical
9445                      + 2 * extra_cost->alu.arith;
9446
9447           return true;
9448         }
9449
9450     /* Fall-through.  */
9451     case UMOD:
9452       if (speed)
9453         {
9454           /* Slighly prefer UMOD over SMOD.  */
9455           if (VECTOR_MODE_P (mode))
9456             *cost += extra_cost->vect.alu;
9457           else if (GET_MODE_CLASS (mode) == MODE_INT)
9458             *cost += (extra_cost->mult[mode == DImode].add
9459                       + extra_cost->mult[mode == DImode].idiv
9460                       + (code == MOD ? 1 : 0));
9461         }
9462       return false;  /* All arguments need to be in registers.  */
9463
9464     case DIV:
9465     case UDIV:
9466     case SQRT:
9467       if (speed)
9468         {
9469           if (VECTOR_MODE_P (mode))
9470             *cost += extra_cost->vect.alu;
9471           else if (GET_MODE_CLASS (mode) == MODE_INT)
9472             /* There is no integer SQRT, so only DIV and UDIV can get
9473                here.  */
9474             *cost += (extra_cost->mult[mode == DImode].idiv
9475                      /* Slighly prefer UDIV over SDIV.  */
9476                      + (code == DIV ? 1 : 0));
9477           else
9478             *cost += extra_cost->fp[mode == DFmode].div;
9479         }
9480       return false;  /* All arguments need to be in registers.  */
9481
9482     case IF_THEN_ELSE:
9483       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
9484                                          XEXP (x, 2), cost, speed);
9485
9486     case EQ:
9487     case NE:
9488     case GT:
9489     case GTU:
9490     case LT:
9491     case LTU:
9492     case GE:
9493     case GEU:
9494     case LE:
9495     case LEU:
9496
9497       return false; /* All arguments must be in registers.  */
9498
9499     case FMA:
9500       op0 = XEXP (x, 0);
9501       op1 = XEXP (x, 1);
9502       op2 = XEXP (x, 2);
9503
9504       if (speed)
9505         {
9506           if (VECTOR_MODE_P (mode))
9507             *cost += extra_cost->vect.alu;
9508           else
9509             *cost += extra_cost->fp[mode == DFmode].fma;
9510         }
9511
9512       /* FMSUB, FNMADD, and FNMSUB are free.  */
9513       if (GET_CODE (op0) == NEG)
9514         op0 = XEXP (op0, 0);
9515
9516       if (GET_CODE (op2) == NEG)
9517         op2 = XEXP (op2, 0);
9518
9519       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
9520          and the by-element operand as operand 0.  */
9521       if (GET_CODE (op1) == NEG)
9522         op1 = XEXP (op1, 0);
9523
9524       /* Catch vector-by-element operations.  The by-element operand can
9525          either be (vec_duplicate (vec_select (x))) or just
9526          (vec_select (x)), depending on whether we are multiplying by
9527          a vector or a scalar.
9528
9529          Canonicalization is not very good in these cases, FMA4 will put the
9530          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
9531       if (GET_CODE (op0) == VEC_DUPLICATE)
9532         op0 = XEXP (op0, 0);
9533       else if (GET_CODE (op1) == VEC_DUPLICATE)
9534         op1 = XEXP (op1, 0);
9535
9536       if (GET_CODE (op0) == VEC_SELECT)
9537         op0 = XEXP (op0, 0);
9538       else if (GET_CODE (op1) == VEC_SELECT)
9539         op1 = XEXP (op1, 0);
9540
9541       /* If the remaining parameters are not registers,
9542          get the cost to put them into registers.  */
9543       *cost += rtx_cost (op0, mode, FMA, 0, speed);
9544       *cost += rtx_cost (op1, mode, FMA, 1, speed);
9545       *cost += rtx_cost (op2, mode, FMA, 2, speed);
9546       return true;
9547
9548     case FLOAT:
9549     case UNSIGNED_FLOAT:
9550       if (speed)
9551         *cost += extra_cost->fp[mode == DFmode].fromint;
9552       return false;
9553
9554     case FLOAT_EXTEND:
9555       if (speed)
9556         {
9557           if (VECTOR_MODE_P (mode))
9558             {
9559               /*Vector truncate.  */
9560               *cost += extra_cost->vect.alu;
9561             }
9562           else
9563             *cost += extra_cost->fp[mode == DFmode].widen;
9564         }
9565       return false;
9566
9567     case FLOAT_TRUNCATE:
9568       if (speed)
9569         {
9570           if (VECTOR_MODE_P (mode))
9571             {
9572               /*Vector conversion.  */
9573               *cost += extra_cost->vect.alu;
9574             }
9575           else
9576             *cost += extra_cost->fp[mode == DFmode].narrow;
9577         }
9578       return false;
9579
9580     case FIX:
9581     case UNSIGNED_FIX:
9582       x = XEXP (x, 0);
9583       /* Strip the rounding part.  They will all be implemented
9584          by the fcvt* family of instructions anyway.  */
9585       if (GET_CODE (x) == UNSPEC)
9586         {
9587           unsigned int uns_code = XINT (x, 1);
9588
9589           if (uns_code == UNSPEC_FRINTA
9590               || uns_code == UNSPEC_FRINTM
9591               || uns_code == UNSPEC_FRINTN
9592               || uns_code == UNSPEC_FRINTP
9593               || uns_code == UNSPEC_FRINTZ)
9594             x = XVECEXP (x, 0, 0);
9595         }
9596
9597       if (speed)
9598         {
9599           if (VECTOR_MODE_P (mode))
9600             *cost += extra_cost->vect.alu;
9601           else
9602             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
9603         }
9604
9605       /* We can combine fmul by a power of 2 followed by a fcvt into a single
9606          fixed-point fcvt.  */
9607       if (GET_CODE (x) == MULT
9608           && ((VECTOR_MODE_P (mode)
9609                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
9610               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
9611         {
9612           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
9613                              0, speed);
9614           return true;
9615         }
9616
9617       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
9618       return true;
9619
9620     case ABS:
9621       if (VECTOR_MODE_P (mode))
9622         {
9623           /* ABS (vector).  */
9624           if (speed)
9625             *cost += extra_cost->vect.alu;
9626         }
9627       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9628         {
9629           op0 = XEXP (x, 0);
9630
9631           /* FABD, which is analogous to FADD.  */
9632           if (GET_CODE (op0) == MINUS)
9633             {
9634               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
9635               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
9636               if (speed)
9637                 *cost += extra_cost->fp[mode == DFmode].addsub;
9638
9639               return true;
9640             }
9641           /* Simple FABS is analogous to FNEG.  */
9642           if (speed)
9643             *cost += extra_cost->fp[mode == DFmode].neg;
9644         }
9645       else
9646         {
9647           /* Integer ABS will either be split to
9648              two arithmetic instructions, or will be an ABS
9649              (scalar), which we don't model.  */
9650           *cost = COSTS_N_INSNS (2);
9651           if (speed)
9652             *cost += 2 * extra_cost->alu.arith;
9653         }
9654       return false;
9655
9656     case SMAX:
9657     case SMIN:
9658       if (speed)
9659         {
9660           if (VECTOR_MODE_P (mode))
9661             *cost += extra_cost->vect.alu;
9662           else
9663             {
9664               /* FMAXNM/FMINNM/FMAX/FMIN.
9665                  TODO: This may not be accurate for all implementations, but
9666                  we do not model this in the cost tables.  */
9667               *cost += extra_cost->fp[mode == DFmode].addsub;
9668             }
9669         }
9670       return false;
9671
9672     case UNSPEC:
9673       /* The floating point round to integer frint* instructions.  */
9674       if (aarch64_frint_unspec_p (XINT (x, 1)))
9675         {
9676           if (speed)
9677             *cost += extra_cost->fp[mode == DFmode].roundint;
9678
9679           return false;
9680         }
9681
9682       if (XINT (x, 1) == UNSPEC_RBIT)
9683         {
9684           if (speed)
9685             *cost += extra_cost->alu.rev;
9686
9687           return false;
9688         }
9689       break;
9690
9691     case TRUNCATE:
9692
9693       /* Decompose <su>muldi3_highpart.  */
9694       if (/* (truncate:DI  */
9695           mode == DImode
9696           /*   (lshiftrt:TI  */
9697           && GET_MODE (XEXP (x, 0)) == TImode
9698           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
9699           /*      (mult:TI  */
9700           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
9701           /*        (ANY_EXTEND:TI (reg:DI))
9702                     (ANY_EXTEND:TI (reg:DI)))  */
9703           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
9704                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
9705               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
9706                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
9707           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
9708           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
9709           /*     (const_int 64)  */
9710           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9711           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
9712         {
9713           /* UMULH/SMULH.  */
9714           if (speed)
9715             *cost += extra_cost->mult[mode == DImode].extend;
9716           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
9717                              mode, MULT, 0, speed);
9718           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
9719                              mode, MULT, 1, speed);
9720           return true;
9721         }
9722
9723       /* Fall through.  */
9724     default:
9725       break;
9726     }
9727
9728   if (dump_file
9729       && flag_aarch64_verbose_cost)
9730     fprintf (dump_file,
9731       "\nFailed to cost RTX.  Assuming default cost.\n");
9732
9733   return true;
9734 }
9735
9736 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
9737    calculated for X.  This cost is stored in *COST.  Returns true
9738    if the total cost of X was calculated.  */
9739 static bool
9740 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
9741                    int param, int *cost, bool speed)
9742 {
9743   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
9744
9745   if (dump_file
9746       && flag_aarch64_verbose_cost)
9747     {
9748       print_rtl_single (dump_file, x);
9749       fprintf (dump_file, "\n%s cost: %d (%s)\n",
9750                speed ? "Hot" : "Cold",
9751                *cost, result ? "final" : "partial");
9752     }
9753
9754   return result;
9755 }
9756
9757 static int
9758 aarch64_register_move_cost (machine_mode mode,
9759                             reg_class_t from_i, reg_class_t to_i)
9760 {
9761   enum reg_class from = (enum reg_class) from_i;
9762   enum reg_class to = (enum reg_class) to_i;
9763   const struct cpu_regmove_cost *regmove_cost
9764     = aarch64_tune_params.regmove_cost;
9765
9766   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
9767   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
9768     to = GENERAL_REGS;
9769
9770   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
9771     from = GENERAL_REGS;
9772
9773   /* Moving between GPR and stack cost is the same as GP2GP.  */
9774   if ((from == GENERAL_REGS && to == STACK_REG)
9775       || (to == GENERAL_REGS && from == STACK_REG))
9776     return regmove_cost->GP2GP;
9777
9778   /* To/From the stack register, we move via the gprs.  */
9779   if (to == STACK_REG || from == STACK_REG)
9780     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
9781             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
9782
9783   if (known_eq (GET_MODE_SIZE (mode), 16))
9784     {
9785       /* 128-bit operations on general registers require 2 instructions.  */
9786       if (from == GENERAL_REGS && to == GENERAL_REGS)
9787         return regmove_cost->GP2GP * 2;
9788       else if (from == GENERAL_REGS)
9789         return regmove_cost->GP2FP * 2;
9790       else if (to == GENERAL_REGS)
9791         return regmove_cost->FP2GP * 2;
9792
9793       /* When AdvSIMD instructions are disabled it is not possible to move
9794          a 128-bit value directly between Q registers.  This is handled in
9795          secondary reload.  A general register is used as a scratch to move
9796          the upper DI value and the lower DI value is moved directly,
9797          hence the cost is the sum of three moves. */
9798       if (! TARGET_SIMD)
9799         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
9800
9801       return regmove_cost->FP2FP;
9802     }
9803
9804   if (from == GENERAL_REGS && to == GENERAL_REGS)
9805     return regmove_cost->GP2GP;
9806   else if (from == GENERAL_REGS)
9807     return regmove_cost->GP2FP;
9808   else if (to == GENERAL_REGS)
9809     return regmove_cost->FP2GP;
9810
9811   return regmove_cost->FP2FP;
9812 }
9813
9814 static int
9815 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
9816                           reg_class_t rclass ATTRIBUTE_UNUSED,
9817                           bool in ATTRIBUTE_UNUSED)
9818 {
9819   return aarch64_tune_params.memmov_cost;
9820 }
9821
9822 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
9823    to optimize 1.0/sqrt.  */
9824
9825 static bool
9826 use_rsqrt_p (machine_mode mode)
9827 {
9828   return (!flag_trapping_math
9829           && flag_unsafe_math_optimizations
9830           && ((aarch64_tune_params.approx_modes->recip_sqrt
9831                & AARCH64_APPROX_MODE (mode))
9832               || flag_mrecip_low_precision_sqrt));
9833 }
9834
9835 /* Function to decide when to use the approximate reciprocal square root
9836    builtin.  */
9837
9838 static tree
9839 aarch64_builtin_reciprocal (tree fndecl)
9840 {
9841   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
9842
9843   if (!use_rsqrt_p (mode))
9844     return NULL_TREE;
9845   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
9846 }
9847
9848 typedef rtx (*rsqrte_type) (rtx, rtx);
9849
9850 /* Select reciprocal square root initial estimate insn depending on machine
9851    mode.  */
9852
9853 static rsqrte_type
9854 get_rsqrte_type (machine_mode mode)
9855 {
9856   switch (mode)
9857   {
9858     case E_DFmode:   return gen_aarch64_rsqrtedf;
9859     case E_SFmode:   return gen_aarch64_rsqrtesf;
9860     case E_V2DFmode: return gen_aarch64_rsqrtev2df;
9861     case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
9862     case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
9863     default: gcc_unreachable ();
9864   }
9865 }
9866
9867 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
9868
9869 /* Select reciprocal square root series step insn depending on machine mode.  */
9870
9871 static rsqrts_type
9872 get_rsqrts_type (machine_mode mode)
9873 {
9874   switch (mode)
9875   {
9876     case E_DFmode:   return gen_aarch64_rsqrtsdf;
9877     case E_SFmode:   return gen_aarch64_rsqrtssf;
9878     case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
9879     case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
9880     case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
9881     default: gcc_unreachable ();
9882   }
9883 }
9884
9885 /* Emit instruction sequence to compute either the approximate square root
9886    or its approximate reciprocal, depending on the flag RECP, and return
9887    whether the sequence was emitted or not.  */
9888
9889 bool
9890 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
9891 {
9892   machine_mode mode = GET_MODE (dst);
9893
9894   if (GET_MODE_INNER (mode) == HFmode)
9895     {
9896       gcc_assert (!recp);
9897       return false;
9898     }
9899
9900   if (!recp)
9901     {
9902       if (!(flag_mlow_precision_sqrt
9903             || (aarch64_tune_params.approx_modes->sqrt
9904                 & AARCH64_APPROX_MODE (mode))))
9905         return false;
9906
9907       if (flag_finite_math_only
9908           || flag_trapping_math
9909           || !flag_unsafe_math_optimizations
9910           || optimize_function_for_size_p (cfun))
9911         return false;
9912     }
9913   else
9914     /* Caller assumes we cannot fail.  */
9915     gcc_assert (use_rsqrt_p (mode));
9916
9917   machine_mode mmsk = mode_for_int_vector (mode).require ();
9918   rtx xmsk = gen_reg_rtx (mmsk);
9919   if (!recp)
9920     /* When calculating the approximate square root, compare the
9921        argument with 0.0 and create a mask.  */
9922     emit_insn (gen_rtx_SET (xmsk,
9923                             gen_rtx_NEG (mmsk,
9924                                          gen_rtx_EQ (mmsk, src,
9925                                                      CONST0_RTX (mode)))));
9926
9927   /* Estimate the approximate reciprocal square root.  */
9928   rtx xdst = gen_reg_rtx (mode);
9929   emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
9930
9931   /* Iterate over the series twice for SF and thrice for DF.  */
9932   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9933
9934   /* Optionally iterate over the series once less for faster performance
9935      while sacrificing the accuracy.  */
9936   if ((recp && flag_mrecip_low_precision_sqrt)
9937       || (!recp && flag_mlow_precision_sqrt))
9938     iterations--;
9939
9940   /* Iterate over the series to calculate the approximate reciprocal square
9941      root.  */
9942   rtx x1 = gen_reg_rtx (mode);
9943   while (iterations--)
9944     {
9945       rtx x2 = gen_reg_rtx (mode);
9946       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
9947
9948       emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
9949
9950       if (iterations > 0)
9951         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
9952     }
9953
9954   if (!recp)
9955     {
9956       /* Qualify the approximate reciprocal square root when the argument is
9957          0.0 by squashing the intermediary result to 0.0.  */
9958       rtx xtmp = gen_reg_rtx (mmsk);
9959       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
9960                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
9961       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
9962
9963       /* Calculate the approximate square root.  */
9964       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
9965     }
9966
9967   /* Finalize the approximation.  */
9968   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
9969
9970   return true;
9971 }
9972
9973 typedef rtx (*recpe_type) (rtx, rtx);
9974
9975 /* Select reciprocal initial estimate insn depending on machine mode.  */
9976
9977 static recpe_type
9978 get_recpe_type (machine_mode mode)
9979 {
9980   switch (mode)
9981   {
9982     case E_SFmode:   return (gen_aarch64_frecpesf);
9983     case E_V2SFmode: return (gen_aarch64_frecpev2sf);
9984     case E_V4SFmode: return (gen_aarch64_frecpev4sf);
9985     case E_DFmode:   return (gen_aarch64_frecpedf);
9986     case E_V2DFmode: return (gen_aarch64_frecpev2df);
9987     default:         gcc_unreachable ();
9988   }
9989 }
9990
9991 typedef rtx (*recps_type) (rtx, rtx, rtx);
9992
9993 /* Select reciprocal series step insn depending on machine mode.  */
9994
9995 static recps_type
9996 get_recps_type (machine_mode mode)
9997 {
9998   switch (mode)
9999   {
10000     case E_SFmode:   return (gen_aarch64_frecpssf);
10001     case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
10002     case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
10003     case E_DFmode:   return (gen_aarch64_frecpsdf);
10004     case E_V2DFmode: return (gen_aarch64_frecpsv2df);
10005     default:         gcc_unreachable ();
10006   }
10007 }
10008
10009 /* Emit the instruction sequence to compute the approximation for the division
10010    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
10011
10012 bool
10013 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
10014 {
10015   machine_mode mode = GET_MODE (quo);
10016
10017   if (GET_MODE_INNER (mode) == HFmode)
10018     return false;
10019
10020   bool use_approx_division_p = (flag_mlow_precision_div
10021                                 || (aarch64_tune_params.approx_modes->division
10022                                     & AARCH64_APPROX_MODE (mode)));
10023
10024   if (!flag_finite_math_only
10025       || flag_trapping_math
10026       || !flag_unsafe_math_optimizations
10027       || optimize_function_for_size_p (cfun)
10028       || !use_approx_division_p)
10029     return false;
10030
10031   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
10032     return false;
10033
10034   /* Estimate the approximate reciprocal.  */
10035   rtx xrcp = gen_reg_rtx (mode);
10036   emit_insn ((*get_recpe_type (mode)) (xrcp, den));
10037
10038   /* Iterate over the series twice for SF and thrice for DF.  */
10039   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10040
10041   /* Optionally iterate over the series once less for faster performance,
10042      while sacrificing the accuracy.  */
10043   if (flag_mlow_precision_div)
10044     iterations--;
10045
10046   /* Iterate over the series to calculate the approximate reciprocal.  */
10047   rtx xtmp = gen_reg_rtx (mode);
10048   while (iterations--)
10049     {
10050       emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
10051
10052       if (iterations > 0)
10053         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
10054     }
10055
10056   if (num != CONST1_RTX (mode))
10057     {
10058       /* As the approximate reciprocal of DEN is already calculated, only
10059          calculate the approximate division when NUM is not 1.0.  */
10060       rtx xnum = force_reg (mode, num);
10061       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
10062     }
10063
10064   /* Finalize the approximation.  */
10065   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
10066   return true;
10067 }
10068
10069 /* Return the number of instructions that can be issued per cycle.  */
10070 static int
10071 aarch64_sched_issue_rate (void)
10072 {
10073   return aarch64_tune_params.issue_rate;
10074 }
10075
10076 static int
10077 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10078 {
10079   int issue_rate = aarch64_sched_issue_rate ();
10080
10081   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
10082 }
10083
10084
10085 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
10086    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
10087    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
10088
10089 static int
10090 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
10091                                                     int ready_index)
10092 {
10093   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
10094 }
10095
10096
10097 /* Vectorizer cost model target hooks.  */
10098
10099 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
10100 static int
10101 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
10102                                     tree vectype,
10103                                     int misalign ATTRIBUTE_UNUSED)
10104 {
10105   unsigned elements;
10106   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
10107   bool fp = false;
10108
10109   if (vectype != NULL)
10110     fp = FLOAT_TYPE_P (vectype);
10111
10112   switch (type_of_cost)
10113     {
10114       case scalar_stmt:
10115         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
10116
10117       case scalar_load:
10118         return costs->scalar_load_cost;
10119
10120       case scalar_store:
10121         return costs->scalar_store_cost;
10122
10123       case vector_stmt:
10124         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10125
10126       case vector_load:
10127         return costs->vec_align_load_cost;
10128
10129       case vector_store:
10130         return costs->vec_store_cost;
10131
10132       case vec_to_scalar:
10133         return costs->vec_to_scalar_cost;
10134
10135       case scalar_to_vec:
10136         return costs->scalar_to_vec_cost;
10137
10138       case unaligned_load:
10139       case vector_gather_load:
10140         return costs->vec_unalign_load_cost;
10141
10142       case unaligned_store:
10143       case vector_scatter_store:
10144         return costs->vec_unalign_store_cost;
10145
10146       case cond_branch_taken:
10147         return costs->cond_taken_branch_cost;
10148
10149       case cond_branch_not_taken:
10150         return costs->cond_not_taken_branch_cost;
10151
10152       case vec_perm:
10153         return costs->vec_permute_cost;
10154
10155       case vec_promote_demote:
10156         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10157
10158       case vec_construct:
10159         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
10160         return elements / 2 + 1;
10161
10162       default:
10163         gcc_unreachable ();
10164     }
10165 }
10166
10167 /* Implement targetm.vectorize.add_stmt_cost.  */
10168 static unsigned
10169 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
10170                        struct _stmt_vec_info *stmt_info, int misalign,
10171                        enum vect_cost_model_location where)
10172 {
10173   unsigned *cost = (unsigned *) data;
10174   unsigned retval = 0;
10175
10176   if (flag_vect_cost_model)
10177     {
10178       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
10179       int stmt_cost =
10180             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
10181
10182       /* Statements in an inner loop relative to the loop being
10183          vectorized are weighted more heavily.  The value here is
10184          arbitrary and could potentially be improved with analysis.  */
10185       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
10186         count *= 50; /*  FIXME  */
10187
10188       retval = (unsigned) (count * stmt_cost);
10189       cost[where] += retval;
10190     }
10191
10192   return retval;
10193 }
10194
10195 static void initialize_aarch64_code_model (struct gcc_options *);
10196
10197 /* Parse the TO_PARSE string and put the architecture struct that it
10198    selects into RES and the architectural features into ISA_FLAGS.
10199    Return an aarch64_parse_opt_result describing the parse result.
10200    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
10201
10202 static enum aarch64_parse_opt_result
10203 aarch64_parse_arch (const char *to_parse, const struct processor **res,
10204                     unsigned long *isa_flags)
10205 {
10206   char *ext;
10207   const struct processor *arch;
10208   char *str = (char *) alloca (strlen (to_parse) + 1);
10209   size_t len;
10210
10211   strcpy (str, to_parse);
10212
10213   ext = strchr (str, '+');
10214
10215   if (ext != NULL)
10216     len = ext - str;
10217   else
10218     len = strlen (str);
10219
10220   if (len == 0)
10221     return AARCH64_PARSE_MISSING_ARG;
10222
10223
10224   /* Loop through the list of supported ARCHes to find a match.  */
10225   for (arch = all_architectures; arch->name != NULL; arch++)
10226     {
10227       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
10228         {
10229           unsigned long isa_temp = arch->flags;
10230
10231           if (ext != NULL)
10232             {
10233               /* TO_PARSE string contains at least one extension.  */
10234               enum aarch64_parse_opt_result ext_res
10235                 = aarch64_parse_extension (ext, &isa_temp);
10236
10237               if (ext_res != AARCH64_PARSE_OK)
10238                 return ext_res;
10239             }
10240           /* Extension parsing was successful.  Confirm the result
10241              arch and ISA flags.  */
10242           *res = arch;
10243           *isa_flags = isa_temp;
10244           return AARCH64_PARSE_OK;
10245         }
10246     }
10247
10248   /* ARCH name not found in list.  */
10249   return AARCH64_PARSE_INVALID_ARG;
10250 }
10251
10252 /* Parse the TO_PARSE string and put the result tuning in RES and the
10253    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
10254    describing the parse result.  If there is an error parsing, RES and
10255    ISA_FLAGS are left unchanged.  */
10256
10257 static enum aarch64_parse_opt_result
10258 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
10259                    unsigned long *isa_flags)
10260 {
10261   char *ext;
10262   const struct processor *cpu;
10263   char *str = (char *) alloca (strlen (to_parse) + 1);
10264   size_t len;
10265
10266   strcpy (str, to_parse);
10267
10268   ext = strchr (str, '+');
10269
10270   if (ext != NULL)
10271     len = ext - str;
10272   else
10273     len = strlen (str);
10274
10275   if (len == 0)
10276     return AARCH64_PARSE_MISSING_ARG;
10277
10278
10279   /* Loop through the list of supported CPUs to find a match.  */
10280   for (cpu = all_cores; cpu->name != NULL; cpu++)
10281     {
10282       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
10283         {
10284           unsigned long isa_temp = cpu->flags;
10285
10286
10287           if (ext != NULL)
10288             {
10289               /* TO_PARSE string contains at least one extension.  */
10290               enum aarch64_parse_opt_result ext_res
10291                 = aarch64_parse_extension (ext, &isa_temp);
10292
10293               if (ext_res != AARCH64_PARSE_OK)
10294                 return ext_res;
10295             }
10296           /* Extension parsing was successfull.  Confirm the result
10297              cpu and ISA flags.  */
10298           *res = cpu;
10299           *isa_flags = isa_temp;
10300           return AARCH64_PARSE_OK;
10301         }
10302     }
10303
10304   /* CPU name not found in list.  */
10305   return AARCH64_PARSE_INVALID_ARG;
10306 }
10307
10308 /* Parse the TO_PARSE string and put the cpu it selects into RES.
10309    Return an aarch64_parse_opt_result describing the parse result.
10310    If the parsing fails the RES does not change.  */
10311
10312 static enum aarch64_parse_opt_result
10313 aarch64_parse_tune (const char *to_parse, const struct processor **res)
10314 {
10315   const struct processor *cpu;
10316   char *str = (char *) alloca (strlen (to_parse) + 1);
10317
10318   strcpy (str, to_parse);
10319
10320   /* Loop through the list of supported CPUs to find a match.  */
10321   for (cpu = all_cores; cpu->name != NULL; cpu++)
10322     {
10323       if (strcmp (cpu->name, str) == 0)
10324         {
10325           *res = cpu;
10326           return AARCH64_PARSE_OK;
10327         }
10328     }
10329
10330   /* CPU name not found in list.  */
10331   return AARCH64_PARSE_INVALID_ARG;
10332 }
10333
10334 /* Parse TOKEN, which has length LENGTH to see if it is an option
10335    described in FLAG.  If it is, return the index bit for that fusion type.
10336    If not, error (printing OPTION_NAME) and return zero.  */
10337
10338 static unsigned int
10339 aarch64_parse_one_option_token (const char *token,
10340                                 size_t length,
10341                                 const struct aarch64_flag_desc *flag,
10342                                 const char *option_name)
10343 {
10344   for (; flag->name != NULL; flag++)
10345     {
10346       if (length == strlen (flag->name)
10347           && !strncmp (flag->name, token, length))
10348         return flag->flag;
10349     }
10350
10351   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
10352   return 0;
10353 }
10354
10355 /* Parse OPTION which is a comma-separated list of flags to enable.
10356    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
10357    default state we inherit from the CPU tuning structures.  OPTION_NAME
10358    gives the top-level option we are parsing in the -moverride string,
10359    for use in error messages.  */
10360
10361 static unsigned int
10362 aarch64_parse_boolean_options (const char *option,
10363                                const struct aarch64_flag_desc *flags,
10364                                unsigned int initial_state,
10365                                const char *option_name)
10366 {
10367   const char separator = '.';
10368   const char* specs = option;
10369   const char* ntoken = option;
10370   unsigned int found_flags = initial_state;
10371
10372   while ((ntoken = strchr (specs, separator)))
10373     {
10374       size_t token_length = ntoken - specs;
10375       unsigned token_ops = aarch64_parse_one_option_token (specs,
10376                                                            token_length,
10377                                                            flags,
10378                                                            option_name);
10379       /* If we find "none" (or, for simplicity's sake, an error) anywhere
10380          in the token stream, reset the supported operations.  So:
10381
10382            adrp+add.cmp+branch.none.adrp+add
10383
10384            would have the result of turning on only adrp+add fusion.  */
10385       if (!token_ops)
10386         found_flags = 0;
10387
10388       found_flags |= token_ops;
10389       specs = ++ntoken;
10390     }
10391
10392   /* We ended with a comma, print something.  */
10393   if (!(*specs))
10394     {
10395       error ("%s string ill-formed\n", option_name);
10396       return 0;
10397     }
10398
10399   /* We still have one more token to parse.  */
10400   size_t token_length = strlen (specs);
10401   unsigned token_ops = aarch64_parse_one_option_token (specs,
10402                                                        token_length,
10403                                                        flags,
10404                                                        option_name);
10405    if (!token_ops)
10406      found_flags = 0;
10407
10408   found_flags |= token_ops;
10409   return found_flags;
10410 }
10411
10412 /* Support for overriding instruction fusion.  */
10413
10414 static void
10415 aarch64_parse_fuse_string (const char *fuse_string,
10416                             struct tune_params *tune)
10417 {
10418   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
10419                                                      aarch64_fusible_pairs,
10420                                                      tune->fusible_ops,
10421                                                      "fuse=");
10422 }
10423
10424 /* Support for overriding other tuning flags.  */
10425
10426 static void
10427 aarch64_parse_tune_string (const char *tune_string,
10428                             struct tune_params *tune)
10429 {
10430   tune->extra_tuning_flags
10431     = aarch64_parse_boolean_options (tune_string,
10432                                      aarch64_tuning_flags,
10433                                      tune->extra_tuning_flags,
10434                                      "tune=");
10435 }
10436
10437 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
10438    we understand.  If it is, extract the option string and handoff to
10439    the appropriate function.  */
10440
10441 void
10442 aarch64_parse_one_override_token (const char* token,
10443                                   size_t length,
10444                                   struct tune_params *tune)
10445 {
10446   const struct aarch64_tuning_override_function *fn
10447     = aarch64_tuning_override_functions;
10448
10449   const char *option_part = strchr (token, '=');
10450   if (!option_part)
10451     {
10452       error ("tuning string missing in option (%s)", token);
10453       return;
10454     }
10455
10456   /* Get the length of the option name.  */
10457   length = option_part - token;
10458   /* Skip the '=' to get to the option string.  */
10459   option_part++;
10460
10461   for (; fn->name != NULL; fn++)
10462     {
10463       if (!strncmp (fn->name, token, length))
10464         {
10465           fn->parse_override (option_part, tune);
10466           return;
10467         }
10468     }
10469
10470   error ("unknown tuning option (%s)",token);
10471   return;
10472 }
10473
10474 /* A checking mechanism for the implementation of the tls size.  */
10475
10476 static void
10477 initialize_aarch64_tls_size (struct gcc_options *opts)
10478 {
10479   if (aarch64_tls_size == 0)
10480     aarch64_tls_size = 24;
10481
10482   switch (opts->x_aarch64_cmodel_var)
10483     {
10484     case AARCH64_CMODEL_TINY:
10485       /* Both the default and maximum TLS size allowed under tiny is 1M which
10486          needs two instructions to address, so we clamp the size to 24.  */
10487       if (aarch64_tls_size > 24)
10488         aarch64_tls_size = 24;
10489       break;
10490     case AARCH64_CMODEL_SMALL:
10491       /* The maximum TLS size allowed under small is 4G.  */
10492       if (aarch64_tls_size > 32)
10493         aarch64_tls_size = 32;
10494       break;
10495     case AARCH64_CMODEL_LARGE:
10496       /* The maximum TLS size allowed under large is 16E.
10497          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
10498       if (aarch64_tls_size > 48)
10499         aarch64_tls_size = 48;
10500       break;
10501     default:
10502       gcc_unreachable ();
10503     }
10504
10505   return;
10506 }
10507
10508 /* Parse STRING looking for options in the format:
10509      string     :: option:string
10510      option     :: name=substring
10511      name       :: {a-z}
10512      substring  :: defined by option.  */
10513
10514 static void
10515 aarch64_parse_override_string (const char* input_string,
10516                                struct tune_params* tune)
10517 {
10518   const char separator = ':';
10519   size_t string_length = strlen (input_string) + 1;
10520   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
10521   char *string = string_root;
10522   strncpy (string, input_string, string_length);
10523   string[string_length - 1] = '\0';
10524
10525   char* ntoken = string;
10526
10527   while ((ntoken = strchr (string, separator)))
10528     {
10529       size_t token_length = ntoken - string;
10530       /* Make this substring look like a string.  */
10531       *ntoken = '\0';
10532       aarch64_parse_one_override_token (string, token_length, tune);
10533       string = ++ntoken;
10534     }
10535
10536   /* One last option to parse.  */
10537   aarch64_parse_one_override_token (string, strlen (string), tune);
10538   free (string_root);
10539 }
10540
10541
10542 static void
10543 aarch64_override_options_after_change_1 (struct gcc_options *opts)
10544 {
10545   /* PR 70044: We have to be careful about being called multiple times for the
10546      same function.  This means all changes should be repeatable.  */
10547
10548   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
10549      Disable the frame pointer flag so the mid-end will not use a frame
10550      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
10551      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
10552      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
10553   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
10554   if (opts->x_flag_omit_frame_pointer == 0)
10555     opts->x_flag_omit_frame_pointer = 2;
10556
10557   /* If not optimizing for size, set the default
10558      alignment to what the target wants.  */
10559   if (!opts->x_optimize_size)
10560     {
10561       if (opts->x_align_loops <= 0)
10562         opts->x_align_loops = aarch64_tune_params.loop_align;
10563       if (opts->x_align_jumps <= 0)
10564         opts->x_align_jumps = aarch64_tune_params.jump_align;
10565       if (opts->x_align_functions <= 0)
10566         opts->x_align_functions = aarch64_tune_params.function_align;
10567     }
10568
10569   /* We default to no pc-relative literal loads.  */
10570
10571   aarch64_pcrelative_literal_loads = false;
10572
10573   /* If -mpc-relative-literal-loads is set on the command line, this
10574      implies that the user asked for PC relative literal loads.  */
10575   if (opts->x_pcrelative_literal_loads == 1)
10576     aarch64_pcrelative_literal_loads = true;
10577
10578   /* In the tiny memory model it makes no sense to disallow PC relative
10579      literal pool loads.  */
10580   if (aarch64_cmodel == AARCH64_CMODEL_TINY
10581       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10582     aarch64_pcrelative_literal_loads = true;
10583
10584   /* When enabling the lower precision Newton series for the square root, also
10585      enable it for the reciprocal square root, since the latter is an
10586      intermediary step for the former.  */
10587   if (flag_mlow_precision_sqrt)
10588     flag_mrecip_low_precision_sqrt = true;
10589 }
10590
10591 /* 'Unpack' up the internal tuning structs and update the options
10592     in OPTS.  The caller must have set up selected_tune and selected_arch
10593     as all the other target-specific codegen decisions are
10594     derived from them.  */
10595
10596 void
10597 aarch64_override_options_internal (struct gcc_options *opts)
10598 {
10599   aarch64_tune_flags = selected_tune->flags;
10600   aarch64_tune = selected_tune->sched_core;
10601   /* Make a copy of the tuning parameters attached to the core, which
10602      we may later overwrite.  */
10603   aarch64_tune_params = *(selected_tune->tune);
10604   aarch64_architecture_version = selected_arch->architecture_version;
10605
10606   if (opts->x_aarch64_override_tune_string)
10607     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
10608                                   &aarch64_tune_params);
10609
10610   /* This target defaults to strict volatile bitfields.  */
10611   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
10612     opts->x_flag_strict_volatile_bitfields = 1;
10613
10614   initialize_aarch64_code_model (opts);
10615   initialize_aarch64_tls_size (opts);
10616
10617   int queue_depth = 0;
10618   switch (aarch64_tune_params.autoprefetcher_model)
10619     {
10620       case tune_params::AUTOPREFETCHER_OFF:
10621         queue_depth = -1;
10622         break;
10623       case tune_params::AUTOPREFETCHER_WEAK:
10624         queue_depth = 0;
10625         break;
10626       case tune_params::AUTOPREFETCHER_STRONG:
10627         queue_depth = max_insn_queue_index + 1;
10628         break;
10629       default:
10630         gcc_unreachable ();
10631     }
10632
10633   /* We don't mind passing in global_options_set here as we don't use
10634      the *options_set structs anyway.  */
10635   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
10636                          queue_depth,
10637                          opts->x_param_values,
10638                          global_options_set.x_param_values);
10639
10640   /* Set up parameters to be used in prefetching algorithm.  Do not
10641      override the defaults unless we are tuning for a core we have
10642      researched values for.  */
10643   if (aarch64_tune_params.prefetch->num_slots > 0)
10644     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
10645                            aarch64_tune_params.prefetch->num_slots,
10646                            opts->x_param_values,
10647                            global_options_set.x_param_values);
10648   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
10649     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
10650                            aarch64_tune_params.prefetch->l1_cache_size,
10651                            opts->x_param_values,
10652                            global_options_set.x_param_values);
10653   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
10654     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
10655                            aarch64_tune_params.prefetch->l1_cache_line_size,
10656                            opts->x_param_values,
10657                            global_options_set.x_param_values);
10658   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
10659     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
10660                            aarch64_tune_params.prefetch->l2_cache_size,
10661                            opts->x_param_values,
10662                            global_options_set.x_param_values);
10663   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
10664     maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
10665                            0,
10666                            opts->x_param_values,
10667                            global_options_set.x_param_values);
10668   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
10669     maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
10670                            aarch64_tune_params.prefetch->minimum_stride,
10671                            opts->x_param_values,
10672                            global_options_set.x_param_values);
10673
10674   /* Use the alternative scheduling-pressure algorithm by default.  */
10675   maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
10676                          opts->x_param_values,
10677                          global_options_set.x_param_values);
10678
10679   /* Enable sw prefetching at specified optimization level for
10680      CPUS that have prefetch.  Lower optimization level threshold by 1
10681      when profiling is enabled.  */
10682   if (opts->x_flag_prefetch_loop_arrays < 0
10683       && !opts->x_optimize_size
10684       && aarch64_tune_params.prefetch->default_opt_level >= 0
10685       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
10686     opts->x_flag_prefetch_loop_arrays = 1;
10687
10688   aarch64_override_options_after_change_1 (opts);
10689 }
10690
10691 /* Print a hint with a suggestion for a core or architecture name that
10692    most closely resembles what the user passed in STR.  ARCH is true if
10693    the user is asking for an architecture name.  ARCH is false if the user
10694    is asking for a core name.  */
10695
10696 static void
10697 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
10698 {
10699   auto_vec<const char *> candidates;
10700   const struct processor *entry = arch ? all_architectures : all_cores;
10701   for (; entry->name != NULL; entry++)
10702     candidates.safe_push (entry->name);
10703
10704 #ifdef HAVE_LOCAL_CPU_DETECT
10705   /* Add also "native" as possible value.  */
10706   if (arch)
10707     candidates.safe_push ("native");
10708 #endif
10709
10710   char *s;
10711   const char *hint = candidates_list_and_hint (str, s, candidates);
10712   if (hint)
10713     inform (input_location, "valid arguments are: %s;"
10714                              " did you mean %qs?", s, hint);
10715   else
10716     inform (input_location, "valid arguments are: %s", s);
10717
10718   XDELETEVEC (s);
10719 }
10720
10721 /* Print a hint with a suggestion for a core name that most closely resembles
10722    what the user passed in STR.  */
10723
10724 inline static void
10725 aarch64_print_hint_for_core (const char *str)
10726 {
10727   aarch64_print_hint_for_core_or_arch (str, false);
10728 }
10729
10730 /* Print a hint with a suggestion for an architecture name that most closely
10731    resembles what the user passed in STR.  */
10732
10733 inline static void
10734 aarch64_print_hint_for_arch (const char *str)
10735 {
10736   aarch64_print_hint_for_core_or_arch (str, true);
10737 }
10738
10739 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
10740    specified in STR and throw errors if appropriate.  Put the results if
10741    they are valid in RES and ISA_FLAGS.  Return whether the option is
10742    valid.  */
10743
10744 static bool
10745 aarch64_validate_mcpu (const char *str, const struct processor **res,
10746                        unsigned long *isa_flags)
10747 {
10748   enum aarch64_parse_opt_result parse_res
10749     = aarch64_parse_cpu (str, res, isa_flags);
10750
10751   if (parse_res == AARCH64_PARSE_OK)
10752     return true;
10753
10754   switch (parse_res)
10755     {
10756       case AARCH64_PARSE_MISSING_ARG:
10757         error ("missing cpu name in %<-mcpu=%s%>", str);
10758         break;
10759       case AARCH64_PARSE_INVALID_ARG:
10760         error ("unknown value %qs for -mcpu", str);
10761         aarch64_print_hint_for_core (str);
10762         break;
10763       case AARCH64_PARSE_INVALID_FEATURE:
10764         error ("invalid feature modifier in %<-mcpu=%s%>", str);
10765         break;
10766       default:
10767         gcc_unreachable ();
10768     }
10769
10770   return false;
10771 }
10772
10773 /* Validate a command-line -march option.  Parse the arch and extensions
10774    (if any) specified in STR and throw errors if appropriate.  Put the
10775    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
10776    option is valid.  */
10777
10778 static bool
10779 aarch64_validate_march (const char *str, const struct processor **res,
10780                          unsigned long *isa_flags)
10781 {
10782   enum aarch64_parse_opt_result parse_res
10783     = aarch64_parse_arch (str, res, isa_flags);
10784
10785   if (parse_res == AARCH64_PARSE_OK)
10786     return true;
10787
10788   switch (parse_res)
10789     {
10790       case AARCH64_PARSE_MISSING_ARG:
10791         error ("missing arch name in %<-march=%s%>", str);
10792         break;
10793       case AARCH64_PARSE_INVALID_ARG:
10794         error ("unknown value %qs for -march", str);
10795         aarch64_print_hint_for_arch (str);
10796         break;
10797       case AARCH64_PARSE_INVALID_FEATURE:
10798         error ("invalid feature modifier in %<-march=%s%>", str);
10799         break;
10800       default:
10801         gcc_unreachable ();
10802     }
10803
10804   return false;
10805 }
10806
10807 /* Validate a command-line -mtune option.  Parse the cpu
10808    specified in STR and throw errors if appropriate.  Put the
10809    result, if it is valid, in RES.  Return whether the option is
10810    valid.  */
10811
10812 static bool
10813 aarch64_validate_mtune (const char *str, const struct processor **res)
10814 {
10815   enum aarch64_parse_opt_result parse_res
10816     = aarch64_parse_tune (str, res);
10817
10818   if (parse_res == AARCH64_PARSE_OK)
10819     return true;
10820
10821   switch (parse_res)
10822     {
10823       case AARCH64_PARSE_MISSING_ARG:
10824         error ("missing cpu name in %<-mtune=%s%>", str);
10825         break;
10826       case AARCH64_PARSE_INVALID_ARG:
10827         error ("unknown value %qs for -mtune", str);
10828         aarch64_print_hint_for_core (str);
10829         break;
10830       default:
10831         gcc_unreachable ();
10832     }
10833   return false;
10834 }
10835
10836 /* Return the CPU corresponding to the enum CPU.
10837    If it doesn't specify a cpu, return the default.  */
10838
10839 static const struct processor *
10840 aarch64_get_tune_cpu (enum aarch64_processor cpu)
10841 {
10842   if (cpu != aarch64_none)
10843     return &all_cores[cpu];
10844
10845   /* The & 0x3f is to extract the bottom 6 bits that encode the
10846      default cpu as selected by the --with-cpu GCC configure option
10847      in config.gcc.
10848      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
10849      flags mechanism should be reworked to make it more sane.  */
10850   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10851 }
10852
10853 /* Return the architecture corresponding to the enum ARCH.
10854    If it doesn't specify a valid architecture, return the default.  */
10855
10856 static const struct processor *
10857 aarch64_get_arch (enum aarch64_arch arch)
10858 {
10859   if (arch != aarch64_no_arch)
10860     return &all_architectures[arch];
10861
10862   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10863
10864   return &all_architectures[cpu->arch];
10865 }
10866
10867 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
10868
10869 static poly_uint16
10870 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
10871 {
10872   /* For now generate vector-length agnostic code for -msve-vector-bits=128.
10873      This ensures we can clearly distinguish SVE and Advanced SIMD modes when
10874      deciding which .md file patterns to use and when deciding whether
10875      something is a legitimate address or constant.  */
10876   if (value == SVE_SCALABLE || value == SVE_128)
10877     return poly_uint16 (2, 2);
10878   else
10879     return (int) value / 64;
10880 }
10881
10882 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
10883    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
10884    tuning structs.  In particular it must set selected_tune and
10885    aarch64_isa_flags that define the available ISA features and tuning
10886    decisions.  It must also set selected_arch as this will be used to
10887    output the .arch asm tags for each function.  */
10888
10889 static void
10890 aarch64_override_options (void)
10891 {
10892   unsigned long cpu_isa = 0;
10893   unsigned long arch_isa = 0;
10894   aarch64_isa_flags = 0;
10895
10896   bool valid_cpu = true;
10897   bool valid_tune = true;
10898   bool valid_arch = true;
10899
10900   selected_cpu = NULL;
10901   selected_arch = NULL;
10902   selected_tune = NULL;
10903
10904   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
10905      If either of -march or -mtune is given, they override their
10906      respective component of -mcpu.  */
10907   if (aarch64_cpu_string)
10908     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
10909                                         &cpu_isa);
10910
10911   if (aarch64_arch_string)
10912     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
10913                                           &arch_isa);
10914
10915   if (aarch64_tune_string)
10916     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
10917
10918   /* If the user did not specify a processor, choose the default
10919      one for them.  This will be the CPU set during configuration using
10920      --with-cpu, otherwise it is "generic".  */
10921   if (!selected_cpu)
10922     {
10923       if (selected_arch)
10924         {
10925           selected_cpu = &all_cores[selected_arch->ident];
10926           aarch64_isa_flags = arch_isa;
10927           explicit_arch = selected_arch->arch;
10928         }
10929       else
10930         {
10931           /* Get default configure-time CPU.  */
10932           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
10933           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
10934         }
10935
10936       if (selected_tune)
10937         explicit_tune_core = selected_tune->ident;
10938     }
10939   /* If both -mcpu and -march are specified check that they are architecturally
10940      compatible, warn if they're not and prefer the -march ISA flags.  */
10941   else if (selected_arch)
10942     {
10943       if (selected_arch->arch != selected_cpu->arch)
10944         {
10945           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
10946                        all_architectures[selected_cpu->arch].name,
10947                        selected_arch->name);
10948         }
10949       aarch64_isa_flags = arch_isa;
10950       explicit_arch = selected_arch->arch;
10951       explicit_tune_core = selected_tune ? selected_tune->ident
10952                                           : selected_cpu->ident;
10953     }
10954   else
10955     {
10956       /* -mcpu but no -march.  */
10957       aarch64_isa_flags = cpu_isa;
10958       explicit_tune_core = selected_tune ? selected_tune->ident
10959                                           : selected_cpu->ident;
10960       gcc_assert (selected_cpu);
10961       selected_arch = &all_architectures[selected_cpu->arch];
10962       explicit_arch = selected_arch->arch;
10963     }
10964
10965   /* Set the arch as well as we will need it when outputing
10966      the .arch directive in assembly.  */
10967   if (!selected_arch)
10968     {
10969       gcc_assert (selected_cpu);
10970       selected_arch = &all_architectures[selected_cpu->arch];
10971     }
10972
10973   if (!selected_tune)
10974     selected_tune = selected_cpu;
10975
10976 #ifndef HAVE_AS_MABI_OPTION
10977   /* The compiler may have been configured with 2.23.* binutils, which does
10978      not have support for ILP32.  */
10979   if (TARGET_ILP32)
10980     error ("assembler does not support -mabi=ilp32");
10981 #endif
10982
10983   /* Convert -msve-vector-bits to a VG count.  */
10984   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
10985
10986   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
10987     sorry ("return address signing is only supported for -mabi=lp64");
10988
10989   /* Make sure we properly set up the explicit options.  */
10990   if ((aarch64_cpu_string && valid_cpu)
10991        || (aarch64_tune_string && valid_tune))
10992     gcc_assert (explicit_tune_core != aarch64_none);
10993
10994   if ((aarch64_cpu_string && valid_cpu)
10995        || (aarch64_arch_string && valid_arch))
10996     gcc_assert (explicit_arch != aarch64_no_arch);
10997
10998   aarch64_override_options_internal (&global_options);
10999
11000   /* Save these options as the default ones in case we push and pop them later
11001      while processing functions with potential target attributes.  */
11002   target_option_default_node = target_option_current_node
11003       = build_target_option_node (&global_options);
11004 }
11005
11006 /* Implement targetm.override_options_after_change.  */
11007
11008 static void
11009 aarch64_override_options_after_change (void)
11010 {
11011   aarch64_override_options_after_change_1 (&global_options);
11012 }
11013
11014 static struct machine_function *
11015 aarch64_init_machine_status (void)
11016 {
11017   struct machine_function *machine;
11018   machine = ggc_cleared_alloc<machine_function> ();
11019   return machine;
11020 }
11021
11022 void
11023 aarch64_init_expanders (void)
11024 {
11025   init_machine_status = aarch64_init_machine_status;
11026 }
11027
11028 /* A checking mechanism for the implementation of the various code models.  */
11029 static void
11030 initialize_aarch64_code_model (struct gcc_options *opts)
11031 {
11032    if (opts->x_flag_pic)
11033      {
11034        switch (opts->x_aarch64_cmodel_var)
11035          {
11036          case AARCH64_CMODEL_TINY:
11037            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
11038            break;
11039          case AARCH64_CMODEL_SMALL:
11040 #ifdef HAVE_AS_SMALL_PIC_RELOCS
11041            aarch64_cmodel = (flag_pic == 2
11042                              ? AARCH64_CMODEL_SMALL_PIC
11043                              : AARCH64_CMODEL_SMALL_SPIC);
11044 #else
11045            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
11046 #endif
11047            break;
11048          case AARCH64_CMODEL_LARGE:
11049            sorry ("code model %qs with -f%s", "large",
11050                   opts->x_flag_pic > 1 ? "PIC" : "pic");
11051            break;
11052          default:
11053            gcc_unreachable ();
11054          }
11055      }
11056    else
11057      aarch64_cmodel = opts->x_aarch64_cmodel_var;
11058 }
11059
11060 /* Implement TARGET_OPTION_SAVE.  */
11061
11062 static void
11063 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
11064 {
11065   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
11066 }
11067
11068 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
11069    using the information saved in PTR.  */
11070
11071 static void
11072 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
11073 {
11074   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
11075   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11076   opts->x_explicit_arch = ptr->x_explicit_arch;
11077   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
11078   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
11079
11080   aarch64_override_options_internal (opts);
11081 }
11082
11083 /* Implement TARGET_OPTION_PRINT.  */
11084
11085 static void
11086 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
11087 {
11088   const struct processor *cpu
11089     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11090   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
11091   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
11092   std::string extension
11093     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
11094
11095   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
11096   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
11097            arch->name, extension.c_str ());
11098 }
11099
11100 static GTY(()) tree aarch64_previous_fndecl;
11101
11102 void
11103 aarch64_reset_previous_fndecl (void)
11104 {
11105   aarch64_previous_fndecl = NULL;
11106 }
11107
11108 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
11109    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
11110    make sure optab availability predicates are recomputed when necessary.  */
11111
11112 void
11113 aarch64_save_restore_target_globals (tree new_tree)
11114 {
11115   if (TREE_TARGET_GLOBALS (new_tree))
11116     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
11117   else if (new_tree == target_option_default_node)
11118     restore_target_globals (&default_target_globals);
11119   else
11120     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
11121 }
11122
11123 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
11124    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
11125    of the function, if such exists.  This function may be called multiple
11126    times on a single function so use aarch64_previous_fndecl to avoid
11127    setting up identical state.  */
11128
11129 static void
11130 aarch64_set_current_function (tree fndecl)
11131 {
11132   if (!fndecl || fndecl == aarch64_previous_fndecl)
11133     return;
11134
11135   tree old_tree = (aarch64_previous_fndecl
11136                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
11137                    : NULL_TREE);
11138
11139   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11140
11141   /* If current function has no attributes but the previous one did,
11142      use the default node.  */
11143   if (!new_tree && old_tree)
11144     new_tree = target_option_default_node;
11145
11146   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
11147      the default have been handled by aarch64_save_restore_target_globals from
11148      aarch64_pragma_target_parse.  */
11149   if (old_tree == new_tree)
11150     return;
11151
11152   aarch64_previous_fndecl = fndecl;
11153
11154   /* First set the target options.  */
11155   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
11156
11157   aarch64_save_restore_target_globals (new_tree);
11158 }
11159
11160 /* Enum describing the various ways we can handle attributes.
11161    In many cases we can reuse the generic option handling machinery.  */
11162
11163 enum aarch64_attr_opt_type
11164 {
11165   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
11166   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
11167   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
11168   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
11169 };
11170
11171 /* All the information needed to handle a target attribute.
11172    NAME is the name of the attribute.
11173    ATTR_TYPE specifies the type of behavior of the attribute as described
11174    in the definition of enum aarch64_attr_opt_type.
11175    ALLOW_NEG is true if the attribute supports a "no-" form.
11176    HANDLER is the function that takes the attribute string as an argument
11177    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
11178    OPT_NUM is the enum specifying the option that the attribute modifies.
11179    This is needed for attributes that mirror the behavior of a command-line
11180    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
11181    aarch64_attr_enum.  */
11182
11183 struct aarch64_attribute_info
11184 {
11185   const char *name;
11186   enum aarch64_attr_opt_type attr_type;
11187   bool allow_neg;
11188   bool (*handler) (const char *);
11189   enum opt_code opt_num;
11190 };
11191
11192 /* Handle the ARCH_STR argument to the arch= target attribute.  */
11193
11194 static bool
11195 aarch64_handle_attr_arch (const char *str)
11196 {
11197   const struct processor *tmp_arch = NULL;
11198   enum aarch64_parse_opt_result parse_res
11199     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
11200
11201   if (parse_res == AARCH64_PARSE_OK)
11202     {
11203       gcc_assert (tmp_arch);
11204       selected_arch = tmp_arch;
11205       explicit_arch = selected_arch->arch;
11206       return true;
11207     }
11208
11209   switch (parse_res)
11210     {
11211       case AARCH64_PARSE_MISSING_ARG:
11212         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
11213         break;
11214       case AARCH64_PARSE_INVALID_ARG:
11215         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
11216         aarch64_print_hint_for_arch (str);
11217         break;
11218       case AARCH64_PARSE_INVALID_FEATURE:
11219         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11220         break;
11221       default:
11222         gcc_unreachable ();
11223     }
11224
11225   return false;
11226 }
11227
11228 /* Handle the argument CPU_STR to the cpu= target attribute.  */
11229
11230 static bool
11231 aarch64_handle_attr_cpu (const char *str)
11232 {
11233   const struct processor *tmp_cpu = NULL;
11234   enum aarch64_parse_opt_result parse_res
11235     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
11236
11237   if (parse_res == AARCH64_PARSE_OK)
11238     {
11239       gcc_assert (tmp_cpu);
11240       selected_tune = tmp_cpu;
11241       explicit_tune_core = selected_tune->ident;
11242
11243       selected_arch = &all_architectures[tmp_cpu->arch];
11244       explicit_arch = selected_arch->arch;
11245       return true;
11246     }
11247
11248   switch (parse_res)
11249     {
11250       case AARCH64_PARSE_MISSING_ARG:
11251         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
11252         break;
11253       case AARCH64_PARSE_INVALID_ARG:
11254         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
11255         aarch64_print_hint_for_core (str);
11256         break;
11257       case AARCH64_PARSE_INVALID_FEATURE:
11258         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11259         break;
11260       default:
11261         gcc_unreachable ();
11262     }
11263
11264   return false;
11265 }
11266
11267 /* Handle the argument STR to the tune= target attribute.  */
11268
11269 static bool
11270 aarch64_handle_attr_tune (const char *str)
11271 {
11272   const struct processor *tmp_tune = NULL;
11273   enum aarch64_parse_opt_result parse_res
11274     = aarch64_parse_tune (str, &tmp_tune);
11275
11276   if (parse_res == AARCH64_PARSE_OK)
11277     {
11278       gcc_assert (tmp_tune);
11279       selected_tune = tmp_tune;
11280       explicit_tune_core = selected_tune->ident;
11281       return true;
11282     }
11283
11284   switch (parse_res)
11285     {
11286       case AARCH64_PARSE_INVALID_ARG:
11287         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
11288         aarch64_print_hint_for_core (str);
11289         break;
11290       default:
11291         gcc_unreachable ();
11292     }
11293
11294   return false;
11295 }
11296
11297 /* Parse an architecture extensions target attribute string specified in STR.
11298    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
11299    if successful.  Update aarch64_isa_flags to reflect the ISA features
11300    modified.  */
11301
11302 static bool
11303 aarch64_handle_attr_isa_flags (char *str)
11304 {
11305   enum aarch64_parse_opt_result parse_res;
11306   unsigned long isa_flags = aarch64_isa_flags;
11307
11308   /* We allow "+nothing" in the beginning to clear out all architectural
11309      features if the user wants to handpick specific features.  */
11310   if (strncmp ("+nothing", str, 8) == 0)
11311     {
11312       isa_flags = 0;
11313       str += 8;
11314     }
11315
11316   parse_res = aarch64_parse_extension (str, &isa_flags);
11317
11318   if (parse_res == AARCH64_PARSE_OK)
11319     {
11320       aarch64_isa_flags = isa_flags;
11321       return true;
11322     }
11323
11324   switch (parse_res)
11325     {
11326       case AARCH64_PARSE_MISSING_ARG:
11327         error ("missing value in %<target()%> pragma or attribute");
11328         break;
11329
11330       case AARCH64_PARSE_INVALID_FEATURE:
11331         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11332         break;
11333
11334       default:
11335         gcc_unreachable ();
11336     }
11337
11338  return false;
11339 }
11340
11341 /* The target attributes that we support.  On top of these we also support just
11342    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
11343    handled explicitly in aarch64_process_one_target_attr.  */
11344
11345 static const struct aarch64_attribute_info aarch64_attributes[] =
11346 {
11347   { "general-regs-only", aarch64_attr_mask, false, NULL,
11348      OPT_mgeneral_regs_only },
11349   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
11350      OPT_mfix_cortex_a53_835769 },
11351   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
11352      OPT_mfix_cortex_a53_843419 },
11353   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
11354   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
11355   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
11356      OPT_momit_leaf_frame_pointer },
11357   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
11358   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
11359      OPT_march_ },
11360   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
11361   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
11362      OPT_mtune_ },
11363   { "sign-return-address", aarch64_attr_enum, false, NULL,
11364      OPT_msign_return_address_ },
11365   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
11366 };
11367
11368 /* Parse ARG_STR which contains the definition of one target attribute.
11369    Show appropriate errors if any or return true if the attribute is valid.  */
11370
11371 static bool
11372 aarch64_process_one_target_attr (char *arg_str)
11373 {
11374   bool invert = false;
11375
11376   size_t len = strlen (arg_str);
11377
11378   if (len == 0)
11379     {
11380       error ("malformed %<target()%> pragma or attribute");
11381       return false;
11382     }
11383
11384   char *str_to_check = (char *) alloca (len + 1);
11385   strcpy (str_to_check, arg_str);
11386
11387   /* Skip leading whitespace.  */
11388   while (*str_to_check == ' ' || *str_to_check == '\t')
11389     str_to_check++;
11390
11391   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
11392      It is easier to detect and handle it explicitly here rather than going
11393      through the machinery for the rest of the target attributes in this
11394      function.  */
11395   if (*str_to_check == '+')
11396     return aarch64_handle_attr_isa_flags (str_to_check);
11397
11398   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
11399     {
11400       invert = true;
11401       str_to_check += 3;
11402     }
11403   char *arg = strchr (str_to_check, '=');
11404
11405   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
11406      and point ARG to "foo".  */
11407   if (arg)
11408     {
11409       *arg = '\0';
11410       arg++;
11411     }
11412   const struct aarch64_attribute_info *p_attr;
11413   bool found = false;
11414   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
11415     {
11416       /* If the names don't match up, or the user has given an argument
11417          to an attribute that doesn't accept one, or didn't give an argument
11418          to an attribute that expects one, fail to match.  */
11419       if (strcmp (str_to_check, p_attr->name) != 0)
11420         continue;
11421
11422       found = true;
11423       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
11424                               || p_attr->attr_type == aarch64_attr_enum;
11425
11426       if (attr_need_arg_p ^ (arg != NULL))
11427         {
11428           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
11429           return false;
11430         }
11431
11432       /* If the name matches but the attribute does not allow "no-" versions
11433          then we can't match.  */
11434       if (invert && !p_attr->allow_neg)
11435         {
11436           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
11437           return false;
11438         }
11439
11440       switch (p_attr->attr_type)
11441         {
11442         /* Has a custom handler registered.
11443            For example, cpu=, arch=, tune=.  */
11444           case aarch64_attr_custom:
11445             gcc_assert (p_attr->handler);
11446             if (!p_attr->handler (arg))
11447               return false;
11448             break;
11449
11450           /* Either set or unset a boolean option.  */
11451           case aarch64_attr_bool:
11452             {
11453               struct cl_decoded_option decoded;
11454
11455               generate_option (p_attr->opt_num, NULL, !invert,
11456                                CL_TARGET, &decoded);
11457               aarch64_handle_option (&global_options, &global_options_set,
11458                                       &decoded, input_location);
11459               break;
11460             }
11461           /* Set or unset a bit in the target_flags.  aarch64_handle_option
11462              should know what mask to apply given the option number.  */
11463           case aarch64_attr_mask:
11464             {
11465               struct cl_decoded_option decoded;
11466               /* We only need to specify the option number.
11467                  aarch64_handle_option will know which mask to apply.  */
11468               decoded.opt_index = p_attr->opt_num;
11469               decoded.value = !invert;
11470               aarch64_handle_option (&global_options, &global_options_set,
11471                                       &decoded, input_location);
11472               break;
11473             }
11474           /* Use the option setting machinery to set an option to an enum.  */
11475           case aarch64_attr_enum:
11476             {
11477               gcc_assert (arg);
11478               bool valid;
11479               int value;
11480               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
11481                                               &value, CL_TARGET);
11482               if (valid)
11483                 {
11484                   set_option (&global_options, NULL, p_attr->opt_num, value,
11485                               NULL, DK_UNSPECIFIED, input_location,
11486                               global_dc);
11487                 }
11488               else
11489                 {
11490                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
11491                 }
11492               break;
11493             }
11494           default:
11495             gcc_unreachable ();
11496         }
11497     }
11498
11499   /* If we reached here we either have found an attribute and validated
11500      it or didn't match any.  If we matched an attribute but its arguments
11501      were malformed we will have returned false already.  */
11502   return found;
11503 }
11504
11505 /* Count how many times the character C appears in
11506    NULL-terminated string STR.  */
11507
11508 static unsigned int
11509 num_occurences_in_str (char c, char *str)
11510 {
11511   unsigned int res = 0;
11512   while (*str != '\0')
11513     {
11514       if (*str == c)
11515         res++;
11516
11517       str++;
11518     }
11519
11520   return res;
11521 }
11522
11523 /* Parse the tree in ARGS that contains the target attribute information
11524    and update the global target options space.  */
11525
11526 bool
11527 aarch64_process_target_attr (tree args)
11528 {
11529   if (TREE_CODE (args) == TREE_LIST)
11530     {
11531       do
11532         {
11533           tree head = TREE_VALUE (args);
11534           if (head)
11535             {
11536               if (!aarch64_process_target_attr (head))
11537                 return false;
11538             }
11539           args = TREE_CHAIN (args);
11540         } while (args);
11541
11542       return true;
11543     }
11544
11545   if (TREE_CODE (args) != STRING_CST)
11546     {
11547       error ("attribute %<target%> argument not a string");
11548       return false;
11549     }
11550
11551   size_t len = strlen (TREE_STRING_POINTER (args));
11552   char *str_to_check = (char *) alloca (len + 1);
11553   strcpy (str_to_check, TREE_STRING_POINTER (args));
11554
11555   if (len == 0)
11556     {
11557       error ("malformed %<target()%> pragma or attribute");
11558       return false;
11559     }
11560
11561   /* Used to catch empty spaces between commas i.e.
11562      attribute ((target ("attr1,,attr2"))).  */
11563   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
11564
11565   /* Handle multiple target attributes separated by ','.  */
11566   char *token = strtok (str_to_check, ",");
11567
11568   unsigned int num_attrs = 0;
11569   while (token)
11570     {
11571       num_attrs++;
11572       if (!aarch64_process_one_target_attr (token))
11573         {
11574           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
11575           return false;
11576         }
11577
11578       token = strtok (NULL, ",");
11579     }
11580
11581   if (num_attrs != num_commas + 1)
11582     {
11583       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
11584       return false;
11585     }
11586
11587   return true;
11588 }
11589
11590 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
11591    process attribute ((target ("..."))).  */
11592
11593 static bool
11594 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
11595 {
11596   struct cl_target_option cur_target;
11597   bool ret;
11598   tree old_optimize;
11599   tree new_target, new_optimize;
11600   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11601
11602   /* If what we're processing is the current pragma string then the
11603      target option node is already stored in target_option_current_node
11604      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
11605      having to re-parse the string.  This is especially useful to keep
11606      arm_neon.h compile times down since that header contains a lot
11607      of intrinsics enclosed in pragmas.  */
11608   if (!existing_target && args == current_target_pragma)
11609     {
11610       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
11611       return true;
11612     }
11613   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11614
11615   old_optimize = build_optimization_node (&global_options);
11616   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11617
11618   /* If the function changed the optimization levels as well as setting
11619      target options, start with the optimizations specified.  */
11620   if (func_optimize && func_optimize != old_optimize)
11621     cl_optimization_restore (&global_options,
11622                              TREE_OPTIMIZATION (func_optimize));
11623
11624   /* Save the current target options to restore at the end.  */
11625   cl_target_option_save (&cur_target, &global_options);
11626
11627   /* If fndecl already has some target attributes applied to it, unpack
11628      them so that we add this attribute on top of them, rather than
11629      overwriting them.  */
11630   if (existing_target)
11631     {
11632       struct cl_target_option *existing_options
11633         = TREE_TARGET_OPTION (existing_target);
11634
11635       if (existing_options)
11636         cl_target_option_restore (&global_options, existing_options);
11637     }
11638   else
11639     cl_target_option_restore (&global_options,
11640                         TREE_TARGET_OPTION (target_option_current_node));
11641
11642   ret = aarch64_process_target_attr (args);
11643
11644   /* Set up any additional state.  */
11645   if (ret)
11646     {
11647       aarch64_override_options_internal (&global_options);
11648       /* Initialize SIMD builtins if we haven't already.
11649          Set current_target_pragma to NULL for the duration so that
11650          the builtin initialization code doesn't try to tag the functions
11651          being built with the attributes specified by any current pragma, thus
11652          going into an infinite recursion.  */
11653       if (TARGET_SIMD)
11654         {
11655           tree saved_current_target_pragma = current_target_pragma;
11656           current_target_pragma = NULL;
11657           aarch64_init_simd_builtins ();
11658           current_target_pragma = saved_current_target_pragma;
11659         }
11660       new_target = build_target_option_node (&global_options);
11661     }
11662   else
11663     new_target = NULL;
11664
11665   new_optimize = build_optimization_node (&global_options);
11666
11667   if (fndecl && ret)
11668     {
11669       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
11670
11671       if (old_optimize != new_optimize)
11672         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
11673     }
11674
11675   cl_target_option_restore (&global_options, &cur_target);
11676
11677   if (old_optimize != new_optimize)
11678     cl_optimization_restore (&global_options,
11679                              TREE_OPTIMIZATION (old_optimize));
11680   return ret;
11681 }
11682
11683 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
11684    tri-bool options (yes, no, don't care) and the default value is
11685    DEF, determine whether to reject inlining.  */
11686
11687 static bool
11688 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
11689                                      int dont_care, int def)
11690 {
11691   /* If the callee doesn't care, always allow inlining.  */
11692   if (callee == dont_care)
11693     return true;
11694
11695   /* If the caller doesn't care, always allow inlining.  */
11696   if (caller == dont_care)
11697     return true;
11698
11699   /* Otherwise, allow inlining if either the callee and caller values
11700      agree, or if the callee is using the default value.  */
11701   return (callee == caller || callee == def);
11702 }
11703
11704 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
11705    to inline CALLEE into CALLER based on target-specific info.
11706    Make sure that the caller and callee have compatible architectural
11707    features.  Then go through the other possible target attributes
11708    and see if they can block inlining.  Try not to reject always_inline
11709    callees unless they are incompatible architecturally.  */
11710
11711 static bool
11712 aarch64_can_inline_p (tree caller, tree callee)
11713 {
11714   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
11715   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
11716
11717   struct cl_target_option *caller_opts
11718         = TREE_TARGET_OPTION (caller_tree ? caller_tree
11719                                            : target_option_default_node);
11720
11721   struct cl_target_option *callee_opts
11722         = TREE_TARGET_OPTION (callee_tree ? callee_tree
11723                                            : target_option_default_node);
11724
11725   /* Callee's ISA flags should be a subset of the caller's.  */
11726   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
11727        != callee_opts->x_aarch64_isa_flags)
11728     return false;
11729
11730   /* Allow non-strict aligned functions inlining into strict
11731      aligned ones.  */
11732   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
11733        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
11734       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
11735            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
11736     return false;
11737
11738   bool always_inline = lookup_attribute ("always_inline",
11739                                           DECL_ATTRIBUTES (callee));
11740
11741   /* If the architectural features match up and the callee is always_inline
11742      then the other attributes don't matter.  */
11743   if (always_inline)
11744     return true;
11745
11746   if (caller_opts->x_aarch64_cmodel_var
11747       != callee_opts->x_aarch64_cmodel_var)
11748     return false;
11749
11750   if (caller_opts->x_aarch64_tls_dialect
11751       != callee_opts->x_aarch64_tls_dialect)
11752     return false;
11753
11754   /* Honour explicit requests to workaround errata.  */
11755   if (!aarch64_tribools_ok_for_inlining_p (
11756           caller_opts->x_aarch64_fix_a53_err835769,
11757           callee_opts->x_aarch64_fix_a53_err835769,
11758           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
11759     return false;
11760
11761   if (!aarch64_tribools_ok_for_inlining_p (
11762           caller_opts->x_aarch64_fix_a53_err843419,
11763           callee_opts->x_aarch64_fix_a53_err843419,
11764           2, TARGET_FIX_ERR_A53_843419))
11765     return false;
11766
11767   /* If the user explicitly specified -momit-leaf-frame-pointer for the
11768      caller and calle and they don't match up, reject inlining.  */
11769   if (!aarch64_tribools_ok_for_inlining_p (
11770           caller_opts->x_flag_omit_leaf_frame_pointer,
11771           callee_opts->x_flag_omit_leaf_frame_pointer,
11772           2, 1))
11773     return false;
11774
11775   /* If the callee has specific tuning overrides, respect them.  */
11776   if (callee_opts->x_aarch64_override_tune_string != NULL
11777       && caller_opts->x_aarch64_override_tune_string == NULL)
11778     return false;
11779
11780   /* If the user specified tuning override strings for the
11781      caller and callee and they don't match up, reject inlining.
11782      We just do a string compare here, we don't analyze the meaning
11783      of the string, as it would be too costly for little gain.  */
11784   if (callee_opts->x_aarch64_override_tune_string
11785       && caller_opts->x_aarch64_override_tune_string
11786       && (strcmp (callee_opts->x_aarch64_override_tune_string,
11787                   caller_opts->x_aarch64_override_tune_string) != 0))
11788     return false;
11789
11790   return true;
11791 }
11792
11793 /* Return true if SYMBOL_REF X binds locally.  */
11794
11795 static bool
11796 aarch64_symbol_binds_local_p (const_rtx x)
11797 {
11798   return (SYMBOL_REF_DECL (x)
11799           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
11800           : SYMBOL_REF_LOCAL_P (x));
11801 }
11802
11803 /* Return true if SYMBOL_REF X is thread local */
11804 static bool
11805 aarch64_tls_symbol_p (rtx x)
11806 {
11807   if (! TARGET_HAVE_TLS)
11808     return false;
11809
11810   if (GET_CODE (x) != SYMBOL_REF)
11811     return false;
11812
11813   return SYMBOL_REF_TLS_MODEL (x) != 0;
11814 }
11815
11816 /* Classify a TLS symbol into one of the TLS kinds.  */
11817 enum aarch64_symbol_type
11818 aarch64_classify_tls_symbol (rtx x)
11819 {
11820   enum tls_model tls_kind = tls_symbolic_operand_type (x);
11821
11822   switch (tls_kind)
11823     {
11824     case TLS_MODEL_GLOBAL_DYNAMIC:
11825     case TLS_MODEL_LOCAL_DYNAMIC:
11826       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
11827
11828     case TLS_MODEL_INITIAL_EXEC:
11829       switch (aarch64_cmodel)
11830         {
11831         case AARCH64_CMODEL_TINY:
11832         case AARCH64_CMODEL_TINY_PIC:
11833           return SYMBOL_TINY_TLSIE;
11834         default:
11835           return SYMBOL_SMALL_TLSIE;
11836         }
11837
11838     case TLS_MODEL_LOCAL_EXEC:
11839       if (aarch64_tls_size == 12)
11840         return SYMBOL_TLSLE12;
11841       else if (aarch64_tls_size == 24)
11842         return SYMBOL_TLSLE24;
11843       else if (aarch64_tls_size == 32)
11844         return SYMBOL_TLSLE32;
11845       else if (aarch64_tls_size == 48)
11846         return SYMBOL_TLSLE48;
11847       else
11848         gcc_unreachable ();
11849
11850     case TLS_MODEL_EMULATED:
11851     case TLS_MODEL_NONE:
11852       return SYMBOL_FORCE_TO_MEM;
11853
11854     default:
11855       gcc_unreachable ();
11856     }
11857 }
11858
11859 /* Return the correct method for accessing X + OFFSET, where X is either
11860    a SYMBOL_REF or LABEL_REF.  */
11861
11862 enum aarch64_symbol_type
11863 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
11864 {
11865   if (GET_CODE (x) == LABEL_REF)
11866     {
11867       switch (aarch64_cmodel)
11868         {
11869         case AARCH64_CMODEL_LARGE:
11870           return SYMBOL_FORCE_TO_MEM;
11871
11872         case AARCH64_CMODEL_TINY_PIC:
11873         case AARCH64_CMODEL_TINY:
11874           return SYMBOL_TINY_ABSOLUTE;
11875
11876         case AARCH64_CMODEL_SMALL_SPIC:
11877         case AARCH64_CMODEL_SMALL_PIC:
11878         case AARCH64_CMODEL_SMALL:
11879           return SYMBOL_SMALL_ABSOLUTE;
11880
11881         default:
11882           gcc_unreachable ();
11883         }
11884     }
11885
11886   if (GET_CODE (x) == SYMBOL_REF)
11887     {
11888       if (aarch64_tls_symbol_p (x))
11889         return aarch64_classify_tls_symbol (x);
11890
11891       switch (aarch64_cmodel)
11892         {
11893         case AARCH64_CMODEL_TINY:
11894           /* When we retrieve symbol + offset address, we have to make sure
11895              the offset does not cause overflow of the final address.  But
11896              we have no way of knowing the address of symbol at compile time
11897              so we can't accurately say if the distance between the PC and
11898              symbol + offset is outside the addressible range of +/-1M in the
11899              TINY code model.  So we rely on images not being greater than
11900              1M and cap the offset at 1M and anything beyond 1M will have to
11901              be loaded using an alternative mechanism.  Furthermore if the
11902              symbol is a weak reference to something that isn't known to
11903              resolve to a symbol in this module, then force to memory.  */
11904           if ((SYMBOL_REF_WEAK (x)
11905                && !aarch64_symbol_binds_local_p (x))
11906               || !IN_RANGE (offset, -1048575, 1048575))
11907             return SYMBOL_FORCE_TO_MEM;
11908           return SYMBOL_TINY_ABSOLUTE;
11909
11910         case AARCH64_CMODEL_SMALL:
11911           /* Same reasoning as the tiny code model, but the offset cap here is
11912              4G.  */
11913           if ((SYMBOL_REF_WEAK (x)
11914                && !aarch64_symbol_binds_local_p (x))
11915               || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
11916                             HOST_WIDE_INT_C (4294967264)))
11917             return SYMBOL_FORCE_TO_MEM;
11918           return SYMBOL_SMALL_ABSOLUTE;
11919
11920         case AARCH64_CMODEL_TINY_PIC:
11921           if (!aarch64_symbol_binds_local_p (x))
11922             return SYMBOL_TINY_GOT;
11923           return SYMBOL_TINY_ABSOLUTE;
11924
11925         case AARCH64_CMODEL_SMALL_SPIC:
11926         case AARCH64_CMODEL_SMALL_PIC:
11927           if (!aarch64_symbol_binds_local_p (x))
11928             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
11929                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
11930           return SYMBOL_SMALL_ABSOLUTE;
11931
11932         case AARCH64_CMODEL_LARGE:
11933           /* This is alright even in PIC code as the constant
11934              pool reference is always PC relative and within
11935              the same translation unit.  */
11936           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
11937             return SYMBOL_SMALL_ABSOLUTE;
11938           else
11939             return SYMBOL_FORCE_TO_MEM;
11940
11941         default:
11942           gcc_unreachable ();
11943         }
11944     }
11945
11946   /* By default push everything into the constant pool.  */
11947   return SYMBOL_FORCE_TO_MEM;
11948 }
11949
11950 bool
11951 aarch64_constant_address_p (rtx x)
11952 {
11953   return (CONSTANT_P (x) && memory_address_p (DImode, x));
11954 }
11955
11956 bool
11957 aarch64_legitimate_pic_operand_p (rtx x)
11958 {
11959   if (GET_CODE (x) == SYMBOL_REF
11960       || (GET_CODE (x) == CONST
11961           && GET_CODE (XEXP (x, 0)) == PLUS
11962           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
11963      return false;
11964
11965   return true;
11966 }
11967
11968 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
11969    that should be rematerialized rather than spilled.  */
11970
11971 static bool
11972 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
11973 {
11974   /* Support CSE and rematerialization of common constants.  */
11975   if (CONST_INT_P (x)
11976       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
11977       || GET_CODE (x) == CONST_VECTOR)
11978     return true;
11979
11980   /* Do not allow vector struct mode constants for Advanced SIMD.
11981      We could support 0 and -1 easily, but they need support in
11982      aarch64-simd.md.  */
11983   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11984   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
11985     return false;
11986
11987   /* Only accept variable-length vector constants if they can be
11988      handled directly.
11989
11990      ??? It would be possible to handle rematerialization of other
11991      constants via secondary reloads.  */
11992   if (vec_flags & VEC_ANY_SVE)
11993     return aarch64_simd_valid_immediate (x, NULL);
11994
11995   if (GET_CODE (x) == HIGH)
11996     x = XEXP (x, 0);
11997
11998   /* Accept polynomial constants that can be calculated by using the
11999      destination of a move as the sole temporary.  Constants that
12000      require a second temporary cannot be rematerialized (they can't be
12001      forced to memory and also aren't legitimate constants).  */
12002   poly_int64 offset;
12003   if (poly_int_rtx_p (x, &offset))
12004     return aarch64_offset_temporaries (false, offset) <= 1;
12005
12006   /* If an offset is being added to something else, we need to allow the
12007      base to be moved into the destination register, meaning that there
12008      are no free temporaries for the offset.  */
12009   x = strip_offset (x, &offset);
12010   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
12011     return false;
12012
12013   /* Do not allow const (plus (anchor_symbol, const_int)).  */
12014   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
12015     return false;
12016
12017   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
12018      so spilling them is better than rematerialization.  */
12019   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
12020     return true;
12021
12022   /* Label references are always constant.  */
12023   if (GET_CODE (x) == LABEL_REF)
12024     return true;
12025
12026   return false;
12027 }
12028
12029 rtx
12030 aarch64_load_tp (rtx target)
12031 {
12032   if (!target
12033       || GET_MODE (target) != Pmode
12034       || !register_operand (target, Pmode))
12035     target = gen_reg_rtx (Pmode);
12036
12037   /* Can return in any reg.  */
12038   emit_insn (gen_aarch64_load_tp_hard (target));
12039   return target;
12040 }
12041
12042 /* On AAPCS systems, this is the "struct __va_list".  */
12043 static GTY(()) tree va_list_type;
12044
12045 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
12046    Return the type to use as __builtin_va_list.
12047
12048    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
12049
12050    struct __va_list
12051    {
12052      void *__stack;
12053      void *__gr_top;
12054      void *__vr_top;
12055      int   __gr_offs;
12056      int   __vr_offs;
12057    };  */
12058
12059 static tree
12060 aarch64_build_builtin_va_list (void)
12061 {
12062   tree va_list_name;
12063   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12064
12065   /* Create the type.  */
12066   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
12067   /* Give it the required name.  */
12068   va_list_name = build_decl (BUILTINS_LOCATION,
12069                              TYPE_DECL,
12070                              get_identifier ("__va_list"),
12071                              va_list_type);
12072   DECL_ARTIFICIAL (va_list_name) = 1;
12073   TYPE_NAME (va_list_type) = va_list_name;
12074   TYPE_STUB_DECL (va_list_type) = va_list_name;
12075
12076   /* Create the fields.  */
12077   f_stack = build_decl (BUILTINS_LOCATION,
12078                         FIELD_DECL, get_identifier ("__stack"),
12079                         ptr_type_node);
12080   f_grtop = build_decl (BUILTINS_LOCATION,
12081                         FIELD_DECL, get_identifier ("__gr_top"),
12082                         ptr_type_node);
12083   f_vrtop = build_decl (BUILTINS_LOCATION,
12084                         FIELD_DECL, get_identifier ("__vr_top"),
12085                         ptr_type_node);
12086   f_groff = build_decl (BUILTINS_LOCATION,
12087                         FIELD_DECL, get_identifier ("__gr_offs"),
12088                         integer_type_node);
12089   f_vroff = build_decl (BUILTINS_LOCATION,
12090                         FIELD_DECL, get_identifier ("__vr_offs"),
12091                         integer_type_node);
12092
12093   /* Tell tree-stdarg pass about our internal offset fields.
12094      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
12095      purpose to identify whether the code is updating va_list internal
12096      offset fields through irregular way.  */
12097   va_list_gpr_counter_field = f_groff;
12098   va_list_fpr_counter_field = f_vroff;
12099
12100   DECL_ARTIFICIAL (f_stack) = 1;
12101   DECL_ARTIFICIAL (f_grtop) = 1;
12102   DECL_ARTIFICIAL (f_vrtop) = 1;
12103   DECL_ARTIFICIAL (f_groff) = 1;
12104   DECL_ARTIFICIAL (f_vroff) = 1;
12105
12106   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
12107   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
12108   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
12109   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
12110   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
12111
12112   TYPE_FIELDS (va_list_type) = f_stack;
12113   DECL_CHAIN (f_stack) = f_grtop;
12114   DECL_CHAIN (f_grtop) = f_vrtop;
12115   DECL_CHAIN (f_vrtop) = f_groff;
12116   DECL_CHAIN (f_groff) = f_vroff;
12117
12118   /* Compute its layout.  */
12119   layout_type (va_list_type);
12120
12121   return va_list_type;
12122 }
12123
12124 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
12125 static void
12126 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
12127 {
12128   const CUMULATIVE_ARGS *cum;
12129   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12130   tree stack, grtop, vrtop, groff, vroff;
12131   tree t;
12132   int gr_save_area_size = cfun->va_list_gpr_size;
12133   int vr_save_area_size = cfun->va_list_fpr_size;
12134   int vr_offset;
12135
12136   cum = &crtl->args.info;
12137   if (cfun->va_list_gpr_size)
12138     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
12139                              cfun->va_list_gpr_size);
12140   if (cfun->va_list_fpr_size)
12141     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
12142                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
12143
12144   if (!TARGET_FLOAT)
12145     {
12146       gcc_assert (cum->aapcs_nvrn == 0);
12147       vr_save_area_size = 0;
12148     }
12149
12150   f_stack = TYPE_FIELDS (va_list_type_node);
12151   f_grtop = DECL_CHAIN (f_stack);
12152   f_vrtop = DECL_CHAIN (f_grtop);
12153   f_groff = DECL_CHAIN (f_vrtop);
12154   f_vroff = DECL_CHAIN (f_groff);
12155
12156   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
12157                   NULL_TREE);
12158   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
12159                   NULL_TREE);
12160   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
12161                   NULL_TREE);
12162   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
12163                   NULL_TREE);
12164   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
12165                   NULL_TREE);
12166
12167   /* Emit code to initialize STACK, which points to the next varargs stack
12168      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
12169      by named arguments.  STACK is 8-byte aligned.  */
12170   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
12171   if (cum->aapcs_stack_size > 0)
12172     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
12173   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
12174   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12175
12176   /* Emit code to initialize GRTOP, the top of the GR save area.
12177      virtual_incoming_args_rtx should have been 16 byte aligned.  */
12178   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
12179   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
12180   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12181
12182   /* Emit code to initialize VRTOP, the top of the VR save area.
12183      This address is gr_save_area_bytes below GRTOP, rounded
12184      down to the next 16-byte boundary.  */
12185   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
12186   vr_offset = ROUND_UP (gr_save_area_size,
12187                         STACK_BOUNDARY / BITS_PER_UNIT);
12188
12189   if (vr_offset)
12190     t = fold_build_pointer_plus_hwi (t, -vr_offset);
12191   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
12192   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12193
12194   /* Emit code to initialize GROFF, the offset from GRTOP of the
12195      next GPR argument.  */
12196   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
12197               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
12198   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12199
12200   /* Likewise emit code to initialize VROFF, the offset from FTOP
12201      of the next VR argument.  */
12202   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
12203               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
12204   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12205 }
12206
12207 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
12208
12209 static tree
12210 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
12211                               gimple_seq *post_p ATTRIBUTE_UNUSED)
12212 {
12213   tree addr;
12214   bool indirect_p;
12215   bool is_ha;           /* is HFA or HVA.  */
12216   bool dw_align;        /* double-word align.  */
12217   machine_mode ag_mode = VOIDmode;
12218   int nregs;
12219   machine_mode mode;
12220
12221   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12222   tree stack, f_top, f_off, off, arg, roundup, on_stack;
12223   HOST_WIDE_INT size, rsize, adjust, align;
12224   tree t, u, cond1, cond2;
12225
12226   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
12227   if (indirect_p)
12228     type = build_pointer_type (type);
12229
12230   mode = TYPE_MODE (type);
12231
12232   f_stack = TYPE_FIELDS (va_list_type_node);
12233   f_grtop = DECL_CHAIN (f_stack);
12234   f_vrtop = DECL_CHAIN (f_grtop);
12235   f_groff = DECL_CHAIN (f_vrtop);
12236   f_vroff = DECL_CHAIN (f_groff);
12237
12238   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
12239                   f_stack, NULL_TREE);
12240   size = int_size_in_bytes (type);
12241   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
12242
12243   dw_align = false;
12244   adjust = 0;
12245   if (aarch64_vfp_is_call_or_return_candidate (mode,
12246                                                type,
12247                                                &ag_mode,
12248                                                &nregs,
12249                                                &is_ha))
12250     {
12251       /* No frontends can create types with variable-sized modes, so we
12252          shouldn't be asked to pass or return them.  */
12253       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
12254
12255       /* TYPE passed in fp/simd registers.  */
12256       if (!TARGET_FLOAT)
12257         aarch64_err_no_fpadvsimd (mode, "varargs");
12258
12259       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
12260                       unshare_expr (valist), f_vrtop, NULL_TREE);
12261       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
12262                       unshare_expr (valist), f_vroff, NULL_TREE);
12263
12264       rsize = nregs * UNITS_PER_VREG;
12265
12266       if (is_ha)
12267         {
12268           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
12269             adjust = UNITS_PER_VREG - ag_size;
12270         }
12271       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12272                && size < UNITS_PER_VREG)
12273         {
12274           adjust = UNITS_PER_VREG - size;
12275         }
12276     }
12277   else
12278     {
12279       /* TYPE passed in general registers.  */
12280       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
12281                       unshare_expr (valist), f_grtop, NULL_TREE);
12282       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
12283                       unshare_expr (valist), f_groff, NULL_TREE);
12284       rsize = ROUND_UP (size, UNITS_PER_WORD);
12285       nregs = rsize / UNITS_PER_WORD;
12286
12287       if (align > 8)
12288         dw_align = true;
12289
12290       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12291           && size < UNITS_PER_WORD)
12292         {
12293           adjust = UNITS_PER_WORD  - size;
12294         }
12295     }
12296
12297   /* Get a local temporary for the field value.  */
12298   off = get_initialized_tmp_var (f_off, pre_p, NULL);
12299
12300   /* Emit code to branch if off >= 0.  */
12301   t = build2 (GE_EXPR, boolean_type_node, off,
12302               build_int_cst (TREE_TYPE (off), 0));
12303   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
12304
12305   if (dw_align)
12306     {
12307       /* Emit: offs = (offs + 15) & -16.  */
12308       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12309                   build_int_cst (TREE_TYPE (off), 15));
12310       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
12311                   build_int_cst (TREE_TYPE (off), -16));
12312       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
12313     }
12314   else
12315     roundup = NULL;
12316
12317   /* Update ap.__[g|v]r_offs  */
12318   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12319               build_int_cst (TREE_TYPE (off), rsize));
12320   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
12321
12322   /* String up.  */
12323   if (roundup)
12324     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12325
12326   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
12327   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
12328               build_int_cst (TREE_TYPE (f_off), 0));
12329   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
12330
12331   /* String up: make sure the assignment happens before the use.  */
12332   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
12333   COND_EXPR_ELSE (cond1) = t;
12334
12335   /* Prepare the trees handling the argument that is passed on the stack;
12336      the top level node will store in ON_STACK.  */
12337   arg = get_initialized_tmp_var (stack, pre_p, NULL);
12338   if (align > 8)
12339     {
12340       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
12341       t = fold_build_pointer_plus_hwi (arg, 15);
12342       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12343                   build_int_cst (TREE_TYPE (t), -16));
12344       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
12345     }
12346   else
12347     roundup = NULL;
12348   /* Advance ap.__stack  */
12349   t = fold_build_pointer_plus_hwi (arg, size + 7);
12350   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12351               build_int_cst (TREE_TYPE (t), -8));
12352   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
12353   /* String up roundup and advance.  */
12354   if (roundup)
12355     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12356   /* String up with arg */
12357   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
12358   /* Big-endianness related address adjustment.  */
12359   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12360       && size < UNITS_PER_WORD)
12361   {
12362     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
12363                 size_int (UNITS_PER_WORD - size));
12364     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
12365   }
12366
12367   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
12368   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
12369
12370   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
12371   t = off;
12372   if (adjust)
12373     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
12374                 build_int_cst (TREE_TYPE (off), adjust));
12375
12376   t = fold_convert (sizetype, t);
12377   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
12378
12379   if (is_ha)
12380     {
12381       /* type ha; // treat as "struct {ftype field[n];}"
12382          ... [computing offs]
12383          for (i = 0; i <nregs; ++i, offs += 16)
12384            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
12385          return ha;  */
12386       int i;
12387       tree tmp_ha, field_t, field_ptr_t;
12388
12389       /* Declare a local variable.  */
12390       tmp_ha = create_tmp_var_raw (type, "ha");
12391       gimple_add_tmp_var (tmp_ha);
12392
12393       /* Establish the base type.  */
12394       switch (ag_mode)
12395         {
12396         case E_SFmode:
12397           field_t = float_type_node;
12398           field_ptr_t = float_ptr_type_node;
12399           break;
12400         case E_DFmode:
12401           field_t = double_type_node;
12402           field_ptr_t = double_ptr_type_node;
12403           break;
12404         case E_TFmode:
12405           field_t = long_double_type_node;
12406           field_ptr_t = long_double_ptr_type_node;
12407           break;
12408         case E_HFmode:
12409           field_t = aarch64_fp16_type_node;
12410           field_ptr_t = aarch64_fp16_ptr_type_node;
12411           break;
12412         case E_V2SImode:
12413         case E_V4SImode:
12414             {
12415               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
12416               field_t = build_vector_type_for_mode (innertype, ag_mode);
12417               field_ptr_t = build_pointer_type (field_t);
12418             }
12419           break;
12420         default:
12421           gcc_assert (0);
12422         }
12423
12424       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
12425       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
12426       addr = t;
12427       t = fold_convert (field_ptr_t, addr);
12428       t = build2 (MODIFY_EXPR, field_t,
12429                   build1 (INDIRECT_REF, field_t, tmp_ha),
12430                   build1 (INDIRECT_REF, field_t, t));
12431
12432       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
12433       for (i = 1; i < nregs; ++i)
12434         {
12435           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
12436           u = fold_convert (field_ptr_t, addr);
12437           u = build2 (MODIFY_EXPR, field_t,
12438                       build2 (MEM_REF, field_t, tmp_ha,
12439                               build_int_cst (field_ptr_t,
12440                                              (i *
12441                                               int_size_in_bytes (field_t)))),
12442                       build1 (INDIRECT_REF, field_t, u));
12443           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
12444         }
12445
12446       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
12447       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
12448     }
12449
12450   COND_EXPR_ELSE (cond2) = t;
12451   addr = fold_convert (build_pointer_type (type), cond1);
12452   addr = build_va_arg_indirect_ref (addr);
12453
12454   if (indirect_p)
12455     addr = build_va_arg_indirect_ref (addr);
12456
12457   return addr;
12458 }
12459
12460 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
12461
12462 static void
12463 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
12464                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
12465                                 int no_rtl)
12466 {
12467   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
12468   CUMULATIVE_ARGS local_cum;
12469   int gr_saved = cfun->va_list_gpr_size;
12470   int vr_saved = cfun->va_list_fpr_size;
12471
12472   /* The caller has advanced CUM up to, but not beyond, the last named
12473      argument.  Advance a local copy of CUM past the last "real" named
12474      argument, to find out how many registers are left over.  */
12475   local_cum = *cum;
12476   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
12477
12478   /* Found out how many registers we need to save.
12479      Honor tree-stdvar analysis results.  */
12480   if (cfun->va_list_gpr_size)
12481     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
12482                     cfun->va_list_gpr_size / UNITS_PER_WORD);
12483   if (cfun->va_list_fpr_size)
12484     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
12485                     cfun->va_list_fpr_size / UNITS_PER_VREG);
12486
12487   if (!TARGET_FLOAT)
12488     {
12489       gcc_assert (local_cum.aapcs_nvrn == 0);
12490       vr_saved = 0;
12491     }
12492
12493   if (!no_rtl)
12494     {
12495       if (gr_saved > 0)
12496         {
12497           rtx ptr, mem;
12498
12499           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
12500           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
12501                                - gr_saved * UNITS_PER_WORD);
12502           mem = gen_frame_mem (BLKmode, ptr);
12503           set_mem_alias_set (mem, get_varargs_alias_set ());
12504
12505           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
12506                                mem, gr_saved);
12507         }
12508       if (vr_saved > 0)
12509         {
12510           /* We can't use move_block_from_reg, because it will use
12511              the wrong mode, storing D regs only.  */
12512           machine_mode mode = TImode;
12513           int off, i, vr_start;
12514
12515           /* Set OFF to the offset from virtual_incoming_args_rtx of
12516              the first vector register.  The VR save area lies below
12517              the GR one, and is aligned to 16 bytes.  */
12518           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
12519                            STACK_BOUNDARY / BITS_PER_UNIT);
12520           off -= vr_saved * UNITS_PER_VREG;
12521
12522           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
12523           for (i = 0; i < vr_saved; ++i)
12524             {
12525               rtx ptr, mem;
12526
12527               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
12528               mem = gen_frame_mem (mode, ptr);
12529               set_mem_alias_set (mem, get_varargs_alias_set ());
12530               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
12531               off += UNITS_PER_VREG;
12532             }
12533         }
12534     }
12535
12536   /* We don't save the size into *PRETEND_SIZE because we want to avoid
12537      any complication of having crtl->args.pretend_args_size changed.  */
12538   cfun->machine->frame.saved_varargs_size
12539     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
12540                  STACK_BOUNDARY / BITS_PER_UNIT)
12541        + vr_saved * UNITS_PER_VREG);
12542 }
12543
12544 static void
12545 aarch64_conditional_register_usage (void)
12546 {
12547   int i;
12548   if (!TARGET_FLOAT)
12549     {
12550       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
12551         {
12552           fixed_regs[i] = 1;
12553           call_used_regs[i] = 1;
12554         }
12555     }
12556   if (!TARGET_SVE)
12557     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
12558       {
12559         fixed_regs[i] = 1;
12560         call_used_regs[i] = 1;
12561       }
12562 }
12563
12564 /* Walk down the type tree of TYPE counting consecutive base elements.
12565    If *MODEP is VOIDmode, then set it to the first valid floating point
12566    type.  If a non-floating point type is found, or if a floating point
12567    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
12568    otherwise return the count in the sub-tree.  */
12569 static int
12570 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
12571 {
12572   machine_mode mode;
12573   HOST_WIDE_INT size;
12574
12575   switch (TREE_CODE (type))
12576     {
12577     case REAL_TYPE:
12578       mode = TYPE_MODE (type);
12579       if (mode != DFmode && mode != SFmode
12580           && mode != TFmode && mode != HFmode)
12581         return -1;
12582
12583       if (*modep == VOIDmode)
12584         *modep = mode;
12585
12586       if (*modep == mode)
12587         return 1;
12588
12589       break;
12590
12591     case COMPLEX_TYPE:
12592       mode = TYPE_MODE (TREE_TYPE (type));
12593       if (mode != DFmode && mode != SFmode
12594           && mode != TFmode && mode != HFmode)
12595         return -1;
12596
12597       if (*modep == VOIDmode)
12598         *modep = mode;
12599
12600       if (*modep == mode)
12601         return 2;
12602
12603       break;
12604
12605     case VECTOR_TYPE:
12606       /* Use V2SImode and V4SImode as representatives of all 64-bit
12607          and 128-bit vector types.  */
12608       size = int_size_in_bytes (type);
12609       switch (size)
12610         {
12611         case 8:
12612           mode = V2SImode;
12613           break;
12614         case 16:
12615           mode = V4SImode;
12616           break;
12617         default:
12618           return -1;
12619         }
12620
12621       if (*modep == VOIDmode)
12622         *modep = mode;
12623
12624       /* Vector modes are considered to be opaque: two vectors are
12625          equivalent for the purposes of being homogeneous aggregates
12626          if they are the same size.  */
12627       if (*modep == mode)
12628         return 1;
12629
12630       break;
12631
12632     case ARRAY_TYPE:
12633       {
12634         int count;
12635         tree index = TYPE_DOMAIN (type);
12636
12637         /* Can't handle incomplete types nor sizes that are not
12638            fixed.  */
12639         if (!COMPLETE_TYPE_P (type)
12640             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12641           return -1;
12642
12643         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
12644         if (count == -1
12645             || !index
12646             || !TYPE_MAX_VALUE (index)
12647             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
12648             || !TYPE_MIN_VALUE (index)
12649             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
12650             || count < 0)
12651           return -1;
12652
12653         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
12654                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
12655
12656         /* There must be no padding.  */
12657         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12658                       count * GET_MODE_BITSIZE (*modep)))
12659           return -1;
12660
12661         return count;
12662       }
12663
12664     case RECORD_TYPE:
12665       {
12666         int count = 0;
12667         int sub_count;
12668         tree field;
12669
12670         /* Can't handle incomplete types nor sizes that are not
12671            fixed.  */
12672         if (!COMPLETE_TYPE_P (type)
12673             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12674           return -1;
12675
12676         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12677           {
12678             if (TREE_CODE (field) != FIELD_DECL)
12679               continue;
12680
12681             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12682             if (sub_count < 0)
12683               return -1;
12684             count += sub_count;
12685           }
12686
12687         /* There must be no padding.  */
12688         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12689                       count * GET_MODE_BITSIZE (*modep)))
12690           return -1;
12691
12692         return count;
12693       }
12694
12695     case UNION_TYPE:
12696     case QUAL_UNION_TYPE:
12697       {
12698         /* These aren't very interesting except in a degenerate case.  */
12699         int count = 0;
12700         int sub_count;
12701         tree field;
12702
12703         /* Can't handle incomplete types nor sizes that are not
12704            fixed.  */
12705         if (!COMPLETE_TYPE_P (type)
12706             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12707           return -1;
12708
12709         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12710           {
12711             if (TREE_CODE (field) != FIELD_DECL)
12712               continue;
12713
12714             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12715             if (sub_count < 0)
12716               return -1;
12717             count = count > sub_count ? count : sub_count;
12718           }
12719
12720         /* There must be no padding.  */
12721         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12722                       count * GET_MODE_BITSIZE (*modep)))
12723           return -1;
12724
12725         return count;
12726       }
12727
12728     default:
12729       break;
12730     }
12731
12732   return -1;
12733 }
12734
12735 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
12736    type as described in AAPCS64 \S 4.1.2.
12737
12738    See the comment above aarch64_composite_type_p for the notes on MODE.  */
12739
12740 static bool
12741 aarch64_short_vector_p (const_tree type,
12742                         machine_mode mode)
12743 {
12744   poly_int64 size = -1;
12745
12746   if (type && TREE_CODE (type) == VECTOR_TYPE)
12747     size = int_size_in_bytes (type);
12748   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
12749             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
12750     size = GET_MODE_SIZE (mode);
12751
12752   return known_eq (size, 8) || known_eq (size, 16);
12753 }
12754
12755 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
12756    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
12757    array types.  The C99 floating-point complex types are also considered
12758    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
12759    types, which are GCC extensions and out of the scope of AAPCS64, are
12760    treated as composite types here as well.
12761
12762    Note that MODE itself is not sufficient in determining whether a type
12763    is such a composite type or not.  This is because
12764    stor-layout.c:compute_record_mode may have already changed the MODE
12765    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
12766    structure with only one field may have its MODE set to the mode of the
12767    field.  Also an integer mode whose size matches the size of the
12768    RECORD_TYPE type may be used to substitute the original mode
12769    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
12770    solely relied on.  */
12771
12772 static bool
12773 aarch64_composite_type_p (const_tree type,
12774                           machine_mode mode)
12775 {
12776   if (aarch64_short_vector_p (type, mode))
12777     return false;
12778
12779   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
12780     return true;
12781
12782   if (mode == BLKmode
12783       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
12784       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
12785     return true;
12786
12787   return false;
12788 }
12789
12790 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
12791    shall be passed or returned in simd/fp register(s) (providing these
12792    parameter passing registers are available).
12793
12794    Upon successful return, *COUNT returns the number of needed registers,
12795    *BASE_MODE returns the mode of the individual register and when IS_HAF
12796    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
12797    floating-point aggregate or a homogeneous short-vector aggregate.  */
12798
12799 static bool
12800 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
12801                                          const_tree type,
12802                                          machine_mode *base_mode,
12803                                          int *count,
12804                                          bool *is_ha)
12805 {
12806   machine_mode new_mode = VOIDmode;
12807   bool composite_p = aarch64_composite_type_p (type, mode);
12808
12809   if (is_ha != NULL) *is_ha = false;
12810
12811   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
12812       || aarch64_short_vector_p (type, mode))
12813     {
12814       *count = 1;
12815       new_mode = mode;
12816     }
12817   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
12818     {
12819       if (is_ha != NULL) *is_ha = true;
12820       *count = 2;
12821       new_mode = GET_MODE_INNER (mode);
12822     }
12823   else if (type && composite_p)
12824     {
12825       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
12826
12827       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
12828         {
12829           if (is_ha != NULL) *is_ha = true;
12830           *count = ag_count;
12831         }
12832       else
12833         return false;
12834     }
12835   else
12836     return false;
12837
12838   *base_mode = new_mode;
12839   return true;
12840 }
12841
12842 /* Implement TARGET_STRUCT_VALUE_RTX.  */
12843
12844 static rtx
12845 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
12846                           int incoming ATTRIBUTE_UNUSED)
12847 {
12848   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
12849 }
12850
12851 /* Implements target hook vector_mode_supported_p.  */
12852 static bool
12853 aarch64_vector_mode_supported_p (machine_mode mode)
12854 {
12855   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12856   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
12857 }
12858
12859 /* Return appropriate SIMD container
12860    for MODE within a vector of WIDTH bits.  */
12861 static machine_mode
12862 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
12863 {
12864   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
12865     switch (mode)
12866       {
12867       case E_DFmode:
12868         return VNx2DFmode;
12869       case E_SFmode:
12870         return VNx4SFmode;
12871       case E_HFmode:
12872         return VNx8HFmode;
12873       case E_DImode:
12874         return VNx2DImode;
12875       case E_SImode:
12876         return VNx4SImode;
12877       case E_HImode:
12878         return VNx8HImode;
12879       case E_QImode:
12880         return VNx16QImode;
12881       default:
12882         return word_mode;
12883       }
12884
12885   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
12886   if (TARGET_SIMD)
12887     {
12888       if (known_eq (width, 128))
12889         switch (mode)
12890           {
12891           case E_DFmode:
12892             return V2DFmode;
12893           case E_SFmode:
12894             return V4SFmode;
12895           case E_HFmode:
12896             return V8HFmode;
12897           case E_SImode:
12898             return V4SImode;
12899           case E_HImode:
12900             return V8HImode;
12901           case E_QImode:
12902             return V16QImode;
12903           case E_DImode:
12904             return V2DImode;
12905           default:
12906             break;
12907           }
12908       else
12909         switch (mode)
12910           {
12911           case E_SFmode:
12912             return V2SFmode;
12913           case E_HFmode:
12914             return V4HFmode;
12915           case E_SImode:
12916             return V2SImode;
12917           case E_HImode:
12918             return V4HImode;
12919           case E_QImode:
12920             return V8QImode;
12921           default:
12922             break;
12923           }
12924     }
12925   return word_mode;
12926 }
12927
12928 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
12929 static machine_mode
12930 aarch64_preferred_simd_mode (scalar_mode mode)
12931 {
12932   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
12933   return aarch64_simd_container_mode (mode, bits);
12934 }
12935
12936 /* Return a list of possible vector sizes for the vectorizer
12937    to iterate over.  */
12938 static void
12939 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
12940 {
12941   if (TARGET_SVE)
12942     sizes->safe_push (BYTES_PER_SVE_VECTOR);
12943   sizes->safe_push (16);
12944   sizes->safe_push (8);
12945 }
12946
12947 /* Implement TARGET_MANGLE_TYPE.  */
12948
12949 static const char *
12950 aarch64_mangle_type (const_tree type)
12951 {
12952   /* The AArch64 ABI documents say that "__va_list" has to be
12953      managled as if it is in the "std" namespace.  */
12954   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
12955     return "St9__va_list";
12956
12957   /* Half-precision float.  */
12958   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
12959     return "Dh";
12960
12961   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
12962      builtin types.  */
12963   if (TYPE_NAME (type) != NULL)
12964     return aarch64_mangle_builtin_type (type);
12965
12966   /* Use the default mangling.  */
12967   return NULL;
12968 }
12969
12970 /* Find the first rtx_insn before insn that will generate an assembly
12971    instruction.  */
12972
12973 static rtx_insn *
12974 aarch64_prev_real_insn (rtx_insn *insn)
12975 {
12976   if (!insn)
12977     return NULL;
12978
12979   do
12980     {
12981       insn = prev_real_insn (insn);
12982     }
12983   while (insn && recog_memoized (insn) < 0);
12984
12985   return insn;
12986 }
12987
12988 static bool
12989 is_madd_op (enum attr_type t1)
12990 {
12991   unsigned int i;
12992   /* A number of these may be AArch32 only.  */
12993   enum attr_type mlatypes[] = {
12994     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
12995     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
12996     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
12997   };
12998
12999   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
13000     {
13001       if (t1 == mlatypes[i])
13002         return true;
13003     }
13004
13005   return false;
13006 }
13007
13008 /* Check if there is a register dependency between a load and the insn
13009    for which we hold recog_data.  */
13010
13011 static bool
13012 dep_between_memop_and_curr (rtx memop)
13013 {
13014   rtx load_reg;
13015   int opno;
13016
13017   gcc_assert (GET_CODE (memop) == SET);
13018
13019   if (!REG_P (SET_DEST (memop)))
13020     return false;
13021
13022   load_reg = SET_DEST (memop);
13023   for (opno = 1; opno < recog_data.n_operands; opno++)
13024     {
13025       rtx operand = recog_data.operand[opno];
13026       if (REG_P (operand)
13027           && reg_overlap_mentioned_p (load_reg, operand))
13028         return true;
13029
13030     }
13031   return false;
13032 }
13033
13034
13035 /* When working around the Cortex-A53 erratum 835769,
13036    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
13037    instruction and has a preceding memory instruction such that a NOP
13038    should be inserted between them.  */
13039
13040 bool
13041 aarch64_madd_needs_nop (rtx_insn* insn)
13042 {
13043   enum attr_type attr_type;
13044   rtx_insn *prev;
13045   rtx body;
13046
13047   if (!TARGET_FIX_ERR_A53_835769)
13048     return false;
13049
13050   if (!INSN_P (insn) || recog_memoized (insn) < 0)
13051     return false;
13052
13053   attr_type = get_attr_type (insn);
13054   if (!is_madd_op (attr_type))
13055     return false;
13056
13057   prev = aarch64_prev_real_insn (insn);
13058   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
13059      Restore recog state to INSN to avoid state corruption.  */
13060   extract_constrain_insn_cached (insn);
13061
13062   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
13063     return false;
13064
13065   body = single_set (prev);
13066
13067   /* If the previous insn is a memory op and there is no dependency between
13068      it and the DImode madd, emit a NOP between them.  If body is NULL then we
13069      have a complex memory operation, probably a load/store pair.
13070      Be conservative for now and emit a NOP.  */
13071   if (GET_MODE (recog_data.operand[0]) == DImode
13072       && (!body || !dep_between_memop_and_curr (body)))
13073     return true;
13074
13075   return false;
13076
13077 }
13078
13079
13080 /* Implement FINAL_PRESCAN_INSN.  */
13081
13082 void
13083 aarch64_final_prescan_insn (rtx_insn *insn)
13084 {
13085   if (aarch64_madd_needs_nop (insn))
13086     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
13087 }
13088
13089
13090 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
13091    instruction.  */
13092
13093 bool
13094 aarch64_sve_index_immediate_p (rtx base_or_step)
13095 {
13096   return (CONST_INT_P (base_or_step)
13097           && IN_RANGE (INTVAL (base_or_step), -16, 15));
13098 }
13099
13100 /* Return true if X is a valid immediate for the SVE ADD and SUB
13101    instructions.  Negate X first if NEGATE_P is true.  */
13102
13103 bool
13104 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
13105 {
13106   rtx elt;
13107
13108   if (!const_vec_duplicate_p (x, &elt)
13109       || !CONST_INT_P (elt))
13110     return false;
13111
13112   HOST_WIDE_INT val = INTVAL (elt);
13113   if (negate_p)
13114     val = -val;
13115   val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
13116
13117   if (val & 0xff)
13118     return IN_RANGE (val, 0, 0xff);
13119   return IN_RANGE (val, 0, 0xff00);
13120 }
13121
13122 /* Return true if X is a valid immediate operand for an SVE logical
13123    instruction such as AND.  */
13124
13125 bool
13126 aarch64_sve_bitmask_immediate_p (rtx x)
13127 {
13128   rtx elt;
13129
13130   return (const_vec_duplicate_p (x, &elt)
13131           && CONST_INT_P (elt)
13132           && aarch64_bitmask_imm (INTVAL (elt),
13133                                   GET_MODE_INNER (GET_MODE (x))));
13134 }
13135
13136 /* Return true if X is a valid immediate for the SVE DUP and CPY
13137    instructions.  */
13138
13139 bool
13140 aarch64_sve_dup_immediate_p (rtx x)
13141 {
13142   rtx elt;
13143
13144   if (!const_vec_duplicate_p (x, &elt)
13145       || !CONST_INT_P (elt))
13146     return false;
13147
13148   HOST_WIDE_INT val = INTVAL (elt);
13149   if (val & 0xff)
13150     return IN_RANGE (val, -0x80, 0x7f);
13151   return IN_RANGE (val, -0x8000, 0x7f00);
13152 }
13153
13154 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
13155    SIGNED_P says whether the operand is signed rather than unsigned.  */
13156
13157 bool
13158 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
13159 {
13160   rtx elt;
13161
13162   return (const_vec_duplicate_p (x, &elt)
13163           && CONST_INT_P (elt)
13164           && (signed_p
13165               ? IN_RANGE (INTVAL (elt), -16, 15)
13166               : IN_RANGE (INTVAL (elt), 0, 127)));
13167 }
13168
13169 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
13170    instruction.  Negate X first if NEGATE_P is true.  */
13171
13172 bool
13173 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
13174 {
13175   rtx elt;
13176   REAL_VALUE_TYPE r;
13177
13178   if (!const_vec_duplicate_p (x, &elt)
13179       || GET_CODE (elt) != CONST_DOUBLE)
13180     return false;
13181
13182   r = *CONST_DOUBLE_REAL_VALUE (elt);
13183
13184   if (negate_p)
13185     r = real_value_negate (&r);
13186
13187   if (real_equal (&r, &dconst1))
13188     return true;
13189   if (real_equal (&r, &dconsthalf))
13190     return true;
13191   return false;
13192 }
13193
13194 /* Return true if X is a valid immediate operand for an SVE FMUL
13195    instruction.  */
13196
13197 bool
13198 aarch64_sve_float_mul_immediate_p (rtx x)
13199 {
13200   rtx elt;
13201
13202   /* GCC will never generate a multiply with an immediate of 2, so there is no
13203      point testing for it (even though it is a valid constant).  */
13204   return (const_vec_duplicate_p (x, &elt)
13205           && GET_CODE (elt) == CONST_DOUBLE
13206           && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
13207 }
13208
13209 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
13210    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
13211    is nonnull, use it to describe valid immediates.  */
13212 static bool
13213 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
13214                                     simd_immediate_info *info,
13215                                     enum simd_immediate_check which,
13216                                     simd_immediate_info::insn_type insn)
13217 {
13218   /* Try a 4-byte immediate with LSL.  */
13219   for (unsigned int shift = 0; shift < 32; shift += 8)
13220     if ((val32 & (0xff << shift)) == val32)
13221       {
13222         if (info)
13223           *info = simd_immediate_info (SImode, val32 >> shift, insn,
13224                                        simd_immediate_info::LSL, shift);
13225         return true;
13226       }
13227
13228   /* Try a 2-byte immediate with LSL.  */
13229   unsigned int imm16 = val32 & 0xffff;
13230   if (imm16 == (val32 >> 16))
13231     for (unsigned int shift = 0; shift < 16; shift += 8)
13232       if ((imm16 & (0xff << shift)) == imm16)
13233         {
13234           if (info)
13235             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
13236                                          simd_immediate_info::LSL, shift);
13237           return true;
13238         }
13239
13240   /* Try a 4-byte immediate with MSL, except for cases that MVN
13241      can handle.  */
13242   if (which == AARCH64_CHECK_MOV)
13243     for (unsigned int shift = 8; shift < 24; shift += 8)
13244       {
13245         unsigned int low = (1 << shift) - 1;
13246         if (((val32 & (0xff << shift)) | low) == val32)
13247           {
13248             if (info)
13249               *info = simd_immediate_info (SImode, val32 >> shift, insn,
13250                                            simd_immediate_info::MSL, shift);
13251             return true;
13252           }
13253       }
13254
13255   return false;
13256 }
13257
13258 /* Return true if replicating VAL64 is a valid immediate for the
13259    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
13260    use it to describe valid immediates.  */
13261 static bool
13262 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
13263                                  simd_immediate_info *info,
13264                                  enum simd_immediate_check which)
13265 {
13266   unsigned int val32 = val64 & 0xffffffff;
13267   unsigned int val16 = val64 & 0xffff;
13268   unsigned int val8 = val64 & 0xff;
13269
13270   if (val32 == (val64 >> 32))
13271     {
13272       if ((which & AARCH64_CHECK_ORR) != 0
13273           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
13274                                                  simd_immediate_info::MOV))
13275         return true;
13276
13277       if ((which & AARCH64_CHECK_BIC) != 0
13278           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
13279                                                  simd_immediate_info::MVN))
13280         return true;
13281
13282       /* Try using a replicated byte.  */
13283       if (which == AARCH64_CHECK_MOV
13284           && val16 == (val32 >> 16)
13285           && val8 == (val16 >> 8))
13286         {
13287           if (info)
13288             *info = simd_immediate_info (QImode, val8);
13289           return true;
13290         }
13291     }
13292
13293   /* Try using a bit-to-bytemask.  */
13294   if (which == AARCH64_CHECK_MOV)
13295     {
13296       unsigned int i;
13297       for (i = 0; i < 64; i += 8)
13298         {
13299           unsigned char byte = (val64 >> i) & 0xff;
13300           if (byte != 0 && byte != 0xff)
13301             break;
13302         }
13303       if (i == 64)
13304         {
13305           if (info)
13306             *info = simd_immediate_info (DImode, val64);
13307           return true;
13308         }
13309     }
13310   return false;
13311 }
13312
13313 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
13314    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
13315
13316 static bool
13317 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
13318                              simd_immediate_info *info)
13319 {
13320   scalar_int_mode mode = DImode;
13321   unsigned int val32 = val64 & 0xffffffff;
13322   if (val32 == (val64 >> 32))
13323     {
13324       mode = SImode;
13325       unsigned int val16 = val32 & 0xffff;
13326       if (val16 == (val32 >> 16))
13327         {
13328           mode = HImode;
13329           unsigned int val8 = val16 & 0xff;
13330           if (val8 == (val16 >> 8))
13331             mode = QImode;
13332         }
13333     }
13334   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
13335   if (IN_RANGE (val, -0x80, 0x7f))
13336     {
13337       /* DUP with no shift.  */
13338       if (info)
13339         *info = simd_immediate_info (mode, val);
13340       return true;
13341     }
13342   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
13343     {
13344       /* DUP with LSL #8.  */
13345       if (info)
13346         *info = simd_immediate_info (mode, val);
13347       return true;
13348     }
13349   if (aarch64_bitmask_imm (val64, mode))
13350     {
13351       /* DUPM.  */
13352       if (info)
13353         *info = simd_immediate_info (mode, val);
13354       return true;
13355     }
13356   return false;
13357 }
13358
13359 /* Return true if OP is a valid SIMD immediate for the operation
13360    described by WHICH.  If INFO is nonnull, use it to describe valid
13361    immediates.  */
13362 bool
13363 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
13364                               enum simd_immediate_check which)
13365 {
13366   machine_mode mode = GET_MODE (op);
13367   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13368   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13369     return false;
13370
13371   scalar_mode elt_mode = GET_MODE_INNER (mode);
13372   rtx base, step;
13373   unsigned int n_elts;
13374   if (GET_CODE (op) == CONST_VECTOR
13375       && CONST_VECTOR_DUPLICATE_P (op))
13376     n_elts = CONST_VECTOR_NPATTERNS (op);
13377   else if ((vec_flags & VEC_SVE_DATA)
13378            && const_vec_series_p (op, &base, &step))
13379     {
13380       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
13381       if (!aarch64_sve_index_immediate_p (base)
13382           || !aarch64_sve_index_immediate_p (step))
13383         return false;
13384
13385       if (info)
13386         *info = simd_immediate_info (elt_mode, base, step);
13387       return true;
13388     }
13389   else if (GET_CODE (op) == CONST_VECTOR
13390            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
13391     /* N_ELTS set above.  */;
13392   else
13393     return false;
13394
13395   /* Handle PFALSE and PTRUE.  */
13396   if (vec_flags & VEC_SVE_PRED)
13397     return (op == CONST0_RTX (mode)
13398             || op == CONSTM1_RTX (mode));
13399
13400   scalar_float_mode elt_float_mode;
13401   if (n_elts == 1
13402       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
13403     {
13404       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
13405       if (aarch64_float_const_zero_rtx_p (elt)
13406           || aarch64_float_const_representable_p (elt))
13407         {
13408           if (info)
13409             *info = simd_immediate_info (elt_float_mode, elt);
13410           return true;
13411         }
13412     }
13413
13414   unsigned int elt_size = GET_MODE_SIZE (elt_mode);
13415   if (elt_size > 8)
13416     return false;
13417
13418   scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
13419
13420   /* Expand the vector constant out into a byte vector, with the least
13421      significant byte of the register first.  */
13422   auto_vec<unsigned char, 16> bytes;
13423   bytes.reserve (n_elts * elt_size);
13424   for (unsigned int i = 0; i < n_elts; i++)
13425     {
13426       /* The vector is provided in gcc endian-neutral fashion.
13427          For aarch64_be Advanced SIMD, it must be laid out in the vector
13428          register in reverse order.  */
13429       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
13430       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
13431
13432       if (elt_mode != elt_int_mode)
13433         elt = gen_lowpart (elt_int_mode, elt);
13434
13435       if (!CONST_INT_P (elt))
13436         return false;
13437
13438       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
13439       for (unsigned int byte = 0; byte < elt_size; byte++)
13440         {
13441           bytes.quick_push (elt_val & 0xff);
13442           elt_val >>= BITS_PER_UNIT;
13443         }
13444     }
13445
13446   /* The immediate must repeat every eight bytes.  */
13447   unsigned int nbytes = bytes.length ();
13448   for (unsigned i = 8; i < nbytes; ++i)
13449     if (bytes[i] != bytes[i - 8])
13450       return false;
13451
13452   /* Get the repeating 8-byte value as an integer.  No endian correction
13453      is needed here because bytes is already in lsb-first order.  */
13454   unsigned HOST_WIDE_INT val64 = 0;
13455   for (unsigned int i = 0; i < 8; i++)
13456     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
13457               << (i * BITS_PER_UNIT));
13458
13459   if (vec_flags & VEC_SVE_DATA)
13460     return aarch64_sve_valid_immediate (val64, info);
13461   else
13462     return aarch64_advsimd_valid_immediate (val64, info, which);
13463 }
13464
13465 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
13466    has a step in the range of INDEX.  Return the index expression if so,
13467    otherwise return null.  */
13468 rtx
13469 aarch64_check_zero_based_sve_index_immediate (rtx x)
13470 {
13471   rtx base, step;
13472   if (const_vec_series_p (x, &base, &step)
13473       && base == const0_rtx
13474       && aarch64_sve_index_immediate_p (step))
13475     return step;
13476   return NULL_RTX;
13477 }
13478
13479 /* Check of immediate shift constants are within range.  */
13480 bool
13481 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
13482 {
13483   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
13484   if (left)
13485     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
13486   else
13487     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
13488 }
13489
13490 /* Return the bitmask CONST_INT to select the bits required by a zero extract
13491    operation of width WIDTH at bit position POS.  */
13492
13493 rtx
13494 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
13495 {
13496   gcc_assert (CONST_INT_P (width));
13497   gcc_assert (CONST_INT_P (pos));
13498
13499   unsigned HOST_WIDE_INT mask
13500     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
13501   return GEN_INT (mask << UINTVAL (pos));
13502 }
13503
13504 bool
13505 aarch64_mov_operand_p (rtx x, machine_mode mode)
13506 {
13507   if (GET_CODE (x) == HIGH
13508       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
13509     return true;
13510
13511   if (CONST_INT_P (x))
13512     return true;
13513
13514   if (VECTOR_MODE_P (GET_MODE (x)))
13515     return aarch64_simd_valid_immediate (x, NULL);
13516
13517   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
13518     return true;
13519
13520   if (aarch64_sve_cnt_immediate_p (x))
13521     return true;
13522
13523   return aarch64_classify_symbolic_expression (x)
13524     == SYMBOL_TINY_ABSOLUTE;
13525 }
13526
13527 /* Return a const_int vector of VAL.  */
13528 rtx
13529 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
13530 {
13531   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
13532   return gen_const_vec_duplicate (mode, c);
13533 }
13534
13535 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
13536
13537 bool
13538 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
13539 {
13540   machine_mode vmode;
13541
13542   vmode = aarch64_simd_container_mode (mode, 64);
13543   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
13544   return aarch64_simd_valid_immediate (op_v, NULL);
13545 }
13546
13547 /* Construct and return a PARALLEL RTX vector with elements numbering the
13548    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
13549    the vector - from the perspective of the architecture.  This does not
13550    line up with GCC's perspective on lane numbers, so we end up with
13551    different masks depending on our target endian-ness.  The diagram
13552    below may help.  We must draw the distinction when building masks
13553    which select one half of the vector.  An instruction selecting
13554    architectural low-lanes for a big-endian target, must be described using
13555    a mask selecting GCC high-lanes.
13556
13557                  Big-Endian             Little-Endian
13558
13559 GCC             0   1   2   3           3   2   1   0
13560               | x | x | x | x |       | x | x | x | x |
13561 Architecture    3   2   1   0           3   2   1   0
13562
13563 Low Mask:         { 2, 3 }                { 0, 1 }
13564 High Mask:        { 0, 1 }                { 2, 3 }
13565
13566    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
13567
13568 rtx
13569 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
13570 {
13571   rtvec v = rtvec_alloc (nunits / 2);
13572   int high_base = nunits / 2;
13573   int low_base = 0;
13574   int base;
13575   rtx t1;
13576   int i;
13577
13578   if (BYTES_BIG_ENDIAN)
13579     base = high ? low_base : high_base;
13580   else
13581     base = high ? high_base : low_base;
13582
13583   for (i = 0; i < nunits / 2; i++)
13584     RTVEC_ELT (v, i) = GEN_INT (base + i);
13585
13586   t1 = gen_rtx_PARALLEL (mode, v);
13587   return t1;
13588 }
13589
13590 /* Check OP for validity as a PARALLEL RTX vector with elements
13591    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
13592    from the perspective of the architecture.  See the diagram above
13593    aarch64_simd_vect_par_cnst_half for more details.  */
13594
13595 bool
13596 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
13597                                        bool high)
13598 {
13599   int nelts;
13600   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
13601     return false;
13602
13603   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
13604   HOST_WIDE_INT count_op = XVECLEN (op, 0);
13605   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
13606   int i = 0;
13607
13608   if (count_op != count_ideal)
13609     return false;
13610
13611   for (i = 0; i < count_ideal; i++)
13612     {
13613       rtx elt_op = XVECEXP (op, 0, i);
13614       rtx elt_ideal = XVECEXP (ideal, 0, i);
13615
13616       if (!CONST_INT_P (elt_op)
13617           || INTVAL (elt_ideal) != INTVAL (elt_op))
13618         return false;
13619     }
13620   return true;
13621 }
13622
13623 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
13624    HIGH (exclusive).  */
13625 void
13626 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
13627                           const_tree exp)
13628 {
13629   HOST_WIDE_INT lane;
13630   gcc_assert (CONST_INT_P (operand));
13631   lane = INTVAL (operand);
13632
13633   if (lane < low || lane >= high)
13634   {
13635     if (exp)
13636       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
13637     else
13638       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
13639   }
13640 }
13641
13642 /* Peform endian correction on lane number N, which indexes a vector
13643    of mode MODE, and return the result as an SImode rtx.  */
13644
13645 rtx
13646 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
13647 {
13648   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
13649 }
13650
13651 /* Return TRUE if OP is a valid vector addressing mode.  */
13652
13653 bool
13654 aarch64_simd_mem_operand_p (rtx op)
13655 {
13656   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
13657                         || REG_P (XEXP (op, 0)));
13658 }
13659
13660 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
13661
13662 bool
13663 aarch64_sve_ld1r_operand_p (rtx op)
13664 {
13665   struct aarch64_address_info addr;
13666   scalar_mode mode;
13667
13668   return (MEM_P (op)
13669           && is_a <scalar_mode> (GET_MODE (op), &mode)
13670           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
13671           && addr.type == ADDRESS_REG_IMM
13672           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
13673 }
13674
13675 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
13676    The conditions for STR are the same.  */
13677 bool
13678 aarch64_sve_ldr_operand_p (rtx op)
13679 {
13680   struct aarch64_address_info addr;
13681
13682   return (MEM_P (op)
13683           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
13684                                        false, ADDR_QUERY_ANY)
13685           && addr.type == ADDRESS_REG_IMM);
13686 }
13687
13688 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
13689    We need to be able to access the individual pieces, so the range
13690    is different from LD[234] and ST[234].  */
13691 bool
13692 aarch64_sve_struct_memory_operand_p (rtx op)
13693 {
13694   if (!MEM_P (op))
13695     return false;
13696
13697   machine_mode mode = GET_MODE (op);
13698   struct aarch64_address_info addr;
13699   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
13700                                  ADDR_QUERY_ANY)
13701       || addr.type != ADDRESS_REG_IMM)
13702     return false;
13703
13704   poly_int64 first = addr.const_offset;
13705   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
13706   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
13707           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
13708 }
13709
13710 /* Emit a register copy from operand to operand, taking care not to
13711    early-clobber source registers in the process.
13712
13713    COUNT is the number of components into which the copy needs to be
13714    decomposed.  */
13715 void
13716 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
13717                                 unsigned int count)
13718 {
13719   unsigned int i;
13720   int rdest = REGNO (operands[0]);
13721   int rsrc = REGNO (operands[1]);
13722
13723   if (!reg_overlap_mentioned_p (operands[0], operands[1])
13724       || rdest < rsrc)
13725     for (i = 0; i < count; i++)
13726       emit_move_insn (gen_rtx_REG (mode, rdest + i),
13727                       gen_rtx_REG (mode, rsrc + i));
13728   else
13729     for (i = 0; i < count; i++)
13730       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
13731                       gen_rtx_REG (mode, rsrc + count - i - 1));
13732 }
13733
13734 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
13735    one of VSTRUCT modes: OI, CI, or XI.  */
13736 int
13737 aarch64_simd_attr_length_rglist (machine_mode mode)
13738 {
13739   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
13740   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
13741 }
13742
13743 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
13744    alignment of a vector to 128 bits.  SVE predicates have an alignment of
13745    16 bits.  */
13746 static HOST_WIDE_INT
13747 aarch64_simd_vector_alignment (const_tree type)
13748 {
13749   if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13750     /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
13751        be set for non-predicate vectors of booleans.  Modes are the most
13752        direct way we have of identifying real SVE predicate types.  */
13753     return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
13754   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
13755   return MIN (align, 128);
13756 }
13757
13758 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
13759 static HOST_WIDE_INT
13760 aarch64_vectorize_preferred_vector_alignment (const_tree type)
13761 {
13762   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
13763     {
13764       /* If the length of the vector is fixed, try to align to that length,
13765          otherwise don't try to align at all.  */
13766       HOST_WIDE_INT result;
13767       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
13768         result = TYPE_ALIGN (TREE_TYPE (type));
13769       return result;
13770     }
13771   return TYPE_ALIGN (type);
13772 }
13773
13774 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
13775 static bool
13776 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
13777 {
13778   if (is_packed)
13779     return false;
13780
13781   /* For fixed-length vectors, check that the vectorizer will aim for
13782      full-vector alignment.  This isn't true for generic GCC vectors
13783      that are wider than the ABI maximum of 128 bits.  */
13784   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
13785       && (wi::to_widest (TYPE_SIZE (type))
13786           != aarch64_vectorize_preferred_vector_alignment (type)))
13787     return false;
13788
13789   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
13790   return true;
13791 }
13792
13793 /* Return true if the vector misalignment factor is supported by the
13794    target.  */
13795 static bool
13796 aarch64_builtin_support_vector_misalignment (machine_mode mode,
13797                                              const_tree type, int misalignment,
13798                                              bool is_packed)
13799 {
13800   if (TARGET_SIMD && STRICT_ALIGNMENT)
13801     {
13802       /* Return if movmisalign pattern is not supported for this mode.  */
13803       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
13804         return false;
13805
13806       /* Misalignment factor is unknown at compile time.  */
13807       if (misalignment == -1)
13808         return false;
13809     }
13810   return default_builtin_support_vector_misalignment (mode, type, misalignment,
13811                                                       is_packed);
13812 }
13813
13814 /* If VALS is a vector constant that can be loaded into a register
13815    using DUP, generate instructions to do so and return an RTX to
13816    assign to the register.  Otherwise return NULL_RTX.  */
13817 static rtx
13818 aarch64_simd_dup_constant (rtx vals)
13819 {
13820   machine_mode mode = GET_MODE (vals);
13821   machine_mode inner_mode = GET_MODE_INNER (mode);
13822   rtx x;
13823
13824   if (!const_vec_duplicate_p (vals, &x))
13825     return NULL_RTX;
13826
13827   /* We can load this constant by using DUP and a constant in a
13828      single ARM register.  This will be cheaper than a vector
13829      load.  */
13830   x = copy_to_mode_reg (inner_mode, x);
13831   return gen_vec_duplicate (mode, x);
13832 }
13833
13834
13835 /* Generate code to load VALS, which is a PARALLEL containing only
13836    constants (for vec_init) or CONST_VECTOR, efficiently into a
13837    register.  Returns an RTX to copy into the register, or NULL_RTX
13838    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
13839 static rtx
13840 aarch64_simd_make_constant (rtx vals)
13841 {
13842   machine_mode mode = GET_MODE (vals);
13843   rtx const_dup;
13844   rtx const_vec = NULL_RTX;
13845   int n_const = 0;
13846   int i;
13847
13848   if (GET_CODE (vals) == CONST_VECTOR)
13849     const_vec = vals;
13850   else if (GET_CODE (vals) == PARALLEL)
13851     {
13852       /* A CONST_VECTOR must contain only CONST_INTs and
13853          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
13854          Only store valid constants in a CONST_VECTOR.  */
13855       int n_elts = XVECLEN (vals, 0);
13856       for (i = 0; i < n_elts; ++i)
13857         {
13858           rtx x = XVECEXP (vals, 0, i);
13859           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13860             n_const++;
13861         }
13862       if (n_const == n_elts)
13863         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
13864     }
13865   else
13866     gcc_unreachable ();
13867
13868   if (const_vec != NULL_RTX
13869       && aarch64_simd_valid_immediate (const_vec, NULL))
13870     /* Load using MOVI/MVNI.  */
13871     return const_vec;
13872   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
13873     /* Loaded using DUP.  */
13874     return const_dup;
13875   else if (const_vec != NULL_RTX)
13876     /* Load from constant pool. We can not take advantage of single-cycle
13877        LD1 because we need a PC-relative addressing mode.  */
13878     return const_vec;
13879   else
13880     /* A PARALLEL containing something not valid inside CONST_VECTOR.
13881        We can not construct an initializer.  */
13882     return NULL_RTX;
13883 }
13884
13885 /* Expand a vector initialisation sequence, such that TARGET is
13886    initialised to contain VALS.  */
13887
13888 void
13889 aarch64_expand_vector_init (rtx target, rtx vals)
13890 {
13891   machine_mode mode = GET_MODE (target);
13892   scalar_mode inner_mode = GET_MODE_INNER (mode);
13893   /* The number of vector elements.  */
13894   int n_elts = XVECLEN (vals, 0);
13895   /* The number of vector elements which are not constant.  */
13896   int n_var = 0;
13897   rtx any_const = NULL_RTX;
13898   /* The first element of vals.  */
13899   rtx v0 = XVECEXP (vals, 0, 0);
13900   bool all_same = true;
13901
13902   /* Count the number of variable elements to initialise.  */
13903   for (int i = 0; i < n_elts; ++i)
13904     {
13905       rtx x = XVECEXP (vals, 0, i);
13906       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
13907         ++n_var;
13908       else
13909         any_const = x;
13910
13911       all_same &= rtx_equal_p (x, v0);
13912     }
13913
13914   /* No variable elements, hand off to aarch64_simd_make_constant which knows
13915      how best to handle this.  */
13916   if (n_var == 0)
13917     {
13918       rtx constant = aarch64_simd_make_constant (vals);
13919       if (constant != NULL_RTX)
13920         {
13921           emit_move_insn (target, constant);
13922           return;
13923         }
13924     }
13925
13926   /* Splat a single non-constant element if we can.  */
13927   if (all_same)
13928     {
13929       rtx x = copy_to_mode_reg (inner_mode, v0);
13930       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13931       return;
13932     }
13933
13934   enum insn_code icode = optab_handler (vec_set_optab, mode);
13935   gcc_assert (icode != CODE_FOR_nothing);
13936
13937   /* If there are only variable elements, try to optimize
13938      the insertion using dup for the most common element
13939      followed by insertions.  */
13940
13941   /* The algorithm will fill matches[*][0] with the earliest matching element,
13942      and matches[X][1] with the count of duplicate elements (if X is the
13943      earliest element which has duplicates).  */
13944
13945   if (n_var == n_elts && n_elts <= 16)
13946     {
13947       int matches[16][2] = {0};
13948       for (int i = 0; i < n_elts; i++)
13949         {
13950           for (int j = 0; j <= i; j++)
13951             {
13952               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
13953                 {
13954                   matches[i][0] = j;
13955                   matches[j][1]++;
13956                   break;
13957                 }
13958             }
13959         }
13960       int maxelement = 0;
13961       int maxv = 0;
13962       for (int i = 0; i < n_elts; i++)
13963         if (matches[i][1] > maxv)
13964           {
13965             maxelement = i;
13966             maxv = matches[i][1];
13967           }
13968
13969       /* Create a duplicate of the most common element, unless all elements
13970          are equally useless to us, in which case just immediately set the
13971          vector register using the first element.  */
13972
13973       if (maxv == 1)
13974         {
13975           /* For vectors of two 64-bit elements, we can do even better.  */
13976           if (n_elts == 2
13977               && (inner_mode == E_DImode
13978                   || inner_mode == E_DFmode))
13979
13980             {
13981               rtx x0 = XVECEXP (vals, 0, 0);
13982               rtx x1 = XVECEXP (vals, 0, 1);
13983               /* Combine can pick up this case, but handling it directly
13984                  here leaves clearer RTL.
13985
13986                  This is load_pair_lanes<mode>, and also gives us a clean-up
13987                  for store_pair_lanes<mode>.  */
13988               if (memory_operand (x0, inner_mode)
13989                   && memory_operand (x1, inner_mode)
13990                   && !STRICT_ALIGNMENT
13991                   && rtx_equal_p (XEXP (x1, 0),
13992                                   plus_constant (Pmode,
13993                                                  XEXP (x0, 0),
13994                                                  GET_MODE_SIZE (inner_mode))))
13995                 {
13996                   rtx t;
13997                   if (inner_mode == DFmode)
13998                     t = gen_load_pair_lanesdf (target, x0, x1);
13999                   else
14000                     t = gen_load_pair_lanesdi (target, x0, x1);
14001                   emit_insn (t);
14002                   return;
14003                 }
14004             }
14005           /* The subreg-move sequence below will move into lane zero of the
14006              vector register.  For big-endian we want that position to hold
14007              the last element of VALS.  */
14008           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
14009           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
14010           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
14011         }
14012       else
14013         {
14014           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
14015           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
14016         }
14017
14018       /* Insert the rest.  */
14019       for (int i = 0; i < n_elts; i++)
14020         {
14021           rtx x = XVECEXP (vals, 0, i);
14022           if (matches[i][0] == maxelement)
14023             continue;
14024           x = copy_to_mode_reg (inner_mode, x);
14025           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
14026         }
14027       return;
14028     }
14029
14030   /* Initialise a vector which is part-variable.  We want to first try
14031      to build those lanes which are constant in the most efficient way we
14032      can.  */
14033   if (n_var != n_elts)
14034     {
14035       rtx copy = copy_rtx (vals);
14036
14037       /* Load constant part of vector.  We really don't care what goes into the
14038          parts we will overwrite, but we're more likely to be able to load the
14039          constant efficiently if it has fewer, larger, repeating parts
14040          (see aarch64_simd_valid_immediate).  */
14041       for (int i = 0; i < n_elts; i++)
14042         {
14043           rtx x = XVECEXP (vals, 0, i);
14044           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
14045             continue;
14046           rtx subst = any_const;
14047           for (int bit = n_elts / 2; bit > 0; bit /= 2)
14048             {
14049               /* Look in the copied vector, as more elements are const.  */
14050               rtx test = XVECEXP (copy, 0, i ^ bit);
14051               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
14052                 {
14053                   subst = test;
14054                   break;
14055                 }
14056             }
14057           XVECEXP (copy, 0, i) = subst;
14058         }
14059       aarch64_expand_vector_init (target, copy);
14060     }
14061
14062   /* Insert the variable lanes directly.  */
14063   for (int i = 0; i < n_elts; i++)
14064     {
14065       rtx x = XVECEXP (vals, 0, i);
14066       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
14067         continue;
14068       x = copy_to_mode_reg (inner_mode, x);
14069       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
14070     }
14071 }
14072
14073 static unsigned HOST_WIDE_INT
14074 aarch64_shift_truncation_mask (machine_mode mode)
14075 {
14076   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
14077     return 0;
14078   return GET_MODE_UNIT_BITSIZE (mode) - 1;
14079 }
14080
14081 /* Select a format to encode pointers in exception handling data.  */
14082 int
14083 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
14084 {
14085    int type;
14086    switch (aarch64_cmodel)
14087      {
14088      case AARCH64_CMODEL_TINY:
14089      case AARCH64_CMODEL_TINY_PIC:
14090      case AARCH64_CMODEL_SMALL:
14091      case AARCH64_CMODEL_SMALL_PIC:
14092      case AARCH64_CMODEL_SMALL_SPIC:
14093        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
14094           for everything.  */
14095        type = DW_EH_PE_sdata4;
14096        break;
14097      default:
14098        /* No assumptions here.  8-byte relocs required.  */
14099        type = DW_EH_PE_sdata8;
14100        break;
14101      }
14102    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
14103 }
14104
14105 /* The last .arch and .tune assembly strings that we printed.  */
14106 static std::string aarch64_last_printed_arch_string;
14107 static std::string aarch64_last_printed_tune_string;
14108
14109 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
14110    by the function fndecl.  */
14111
14112 void
14113 aarch64_declare_function_name (FILE *stream, const char* name,
14114                                 tree fndecl)
14115 {
14116   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
14117
14118   struct cl_target_option *targ_options;
14119   if (target_parts)
14120     targ_options = TREE_TARGET_OPTION (target_parts);
14121   else
14122     targ_options = TREE_TARGET_OPTION (target_option_current_node);
14123   gcc_assert (targ_options);
14124
14125   const struct processor *this_arch
14126     = aarch64_get_arch (targ_options->x_explicit_arch);
14127
14128   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
14129   std::string extension
14130     = aarch64_get_extension_string_for_isa_flags (isa_flags,
14131                                                   this_arch->flags);
14132   /* Only update the assembler .arch string if it is distinct from the last
14133      such string we printed.  */
14134   std::string to_print = this_arch->name + extension;
14135   if (to_print != aarch64_last_printed_arch_string)
14136     {
14137       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
14138       aarch64_last_printed_arch_string = to_print;
14139     }
14140
14141   /* Print the cpu name we're tuning for in the comments, might be
14142      useful to readers of the generated asm.  Do it only when it changes
14143      from function to function and verbose assembly is requested.  */
14144   const struct processor *this_tune
14145     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
14146
14147   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
14148     {
14149       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
14150                    this_tune->name);
14151       aarch64_last_printed_tune_string = this_tune->name;
14152     }
14153
14154   /* Don't forget the type directive for ELF.  */
14155   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
14156   ASM_OUTPUT_LABEL (stream, name);
14157 }
14158
14159 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
14160
14161 static void
14162 aarch64_start_file (void)
14163 {
14164   struct cl_target_option *default_options
14165     = TREE_TARGET_OPTION (target_option_default_node);
14166
14167   const struct processor *default_arch
14168     = aarch64_get_arch (default_options->x_explicit_arch);
14169   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
14170   std::string extension
14171     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
14172                                                   default_arch->flags);
14173
14174    aarch64_last_printed_arch_string = default_arch->name + extension;
14175    aarch64_last_printed_tune_string = "";
14176    asm_fprintf (asm_out_file, "\t.arch %s\n",
14177                 aarch64_last_printed_arch_string.c_str ());
14178
14179    default_file_start ();
14180 }
14181
14182 /* Emit load exclusive.  */
14183
14184 static void
14185 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
14186                              rtx mem, rtx model_rtx)
14187 {
14188   rtx (*gen) (rtx, rtx, rtx);
14189
14190   switch (mode)
14191     {
14192     case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
14193     case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
14194     case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
14195     case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
14196     default:
14197       gcc_unreachable ();
14198     }
14199
14200   emit_insn (gen (rval, mem, model_rtx));
14201 }
14202
14203 /* Emit store exclusive.  */
14204
14205 static void
14206 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
14207                               rtx rval, rtx mem, rtx model_rtx)
14208 {
14209   rtx (*gen) (rtx, rtx, rtx, rtx);
14210
14211   switch (mode)
14212     {
14213     case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
14214     case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
14215     case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
14216     case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
14217     default:
14218       gcc_unreachable ();
14219     }
14220
14221   emit_insn (gen (bval, rval, mem, model_rtx));
14222 }
14223
14224 /* Mark the previous jump instruction as unlikely.  */
14225
14226 static void
14227 aarch64_emit_unlikely_jump (rtx insn)
14228 {
14229   rtx_insn *jump = emit_jump_insn (insn);
14230   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
14231 }
14232
14233 /* Expand a compare and swap pattern.  */
14234
14235 void
14236 aarch64_expand_compare_and_swap (rtx operands[])
14237 {
14238   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
14239   machine_mode mode, cmp_mode;
14240   typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
14241   int idx;
14242   gen_cas_fn gen;
14243   const gen_cas_fn split_cas[] =
14244   {
14245     gen_aarch64_compare_and_swapqi,
14246     gen_aarch64_compare_and_swaphi,
14247     gen_aarch64_compare_and_swapsi,
14248     gen_aarch64_compare_and_swapdi
14249   };
14250   const gen_cas_fn atomic_cas[] =
14251   {
14252     gen_aarch64_compare_and_swapqi_lse,
14253     gen_aarch64_compare_and_swaphi_lse,
14254     gen_aarch64_compare_and_swapsi_lse,
14255     gen_aarch64_compare_and_swapdi_lse
14256   };
14257
14258   bval = operands[0];
14259   rval = operands[1];
14260   mem = operands[2];
14261   oldval = operands[3];
14262   newval = operands[4];
14263   is_weak = operands[5];
14264   mod_s = operands[6];
14265   mod_f = operands[7];
14266   mode = GET_MODE (mem);
14267   cmp_mode = mode;
14268
14269   /* Normally the succ memory model must be stronger than fail, but in the
14270      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
14271      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
14272
14273   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
14274       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
14275     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
14276
14277   switch (mode)
14278     {
14279     case E_QImode:
14280     case E_HImode:
14281       /* For short modes, we're going to perform the comparison in SImode,
14282          so do the zero-extension now.  */
14283       cmp_mode = SImode;
14284       rval = gen_reg_rtx (SImode);
14285       oldval = convert_modes (SImode, mode, oldval, true);
14286       /* Fall through.  */
14287
14288     case E_SImode:
14289     case E_DImode:
14290       /* Force the value into a register if needed.  */
14291       if (!aarch64_plus_operand (oldval, mode))
14292         oldval = force_reg (cmp_mode, oldval);
14293       break;
14294
14295     default:
14296       gcc_unreachable ();
14297     }
14298
14299   switch (mode)
14300     {
14301     case E_QImode: idx = 0; break;
14302     case E_HImode: idx = 1; break;
14303     case E_SImode: idx = 2; break;
14304     case E_DImode: idx = 3; break;
14305     default:
14306       gcc_unreachable ();
14307     }
14308   if (TARGET_LSE)
14309     gen = atomic_cas[idx];
14310   else
14311     gen = split_cas[idx];
14312
14313   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
14314
14315   if (mode == QImode || mode == HImode)
14316     emit_move_insn (operands[1], gen_lowpart (mode, rval));
14317
14318   x = gen_rtx_REG (CCmode, CC_REGNUM);
14319   x = gen_rtx_EQ (SImode, x, const0_rtx);
14320   emit_insn (gen_rtx_SET (bval, x));
14321 }
14322
14323 /* Test whether the target supports using a atomic load-operate instruction.
14324    CODE is the operation and AFTER is TRUE if the data in memory after the
14325    operation should be returned and FALSE if the data before the operation
14326    should be returned.  Returns FALSE if the operation isn't supported by the
14327    architecture.  */
14328
14329 bool
14330 aarch64_atomic_ldop_supported_p (enum rtx_code code)
14331 {
14332   if (!TARGET_LSE)
14333     return false;
14334
14335   switch (code)
14336     {
14337     case SET:
14338     case AND:
14339     case IOR:
14340     case XOR:
14341     case MINUS:
14342     case PLUS:
14343       return true;
14344     default:
14345       return false;
14346     }
14347 }
14348
14349 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
14350    sequence implementing an atomic operation.  */
14351
14352 static void
14353 aarch64_emit_post_barrier (enum memmodel model)
14354 {
14355   const enum memmodel base_model = memmodel_base (model);
14356
14357   if (is_mm_sync (model)
14358       && (base_model == MEMMODEL_ACQUIRE
14359           || base_model == MEMMODEL_ACQ_REL
14360           || base_model == MEMMODEL_SEQ_CST))
14361     {
14362       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
14363     }
14364 }
14365
14366 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
14367    for the data in memory.  EXPECTED is the value expected to be in memory.
14368    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
14369    is the memory ordering to use.  */
14370
14371 void
14372 aarch64_gen_atomic_cas (rtx rval, rtx mem,
14373                         rtx expected, rtx desired,
14374                         rtx model)
14375 {
14376   rtx (*gen) (rtx, rtx, rtx, rtx);
14377   machine_mode mode;
14378
14379   mode = GET_MODE (mem);
14380
14381   switch (mode)
14382     {
14383     case E_QImode: gen = gen_aarch64_atomic_casqi; break;
14384     case E_HImode: gen = gen_aarch64_atomic_cashi; break;
14385     case E_SImode: gen = gen_aarch64_atomic_cassi; break;
14386     case E_DImode: gen = gen_aarch64_atomic_casdi; break;
14387     default:
14388       gcc_unreachable ();
14389     }
14390
14391   /* Move the expected value into the CAS destination register.  */
14392   emit_insn (gen_rtx_SET (rval, expected));
14393
14394   /* Emit the CAS.  */
14395   emit_insn (gen (rval, mem, desired, model));
14396
14397   /* Compare the expected value with the value loaded by the CAS, to establish
14398      whether the swap was made.  */
14399   aarch64_gen_compare_reg (EQ, rval, expected);
14400 }
14401
14402 /* Split a compare and swap pattern.  */
14403
14404 void
14405 aarch64_split_compare_and_swap (rtx operands[])
14406 {
14407   rtx rval, mem, oldval, newval, scratch;
14408   machine_mode mode;
14409   bool is_weak;
14410   rtx_code_label *label1, *label2;
14411   rtx x, cond;
14412   enum memmodel model;
14413   rtx model_rtx;
14414
14415   rval = operands[0];
14416   mem = operands[1];
14417   oldval = operands[2];
14418   newval = operands[3];
14419   is_weak = (operands[4] != const0_rtx);
14420   model_rtx = operands[5];
14421   scratch = operands[7];
14422   mode = GET_MODE (mem);
14423   model = memmodel_from_int (INTVAL (model_rtx));
14424
14425   /* When OLDVAL is zero and we want the strong version we can emit a tighter
14426     loop:
14427     .label1:
14428         LD[A]XR rval, [mem]
14429         CBNZ    rval, .label2
14430         ST[L]XR scratch, newval, [mem]
14431         CBNZ    scratch, .label1
14432     .label2:
14433         CMP     rval, 0.  */
14434   bool strong_zero_p = !is_weak && oldval == const0_rtx;
14435
14436   label1 = NULL;
14437   if (!is_weak)
14438     {
14439       label1 = gen_label_rtx ();
14440       emit_label (label1);
14441     }
14442   label2 = gen_label_rtx ();
14443
14444   /* The initial load can be relaxed for a __sync operation since a final
14445      barrier will be emitted to stop code hoisting.  */
14446   if (is_mm_sync (model))
14447     aarch64_emit_load_exclusive (mode, rval, mem,
14448                                  GEN_INT (MEMMODEL_RELAXED));
14449   else
14450     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
14451
14452   if (strong_zero_p)
14453     {
14454       x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
14455       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14456                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14457       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14458     }
14459   else
14460     {
14461       cond = aarch64_gen_compare_reg (NE, rval, oldval);
14462       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14463       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14464                                  gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14465       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14466     }
14467
14468   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
14469
14470   if (!is_weak)
14471     {
14472       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
14473       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14474                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
14475       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14476     }
14477   else
14478     {
14479       cond = gen_rtx_REG (CCmode, CC_REGNUM);
14480       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
14481       emit_insn (gen_rtx_SET (cond, x));
14482     }
14483
14484   emit_label (label2);
14485   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
14486      to set the condition flags.  If this is not used it will be removed by
14487      later passes.  */
14488   if (strong_zero_p)
14489     {
14490       cond = gen_rtx_REG (CCmode, CC_REGNUM);
14491       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
14492       emit_insn (gen_rtx_SET (cond, x));
14493     }
14494   /* Emit any final barrier needed for a __sync operation.  */
14495   if (is_mm_sync (model))
14496     aarch64_emit_post_barrier (model);
14497 }
14498
14499 /* Emit a BIC instruction.  */
14500
14501 static void
14502 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
14503 {
14504   rtx shift_rtx = GEN_INT (shift);
14505   rtx (*gen) (rtx, rtx, rtx, rtx);
14506
14507   switch (mode)
14508     {
14509     case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
14510     case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
14511     default:
14512       gcc_unreachable ();
14513     }
14514
14515   emit_insn (gen (dst, s2, shift_rtx, s1));
14516 }
14517
14518 /* Emit an atomic swap.  */
14519
14520 static void
14521 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
14522                           rtx mem, rtx model)
14523 {
14524   rtx (*gen) (rtx, rtx, rtx, rtx);
14525
14526   switch (mode)
14527     {
14528     case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
14529     case E_HImode: gen = gen_aarch64_atomic_swphi; break;
14530     case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
14531     case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
14532     default:
14533       gcc_unreachable ();
14534     }
14535
14536   emit_insn (gen (dst, mem, value, model));
14537 }
14538
14539 /* Operations supported by aarch64_emit_atomic_load_op.  */
14540
14541 enum aarch64_atomic_load_op_code
14542 {
14543   AARCH64_LDOP_PLUS,    /* A + B  */
14544   AARCH64_LDOP_XOR,     /* A ^ B  */
14545   AARCH64_LDOP_OR,      /* A | B  */
14546   AARCH64_LDOP_BIC      /* A & ~B  */
14547 };
14548
14549 /* Emit an atomic load-operate.  */
14550
14551 static void
14552 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
14553                              machine_mode mode, rtx dst, rtx src,
14554                              rtx mem, rtx model)
14555 {
14556   typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
14557   const aarch64_atomic_load_op_fn plus[] =
14558   {
14559     gen_aarch64_atomic_loadaddqi,
14560     gen_aarch64_atomic_loadaddhi,
14561     gen_aarch64_atomic_loadaddsi,
14562     gen_aarch64_atomic_loadadddi
14563   };
14564   const aarch64_atomic_load_op_fn eor[] =
14565   {
14566     gen_aarch64_atomic_loadeorqi,
14567     gen_aarch64_atomic_loadeorhi,
14568     gen_aarch64_atomic_loadeorsi,
14569     gen_aarch64_atomic_loadeordi
14570   };
14571   const aarch64_atomic_load_op_fn ior[] =
14572   {
14573     gen_aarch64_atomic_loadsetqi,
14574     gen_aarch64_atomic_loadsethi,
14575     gen_aarch64_atomic_loadsetsi,
14576     gen_aarch64_atomic_loadsetdi
14577   };
14578   const aarch64_atomic_load_op_fn bic[] =
14579   {
14580     gen_aarch64_atomic_loadclrqi,
14581     gen_aarch64_atomic_loadclrhi,
14582     gen_aarch64_atomic_loadclrsi,
14583     gen_aarch64_atomic_loadclrdi
14584   };
14585   aarch64_atomic_load_op_fn gen;
14586   int idx = 0;
14587
14588   switch (mode)
14589     {
14590     case E_QImode: idx = 0; break;
14591     case E_HImode: idx = 1; break;
14592     case E_SImode: idx = 2; break;
14593     case E_DImode: idx = 3; break;
14594     default:
14595       gcc_unreachable ();
14596     }
14597
14598   switch (code)
14599     {
14600     case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
14601     case AARCH64_LDOP_XOR: gen = eor[idx]; break;
14602     case AARCH64_LDOP_OR: gen = ior[idx]; break;
14603     case AARCH64_LDOP_BIC: gen = bic[idx]; break;
14604     default:
14605       gcc_unreachable ();
14606     }
14607
14608   emit_insn (gen (dst, mem, src, model));
14609 }
14610
14611 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
14612    location to store the data read from memory.  OUT_RESULT is the location to
14613    store the result of the operation.  MEM is the memory location to read and
14614    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
14615    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
14616    be NULL.  */
14617
14618 void
14619 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
14620                          rtx mem, rtx value, rtx model_rtx)
14621 {
14622   machine_mode mode = GET_MODE (mem);
14623   machine_mode wmode = (mode == DImode ? DImode : SImode);
14624   const bool short_mode = (mode < SImode);
14625   aarch64_atomic_load_op_code ldop_code;
14626   rtx src;
14627   rtx x;
14628
14629   if (out_data)
14630     out_data = gen_lowpart (mode, out_data);
14631
14632   if (out_result)
14633     out_result = gen_lowpart (mode, out_result);
14634
14635   /* Make sure the value is in a register, putting it into a destination
14636      register if it needs to be manipulated.  */
14637   if (!register_operand (value, mode)
14638       || code == AND || code == MINUS)
14639     {
14640       src = out_result ? out_result : out_data;
14641       emit_move_insn (src, gen_lowpart (mode, value));
14642     }
14643   else
14644     src = value;
14645   gcc_assert (register_operand (src, mode));
14646
14647   /* Preprocess the data for the operation as necessary.  If the operation is
14648      a SET then emit a swap instruction and finish.  */
14649   switch (code)
14650     {
14651     case SET:
14652       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
14653       return;
14654
14655     case MINUS:
14656       /* Negate the value and treat it as a PLUS.  */
14657       {
14658         rtx neg_src;
14659
14660         /* Resize the value if necessary.  */
14661         if (short_mode)
14662           src = gen_lowpart (wmode, src);
14663
14664         neg_src = gen_rtx_NEG (wmode, src);
14665         emit_insn (gen_rtx_SET (src, neg_src));
14666
14667         if (short_mode)
14668           src = gen_lowpart (mode, src);
14669       }
14670       /* Fall-through.  */
14671     case PLUS:
14672       ldop_code = AARCH64_LDOP_PLUS;
14673       break;
14674
14675     case IOR:
14676       ldop_code = AARCH64_LDOP_OR;
14677       break;
14678
14679     case XOR:
14680       ldop_code = AARCH64_LDOP_XOR;
14681       break;
14682
14683     case AND:
14684       {
14685         rtx not_src;
14686
14687         /* Resize the value if necessary.  */
14688         if (short_mode)
14689           src = gen_lowpart (wmode, src);
14690
14691         not_src = gen_rtx_NOT (wmode, src);
14692         emit_insn (gen_rtx_SET (src, not_src));
14693
14694         if (short_mode)
14695           src = gen_lowpart (mode, src);
14696       }
14697       ldop_code = AARCH64_LDOP_BIC;
14698       break;
14699
14700     default:
14701       /* The operation can't be done with atomic instructions.  */
14702       gcc_unreachable ();
14703     }
14704
14705   aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
14706
14707   /* If necessary, calculate the data in memory after the update by redoing the
14708      operation from values in registers.  */
14709   if (!out_result)
14710     return;
14711
14712   if (short_mode)
14713     {
14714       src = gen_lowpart (wmode, src);
14715       out_data = gen_lowpart (wmode, out_data);
14716       out_result = gen_lowpart (wmode, out_result);
14717     }
14718
14719   x = NULL_RTX;
14720
14721   switch (code)
14722     {
14723     case MINUS:
14724     case PLUS:
14725       x = gen_rtx_PLUS (wmode, out_data, src);
14726       break;
14727     case IOR:
14728       x = gen_rtx_IOR (wmode, out_data, src);
14729       break;
14730     case XOR:
14731       x = gen_rtx_XOR (wmode, out_data, src);
14732       break;
14733     case AND:
14734       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
14735       return;
14736     default:
14737       gcc_unreachable ();
14738     }
14739
14740   emit_set_insn (out_result, x);
14741
14742   return;
14743 }
14744
14745 /* Split an atomic operation.  */
14746
14747 void
14748 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
14749                          rtx value, rtx model_rtx, rtx cond)
14750 {
14751   machine_mode mode = GET_MODE (mem);
14752   machine_mode wmode = (mode == DImode ? DImode : SImode);
14753   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
14754   const bool is_sync = is_mm_sync (model);
14755   rtx_code_label *label;
14756   rtx x;
14757
14758   /* Split the atomic operation into a sequence.  */
14759   label = gen_label_rtx ();
14760   emit_label (label);
14761
14762   if (new_out)
14763     new_out = gen_lowpart (wmode, new_out);
14764   if (old_out)
14765     old_out = gen_lowpart (wmode, old_out);
14766   else
14767     old_out = new_out;
14768   value = simplify_gen_subreg (wmode, value, mode, 0);
14769
14770   /* The initial load can be relaxed for a __sync operation since a final
14771      barrier will be emitted to stop code hoisting.  */
14772  if (is_sync)
14773     aarch64_emit_load_exclusive (mode, old_out, mem,
14774                                  GEN_INT (MEMMODEL_RELAXED));
14775   else
14776     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
14777
14778   switch (code)
14779     {
14780     case SET:
14781       new_out = value;
14782       break;
14783
14784     case NOT:
14785       x = gen_rtx_AND (wmode, old_out, value);
14786       emit_insn (gen_rtx_SET (new_out, x));
14787       x = gen_rtx_NOT (wmode, new_out);
14788       emit_insn (gen_rtx_SET (new_out, x));
14789       break;
14790
14791     case MINUS:
14792       if (CONST_INT_P (value))
14793         {
14794           value = GEN_INT (-INTVAL (value));
14795           code = PLUS;
14796         }
14797       /* Fall through.  */
14798
14799     default:
14800       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
14801       emit_insn (gen_rtx_SET (new_out, x));
14802       break;
14803     }
14804
14805   aarch64_emit_store_exclusive (mode, cond, mem,
14806                                 gen_lowpart (mode, new_out), model_rtx);
14807
14808   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14809   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14810                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
14811   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14812
14813   /* Emit any final barrier needed for a __sync operation.  */
14814   if (is_sync)
14815     aarch64_emit_post_barrier (model);
14816 }
14817
14818 static void
14819 aarch64_init_libfuncs (void)
14820 {
14821    /* Half-precision float operations.  The compiler handles all operations
14822      with NULL libfuncs by converting to SFmode.  */
14823
14824   /* Conversions.  */
14825   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
14826   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
14827
14828   /* Arithmetic.  */
14829   set_optab_libfunc (add_optab, HFmode, NULL);
14830   set_optab_libfunc (sdiv_optab, HFmode, NULL);
14831   set_optab_libfunc (smul_optab, HFmode, NULL);
14832   set_optab_libfunc (neg_optab, HFmode, NULL);
14833   set_optab_libfunc (sub_optab, HFmode, NULL);
14834
14835   /* Comparisons.  */
14836   set_optab_libfunc (eq_optab, HFmode, NULL);
14837   set_optab_libfunc (ne_optab, HFmode, NULL);
14838   set_optab_libfunc (lt_optab, HFmode, NULL);
14839   set_optab_libfunc (le_optab, HFmode, NULL);
14840   set_optab_libfunc (ge_optab, HFmode, NULL);
14841   set_optab_libfunc (gt_optab, HFmode, NULL);
14842   set_optab_libfunc (unord_optab, HFmode, NULL);
14843 }
14844
14845 /* Target hook for c_mode_for_suffix.  */
14846 static machine_mode
14847 aarch64_c_mode_for_suffix (char suffix)
14848 {
14849   if (suffix == 'q')
14850     return TFmode;
14851
14852   return VOIDmode;
14853 }
14854
14855 /* We can only represent floating point constants which will fit in
14856    "quarter-precision" values.  These values are characterised by
14857    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
14858    by:
14859
14860    (-1)^s * (n/16) * 2^r
14861
14862    Where:
14863      's' is the sign bit.
14864      'n' is an integer in the range 16 <= n <= 31.
14865      'r' is an integer in the range -3 <= r <= 4.  */
14866
14867 /* Return true iff X can be represented by a quarter-precision
14868    floating point immediate operand X.  Note, we cannot represent 0.0.  */
14869 bool
14870 aarch64_float_const_representable_p (rtx x)
14871 {
14872   /* This represents our current view of how many bits
14873      make up the mantissa.  */
14874   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
14875   int exponent;
14876   unsigned HOST_WIDE_INT mantissa, mask;
14877   REAL_VALUE_TYPE r, m;
14878   bool fail;
14879
14880   if (!CONST_DOUBLE_P (x))
14881     return false;
14882
14883   /* We don't support HFmode constants yet.  */
14884   if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
14885     return false;
14886
14887   r = *CONST_DOUBLE_REAL_VALUE (x);
14888
14889   /* We cannot represent infinities, NaNs or +/-zero.  We won't
14890      know if we have +zero until we analyse the mantissa, but we
14891      can reject the other invalid values.  */
14892   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
14893       || REAL_VALUE_MINUS_ZERO (r))
14894     return false;
14895
14896   /* Extract exponent.  */
14897   r = real_value_abs (&r);
14898   exponent = REAL_EXP (&r);
14899
14900   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
14901      highest (sign) bit, with a fixed binary point at bit point_pos.
14902      m1 holds the low part of the mantissa, m2 the high part.
14903      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
14904      bits for the mantissa, this can fail (low bits will be lost).  */
14905   real_ldexp (&m, &r, point_pos - exponent);
14906   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
14907
14908   /* If the low part of the mantissa has bits set we cannot represent
14909      the value.  */
14910   if (w.ulow () != 0)
14911     return false;
14912   /* We have rejected the lower HOST_WIDE_INT, so update our
14913      understanding of how many bits lie in the mantissa and
14914      look only at the high HOST_WIDE_INT.  */
14915   mantissa = w.elt (1);
14916   point_pos -= HOST_BITS_PER_WIDE_INT;
14917
14918   /* We can only represent values with a mantissa of the form 1.xxxx.  */
14919   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
14920   if ((mantissa & mask) != 0)
14921     return false;
14922
14923   /* Having filtered unrepresentable values, we may now remove all
14924      but the highest 5 bits.  */
14925   mantissa >>= point_pos - 5;
14926
14927   /* We cannot represent the value 0.0, so reject it.  This is handled
14928      elsewhere.  */
14929   if (mantissa == 0)
14930     return false;
14931
14932   /* Then, as bit 4 is always set, we can mask it off, leaving
14933      the mantissa in the range [0, 15].  */
14934   mantissa &= ~(1 << 4);
14935   gcc_assert (mantissa <= 15);
14936
14937   /* GCC internally does not use IEEE754-like encoding (where normalized
14938      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
14939      Our mantissa values are shifted 4 places to the left relative to
14940      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
14941      by 5 places to correct for GCC's representation.  */
14942   exponent = 5 - exponent;
14943
14944   return (exponent >= 0 && exponent <= 7);
14945 }
14946
14947 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
14948    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
14949    output MOVI/MVNI, ORR or BIC immediate.  */
14950 char*
14951 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
14952                                    enum simd_immediate_check which)
14953 {
14954   bool is_valid;
14955   static char templ[40];
14956   const char *mnemonic;
14957   const char *shift_op;
14958   unsigned int lane_count = 0;
14959   char element_char;
14960
14961   struct simd_immediate_info info;
14962
14963   /* This will return true to show const_vector is legal for use as either
14964      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
14965      It will also update INFO to show how the immediate should be generated.
14966      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
14967   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
14968   gcc_assert (is_valid);
14969
14970   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14971   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
14972
14973   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
14974     {
14975       gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
14976       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
14977          move immediate path.  */
14978       if (aarch64_float_const_zero_rtx_p (info.value))
14979         info.value = GEN_INT (0);
14980       else
14981         {
14982           const unsigned int buf_size = 20;
14983           char float_buf[buf_size] = {'\0'};
14984           real_to_decimal_for_mode (float_buf,
14985                                     CONST_DOUBLE_REAL_VALUE (info.value),
14986                                     buf_size, buf_size, 1, info.elt_mode);
14987
14988           if (lane_count == 1)
14989             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
14990           else
14991             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
14992                       lane_count, element_char, float_buf);
14993           return templ;
14994         }
14995     }
14996
14997   gcc_assert (CONST_INT_P (info.value));
14998
14999   if (which == AARCH64_CHECK_MOV)
15000     {
15001       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
15002       shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
15003       if (lane_count == 1)
15004         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
15005                   mnemonic, UINTVAL (info.value));
15006       else if (info.shift)
15007         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
15008                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
15009                   element_char, UINTVAL (info.value), shift_op, info.shift);
15010       else
15011         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
15012                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
15013                   element_char, UINTVAL (info.value));
15014     }
15015   else
15016     {
15017       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
15018       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
15019       if (info.shift)
15020         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
15021                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
15022                   element_char, UINTVAL (info.value), "lsl", info.shift);
15023       else
15024         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
15025                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
15026                   element_char, UINTVAL (info.value));
15027     }
15028   return templ;
15029 }
15030
15031 char*
15032 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
15033 {
15034
15035   /* If a floating point number was passed and we desire to use it in an
15036      integer mode do the conversion to integer.  */
15037   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
15038     {
15039       unsigned HOST_WIDE_INT ival;
15040       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
15041           gcc_unreachable ();
15042       immediate = gen_int_mode (ival, mode);
15043     }
15044
15045   machine_mode vmode;
15046   /* use a 64 bit mode for everything except for DI/DF mode, where we use
15047      a 128 bit vector mode.  */
15048   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
15049
15050   vmode = aarch64_simd_container_mode (mode, width);
15051   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
15052   return aarch64_output_simd_mov_immediate (v_op, width);
15053 }
15054
15055 /* Return the output string to use for moving immediate CONST_VECTOR
15056    into an SVE register.  */
15057
15058 char *
15059 aarch64_output_sve_mov_immediate (rtx const_vector)
15060 {
15061   static char templ[40];
15062   struct simd_immediate_info info;
15063   char element_char;
15064
15065   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
15066   gcc_assert (is_valid);
15067
15068   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
15069
15070   if (info.step)
15071     {
15072       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
15073                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
15074                 element_char, INTVAL (info.value), INTVAL (info.step));
15075       return templ;
15076     }
15077
15078   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15079     {
15080       if (aarch64_float_const_zero_rtx_p (info.value))
15081         info.value = GEN_INT (0);
15082       else
15083         {
15084           const int buf_size = 20;
15085           char float_buf[buf_size] = {};
15086           real_to_decimal_for_mode (float_buf,
15087                                     CONST_DOUBLE_REAL_VALUE (info.value),
15088                                     buf_size, buf_size, 1, info.elt_mode);
15089
15090           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
15091                     element_char, float_buf);
15092           return templ;
15093         }
15094     }
15095
15096   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
15097             element_char, INTVAL (info.value));
15098   return templ;
15099 }
15100
15101 /* Return the asm format for a PTRUE instruction whose destination has
15102    mode MODE.  SUFFIX is the element size suffix.  */
15103
15104 char *
15105 aarch64_output_ptrue (machine_mode mode, char suffix)
15106 {
15107   unsigned int nunits;
15108   static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
15109   if (GET_MODE_NUNITS (mode).is_constant (&nunits))
15110     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
15111   else
15112     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
15113   return buf;
15114 }
15115
15116 /* Split operands into moves from op[1] + op[2] into op[0].  */
15117
15118 void
15119 aarch64_split_combinev16qi (rtx operands[3])
15120 {
15121   unsigned int dest = REGNO (operands[0]);
15122   unsigned int src1 = REGNO (operands[1]);
15123   unsigned int src2 = REGNO (operands[2]);
15124   machine_mode halfmode = GET_MODE (operands[1]);
15125   unsigned int halfregs = REG_NREGS (operands[1]);
15126   rtx destlo, desthi;
15127
15128   gcc_assert (halfmode == V16QImode);
15129
15130   if (src1 == dest && src2 == dest + halfregs)
15131     {
15132       /* No-op move.  Can't split to nothing; emit something.  */
15133       emit_note (NOTE_INSN_DELETED);
15134       return;
15135     }
15136
15137   /* Preserve register attributes for variable tracking.  */
15138   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
15139   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
15140                                GET_MODE_SIZE (halfmode));
15141
15142   /* Special case of reversed high/low parts.  */
15143   if (reg_overlap_mentioned_p (operands[2], destlo)
15144       && reg_overlap_mentioned_p (operands[1], desthi))
15145     {
15146       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15147       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
15148       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15149     }
15150   else if (!reg_overlap_mentioned_p (operands[2], destlo))
15151     {
15152       /* Try to avoid unnecessary moves if part of the result
15153          is in the right place already.  */
15154       if (src1 != dest)
15155         emit_move_insn (destlo, operands[1]);
15156       if (src2 != dest + halfregs)
15157         emit_move_insn (desthi, operands[2]);
15158     }
15159   else
15160     {
15161       if (src2 != dest + halfregs)
15162         emit_move_insn (desthi, operands[2]);
15163       if (src1 != dest)
15164         emit_move_insn (destlo, operands[1]);
15165     }
15166 }
15167
15168 /* vec_perm support.  */
15169
15170 struct expand_vec_perm_d
15171 {
15172   rtx target, op0, op1;
15173   vec_perm_indices perm;
15174   machine_mode vmode;
15175   unsigned int vec_flags;
15176   bool one_vector_p;
15177   bool testing_p;
15178 };
15179
15180 /* Generate a variable permutation.  */
15181
15182 static void
15183 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
15184 {
15185   machine_mode vmode = GET_MODE (target);
15186   bool one_vector_p = rtx_equal_p (op0, op1);
15187
15188   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
15189   gcc_checking_assert (GET_MODE (op0) == vmode);
15190   gcc_checking_assert (GET_MODE (op1) == vmode);
15191   gcc_checking_assert (GET_MODE (sel) == vmode);
15192   gcc_checking_assert (TARGET_SIMD);
15193
15194   if (one_vector_p)
15195     {
15196       if (vmode == V8QImode)
15197         {
15198           /* Expand the argument to a V16QI mode by duplicating it.  */
15199           rtx pair = gen_reg_rtx (V16QImode);
15200           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
15201           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15202         }
15203       else
15204         {
15205           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
15206         }
15207     }
15208   else
15209     {
15210       rtx pair;
15211
15212       if (vmode == V8QImode)
15213         {
15214           pair = gen_reg_rtx (V16QImode);
15215           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
15216           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15217         }
15218       else
15219         {
15220           pair = gen_reg_rtx (OImode);
15221           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
15222           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
15223         }
15224     }
15225 }
15226
15227 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
15228    NELT is the number of elements in the vector.  */
15229
15230 void
15231 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
15232                          unsigned int nelt)
15233 {
15234   machine_mode vmode = GET_MODE (target);
15235   bool one_vector_p = rtx_equal_p (op0, op1);
15236   rtx mask;
15237
15238   /* The TBL instruction does not use a modulo index, so we must take care
15239      of that ourselves.  */
15240   mask = aarch64_simd_gen_const_vector_dup (vmode,
15241       one_vector_p ? nelt - 1 : 2 * nelt - 1);
15242   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
15243
15244   /* For big-endian, we also need to reverse the index within the vector
15245      (but not which vector).  */
15246   if (BYTES_BIG_ENDIAN)
15247     {
15248       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
15249       if (!one_vector_p)
15250         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
15251       sel = expand_simple_binop (vmode, XOR, sel, mask,
15252                                  NULL, 0, OPTAB_LIB_WIDEN);
15253     }
15254   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
15255 }
15256
15257 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
15258
15259 static void
15260 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
15261 {
15262   emit_insn (gen_rtx_SET (target,
15263                           gen_rtx_UNSPEC (GET_MODE (target),
15264                                           gen_rtvec (2, op0, op1), code)));
15265 }
15266
15267 /* Expand an SVE vec_perm with the given operands.  */
15268
15269 void
15270 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
15271 {
15272   machine_mode data_mode = GET_MODE (target);
15273   machine_mode sel_mode = GET_MODE (sel);
15274   /* Enforced by the pattern condition.  */
15275   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
15276
15277   /* Note: vec_perm indices are supposed to wrap when they go beyond the
15278      size of the two value vectors, i.e. the upper bits of the indices
15279      are effectively ignored.  SVE TBL instead produces 0 for any
15280      out-of-range indices, so we need to modulo all the vec_perm indices
15281      to ensure they are all in range.  */
15282   rtx sel_reg = force_reg (sel_mode, sel);
15283
15284   /* Check if the sel only references the first values vector.  */
15285   if (GET_CODE (sel) == CONST_VECTOR
15286       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
15287     {
15288       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
15289       return;
15290     }
15291
15292   /* Check if the two values vectors are the same.  */
15293   if (rtx_equal_p (op0, op1))
15294     {
15295       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
15296       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15297                                          NULL, 0, OPTAB_DIRECT);
15298       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
15299       return;
15300     }
15301
15302   /* Run TBL on for each value vector and combine the results.  */
15303
15304   rtx res0 = gen_reg_rtx (data_mode);
15305   rtx res1 = gen_reg_rtx (data_mode);
15306   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
15307   if (GET_CODE (sel) != CONST_VECTOR
15308       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
15309     {
15310       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
15311                                                        2 * nunits - 1);
15312       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15313                                      NULL, 0, OPTAB_DIRECT);
15314     }
15315   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
15316   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
15317                                      NULL, 0, OPTAB_DIRECT);
15318   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
15319   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
15320     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
15321   else
15322     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
15323 }
15324
15325 /* Recognize patterns suitable for the TRN instructions.  */
15326 static bool
15327 aarch64_evpc_trn (struct expand_vec_perm_d *d)
15328 {
15329   HOST_WIDE_INT odd;
15330   poly_uint64 nelt = d->perm.length ();
15331   rtx out, in0, in1, x;
15332   machine_mode vmode = d->vmode;
15333
15334   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15335     return false;
15336
15337   /* Note that these are little-endian tests.
15338      We correct for big-endian later.  */
15339   if (!d->perm[0].is_constant (&odd)
15340       || (odd != 0 && odd != 1)
15341       || !d->perm.series_p (0, 2, odd, 2)
15342       || !d->perm.series_p (1, 2, nelt + odd, 2))
15343     return false;
15344
15345   /* Success!  */
15346   if (d->testing_p)
15347     return true;
15348
15349   in0 = d->op0;
15350   in1 = d->op1;
15351   /* We don't need a big-endian lane correction for SVE; see the comment
15352      at the head of aarch64-sve.md for details.  */
15353   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15354     {
15355       x = in0, in0 = in1, in1 = x;
15356       odd = !odd;
15357     }
15358   out = d->target;
15359
15360   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15361                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
15362   return true;
15363 }
15364
15365 /* Recognize patterns suitable for the UZP instructions.  */
15366 static bool
15367 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
15368 {
15369   HOST_WIDE_INT odd;
15370   rtx out, in0, in1, x;
15371   machine_mode vmode = d->vmode;
15372
15373   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15374     return false;
15375
15376   /* Note that these are little-endian tests.
15377      We correct for big-endian later.  */
15378   if (!d->perm[0].is_constant (&odd)
15379       || (odd != 0 && odd != 1)
15380       || !d->perm.series_p (0, 1, odd, 2))
15381     return false;
15382
15383   /* Success!  */
15384   if (d->testing_p)
15385     return true;
15386
15387   in0 = d->op0;
15388   in1 = d->op1;
15389   /* We don't need a big-endian lane correction for SVE; see the comment
15390      at the head of aarch64-sve.md for details.  */
15391   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15392     {
15393       x = in0, in0 = in1, in1 = x;
15394       odd = !odd;
15395     }
15396   out = d->target;
15397
15398   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15399                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
15400   return true;
15401 }
15402
15403 /* Recognize patterns suitable for the ZIP instructions.  */
15404 static bool
15405 aarch64_evpc_zip (struct expand_vec_perm_d *d)
15406 {
15407   unsigned int high;
15408   poly_uint64 nelt = d->perm.length ();
15409   rtx out, in0, in1, x;
15410   machine_mode vmode = d->vmode;
15411
15412   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15413     return false;
15414
15415   /* Note that these are little-endian tests.
15416      We correct for big-endian later.  */
15417   poly_uint64 first = d->perm[0];
15418   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
15419       || !d->perm.series_p (0, 2, first, 1)
15420       || !d->perm.series_p (1, 2, first + nelt, 1))
15421     return false;
15422   high = maybe_ne (first, 0U);
15423
15424   /* Success!  */
15425   if (d->testing_p)
15426     return true;
15427
15428   in0 = d->op0;
15429   in1 = d->op1;
15430   /* We don't need a big-endian lane correction for SVE; see the comment
15431      at the head of aarch64-sve.md for details.  */
15432   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15433     {
15434       x = in0, in0 = in1, in1 = x;
15435       high = !high;
15436     }
15437   out = d->target;
15438
15439   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15440                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
15441   return true;
15442 }
15443
15444 /* Recognize patterns for the EXT insn.  */
15445
15446 static bool
15447 aarch64_evpc_ext (struct expand_vec_perm_d *d)
15448 {
15449   HOST_WIDE_INT location;
15450   rtx offset;
15451
15452   /* The first element always refers to the first vector.
15453      Check if the extracted indices are increasing by one.  */
15454   if (d->vec_flags == VEC_SVE_PRED
15455       || !d->perm[0].is_constant (&location)
15456       || !d->perm.series_p (0, 1, location, 1))
15457     return false;
15458
15459   /* Success! */
15460   if (d->testing_p)
15461     return true;
15462
15463   /* The case where (location == 0) is a no-op for both big- and little-endian,
15464      and is removed by the mid-end at optimization levels -O1 and higher.
15465
15466      We don't need a big-endian lane correction for SVE; see the comment
15467      at the head of aarch64-sve.md for details.  */
15468   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
15469     {
15470       /* After setup, we want the high elements of the first vector (stored
15471          at the LSB end of the register), and the low elements of the second
15472          vector (stored at the MSB end of the register). So swap.  */
15473       std::swap (d->op0, d->op1);
15474       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
15475          to_constant () is safe since this is restricted to Advanced SIMD
15476          vectors.  */
15477       location = d->perm.length ().to_constant () - location;
15478     }
15479
15480   offset = GEN_INT (location);
15481   emit_set_insn (d->target,
15482                  gen_rtx_UNSPEC (d->vmode,
15483                                  gen_rtvec (3, d->op0, d->op1, offset),
15484                                  UNSPEC_EXT));
15485   return true;
15486 }
15487
15488 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
15489    within each 64-bit, 32-bit or 16-bit granule.  */
15490
15491 static bool
15492 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
15493 {
15494   HOST_WIDE_INT diff;
15495   unsigned int i, size, unspec;
15496   machine_mode pred_mode;
15497
15498   if (d->vec_flags == VEC_SVE_PRED
15499       || !d->one_vector_p
15500       || !d->perm[0].is_constant (&diff))
15501     return false;
15502
15503   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
15504   if (size == 8)
15505     {
15506       unspec = UNSPEC_REV64;
15507       pred_mode = VNx2BImode;
15508     }
15509   else if (size == 4)
15510     {
15511       unspec = UNSPEC_REV32;
15512       pred_mode = VNx4BImode;
15513     }
15514   else if (size == 2)
15515     {
15516       unspec = UNSPEC_REV16;
15517       pred_mode = VNx8BImode;
15518     }
15519   else
15520     return false;
15521
15522   unsigned int step = diff + 1;
15523   for (i = 0; i < step; ++i)
15524     if (!d->perm.series_p (i, step, diff - i, step))
15525       return false;
15526
15527   /* Success! */
15528   if (d->testing_p)
15529     return true;
15530
15531   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
15532   if (d->vec_flags == VEC_SVE_DATA)
15533     {
15534       rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15535       src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
15536                             UNSPEC_MERGE_PTRUE);
15537     }
15538   emit_set_insn (d->target, src);
15539   return true;
15540 }
15541
15542 /* Recognize patterns for the REV insn, which reverses elements within
15543    a full vector.  */
15544
15545 static bool
15546 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
15547 {
15548   poly_uint64 nelt = d->perm.length ();
15549
15550   if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
15551     return false;
15552
15553   if (!d->perm.series_p (0, 1, nelt - 1, -1))
15554     return false;
15555
15556   /* Success! */
15557   if (d->testing_p)
15558     return true;
15559
15560   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
15561   emit_set_insn (d->target, src);
15562   return true;
15563 }
15564
15565 static bool
15566 aarch64_evpc_dup (struct expand_vec_perm_d *d)
15567 {
15568   rtx out = d->target;
15569   rtx in0;
15570   HOST_WIDE_INT elt;
15571   machine_mode vmode = d->vmode;
15572   rtx lane;
15573
15574   if (d->vec_flags == VEC_SVE_PRED
15575       || d->perm.encoding ().encoded_nelts () != 1
15576       || !d->perm[0].is_constant (&elt))
15577     return false;
15578
15579   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
15580     return false;
15581
15582   /* Success! */
15583   if (d->testing_p)
15584     return true;
15585
15586   /* The generic preparation in aarch64_expand_vec_perm_const_1
15587      swaps the operand order and the permute indices if it finds
15588      d->perm[0] to be in the second operand.  Thus, we can always
15589      use d->op0 and need not do any extra arithmetic to get the
15590      correct lane number.  */
15591   in0 = d->op0;
15592   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
15593
15594   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
15595   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
15596   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
15597   return true;
15598 }
15599
15600 static bool
15601 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
15602 {
15603   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
15604   machine_mode vmode = d->vmode;
15605
15606   /* Make sure that the indices are constant.  */
15607   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
15608   for (unsigned int i = 0; i < encoded_nelts; ++i)
15609     if (!d->perm[i].is_constant ())
15610       return false;
15611
15612   if (d->testing_p)
15613     return true;
15614
15615   /* Generic code will try constant permutation twice.  Once with the
15616      original mode and again with the elements lowered to QImode.
15617      So wait and don't do the selector expansion ourselves.  */
15618   if (vmode != V8QImode && vmode != V16QImode)
15619     return false;
15620
15621   /* to_constant is safe since this routine is specific to Advanced SIMD
15622      vectors.  */
15623   unsigned int nelt = d->perm.length ().to_constant ();
15624   for (unsigned int i = 0; i < nelt; ++i)
15625     /* If big-endian and two vectors we end up with a weird mixed-endian
15626        mode on NEON.  Reverse the index within each word but not the word
15627        itself.  to_constant is safe because we checked is_constant above.  */
15628     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
15629                         ? d->perm[i].to_constant () ^ (nelt - 1)
15630                         : d->perm[i].to_constant ());
15631
15632   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
15633   sel = force_reg (vmode, sel);
15634
15635   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
15636   return true;
15637 }
15638
15639 /* Try to implement D using an SVE TBL instruction.  */
15640
15641 static bool
15642 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
15643 {
15644   unsigned HOST_WIDE_INT nelt;
15645
15646   /* Permuting two variable-length vectors could overflow the
15647      index range.  */
15648   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
15649     return false;
15650
15651   if (d->testing_p)
15652     return true;
15653
15654   machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
15655   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
15656   aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
15657   return true;
15658 }
15659
15660 static bool
15661 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
15662 {
15663   /* The pattern matching functions above are written to look for a small
15664      number to begin the sequence (0, 1, N/2).  If we begin with an index
15665      from the second operand, we can swap the operands.  */
15666   poly_int64 nelt = d->perm.length ();
15667   if (known_ge (d->perm[0], nelt))
15668     {
15669       d->perm.rotate_inputs (1);
15670       std::swap (d->op0, d->op1);
15671     }
15672
15673   if ((d->vec_flags == VEC_ADVSIMD
15674        || d->vec_flags == VEC_SVE_DATA
15675        || d->vec_flags == VEC_SVE_PRED)
15676       && known_gt (nelt, 1))
15677     {
15678       if (aarch64_evpc_rev_local (d))
15679         return true;
15680       else if (aarch64_evpc_rev_global (d))
15681         return true;
15682       else if (aarch64_evpc_ext (d))
15683         return true;
15684       else if (aarch64_evpc_dup (d))
15685         return true;
15686       else if (aarch64_evpc_zip (d))
15687         return true;
15688       else if (aarch64_evpc_uzp (d))
15689         return true;
15690       else if (aarch64_evpc_trn (d))
15691         return true;
15692       if (d->vec_flags == VEC_SVE_DATA)
15693         return aarch64_evpc_sve_tbl (d);
15694       else if (d->vec_flags == VEC_SVE_DATA)
15695         return aarch64_evpc_tbl (d);
15696     }
15697   return false;
15698 }
15699
15700 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
15701
15702 static bool
15703 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
15704                                   rtx op1, const vec_perm_indices &sel)
15705 {
15706   struct expand_vec_perm_d d;
15707
15708   /* Check whether the mask can be applied to a single vector.  */
15709   if (op0 && rtx_equal_p (op0, op1))
15710     d.one_vector_p = true;
15711   else if (sel.all_from_input_p (0))
15712     {
15713       d.one_vector_p = true;
15714       op1 = op0;
15715     }
15716   else if (sel.all_from_input_p (1))
15717     {
15718       d.one_vector_p = true;
15719       op0 = op1;
15720     }
15721   else
15722     d.one_vector_p = false;
15723
15724   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
15725                      sel.nelts_per_input ());
15726   d.vmode = vmode;
15727   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
15728   d.target = target;
15729   d.op0 = op0;
15730   d.op1 = op1;
15731   d.testing_p = !target;
15732
15733   if (!d.testing_p)
15734     return aarch64_expand_vec_perm_const_1 (&d);
15735
15736   rtx_insn *last = get_last_insn ();
15737   bool ret = aarch64_expand_vec_perm_const_1 (&d);
15738   gcc_assert (last == get_last_insn ());
15739
15740   return ret;
15741 }
15742
15743 /* Generate a byte permute mask for a register of mode MODE,
15744    which has NUNITS units.  */
15745
15746 rtx
15747 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
15748 {
15749   /* We have to reverse each vector because we dont have
15750      a permuted load that can reverse-load according to ABI rules.  */
15751   rtx mask;
15752   rtvec v = rtvec_alloc (16);
15753   unsigned int i, j;
15754   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
15755
15756   gcc_assert (BYTES_BIG_ENDIAN);
15757   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
15758
15759   for (i = 0; i < nunits; i++)
15760     for (j = 0; j < usize; j++)
15761       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
15762   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
15763   return force_reg (V16QImode, mask);
15764 }
15765
15766 /* Return true if X is a valid second operand for the SVE instruction
15767    that implements integer comparison OP_CODE.  */
15768
15769 static bool
15770 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
15771 {
15772   if (register_operand (x, VOIDmode))
15773     return true;
15774
15775   switch (op_code)
15776     {
15777     case LTU:
15778     case LEU:
15779     case GEU:
15780     case GTU:
15781       return aarch64_sve_cmp_immediate_p (x, false);
15782     case LT:
15783     case LE:
15784     case GE:
15785     case GT:
15786     case NE:
15787     case EQ:
15788       return aarch64_sve_cmp_immediate_p (x, true);
15789     default:
15790       gcc_unreachable ();
15791     }
15792 }
15793
15794 /* Use predicated SVE instructions to implement the equivalent of:
15795
15796      (set TARGET OP)
15797
15798    given that PTRUE is an all-true predicate of the appropriate mode.  */
15799
15800 static void
15801 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
15802 {
15803   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
15804                                gen_rtvec (2, ptrue, op),
15805                                UNSPEC_MERGE_PTRUE);
15806   rtx_insn *insn = emit_set_insn (target, unspec);
15807   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
15808 }
15809
15810 /* Likewise, but also clobber the condition codes.  */
15811
15812 static void
15813 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
15814 {
15815   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
15816                                gen_rtvec (2, ptrue, op),
15817                                UNSPEC_MERGE_PTRUE);
15818   rtx_insn *insn = emit_insn (gen_set_clobber_cc (target, unspec));
15819   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
15820 }
15821
15822 /* Return the UNSPEC_COND_* code for comparison CODE.  */
15823
15824 static unsigned int
15825 aarch64_unspec_cond_code (rtx_code code)
15826 {
15827   switch (code)
15828     {
15829     case NE:
15830       return UNSPEC_COND_NE;
15831     case EQ:
15832       return UNSPEC_COND_EQ;
15833     case LT:
15834       return UNSPEC_COND_LT;
15835     case GT:
15836       return UNSPEC_COND_GT;
15837     case LE:
15838       return UNSPEC_COND_LE;
15839     case GE:
15840       return UNSPEC_COND_GE;
15841     default:
15842       gcc_unreachable ();
15843     }
15844 }
15845
15846 /* Emit:
15847
15848       (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
15849
15850    where <X> is the operation associated with comparison CODE.  This form
15851    of instruction is used when (and (CODE OP0 OP1) PRED) would have different
15852    semantics, such as when PRED might not be all-true and when comparing
15853    inactive lanes could have side effects.  */
15854
15855 static void
15856 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
15857                                   rtx pred, rtx op0, rtx op1)
15858 {
15859   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
15860                                gen_rtvec (3, pred, op0, op1),
15861                                aarch64_unspec_cond_code (code));
15862   emit_set_insn (target, unspec);
15863 }
15864
15865 /* Expand an SVE integer comparison using the SVE equivalent of:
15866
15867      (set TARGET (CODE OP0 OP1)).  */
15868
15869 void
15870 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
15871 {
15872   machine_mode pred_mode = GET_MODE (target);
15873   machine_mode data_mode = GET_MODE (op0);
15874
15875   if (!aarch64_sve_cmp_operand_p (code, op1))
15876     op1 = force_reg (data_mode, op1);
15877
15878   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15879   rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15880   aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
15881 }
15882
15883 /* Emit the SVE equivalent of:
15884
15885       (set TMP1 (CODE1 OP0 OP1))
15886       (set TMP2 (CODE2 OP0 OP1))
15887       (set TARGET (ior:PRED_MODE TMP1 TMP2))
15888
15889    PTRUE is an all-true predicate with the same mode as TARGET.  */
15890
15891 static void
15892 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
15893                            rtx ptrue, rtx op0, rtx op1)
15894 {
15895   machine_mode pred_mode = GET_MODE (ptrue);
15896   rtx tmp1 = gen_reg_rtx (pred_mode);
15897   aarch64_emit_sve_ptrue_op (tmp1, ptrue,
15898                              gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
15899   rtx tmp2 = gen_reg_rtx (pred_mode);
15900   aarch64_emit_sve_ptrue_op (tmp2, ptrue,
15901                              gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
15902   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
15903 }
15904
15905 /* Emit the SVE equivalent of:
15906
15907       (set TMP (CODE OP0 OP1))
15908       (set TARGET (not TMP))
15909
15910    PTRUE is an all-true predicate with the same mode as TARGET.  */
15911
15912 static void
15913 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
15914                                 rtx op0, rtx op1)
15915 {
15916   machine_mode pred_mode = GET_MODE (ptrue);
15917   rtx tmp = gen_reg_rtx (pred_mode);
15918   aarch64_emit_sve_ptrue_op (tmp, ptrue,
15919                              gen_rtx_fmt_ee (code, pred_mode, op0, op1));
15920   aarch64_emit_unop (target, one_cmpl_optab, tmp);
15921 }
15922
15923 /* Expand an SVE floating-point comparison using the SVE equivalent of:
15924
15925      (set TARGET (CODE OP0 OP1))
15926
15927    If CAN_INVERT_P is true, the caller can also handle inverted results;
15928    return true if the result is in fact inverted.  */
15929
15930 bool
15931 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
15932                                   rtx op0, rtx op1, bool can_invert_p)
15933 {
15934   machine_mode pred_mode = GET_MODE (target);
15935   machine_mode data_mode = GET_MODE (op0);
15936
15937   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15938   switch (code)
15939     {
15940     case UNORDERED:
15941       /* UNORDERED has no immediate form.  */
15942       op1 = force_reg (data_mode, op1);
15943       /* fall through */
15944     case LT:
15945     case LE:
15946     case GT:
15947     case GE:
15948     case EQ:
15949     case NE:
15950       {
15951         /* There is native support for the comparison.  */
15952         rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15953         aarch64_emit_sve_ptrue_op (target, ptrue, cond);
15954         return false;
15955       }
15956
15957     case LTGT:
15958       /* This is a trapping operation (LT or GT).  */
15959       aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
15960       return false;
15961
15962     case UNEQ:
15963       if (!flag_trapping_math)
15964         {
15965           /* This would trap for signaling NaNs.  */
15966           op1 = force_reg (data_mode, op1);
15967           aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
15968           return false;
15969         }
15970       /* fall through */
15971     case UNLT:
15972     case UNLE:
15973     case UNGT:
15974     case UNGE:
15975       if (flag_trapping_math)
15976         {
15977           /* Work out which elements are ordered.  */
15978           rtx ordered = gen_reg_rtx (pred_mode);
15979           op1 = force_reg (data_mode, op1);
15980           aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
15981
15982           /* Test the opposite condition for the ordered elements,
15983              then invert the result.  */
15984           if (code == UNEQ)
15985             code = NE;
15986           else
15987             code = reverse_condition_maybe_unordered (code);
15988           if (can_invert_p)
15989             {
15990               aarch64_emit_sve_predicated_cond (target, code,
15991                                                 ordered, op0, op1);
15992               return true;
15993             }
15994           rtx tmp = gen_reg_rtx (pred_mode);
15995           aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
15996           aarch64_emit_unop (target, one_cmpl_optab, tmp);
15997           return false;
15998         }
15999       break;
16000
16001     case ORDERED:
16002       /* ORDERED has no immediate form.  */
16003       op1 = force_reg (data_mode, op1);
16004       break;
16005
16006     default:
16007       gcc_unreachable ();
16008     }
16009
16010   /* There is native support for the inverse comparison.  */
16011   code = reverse_condition_maybe_unordered (code);
16012   if (can_invert_p)
16013     {
16014       rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16015       aarch64_emit_sve_ptrue_op (target, ptrue, cond);
16016       return true;
16017     }
16018   aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
16019   return false;
16020 }
16021
16022 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
16023    of the data being selected and CMP_MODE is the mode of the values being
16024    compared.  */
16025
16026 void
16027 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
16028                           rtx *ops)
16029 {
16030   machine_mode pred_mode
16031     = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
16032                              GET_MODE_SIZE (cmp_mode)).require ();
16033   rtx pred = gen_reg_rtx (pred_mode);
16034   if (FLOAT_MODE_P (cmp_mode))
16035     {
16036       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
16037                                             ops[4], ops[5], true))
16038         std::swap (ops[1], ops[2]);
16039     }
16040   else
16041     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
16042
16043   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
16044   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
16045 }
16046
16047 /* Prepare a cond_<optab><mode> operation that has the operands
16048    given by OPERANDS, where:
16049
16050    - operand 0 is the destination
16051    - operand 1 is a predicate
16052    - operands 2 to NOPS - 2 are the operands to an operation that is
16053      performed for active lanes
16054    - operand NOPS - 1 specifies the values to use for inactive lanes.
16055
16056    COMMUTATIVE_P is true if operands 2 and 3 are commutative.  In that case,
16057    no pattern is provided for a tie between operands 3 and NOPS - 1.  */
16058
16059 void
16060 aarch64_sve_prepare_conditional_op (rtx *operands, unsigned int nops,
16061                                     bool commutative_p)
16062 {
16063   /* We can do the operation directly if the "else" value matches one
16064      of the other inputs.  */
16065   for (unsigned int i = 2; i < nops - 1; ++i)
16066     if (rtx_equal_p (operands[i], operands[nops - 1]))
16067       {
16068         if (i == 3 && commutative_p)
16069           std::swap (operands[2], operands[3]);
16070         return;
16071       }
16072
16073   /* If the "else" value is different from the other operands, we have
16074      the choice of doing a SEL on the output or a SEL on an input.
16075      Neither choice is better in all cases, but one advantage of
16076      selecting the input is that it can avoid a move when the output
16077      needs to be distinct from the inputs.  E.g. if operand N maps to
16078      register N, selecting the output would give:
16079
16080         MOVPRFX Z0.S, Z2.S
16081         ADD Z0.S, P1/M, Z0.S, Z3.S
16082         SEL Z0.S, P1, Z0.S, Z4.S
16083
16084      whereas selecting the input avoids the MOVPRFX:
16085
16086         SEL Z0.S, P1, Z2.S, Z4.S
16087         ADD Z0.S, P1/M, Z0.S, Z3.S.  */
16088   machine_mode mode = GET_MODE (operands[0]);
16089   rtx temp = gen_reg_rtx (mode);
16090   rtvec vec = gen_rtvec (3, operands[1], operands[2], operands[nops - 1]);
16091   emit_set_insn (temp, gen_rtx_UNSPEC (mode, vec, UNSPEC_SEL));
16092   operands[2] = operands[nops - 1] = temp;
16093 }
16094
16095 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
16096    true.  However due to issues with register allocation it is preferable
16097    to avoid tieing integer scalar and FP scalar modes.  Executing integer
16098    operations in general registers is better than treating them as scalar
16099    vector operations.  This reduces latency and avoids redundant int<->FP
16100    moves.  So tie modes if they are either the same class, or vector modes
16101    with other vector modes, vector structs or any scalar mode.  */
16102
16103 static bool
16104 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
16105 {
16106   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
16107     return true;
16108
16109   /* We specifically want to allow elements of "structure" modes to
16110      be tieable to the structure.  This more general condition allows
16111      other rarer situations too.  The reason we don't extend this to
16112      predicate modes is that there are no predicate structure modes
16113      nor any specific instructions for extracting part of a predicate
16114      register.  */
16115   if (aarch64_vector_data_mode_p (mode1)
16116       && aarch64_vector_data_mode_p (mode2))
16117     return true;
16118
16119   /* Also allow any scalar modes with vectors.  */
16120   if (aarch64_vector_mode_supported_p (mode1)
16121       || aarch64_vector_mode_supported_p (mode2))
16122     return true;
16123
16124   return false;
16125 }
16126
16127 /* Return a new RTX holding the result of moving POINTER forward by
16128    AMOUNT bytes.  */
16129
16130 static rtx
16131 aarch64_move_pointer (rtx pointer, poly_int64 amount)
16132 {
16133   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
16134
16135   return adjust_automodify_address (pointer, GET_MODE (pointer),
16136                                     next, amount);
16137 }
16138
16139 /* Return a new RTX holding the result of moving POINTER forward by the
16140    size of the mode it points to.  */
16141
16142 static rtx
16143 aarch64_progress_pointer (rtx pointer)
16144 {
16145   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
16146 }
16147
16148 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
16149    MODE bytes.  */
16150
16151 static void
16152 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
16153                                               machine_mode mode)
16154 {
16155   rtx reg = gen_reg_rtx (mode);
16156
16157   /* "Cast" the pointers to the correct mode.  */
16158   *src = adjust_address (*src, mode, 0);
16159   *dst = adjust_address (*dst, mode, 0);
16160   /* Emit the memcpy.  */
16161   emit_move_insn (reg, *src);
16162   emit_move_insn (*dst, reg);
16163   /* Move the pointers forward.  */
16164   *src = aarch64_progress_pointer (*src);
16165   *dst = aarch64_progress_pointer (*dst);
16166 }
16167
16168 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
16169    we succeed, otherwise return false.  */
16170
16171 bool
16172 aarch64_expand_movmem (rtx *operands)
16173 {
16174   unsigned int n;
16175   rtx dst = operands[0];
16176   rtx src = operands[1];
16177   rtx base;
16178   bool speed_p = !optimize_function_for_size_p (cfun);
16179
16180   /* When optimizing for size, give a better estimate of the length of a
16181      memcpy call, but use the default otherwise.  */
16182   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
16183
16184   /* We can't do anything smart if the amount to copy is not constant.  */
16185   if (!CONST_INT_P (operands[2]))
16186     return false;
16187
16188   n = UINTVAL (operands[2]);
16189
16190   /* Try to keep the number of instructions low.  For cases below 16 bytes we
16191      need to make at most two moves.  For cases above 16 bytes it will be one
16192      move for each 16 byte chunk, then at most two additional moves.  */
16193   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
16194     return false;
16195
16196   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
16197   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
16198
16199   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
16200   src = adjust_automodify_address (src, VOIDmode, base, 0);
16201
16202   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
16203      1-byte chunk.  */
16204   if (n < 4)
16205     {
16206       if (n >= 2)
16207         {
16208           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
16209           n -= 2;
16210         }
16211
16212       if (n == 1)
16213         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
16214
16215       return true;
16216     }
16217
16218   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
16219      4-byte chunk, partially overlapping with the previously copied chunk.  */
16220   if (n < 8)
16221     {
16222       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16223       n -= 4;
16224       if (n > 0)
16225         {
16226           int move = n - 4;
16227
16228           src = aarch64_move_pointer (src, move);
16229           dst = aarch64_move_pointer (dst, move);
16230           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16231         }
16232       return true;
16233     }
16234
16235   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
16236      them, then (if applicable) an 8-byte chunk.  */
16237   while (n >= 8)
16238     {
16239       if (n / 16)
16240         {
16241           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
16242           n -= 16;
16243         }
16244       else
16245         {
16246           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
16247           n -= 8;
16248         }
16249     }
16250
16251   /* Finish the final bytes of the copy.  We can always do this in one
16252      instruction.  We either copy the exact amount we need, or partially
16253      overlap with the previous chunk we copied and copy 8-bytes.  */
16254   if (n == 0)
16255     return true;
16256   else if (n == 1)
16257     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
16258   else if (n == 2)
16259     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
16260   else if (n == 4)
16261     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16262   else
16263     {
16264       if (n == 3)
16265         {
16266           src = aarch64_move_pointer (src, -1);
16267           dst = aarch64_move_pointer (dst, -1);
16268           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16269         }
16270       else
16271         {
16272           int move = n - 8;
16273
16274           src = aarch64_move_pointer (src, move);
16275           dst = aarch64_move_pointer (dst, move);
16276           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
16277         }
16278     }
16279
16280   return true;
16281 }
16282
16283 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
16284    SImode stores.  Handle the case when the constant has identical
16285    bottom and top halves.  This is beneficial when the two stores can be
16286    merged into an STP and we avoid synthesising potentially expensive
16287    immediates twice.  Return true if such a split is possible.  */
16288
16289 bool
16290 aarch64_split_dimode_const_store (rtx dst, rtx src)
16291 {
16292   rtx lo = gen_lowpart (SImode, src);
16293   rtx hi = gen_highpart_mode (SImode, DImode, src);
16294
16295   bool size_p = optimize_function_for_size_p (cfun);
16296
16297   if (!rtx_equal_p (lo, hi))
16298     return false;
16299
16300   unsigned int orig_cost
16301     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
16302   unsigned int lo_cost
16303     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
16304
16305   /* We want to transform:
16306      MOV        x1, 49370
16307      MOVK       x1, 0x140, lsl 16
16308      MOVK       x1, 0xc0da, lsl 32
16309      MOVK       x1, 0x140, lsl 48
16310      STR        x1, [x0]
16311    into:
16312      MOV        w1, 49370
16313      MOVK       w1, 0x140, lsl 16
16314      STP        w1, w1, [x0]
16315    So we want to perform this only when we save two instructions
16316    or more.  When optimizing for size, however, accept any code size
16317    savings we can.  */
16318   if (size_p && orig_cost <= lo_cost)
16319     return false;
16320
16321   if (!size_p
16322       && (orig_cost <= lo_cost + 1))
16323     return false;
16324
16325   rtx mem_lo = adjust_address (dst, SImode, 0);
16326   if (!aarch64_mem_pair_operand (mem_lo, SImode))
16327     return false;
16328
16329   rtx tmp_reg = gen_reg_rtx (SImode);
16330   aarch64_expand_mov_immediate (tmp_reg, lo);
16331   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
16332   /* Don't emit an explicit store pair as this may not be always profitable.
16333      Let the sched-fusion logic decide whether to merge them.  */
16334   emit_move_insn (mem_lo, tmp_reg);
16335   emit_move_insn (mem_hi, tmp_reg);
16336
16337   return true;
16338 }
16339
16340 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
16341
16342 static unsigned HOST_WIDE_INT
16343 aarch64_asan_shadow_offset (void)
16344 {
16345   return (HOST_WIDE_INT_1 << 36);
16346 }
16347
16348 static rtx
16349 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
16350                         int code, tree treeop0, tree treeop1)
16351 {
16352   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16353   rtx op0, op1;
16354   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16355   insn_code icode;
16356   struct expand_operand ops[4];
16357
16358   start_sequence ();
16359   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16360
16361   op_mode = GET_MODE (op0);
16362   if (op_mode == VOIDmode)
16363     op_mode = GET_MODE (op1);
16364
16365   switch (op_mode)
16366     {
16367     case E_QImode:
16368     case E_HImode:
16369     case E_SImode:
16370       cmp_mode = SImode;
16371       icode = CODE_FOR_cmpsi;
16372       break;
16373
16374     case E_DImode:
16375       cmp_mode = DImode;
16376       icode = CODE_FOR_cmpdi;
16377       break;
16378
16379     case E_SFmode:
16380       cmp_mode = SFmode;
16381       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16382       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
16383       break;
16384
16385     case E_DFmode:
16386       cmp_mode = DFmode;
16387       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16388       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
16389       break;
16390
16391     default:
16392       end_sequence ();
16393       return NULL_RTX;
16394     }
16395
16396   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
16397   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
16398   if (!op0 || !op1)
16399     {
16400       end_sequence ();
16401       return NULL_RTX;
16402     }
16403   *prep_seq = get_insns ();
16404   end_sequence ();
16405
16406   create_fixed_operand (&ops[0], op0);
16407   create_fixed_operand (&ops[1], op1);
16408
16409   start_sequence ();
16410   if (!maybe_expand_insn (icode, 2, ops))
16411     {
16412       end_sequence ();
16413       return NULL_RTX;
16414     }
16415   *gen_seq = get_insns ();
16416   end_sequence ();
16417
16418   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
16419                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
16420 }
16421
16422 static rtx
16423 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
16424                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
16425 {
16426   rtx op0, op1, target;
16427   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16428   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16429   insn_code icode;
16430   struct expand_operand ops[6];
16431   int aarch64_cond;
16432
16433   push_to_sequence (*prep_seq);
16434   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16435
16436   op_mode = GET_MODE (op0);
16437   if (op_mode == VOIDmode)
16438     op_mode = GET_MODE (op1);
16439
16440   switch (op_mode)
16441     {
16442     case E_QImode:
16443     case E_HImode:
16444     case E_SImode:
16445       cmp_mode = SImode;
16446       icode = CODE_FOR_ccmpsi;
16447       break;
16448
16449     case E_DImode:
16450       cmp_mode = DImode;
16451       icode = CODE_FOR_ccmpdi;
16452       break;
16453
16454     case E_SFmode:
16455       cmp_mode = SFmode;
16456       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16457       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
16458       break;
16459
16460     case E_DFmode:
16461       cmp_mode = DFmode;
16462       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16463       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
16464       break;
16465
16466     default:
16467       end_sequence ();
16468       return NULL_RTX;
16469     }
16470
16471   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
16472   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
16473   if (!op0 || !op1)
16474     {
16475       end_sequence ();
16476       return NULL_RTX;
16477     }
16478   *prep_seq = get_insns ();
16479   end_sequence ();
16480
16481   target = gen_rtx_REG (cc_mode, CC_REGNUM);
16482   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
16483
16484   if (bit_code != AND)
16485     {
16486       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
16487                                                 GET_MODE (XEXP (prev, 0))),
16488                              VOIDmode, XEXP (prev, 0), const0_rtx);
16489       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
16490     }
16491
16492   create_fixed_operand (&ops[0], XEXP (prev, 0));
16493   create_fixed_operand (&ops[1], target);
16494   create_fixed_operand (&ops[2], op0);
16495   create_fixed_operand (&ops[3], op1);
16496   create_fixed_operand (&ops[4], prev);
16497   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
16498
16499   push_to_sequence (*gen_seq);
16500   if (!maybe_expand_insn (icode, 6, ops))
16501     {
16502       end_sequence ();
16503       return NULL_RTX;
16504     }
16505
16506   *gen_seq = get_insns ();
16507   end_sequence ();
16508
16509   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
16510 }
16511
16512 #undef TARGET_GEN_CCMP_FIRST
16513 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
16514
16515 #undef TARGET_GEN_CCMP_NEXT
16516 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
16517
16518 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
16519    instruction fusion of some sort.  */
16520
16521 static bool
16522 aarch64_macro_fusion_p (void)
16523 {
16524   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
16525 }
16526
16527
16528 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
16529    should be kept together during scheduling.  */
16530
16531 static bool
16532 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
16533 {
16534   rtx set_dest;
16535   rtx prev_set = single_set (prev);
16536   rtx curr_set = single_set (curr);
16537   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
16538   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
16539
16540   if (!aarch64_macro_fusion_p ())
16541     return false;
16542
16543   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
16544     {
16545       /* We are trying to match:
16546          prev (mov)  == (set (reg r0) (const_int imm16))
16547          curr (movk) == (set (zero_extract (reg r0)
16548                                            (const_int 16)
16549                                            (const_int 16))
16550                              (const_int imm16_1))  */
16551
16552       set_dest = SET_DEST (curr_set);
16553
16554       if (GET_CODE (set_dest) == ZERO_EXTRACT
16555           && CONST_INT_P (SET_SRC (curr_set))
16556           && CONST_INT_P (SET_SRC (prev_set))
16557           && CONST_INT_P (XEXP (set_dest, 2))
16558           && INTVAL (XEXP (set_dest, 2)) == 16
16559           && REG_P (XEXP (set_dest, 0))
16560           && REG_P (SET_DEST (prev_set))
16561           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
16562         {
16563           return true;
16564         }
16565     }
16566
16567   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
16568     {
16569
16570       /*  We're trying to match:
16571           prev (adrp) == (set (reg r1)
16572                               (high (symbol_ref ("SYM"))))
16573           curr (add) == (set (reg r0)
16574                              (lo_sum (reg r1)
16575                                      (symbol_ref ("SYM"))))
16576           Note that r0 need not necessarily be the same as r1, especially
16577           during pre-regalloc scheduling.  */
16578
16579       if (satisfies_constraint_Ush (SET_SRC (prev_set))
16580           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16581         {
16582           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
16583               && REG_P (XEXP (SET_SRC (curr_set), 0))
16584               && REGNO (XEXP (SET_SRC (curr_set), 0))
16585                  == REGNO (SET_DEST (prev_set))
16586               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
16587                               XEXP (SET_SRC (curr_set), 1)))
16588             return true;
16589         }
16590     }
16591
16592   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
16593     {
16594
16595       /* We're trying to match:
16596          prev (movk) == (set (zero_extract (reg r0)
16597                                            (const_int 16)
16598                                            (const_int 32))
16599                              (const_int imm16_1))
16600          curr (movk) == (set (zero_extract (reg r0)
16601                                            (const_int 16)
16602                                            (const_int 48))
16603                              (const_int imm16_2))  */
16604
16605       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
16606           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
16607           && REG_P (XEXP (SET_DEST (prev_set), 0))
16608           && REG_P (XEXP (SET_DEST (curr_set), 0))
16609           && REGNO (XEXP (SET_DEST (prev_set), 0))
16610              == REGNO (XEXP (SET_DEST (curr_set), 0))
16611           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
16612           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
16613           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
16614           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
16615           && CONST_INT_P (SET_SRC (prev_set))
16616           && CONST_INT_P (SET_SRC (curr_set)))
16617         return true;
16618
16619     }
16620   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
16621     {
16622       /* We're trying to match:
16623           prev (adrp) == (set (reg r0)
16624                               (high (symbol_ref ("SYM"))))
16625           curr (ldr) == (set (reg r1)
16626                              (mem (lo_sum (reg r0)
16627                                              (symbol_ref ("SYM")))))
16628                  or
16629           curr (ldr) == (set (reg r1)
16630                              (zero_extend (mem
16631                                            (lo_sum (reg r0)
16632                                                    (symbol_ref ("SYM"))))))  */
16633       if (satisfies_constraint_Ush (SET_SRC (prev_set))
16634           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16635         {
16636           rtx curr_src = SET_SRC (curr_set);
16637
16638           if (GET_CODE (curr_src) == ZERO_EXTEND)
16639             curr_src = XEXP (curr_src, 0);
16640
16641           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
16642               && REG_P (XEXP (XEXP (curr_src, 0), 0))
16643               && REGNO (XEXP (XEXP (curr_src, 0), 0))
16644                  == REGNO (SET_DEST (prev_set))
16645               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
16646                               XEXP (SET_SRC (prev_set), 0)))
16647               return true;
16648         }
16649     }
16650
16651   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
16652        && aarch_crypto_can_dual_issue (prev, curr))
16653     return true;
16654
16655   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
16656       && any_condjump_p (curr))
16657     {
16658       enum attr_type prev_type = get_attr_type (prev);
16659
16660       unsigned int condreg1, condreg2;
16661       rtx cc_reg_1;
16662       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
16663       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
16664
16665       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
16666           && prev
16667           && modified_in_p (cc_reg_1, prev))
16668         {
16669           /* FIXME: this misses some which is considered simple arthematic
16670              instructions for ThunderX.  Simple shifts are missed here.  */
16671           if (prev_type == TYPE_ALUS_SREG
16672               || prev_type == TYPE_ALUS_IMM
16673               || prev_type == TYPE_LOGICS_REG
16674               || prev_type == TYPE_LOGICS_IMM)
16675             return true;
16676         }
16677     }
16678
16679   if (prev_set
16680       && curr_set
16681       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
16682       && any_condjump_p (curr))
16683     {
16684       /* We're trying to match:
16685           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
16686           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
16687                                                          (const_int 0))
16688                                                  (label_ref ("SYM"))
16689                                                  (pc))  */
16690       if (SET_DEST (curr_set) == (pc_rtx)
16691           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
16692           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
16693           && REG_P (SET_DEST (prev_set))
16694           && REGNO (SET_DEST (prev_set))
16695              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
16696         {
16697           /* Fuse ALU operations followed by conditional branch instruction.  */
16698           switch (get_attr_type (prev))
16699             {
16700             case TYPE_ALU_IMM:
16701             case TYPE_ALU_SREG:
16702             case TYPE_ADC_REG:
16703             case TYPE_ADC_IMM:
16704             case TYPE_ADCS_REG:
16705             case TYPE_ADCS_IMM:
16706             case TYPE_LOGIC_REG:
16707             case TYPE_LOGIC_IMM:
16708             case TYPE_CSEL:
16709             case TYPE_ADR:
16710             case TYPE_MOV_IMM:
16711             case TYPE_SHIFT_REG:
16712             case TYPE_SHIFT_IMM:
16713             case TYPE_BFM:
16714             case TYPE_RBIT:
16715             case TYPE_REV:
16716             case TYPE_EXTEND:
16717               return true;
16718
16719             default:;
16720             }
16721         }
16722     }
16723
16724   return false;
16725 }
16726
16727 /* Return true iff the instruction fusion described by OP is enabled.  */
16728
16729 bool
16730 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
16731 {
16732   return (aarch64_tune_params.fusible_ops & op) != 0;
16733 }
16734
16735 /* If MEM is in the form of [base+offset], extract the two parts
16736    of address and set to BASE and OFFSET, otherwise return false
16737    after clearing BASE and OFFSET.  */
16738
16739 bool
16740 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
16741 {
16742   rtx addr;
16743
16744   gcc_assert (MEM_P (mem));
16745
16746   addr = XEXP (mem, 0);
16747
16748   if (REG_P (addr))
16749     {
16750       *base = addr;
16751       *offset = const0_rtx;
16752       return true;
16753     }
16754
16755   if (GET_CODE (addr) == PLUS
16756       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
16757     {
16758       *base = XEXP (addr, 0);
16759       *offset = XEXP (addr, 1);
16760       return true;
16761     }
16762
16763   *base = NULL_RTX;
16764   *offset = NULL_RTX;
16765
16766   return false;
16767 }
16768
16769 /* Types for scheduling fusion.  */
16770 enum sched_fusion_type
16771 {
16772   SCHED_FUSION_NONE = 0,
16773   SCHED_FUSION_LD_SIGN_EXTEND,
16774   SCHED_FUSION_LD_ZERO_EXTEND,
16775   SCHED_FUSION_LD,
16776   SCHED_FUSION_ST,
16777   SCHED_FUSION_NUM
16778 };
16779
16780 /* If INSN is a load or store of address in the form of [base+offset],
16781    extract the two parts and set to BASE and OFFSET.  Return scheduling
16782    fusion type this INSN is.  */
16783
16784 static enum sched_fusion_type
16785 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
16786 {
16787   rtx x, dest, src;
16788   enum sched_fusion_type fusion = SCHED_FUSION_LD;
16789
16790   gcc_assert (INSN_P (insn));
16791   x = PATTERN (insn);
16792   if (GET_CODE (x) != SET)
16793     return SCHED_FUSION_NONE;
16794
16795   src = SET_SRC (x);
16796   dest = SET_DEST (x);
16797
16798   machine_mode dest_mode = GET_MODE (dest);
16799
16800   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
16801     return SCHED_FUSION_NONE;
16802
16803   if (GET_CODE (src) == SIGN_EXTEND)
16804     {
16805       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
16806       src = XEXP (src, 0);
16807       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16808         return SCHED_FUSION_NONE;
16809     }
16810   else if (GET_CODE (src) == ZERO_EXTEND)
16811     {
16812       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
16813       src = XEXP (src, 0);
16814       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16815         return SCHED_FUSION_NONE;
16816     }
16817
16818   if (GET_CODE (src) == MEM && REG_P (dest))
16819     extract_base_offset_in_addr (src, base, offset);
16820   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
16821     {
16822       fusion = SCHED_FUSION_ST;
16823       extract_base_offset_in_addr (dest, base, offset);
16824     }
16825   else
16826     return SCHED_FUSION_NONE;
16827
16828   if (*base == NULL_RTX || *offset == NULL_RTX)
16829     fusion = SCHED_FUSION_NONE;
16830
16831   return fusion;
16832 }
16833
16834 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
16835
16836    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
16837    and PRI are only calculated for these instructions.  For other instruction,
16838    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
16839    type instruction fusion can be added by returning different priorities.
16840
16841    It's important that irrelevant instructions get the largest FUSION_PRI.  */
16842
16843 static void
16844 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
16845                                int *fusion_pri, int *pri)
16846 {
16847   int tmp, off_val;
16848   rtx base, offset;
16849   enum sched_fusion_type fusion;
16850
16851   gcc_assert (INSN_P (insn));
16852
16853   tmp = max_pri - 1;
16854   fusion = fusion_load_store (insn, &base, &offset);
16855   if (fusion == SCHED_FUSION_NONE)
16856     {
16857       *pri = tmp;
16858       *fusion_pri = tmp;
16859       return;
16860     }
16861
16862   /* Set FUSION_PRI according to fusion type and base register.  */
16863   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
16864
16865   /* Calculate PRI.  */
16866   tmp /= 2;
16867
16868   /* INSN with smaller offset goes first.  */
16869   off_val = (int)(INTVAL (offset));
16870   if (off_val >= 0)
16871     tmp -= (off_val & 0xfffff);
16872   else
16873     tmp += ((- off_val) & 0xfffff);
16874
16875   *pri = tmp;
16876   return;
16877 }
16878
16879 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
16880    Adjust priority of sha1h instructions so they are scheduled before
16881    other SHA1 instructions.  */
16882
16883 static int
16884 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
16885 {
16886   rtx x = PATTERN (insn);
16887
16888   if (GET_CODE (x) == SET)
16889     {
16890       x = SET_SRC (x);
16891
16892       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
16893         return priority + 10;
16894     }
16895
16896   return priority;
16897 }
16898
16899 /* Given OPERANDS of consecutive load/store, check if we can merge
16900    them into ldp/stp.  LOAD is true if they are load instructions.
16901    MODE is the mode of memory operands.  */
16902
16903 bool
16904 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
16905                                 machine_mode mode)
16906 {
16907   HOST_WIDE_INT offval_1, offval_2, msize;
16908   enum reg_class rclass_1, rclass_2;
16909   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
16910
16911   if (load)
16912     {
16913       mem_1 = operands[1];
16914       mem_2 = operands[3];
16915       reg_1 = operands[0];
16916       reg_2 = operands[2];
16917       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
16918       if (REGNO (reg_1) == REGNO (reg_2))
16919         return false;
16920     }
16921   else
16922     {
16923       mem_1 = operands[0];
16924       mem_2 = operands[2];
16925       reg_1 = operands[1];
16926       reg_2 = operands[3];
16927     }
16928
16929   /* The mems cannot be volatile.  */
16930   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
16931     return false;
16932
16933   /* If we have SImode and slow unaligned ldp,
16934      check the alignment to be at least 8 byte. */
16935   if (mode == SImode
16936       && (aarch64_tune_params.extra_tuning_flags
16937           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16938       && !optimize_size
16939       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
16940     return false;
16941
16942   /* Check if the addresses are in the form of [base+offset].  */
16943   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16944   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16945     return false;
16946   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16947   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16948     return false;
16949
16950   /* Check if the bases are same.  */
16951   if (!rtx_equal_p (base_1, base_2))
16952     return false;
16953
16954   /* The operands must be of the same size.  */
16955   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
16956                          GET_MODE_SIZE (GET_MODE (mem_2))));
16957
16958   offval_1 = INTVAL (offset_1);
16959   offval_2 = INTVAL (offset_2);
16960   /* We should only be trying this for fixed-sized modes.  There is no
16961      SVE LDP/STP instruction.  */
16962   msize = GET_MODE_SIZE (mode).to_constant ();
16963   /* Check if the offsets are consecutive.  */
16964   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
16965     return false;
16966
16967   /* Check if the addresses are clobbered by load.  */
16968   if (load)
16969     {
16970       if (reg_mentioned_p (reg_1, mem_1))
16971         return false;
16972
16973       /* In increasing order, the last load can clobber the address.  */
16974       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
16975         return false;
16976     }
16977
16978   /* One of the memory accesses must be a mempair operand.
16979      If it is not the first one, they need to be swapped by the
16980      peephole.  */
16981   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
16982        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
16983     return false;
16984
16985   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
16986     rclass_1 = FP_REGS;
16987   else
16988     rclass_1 = GENERAL_REGS;
16989
16990   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
16991     rclass_2 = FP_REGS;
16992   else
16993     rclass_2 = GENERAL_REGS;
16994
16995   /* Check if the registers are of same class.  */
16996   if (rclass_1 != rclass_2)
16997     return false;
16998
16999   return true;
17000 }
17001
17002 /* Given OPERANDS of consecutive load/store that can be merged,
17003    swap them if they are not in ascending order.  */
17004 void
17005 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
17006 {
17007   rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
17008   HOST_WIDE_INT offval_1, offval_2;
17009
17010   if (load)
17011     {
17012       mem_1 = operands[1];
17013       mem_2 = operands[3];
17014     }
17015   else
17016     {
17017       mem_1 = operands[0];
17018       mem_2 = operands[2];
17019     }
17020
17021   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
17022   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
17023
17024   offval_1 = INTVAL (offset_1);
17025   offval_2 = INTVAL (offset_2);
17026
17027   if (offval_1 > offval_2)
17028     {
17029       /* Irrespective of whether this is a load or a store,
17030          we do the same swap.  */
17031       std::swap (operands[0], operands[2]);
17032       std::swap (operands[1], operands[3]);
17033     }
17034 }
17035
17036 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
17037    comparison between the two.  */
17038 int
17039 aarch64_host_wide_int_compare (const void *x, const void *y)
17040 {
17041   return wi::cmps (* ((const HOST_WIDE_INT *) x),
17042                    * ((const HOST_WIDE_INT *) y));
17043 }
17044
17045 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
17046    other pointing to a REG rtx containing an offset, compare the offsets
17047    of the two pairs.
17048
17049    Return:
17050
17051         1 iff offset (X) > offset (Y)
17052         0 iff offset (X) == offset (Y)
17053         -1 iff offset (X) < offset (Y)  */
17054 int
17055 aarch64_ldrstr_offset_compare (const void *x, const void *y)
17056 {
17057   const rtx * operands_1 = (const rtx *) x;
17058   const rtx * operands_2 = (const rtx *) y;
17059   rtx mem_1, mem_2, base, offset_1, offset_2;
17060
17061   if (MEM_P (operands_1[0]))
17062     mem_1 = operands_1[0];
17063   else
17064     mem_1 = operands_1[1];
17065
17066   if (MEM_P (operands_2[0]))
17067     mem_2 = operands_2[0];
17068   else
17069     mem_2 = operands_2[1];
17070
17071   /* Extract the offsets.  */
17072   extract_base_offset_in_addr (mem_1, &base, &offset_1);
17073   extract_base_offset_in_addr (mem_2, &base, &offset_2);
17074
17075   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
17076
17077   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
17078 }
17079
17080 /* Given OPERANDS of consecutive load/store, check if we can merge
17081    them into ldp/stp by adjusting the offset.  LOAD is true if they
17082    are load instructions.  MODE is the mode of memory operands.
17083
17084    Given below consecutive stores:
17085
17086      str  w1, [xb, 0x100]
17087      str  w1, [xb, 0x104]
17088      str  w1, [xb, 0x108]
17089      str  w1, [xb, 0x10c]
17090
17091    Though the offsets are out of the range supported by stp, we can
17092    still pair them after adjusting the offset, like:
17093
17094      add  scratch, xb, 0x100
17095      stp  w1, w1, [scratch]
17096      stp  w1, w1, [scratch, 0x8]
17097
17098    The peephole patterns detecting this opportunity should guarantee
17099    the scratch register is avaliable.  */
17100
17101 bool
17102 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
17103                                        scalar_mode mode)
17104 {
17105   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
17106   HOST_WIDE_INT offvals[4], msize;
17107   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
17108   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
17109
17110   if (load)
17111     {
17112       reg_1 = operands[0];
17113       mem_1 = operands[1];
17114       reg_2 = operands[2];
17115       mem_2 = operands[3];
17116       reg_3 = operands[4];
17117       mem_3 = operands[5];
17118       reg_4 = operands[6];
17119       mem_4 = operands[7];
17120       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
17121                   && REG_P (reg_3) && REG_P (reg_4));
17122
17123       /* Do not attempt to merge the loads if the loads clobber each other.  */
17124       for (int i = 0; i < 8; i += 2)
17125         for (int j = i + 2; j < 8; j += 2)
17126           if (reg_overlap_mentioned_p (operands[i], operands[j]))
17127             return false;
17128     }
17129   else
17130     {
17131       mem_1 = operands[0];
17132       reg_1 = operands[1];
17133       mem_2 = operands[2];
17134       reg_2 = operands[3];
17135       mem_3 = operands[4];
17136       reg_3 = operands[5];
17137       mem_4 = operands[6];
17138       reg_4 = operands[7];
17139     }
17140   /* Skip if memory operand is by itslef valid for ldp/stp.  */
17141   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
17142     return false;
17143
17144   /* The mems cannot be volatile.  */
17145   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
17146       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
17147     return false;
17148
17149   /* Check if the addresses are in the form of [base+offset].  */
17150   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
17151   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
17152     return false;
17153   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
17154   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
17155     return false;
17156   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
17157   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
17158     return false;
17159   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
17160   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
17161     return false;
17162
17163   /* Check if the bases are same.  */
17164   if (!rtx_equal_p (base_1, base_2)
17165       || !rtx_equal_p (base_2, base_3)
17166       || !rtx_equal_p (base_3, base_4))
17167     return false;
17168
17169   offvals[0] = INTVAL (offset_1);
17170   offvals[1] = INTVAL (offset_2);
17171   offvals[2] = INTVAL (offset_3);
17172   offvals[3] = INTVAL (offset_4);
17173   msize = GET_MODE_SIZE (mode);
17174
17175   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
17176   qsort (offvals, 4, sizeof (HOST_WIDE_INT), aarch64_host_wide_int_compare);
17177
17178   if (!(offvals[1] == offvals[0] + msize
17179         && offvals[3] == offvals[2] + msize))
17180     return false;
17181
17182   /* Check that offsets are within range of each other.  The ldp/stp
17183      instructions have 7 bit immediate offsets, so use 0x80.  */
17184   if (offvals[2] - offvals[0] >= msize * 0x80)
17185     return false;
17186
17187   /* The offsets must be aligned with respect to each other.  */
17188   if (offvals[0] % msize != offvals[2] % msize)
17189     return false;
17190
17191   /* Check if the addresses are clobbered by load.  */
17192   if (load && (reg_mentioned_p (reg_1, mem_1)
17193                || reg_mentioned_p (reg_2, mem_2)
17194                || reg_mentioned_p (reg_3, mem_3)
17195                || reg_mentioned_p (reg_4, mem_4)))
17196     return false;
17197
17198   /* If we have SImode and slow unaligned ldp,
17199      check the alignment to be at least 8 byte. */
17200   if (mode == SImode
17201       && (aarch64_tune_params.extra_tuning_flags
17202           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
17203       && !optimize_size
17204       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
17205     return false;
17206
17207   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
17208     rclass_1 = FP_REGS;
17209   else
17210     rclass_1 = GENERAL_REGS;
17211
17212   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
17213     rclass_2 = FP_REGS;
17214   else
17215     rclass_2 = GENERAL_REGS;
17216
17217   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
17218     rclass_3 = FP_REGS;
17219   else
17220     rclass_3 = GENERAL_REGS;
17221
17222   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
17223     rclass_4 = FP_REGS;
17224   else
17225     rclass_4 = GENERAL_REGS;
17226
17227   /* Check if the registers are of same class.  */
17228   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
17229     return false;
17230
17231   return true;
17232 }
17233
17234 /* Given OPERANDS of consecutive load/store, this function pairs them
17235    into LDP/STP after adjusting the offset.  It depends on the fact
17236    that the operands can be sorted so the offsets are correct for STP.
17237    MODE is the mode of memory operands.  CODE is the rtl operator
17238    which should be applied to all memory operands, it's SIGN_EXTEND,
17239    ZERO_EXTEND or UNKNOWN.  */
17240
17241 bool
17242 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
17243                              scalar_mode mode, RTX_CODE code)
17244 {
17245   rtx base, offset_1, offset_3, t1, t2;
17246   rtx mem_1, mem_2, mem_3, mem_4;
17247   rtx temp_operands[8];
17248   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
17249                 stp_off_upper_limit, stp_off_lower_limit, msize;
17250
17251   /* We make changes on a copy as we may still bail out.  */
17252   for (int i = 0; i < 8; i ++)
17253     temp_operands[i] = operands[i];
17254
17255   /* Sort the operands.  */
17256   qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
17257
17258   if (load)
17259     {
17260       mem_1 = temp_operands[1];
17261       mem_2 = temp_operands[3];
17262       mem_3 = temp_operands[5];
17263       mem_4 = temp_operands[7];
17264     }
17265   else
17266     {
17267       mem_1 = temp_operands[0];
17268       mem_2 = temp_operands[2];
17269       mem_3 = temp_operands[4];
17270       mem_4 = temp_operands[6];
17271       gcc_assert (code == UNKNOWN);
17272     }
17273
17274   extract_base_offset_in_addr (mem_1, &base, &offset_1);
17275   extract_base_offset_in_addr (mem_3, &base, &offset_3);
17276   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
17277               && offset_3 != NULL_RTX);
17278
17279   /* Adjust offset so it can fit in LDP/STP instruction.  */
17280   msize = GET_MODE_SIZE (mode);
17281   stp_off_upper_limit = msize * (0x40 - 1);
17282   stp_off_lower_limit = - msize * 0x40;
17283
17284   off_val_1 = INTVAL (offset_1);
17285   off_val_3 = INTVAL (offset_3);
17286
17287   /* The base offset is optimally half way between the two STP/LDP offsets.  */
17288   if (msize <= 4)
17289     base_off = (off_val_1 + off_val_3) / 2;
17290   else
17291     /* However, due to issues with negative LDP/STP offset generation for
17292        larger modes, for DF, DI and vector modes. we must not use negative
17293        addresses smaller than 9 signed unadjusted bits can store.  This
17294        provides the most range in this case.  */
17295     base_off = off_val_1;
17296
17297   /* Adjust the base so that it is aligned with the addresses but still
17298      optimal.  */
17299   if (base_off % msize != off_val_1 % msize)
17300     /* Fix the offset, bearing in mind we want to make it bigger not
17301        smaller.  */
17302     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17303   else if (msize <= 4)
17304     /* The negative range of LDP/STP is one larger than the positive range.  */
17305     base_off += msize;
17306
17307   /* Check if base offset is too big or too small.  We can attempt to resolve
17308      this issue by setting it to the maximum value and seeing if the offsets
17309      still fit.  */
17310   if (base_off >= 0x1000)
17311     {
17312       base_off = 0x1000 - 1;
17313       /* We must still make sure that the base offset is aligned with respect
17314          to the address.  But it may may not be made any bigger.  */
17315       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17316     }
17317
17318   /* Likewise for the case where the base is too small.  */
17319   if (base_off <= -0x1000)
17320     {
17321       base_off = -0x1000 + 1;
17322       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17323     }
17324
17325   /* Offset of the first STP/LDP.  */
17326   new_off_1 = off_val_1 - base_off;
17327
17328   /* Offset of the second STP/LDP.  */
17329   new_off_3 = off_val_3 - base_off;
17330
17331   /* The offsets must be within the range of the LDP/STP instructions.  */
17332   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
17333       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
17334     return false;
17335
17336   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
17337                                                   new_off_1), true);
17338   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
17339                                                   new_off_1 + msize), true);
17340   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
17341                                                   new_off_3), true);
17342   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
17343                                                   new_off_3 + msize), true);
17344
17345   if (!aarch64_mem_pair_operand (mem_1, mode)
17346       || !aarch64_mem_pair_operand (mem_3, mode))
17347     return false;
17348
17349   if (code == ZERO_EXTEND)
17350     {
17351       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
17352       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
17353       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
17354       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
17355     }
17356   else if (code == SIGN_EXTEND)
17357     {
17358       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
17359       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
17360       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
17361       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
17362     }
17363
17364   if (load)
17365     {
17366       operands[0] = temp_operands[0];
17367       operands[1] = mem_1;
17368       operands[2] = temp_operands[2];
17369       operands[3] = mem_2;
17370       operands[4] = temp_operands[4];
17371       operands[5] = mem_3;
17372       operands[6] = temp_operands[6];
17373       operands[7] = mem_4;
17374     }
17375   else
17376     {
17377       operands[0] = mem_1;
17378       operands[1] = temp_operands[1];
17379       operands[2] = mem_2;
17380       operands[3] = temp_operands[3];
17381       operands[4] = mem_3;
17382       operands[5] = temp_operands[5];
17383       operands[6] = mem_4;
17384       operands[7] = temp_operands[7];
17385     }
17386
17387   /* Emit adjusting instruction.  */
17388   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
17389   /* Emit ldp/stp instructions.  */
17390   t1 = gen_rtx_SET (operands[0], operands[1]);
17391   t2 = gen_rtx_SET (operands[2], operands[3]);
17392   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17393   t1 = gen_rtx_SET (operands[4], operands[5]);
17394   t2 = gen_rtx_SET (operands[6], operands[7]);
17395   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17396   return true;
17397 }
17398
17399 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
17400    it isn't worth branching around empty masked ops (including masked
17401    stores).  */
17402
17403 static bool
17404 aarch64_empty_mask_is_expensive (unsigned)
17405 {
17406   return false;
17407 }
17408
17409 /* Return 1 if pseudo register should be created and used to hold
17410    GOT address for PIC code.  */
17411
17412 bool
17413 aarch64_use_pseudo_pic_reg (void)
17414 {
17415   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
17416 }
17417
17418 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
17419
17420 static int
17421 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
17422 {
17423   switch (XINT (x, 1))
17424     {
17425     case UNSPEC_GOTSMALLPIC:
17426     case UNSPEC_GOTSMALLPIC28K:
17427     case UNSPEC_GOTTINYPIC:
17428       return 0;
17429     default:
17430       break;
17431     }
17432
17433   return default_unspec_may_trap_p (x, flags);
17434 }
17435
17436
17437 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
17438    return the log2 of that value.  Otherwise return -1.  */
17439
17440 int
17441 aarch64_fpconst_pow_of_2 (rtx x)
17442 {
17443   const REAL_VALUE_TYPE *r;
17444
17445   if (!CONST_DOUBLE_P (x))
17446     return -1;
17447
17448   r = CONST_DOUBLE_REAL_VALUE (x);
17449
17450   if (REAL_VALUE_NEGATIVE (*r)
17451       || REAL_VALUE_ISNAN (*r)
17452       || REAL_VALUE_ISINF (*r)
17453       || !real_isinteger (r, DFmode))
17454     return -1;
17455
17456   return exact_log2 (real_to_integer (r));
17457 }
17458
17459 /* If X is a vector of equal CONST_DOUBLE values and that value is
17460    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
17461
17462 int
17463 aarch64_vec_fpconst_pow_of_2 (rtx x)
17464 {
17465   int nelts;
17466   if (GET_CODE (x) != CONST_VECTOR
17467       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
17468     return -1;
17469
17470   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
17471     return -1;
17472
17473   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
17474   if (firstval <= 0)
17475     return -1;
17476
17477   for (int i = 1; i < nelts; i++)
17478     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
17479       return -1;
17480
17481   return firstval;
17482 }
17483
17484 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
17485    to float.
17486
17487    __fp16 always promotes through this hook.
17488    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
17489    through the generic excess precision logic rather than here.  */
17490
17491 static tree
17492 aarch64_promoted_type (const_tree t)
17493 {
17494   if (SCALAR_FLOAT_TYPE_P (t)
17495       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
17496     return float_type_node;
17497
17498   return NULL_TREE;
17499 }
17500
17501 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
17502
17503 static bool
17504 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
17505                            optimization_type opt_type)
17506 {
17507   switch (op)
17508     {
17509     case rsqrt_optab:
17510       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
17511
17512     default:
17513       return true;
17514     }
17515 }
17516
17517 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
17518
17519 static unsigned int
17520 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
17521                                         int *offset)
17522 {
17523   /* Polynomial invariant 1 == (VG / 2) - 1.  */
17524   gcc_assert (i == 1);
17525   *factor = 2;
17526   *offset = 1;
17527   return AARCH64_DWARF_VG;
17528 }
17529
17530 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
17531    if MODE is HFmode, and punt to the generic implementation otherwise.  */
17532
17533 static bool
17534 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
17535 {
17536   return (mode == HFmode
17537           ? true
17538           : default_libgcc_floating_mode_supported_p (mode));
17539 }
17540
17541 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
17542    if MODE is HFmode, and punt to the generic implementation otherwise.  */
17543
17544 static bool
17545 aarch64_scalar_mode_supported_p (scalar_mode mode)
17546 {
17547   return (mode == HFmode
17548           ? true
17549           : default_scalar_mode_supported_p (mode));
17550 }
17551
17552 /* Set the value of FLT_EVAL_METHOD.
17553    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
17554
17555     0: evaluate all operations and constants, whose semantic type has at
17556        most the range and precision of type float, to the range and
17557        precision of float; evaluate all other operations and constants to
17558        the range and precision of the semantic type;
17559
17560     N, where _FloatN is a supported interchange floating type
17561        evaluate all operations and constants, whose semantic type has at
17562        most the range and precision of _FloatN type, to the range and
17563        precision of the _FloatN type; evaluate all other operations and
17564        constants to the range and precision of the semantic type;
17565
17566    If we have the ARMv8.2-A extensions then we support _Float16 in native
17567    precision, so we should set this to 16.  Otherwise, we support the type,
17568    but want to evaluate expressions in float precision, so set this to
17569    0.  */
17570
17571 static enum flt_eval_method
17572 aarch64_excess_precision (enum excess_precision_type type)
17573 {
17574   switch (type)
17575     {
17576       case EXCESS_PRECISION_TYPE_FAST:
17577       case EXCESS_PRECISION_TYPE_STANDARD:
17578         /* We can calculate either in 16-bit range and precision or
17579            32-bit range and precision.  Make that decision based on whether
17580            we have native support for the ARMv8.2-A 16-bit floating-point
17581            instructions or not.  */
17582         return (TARGET_FP_F16INST
17583                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
17584                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
17585       case EXCESS_PRECISION_TYPE_IMPLICIT:
17586         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
17587       default:
17588         gcc_unreachable ();
17589     }
17590   return FLT_EVAL_METHOD_UNPREDICTABLE;
17591 }
17592
17593 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
17594    scheduled for speculative execution.  Reject the long-running division
17595    and square-root instructions.  */
17596
17597 static bool
17598 aarch64_sched_can_speculate_insn (rtx_insn *insn)
17599 {
17600   switch (get_attr_type (insn))
17601     {
17602       case TYPE_SDIV:
17603       case TYPE_UDIV:
17604       case TYPE_FDIVS:
17605       case TYPE_FDIVD:
17606       case TYPE_FSQRTS:
17607       case TYPE_FSQRTD:
17608       case TYPE_NEON_FP_SQRT_S:
17609       case TYPE_NEON_FP_SQRT_D:
17610       case TYPE_NEON_FP_SQRT_S_Q:
17611       case TYPE_NEON_FP_SQRT_D_Q:
17612       case TYPE_NEON_FP_DIV_S:
17613       case TYPE_NEON_FP_DIV_D:
17614       case TYPE_NEON_FP_DIV_S_Q:
17615       case TYPE_NEON_FP_DIV_D_Q:
17616         return false;
17617       default:
17618         return true;
17619     }
17620 }
17621
17622 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
17623
17624 static int
17625 aarch64_compute_pressure_classes (reg_class *classes)
17626 {
17627   int i = 0;
17628   classes[i++] = GENERAL_REGS;
17629   classes[i++] = FP_REGS;
17630   /* PR_REGS isn't a useful pressure class because many predicate pseudo
17631      registers need to go in PR_LO_REGS at some point during their
17632      lifetime.  Splitting it into two halves has the effect of making
17633      all predicates count against PR_LO_REGS, so that we try whenever
17634      possible to restrict the number of live predicates to 8.  This
17635      greatly reduces the amount of spilling in certain loops.  */
17636   classes[i++] = PR_LO_REGS;
17637   classes[i++] = PR_HI_REGS;
17638   return i;
17639 }
17640
17641 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
17642
17643 static bool
17644 aarch64_can_change_mode_class (machine_mode from,
17645                                machine_mode to, reg_class_t)
17646 {
17647   if (BYTES_BIG_ENDIAN)
17648     {
17649       bool from_sve_p = aarch64_sve_data_mode_p (from);
17650       bool to_sve_p = aarch64_sve_data_mode_p (to);
17651
17652       /* Don't allow changes between SVE data modes and non-SVE modes.
17653          See the comment at the head of aarch64-sve.md for details.  */
17654       if (from_sve_p != to_sve_p)
17655         return false;
17656
17657       /* Don't allow changes in element size: lane 0 of the new vector
17658          would not then be lane 0 of the old vector.  See the comment
17659          above aarch64_maybe_expand_sve_subreg_move for a more detailed
17660          description.
17661
17662          In the worst case, this forces a register to be spilled in
17663          one mode and reloaded in the other, which handles the
17664          endianness correctly.  */
17665       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
17666         return false;
17667     }
17668   return true;
17669 }
17670
17671 /* Implement TARGET_EARLY_REMAT_MODES.  */
17672
17673 static void
17674 aarch64_select_early_remat_modes (sbitmap modes)
17675 {
17676   /* SVE values are not normally live across a call, so it should be
17677      worth doing early rematerialization even in VL-specific mode.  */
17678   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
17679     {
17680       machine_mode mode = (machine_mode) i;
17681       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
17682       if (vec_flags & VEC_ANY_SVE)
17683         bitmap_set_bit (modes, i);
17684     }
17685 }
17686
17687 /* Target-specific selftests.  */
17688
17689 #if CHECKING_P
17690
17691 namespace selftest {
17692
17693 /* Selftest for the RTL loader.
17694    Verify that the RTL loader copes with a dump from
17695    print_rtx_function.  This is essentially just a test that class
17696    function_reader can handle a real dump, but it also verifies
17697    that lookup_reg_by_dump_name correctly handles hard regs.
17698    The presence of hard reg names in the dump means that the test is
17699    target-specific, hence it is in this file.  */
17700
17701 static void
17702 aarch64_test_loading_full_dump ()
17703 {
17704   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
17705
17706   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
17707
17708   rtx_insn *insn_1 = get_insn_by_uid (1);
17709   ASSERT_EQ (NOTE, GET_CODE (insn_1));
17710
17711   rtx_insn *insn_15 = get_insn_by_uid (15);
17712   ASSERT_EQ (INSN, GET_CODE (insn_15));
17713   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
17714
17715   /* Verify crtl->return_rtx.  */
17716   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
17717   ASSERT_EQ (0, REGNO (crtl->return_rtx));
17718   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
17719 }
17720
17721 /* Run all target-specific selftests.  */
17722
17723 static void
17724 aarch64_run_selftests (void)
17725 {
17726   aarch64_test_loading_full_dump ();
17727 }
17728
17729 } // namespace selftest
17730
17731 #endif /* #if CHECKING_P */
17732
17733 #undef TARGET_ADDRESS_COST
17734 #define TARGET_ADDRESS_COST aarch64_address_cost
17735
17736 /* This hook will determines whether unnamed bitfields affect the alignment
17737    of the containing structure.  The hook returns true if the structure
17738    should inherit the alignment requirements of an unnamed bitfield's
17739    type.  */
17740 #undef TARGET_ALIGN_ANON_BITFIELD
17741 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
17742
17743 #undef TARGET_ASM_ALIGNED_DI_OP
17744 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
17745
17746 #undef TARGET_ASM_ALIGNED_HI_OP
17747 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
17748
17749 #undef TARGET_ASM_ALIGNED_SI_OP
17750 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
17751
17752 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
17753 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
17754   hook_bool_const_tree_hwi_hwi_const_tree_true
17755
17756 #undef TARGET_ASM_FILE_START
17757 #define TARGET_ASM_FILE_START aarch64_start_file
17758
17759 #undef TARGET_ASM_OUTPUT_MI_THUNK
17760 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
17761
17762 #undef TARGET_ASM_SELECT_RTX_SECTION
17763 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
17764
17765 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
17766 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
17767
17768 #undef TARGET_BUILD_BUILTIN_VA_LIST
17769 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
17770
17771 #undef TARGET_CALLEE_COPIES
17772 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
17773
17774 #undef TARGET_CAN_ELIMINATE
17775 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
17776
17777 #undef TARGET_CAN_INLINE_P
17778 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
17779
17780 #undef TARGET_CANNOT_FORCE_CONST_MEM
17781 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
17782
17783 #undef TARGET_CASE_VALUES_THRESHOLD
17784 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
17785
17786 #undef TARGET_CONDITIONAL_REGISTER_USAGE
17787 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
17788
17789 /* Only the least significant bit is used for initialization guard
17790    variables.  */
17791 #undef TARGET_CXX_GUARD_MASK_BIT
17792 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
17793
17794 #undef TARGET_C_MODE_FOR_SUFFIX
17795 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
17796
17797 #ifdef TARGET_BIG_ENDIAN_DEFAULT
17798 #undef  TARGET_DEFAULT_TARGET_FLAGS
17799 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
17800 #endif
17801
17802 #undef TARGET_CLASS_MAX_NREGS
17803 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
17804
17805 #undef TARGET_BUILTIN_DECL
17806 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
17807
17808 #undef TARGET_BUILTIN_RECIPROCAL
17809 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
17810
17811 #undef TARGET_C_EXCESS_PRECISION
17812 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
17813
17814 #undef  TARGET_EXPAND_BUILTIN
17815 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
17816
17817 #undef TARGET_EXPAND_BUILTIN_VA_START
17818 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
17819
17820 #undef TARGET_FOLD_BUILTIN
17821 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
17822
17823 #undef TARGET_FUNCTION_ARG
17824 #define TARGET_FUNCTION_ARG aarch64_function_arg
17825
17826 #undef TARGET_FUNCTION_ARG_ADVANCE
17827 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
17828
17829 #undef TARGET_FUNCTION_ARG_BOUNDARY
17830 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
17831
17832 #undef TARGET_FUNCTION_ARG_PADDING
17833 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
17834
17835 #undef TARGET_GET_RAW_RESULT_MODE
17836 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
17837 #undef TARGET_GET_RAW_ARG_MODE
17838 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
17839
17840 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
17841 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
17842
17843 #undef TARGET_FUNCTION_VALUE
17844 #define TARGET_FUNCTION_VALUE aarch64_function_value
17845
17846 #undef TARGET_FUNCTION_VALUE_REGNO_P
17847 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
17848
17849 #undef TARGET_GIMPLE_FOLD_BUILTIN
17850 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
17851
17852 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
17853 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
17854
17855 #undef  TARGET_INIT_BUILTINS
17856 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
17857
17858 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
17859 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
17860   aarch64_ira_change_pseudo_allocno_class
17861
17862 #undef TARGET_LEGITIMATE_ADDRESS_P
17863 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
17864
17865 #undef TARGET_LEGITIMATE_CONSTANT_P
17866 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
17867
17868 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
17869 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
17870   aarch64_legitimize_address_displacement
17871
17872 #undef TARGET_LIBGCC_CMP_RETURN_MODE
17873 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
17874
17875 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
17876 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
17877 aarch64_libgcc_floating_mode_supported_p
17878
17879 #undef TARGET_MANGLE_TYPE
17880 #define TARGET_MANGLE_TYPE aarch64_mangle_type
17881
17882 #undef TARGET_MEMORY_MOVE_COST
17883 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
17884
17885 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
17886 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
17887
17888 #undef TARGET_MUST_PASS_IN_STACK
17889 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
17890
17891 /* This target hook should return true if accesses to volatile bitfields
17892    should use the narrowest mode possible.  It should return false if these
17893    accesses should use the bitfield container type.  */
17894 #undef TARGET_NARROW_VOLATILE_BITFIELD
17895 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
17896
17897 #undef  TARGET_OPTION_OVERRIDE
17898 #define TARGET_OPTION_OVERRIDE aarch64_override_options
17899
17900 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
17901 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
17902   aarch64_override_options_after_change
17903
17904 #undef TARGET_OPTION_SAVE
17905 #define TARGET_OPTION_SAVE aarch64_option_save
17906
17907 #undef TARGET_OPTION_RESTORE
17908 #define TARGET_OPTION_RESTORE aarch64_option_restore
17909
17910 #undef TARGET_OPTION_PRINT
17911 #define TARGET_OPTION_PRINT aarch64_option_print
17912
17913 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
17914 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
17915
17916 #undef TARGET_SET_CURRENT_FUNCTION
17917 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
17918
17919 #undef TARGET_PASS_BY_REFERENCE
17920 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
17921
17922 #undef TARGET_PREFERRED_RELOAD_CLASS
17923 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
17924
17925 #undef TARGET_SCHED_REASSOCIATION_WIDTH
17926 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
17927
17928 #undef TARGET_PROMOTED_TYPE
17929 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
17930
17931 #undef TARGET_SECONDARY_RELOAD
17932 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
17933
17934 #undef TARGET_SHIFT_TRUNCATION_MASK
17935 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
17936
17937 #undef TARGET_SETUP_INCOMING_VARARGS
17938 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
17939
17940 #undef TARGET_STRUCT_VALUE_RTX
17941 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
17942
17943 #undef TARGET_REGISTER_MOVE_COST
17944 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
17945
17946 #undef TARGET_RETURN_IN_MEMORY
17947 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
17948
17949 #undef TARGET_RETURN_IN_MSB
17950 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
17951
17952 #undef TARGET_RTX_COSTS
17953 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
17954
17955 #undef TARGET_SCALAR_MODE_SUPPORTED_P
17956 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
17957
17958 #undef TARGET_SCHED_ISSUE_RATE
17959 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
17960
17961 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
17962 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
17963   aarch64_sched_first_cycle_multipass_dfa_lookahead
17964
17965 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
17966 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
17967   aarch64_first_cycle_multipass_dfa_lookahead_guard
17968
17969 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
17970 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
17971   aarch64_get_separate_components
17972
17973 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
17974 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
17975   aarch64_components_for_bb
17976
17977 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
17978 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
17979   aarch64_disqualify_components
17980
17981 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
17982 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
17983   aarch64_emit_prologue_components
17984
17985 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
17986 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
17987   aarch64_emit_epilogue_components
17988
17989 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
17990 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
17991   aarch64_set_handled_components
17992
17993 #undef TARGET_TRAMPOLINE_INIT
17994 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
17995
17996 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
17997 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
17998
17999 #undef TARGET_VECTOR_MODE_SUPPORTED_P
18000 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
18001
18002 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
18003 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
18004   aarch64_builtin_support_vector_misalignment
18005
18006 #undef TARGET_ARRAY_MODE
18007 #define TARGET_ARRAY_MODE aarch64_array_mode
18008
18009 #undef TARGET_ARRAY_MODE_SUPPORTED_P
18010 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
18011
18012 #undef TARGET_VECTORIZE_ADD_STMT_COST
18013 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
18014
18015 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
18016 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
18017   aarch64_builtin_vectorization_cost
18018
18019 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
18020 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
18021
18022 #undef TARGET_VECTORIZE_BUILTINS
18023 #define TARGET_VECTORIZE_BUILTINS
18024
18025 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
18026 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
18027   aarch64_builtin_vectorized_function
18028
18029 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
18030 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
18031   aarch64_autovectorize_vector_sizes
18032
18033 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
18034 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
18035   aarch64_atomic_assign_expand_fenv
18036
18037 /* Section anchor support.  */
18038
18039 #undef TARGET_MIN_ANCHOR_OFFSET
18040 #define TARGET_MIN_ANCHOR_OFFSET -256
18041
18042 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
18043    byte offset; we can do much more for larger data types, but have no way
18044    to determine the size of the access.  We assume accesses are aligned.  */
18045 #undef TARGET_MAX_ANCHOR_OFFSET
18046 #define TARGET_MAX_ANCHOR_OFFSET 4095
18047
18048 #undef TARGET_VECTOR_ALIGNMENT
18049 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
18050
18051 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
18052 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
18053   aarch64_vectorize_preferred_vector_alignment
18054 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
18055 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
18056   aarch64_simd_vector_alignment_reachable
18057
18058 /* vec_perm support.  */
18059
18060 #undef TARGET_VECTORIZE_VEC_PERM_CONST
18061 #define TARGET_VECTORIZE_VEC_PERM_CONST \
18062   aarch64_vectorize_vec_perm_const
18063
18064 #undef TARGET_VECTORIZE_GET_MASK_MODE
18065 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
18066 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
18067 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
18068   aarch64_empty_mask_is_expensive
18069
18070 #undef TARGET_INIT_LIBFUNCS
18071 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
18072
18073 #undef TARGET_FIXED_CONDITION_CODE_REGS
18074 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
18075
18076 #undef TARGET_FLAGS_REGNUM
18077 #define TARGET_FLAGS_REGNUM CC_REGNUM
18078
18079 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
18080 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
18081
18082 #undef TARGET_ASAN_SHADOW_OFFSET
18083 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
18084
18085 #undef TARGET_LEGITIMIZE_ADDRESS
18086 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
18087
18088 #undef TARGET_SCHED_CAN_SPECULATE_INSN
18089 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
18090
18091 #undef TARGET_CAN_USE_DOLOOP_P
18092 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
18093
18094 #undef TARGET_SCHED_ADJUST_PRIORITY
18095 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
18096
18097 #undef TARGET_SCHED_MACRO_FUSION_P
18098 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
18099
18100 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
18101 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
18102
18103 #undef TARGET_SCHED_FUSION_PRIORITY
18104 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
18105
18106 #undef TARGET_UNSPEC_MAY_TRAP_P
18107 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
18108
18109 #undef TARGET_USE_PSEUDO_PIC_REG
18110 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
18111
18112 #undef TARGET_PRINT_OPERAND
18113 #define TARGET_PRINT_OPERAND aarch64_print_operand
18114
18115 #undef TARGET_PRINT_OPERAND_ADDRESS
18116 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
18117
18118 #undef TARGET_OPTAB_SUPPORTED_P
18119 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
18120
18121 #undef TARGET_OMIT_STRUCT_RETURN_REG
18122 #define TARGET_OMIT_STRUCT_RETURN_REG true
18123
18124 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
18125 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
18126   aarch64_dwarf_poly_indeterminate_value
18127
18128 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
18129 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
18130 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
18131
18132 #undef TARGET_HARD_REGNO_NREGS
18133 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
18134 #undef TARGET_HARD_REGNO_MODE_OK
18135 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
18136
18137 #undef TARGET_MODES_TIEABLE_P
18138 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
18139
18140 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
18141 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
18142   aarch64_hard_regno_call_part_clobbered
18143
18144 #undef TARGET_CONSTANT_ALIGNMENT
18145 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
18146
18147 #undef TARGET_COMPUTE_PRESSURE_CLASSES
18148 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
18149
18150 #undef TARGET_CAN_CHANGE_MODE_CLASS
18151 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
18152
18153 #undef TARGET_SELECT_EARLY_REMAT_MODES
18154 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
18155
18156 #if CHECKING_P
18157 #undef TARGET_RUN_TARGET_SELFTESTS
18158 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
18159 #endif /* #if CHECKING_P */
18160
18161 struct gcc_target targetm = TARGET_INITIALIZER;
18162
18163 #include "gt-aarch64.h"