gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2018 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #include "config.h"
  24 #define INCLUDE_STRING
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "memmodel.h"
  32 #include "gimple.h"
  33 #include "cfghooks.h"
  34 #include "cfgloop.h"
  35 #include "df.h"
  36 #include "tm_p.h"
  37 #include "stringpool.h"
  38 #include "attribs.h"
  39 #include "optabs.h"
  40 #include "regs.h"
  41 #include "emit-rtl.h"
  42 #include "recog.h"
  43 #include "diagnostic.h"
  44 #include "insn-attr.h"
  45 #include "alias.h"
  46 #include "fold-const.h"
  47 #include "stor-layout.h"
  48 #include "calls.h"
  49 #include "varasm.h"
  50 #include "output.h"
  51 #include "flags.h"
  52 #include "explow.h"
  53 #include "expr.h"
  54 #include "reload.h"
  55 #include "langhooks.h"
  56 #include "opts.h"
  57 #include "params.h"
  58 #include "gimplify.h"
  59 #include "dwarf2.h"
  60 #include "gimple-iterator.h"
  61 #include "tree-vectorizer.h"
  62 #include "aarch64-cost-tables.h"
  63 #include "dumpfile.h"
  64 #include "builtins.h"
  65 #include "rtl-iter.h"
  66 #include "tm-constrs.h"
  67 #include "sched-int.h"
  68 #include "target-globals.h"
  69 #include "common/common-target.h"
  70 #include "cfgrtl.h"
  71 #include "selftest.h"
  72 #include "selftest-rtl.h"
  73 #include "rtx-vector-builder.h"
  74
  75 /* This file should be included last.  */
  76 #include "target-def.h"
  77
  78 /* Defined for convenience.  */
  79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  80
  81 /* Classifies an address.
  82
  83    ADDRESS_REG_IMM
  84        A simple base register plus immediate offset.
  85
  86    ADDRESS_REG_WB
  87        A base register indexed by immediate offset with writeback.
  88
  89    ADDRESS_REG_REG
  90        A base register indexed by (optionally scaled) register.
  91
  92    ADDRESS_REG_UXTW
  93        A base register indexed by (optionally scaled) zero-extended register.
  94
  95    ADDRESS_REG_SXTW
  96        A base register indexed by (optionally scaled) sign-extended register.
  97
  98    ADDRESS_LO_SUM
  99        A LO_SUM rtx with a base register and "LO12" symbol relocation.
 100
 101    ADDRESS_SYMBOLIC:
 102        A constant symbolic address, in pc-relative literal pool.  */
 103
 104 enum aarch64_address_type {
 105   ADDRESS_REG_IMM,
 106   ADDRESS_REG_WB,
 107   ADDRESS_REG_REG,
 108   ADDRESS_REG_UXTW,
 109   ADDRESS_REG_SXTW,
 110   ADDRESS_LO_SUM,
 111   ADDRESS_SYMBOLIC
 112 };
 113
 114 struct aarch64_address_info {
 115   enum aarch64_address_type type;
 116   rtx base;
 117   rtx offset;
 118   poly_int64 const_offset;
 119   int shift;
 120   enum aarch64_symbol_type symbol_type;
 121 };
 122
 123 /* Information about a legitimate vector immediate operand.  */
 124 struct simd_immediate_info
 125 {
 126   enum insn_type { MOV, MVN };
 127   enum modifier_type { LSL, MSL };
 128
 129   simd_immediate_info () {}
 130   simd_immediate_info (scalar_float_mode, rtx);
 131   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
 132                        insn_type = MOV, modifier_type = LSL,
 133                        unsigned int = 0);
 134   simd_immediate_info (scalar_mode, rtx, rtx);
 135
 136   /* The mode of the elements.  */
 137   scalar_mode elt_mode;
 138
 139   /* The value of each element if all elements are the same, or the
 140      first value if the constant is a series.  */
 141   rtx value;
 142
 143   /* The value of the step if the constant is a series, null otherwise.  */
 144   rtx step;
 145
 146   /* The instruction to use to move the immediate into a vector.  */
 147   insn_type insn;
 148
 149   /* The kind of shift modifier to use, and the number of bits to shift.
 150      This is (LSL, 0) if no shift is needed.  */
 151   modifier_type modifier;
 152   unsigned int shift;
 153 };
 154
 155 /* Construct a floating-point immediate in which each element has mode
 156    ELT_MODE_IN and value VALUE_IN.  */
 157 inline simd_immediate_info
 158 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 159   : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
 160     modifier (LSL), shift (0)
 161 {}
 162
 163 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 164    and value VALUE_IN.  The other parameters are as for the structure
 165    fields.  */
 166 inline simd_immediate_info
 167 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 168                        unsigned HOST_WIDE_INT value_in,
 169                        insn_type insn_in, modifier_type modifier_in,
 170                        unsigned int shift_in)
 171   : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
 172     step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
 173 {}
 174
 175 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 176    and where element I is equal to VALUE_IN + I * STEP_IN.  */
 177 inline simd_immediate_info
 178 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
 179   : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
 180     modifier (LSL), shift (0)
 181 {}
 182
 183 /* The current code model.  */
 184 enum aarch64_code_model aarch64_cmodel;
 185
 186 /* The number of 64-bit elements in an SVE vector.  */
 187 poly_uint16 aarch64_sve_vg;
 188
 189 #ifdef HAVE_AS_TLS
 190 #undef TARGET_HAVE_TLS
 191 #define TARGET_HAVE_TLS 1
 192 #endif
 193
 194 static bool aarch64_composite_type_p (const_tree, machine_mode);
 195 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 196                                                      const_tree,
 197                                                      machine_mode *, int *,
 198                                                      bool *);
 199 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 200 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 201 static void aarch64_override_options_after_change (void);
 202 static bool aarch64_vector_mode_supported_p (machine_mode);
 203 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 204 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 205                                                          const_tree type,
 206                                                          int misalignment,
 207                                                          bool is_packed);
 208 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 209 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 210                                             aarch64_addr_query_type);
 211
 212 /* Major revision number of the ARM Architecture implemented by the target.  */
 213 unsigned aarch64_architecture_version;
 214
 215 /* The processor for which instructions should be scheduled.  */
 216 enum aarch64_processor aarch64_tune = cortexa53;
 217
 218 /* Mask to specify which instruction scheduling options should be used.  */
 219 unsigned long aarch64_tune_flags = 0;
 220
 221 /* Global flag for PC relative loads.  */
 222 bool aarch64_pcrelative_literal_loads;
 223
 224 /* Global flag for whether frame pointer is enabled.  */
 225 bool aarch64_use_frame_pointer;
 226
 227 /* Support for command line parsing of boolean flags in the tuning
 228    structures.  */
 229 struct aarch64_flag_desc
 230 {
 231   const char* name;
 232   unsigned int flag;
 233 };
 234
 235 #define AARCH64_FUSION_PAIR(name, internal_name) \
 236   { name, AARCH64_FUSE_##internal_name },
 237 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 238 {
 239   { "none", AARCH64_FUSE_NOTHING },
 240 #include "aarch64-fusion-pairs.def"
 241   { "all", AARCH64_FUSE_ALL },
 242   { NULL, AARCH64_FUSE_NOTHING }
 243 };
 244
 245 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 246   { name, AARCH64_EXTRA_TUNE_##internal_name },
 247 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 248 {
 249   { "none", AARCH64_EXTRA_TUNE_NONE },
 250 #include "aarch64-tuning-flags.def"
 251   { "all", AARCH64_EXTRA_TUNE_ALL },
 252   { NULL, AARCH64_EXTRA_TUNE_NONE }
 253 };
 254
 255 /* Tuning parameters.  */
 256
 257 static const struct cpu_addrcost_table generic_addrcost_table =
 258 {
 259     {
 260       1, /* hi  */
 261       0, /* si  */
 262       0, /* di  */
 263       1, /* ti  */
 264     },
 265   0, /* pre_modify  */
 266   0, /* post_modify  */
 267   0, /* register_offset  */
 268   0, /* register_sextend  */
 269   0, /* register_zextend  */
 270   0 /* imm_offset  */
 271 };
 272
 273 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 274 {
 275     {
 276       0, /* hi  */
 277       0, /* si  */
 278       0, /* di  */
 279       2, /* ti  */
 280     },
 281   0, /* pre_modify  */
 282   0, /* post_modify  */
 283   1, /* register_offset  */
 284   1, /* register_sextend  */
 285   2, /* register_zextend  */
 286   0, /* imm_offset  */
 287 };
 288
 289 static const struct cpu_addrcost_table xgene1_addrcost_table =
 290 {
 291     {
 292       1, /* hi  */
 293       0, /* si  */
 294       0, /* di  */
 295       1, /* ti  */
 296     },
 297   1, /* pre_modify  */
 298   0, /* post_modify  */
 299   0, /* register_offset  */
 300   1, /* register_sextend  */
 301   1, /* register_zextend  */
 302   0, /* imm_offset  */
 303 };
 304
 305 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 306 {
 307     {
 308       1, /* hi  */
 309       1, /* si  */
 310       1, /* di  */
 311       2, /* ti  */
 312     },
 313   0, /* pre_modify  */
 314   0, /* post_modify  */
 315   2, /* register_offset  */
 316   3, /* register_sextend  */
 317   3, /* register_zextend  */
 318   0, /* imm_offset  */
 319 };
 320
 321 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 322 {
 323     {
 324       1, /* hi  */
 325       1, /* si  */
 326       1, /* di  */
 327       2, /* ti  */
 328     },
 329   1, /* pre_modify  */
 330   1, /* post_modify  */
 331   3, /* register_offset  */
 332   4, /* register_sextend  */
 333   3, /* register_zextend  */
 334   2, /* imm_offset  */
 335 };
 336
 337 static const struct cpu_regmove_cost generic_regmove_cost =
 338 {
 339   1, /* GP2GP  */
 340   /* Avoid the use of slow int<->fp moves for spilling by setting
 341      their cost higher than memmov_cost.  */
 342   5, /* GP2FP  */
 343   5, /* FP2GP  */
 344   2 /* FP2FP  */
 345 };
 346
 347 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 348 {
 349   1, /* GP2GP  */
 350   /* Avoid the use of slow int<->fp moves for spilling by setting
 351      their cost higher than memmov_cost.  */
 352   5, /* GP2FP  */
 353   5, /* FP2GP  */
 354   2 /* FP2FP  */
 355 };
 356
 357 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 358 {
 359   1, /* GP2GP  */
 360   /* Avoid the use of slow int<->fp moves for spilling by setting
 361      their cost higher than memmov_cost.  */
 362   5, /* GP2FP  */
 363   5, /* FP2GP  */
 364   2 /* FP2FP  */
 365 };
 366
 367 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 368 {
 369   1, /* GP2GP  */
 370   /* Avoid the use of slow int<->fp moves for spilling by setting
 371      their cost higher than memmov_cost (actual, 4 and 9).  */
 372   9, /* GP2FP  */
 373   9, /* FP2GP  */
 374   1 /* FP2FP  */
 375 };
 376
 377 static const struct cpu_regmove_cost thunderx_regmove_cost =
 378 {
 379   2, /* GP2GP  */
 380   2, /* GP2FP  */
 381   6, /* FP2GP  */
 382   4 /* FP2FP  */
 383 };
 384
 385 static const struct cpu_regmove_cost xgene1_regmove_cost =
 386 {
 387   1, /* GP2GP  */
 388   /* Avoid the use of slow int<->fp moves for spilling by setting
 389      their cost higher than memmov_cost.  */
 390   8, /* GP2FP  */
 391   8, /* FP2GP  */
 392   2 /* FP2FP  */
 393 };
 394
 395 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 396 {
 397   2, /* GP2GP  */
 398   /* Avoid the use of int<->fp moves for spilling.  */
 399   6, /* GP2FP  */
 400   6, /* FP2GP  */
 401   4 /* FP2FP  */
 402 };
 403
 404 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 405 {
 406   1, /* GP2GP  */
 407   /* Avoid the use of int<->fp moves for spilling.  */
 408   8, /* GP2FP  */
 409   8, /* FP2GP  */
 410   4  /* FP2FP  */
 411 };
 412
 413 /* Generic costs for vector insn classes.  */
 414 static const struct cpu_vector_cost generic_vector_cost =
 415 {
 416   1, /* scalar_int_stmt_cost  */
 417   1, /* scalar_fp_stmt_cost  */
 418   1, /* scalar_load_cost  */
 419   1, /* scalar_store_cost  */
 420   1, /* vec_int_stmt_cost  */
 421   1, /* vec_fp_stmt_cost  */
 422   2, /* vec_permute_cost  */
 423   1, /* vec_to_scalar_cost  */
 424   1, /* scalar_to_vec_cost  */
 425   1, /* vec_align_load_cost  */
 426   1, /* vec_unalign_load_cost  */
 427   1, /* vec_unalign_store_cost  */
 428   1, /* vec_store_cost  */
 429   3, /* cond_taken_branch_cost  */
 430   1 /* cond_not_taken_branch_cost  */
 431 };
 432
 433 /* ThunderX costs for vector insn classes.  */
 434 static const struct cpu_vector_cost thunderx_vector_cost =
 435 {
 436   1, /* scalar_int_stmt_cost  */
 437   1, /* scalar_fp_stmt_cost  */
 438   3, /* scalar_load_cost  */
 439   1, /* scalar_store_cost  */
 440   4, /* vec_int_stmt_cost  */
 441   1, /* vec_fp_stmt_cost  */
 442   4, /* vec_permute_cost  */
 443   2, /* vec_to_scalar_cost  */
 444   2, /* scalar_to_vec_cost  */
 445   3, /* vec_align_load_cost  */
 446   5, /* vec_unalign_load_cost  */
 447   5, /* vec_unalign_store_cost  */
 448   1, /* vec_store_cost  */
 449   3, /* cond_taken_branch_cost  */
 450   3 /* cond_not_taken_branch_cost  */
 451 };
 452
 453 /* Generic costs for vector insn classes.  */
 454 static const struct cpu_vector_cost cortexa57_vector_cost =
 455 {
 456   1, /* scalar_int_stmt_cost  */
 457   1, /* scalar_fp_stmt_cost  */
 458   4, /* scalar_load_cost  */
 459   1, /* scalar_store_cost  */
 460   2, /* vec_int_stmt_cost  */
 461   2, /* vec_fp_stmt_cost  */
 462   3, /* vec_permute_cost  */
 463   8, /* vec_to_scalar_cost  */
 464   8, /* scalar_to_vec_cost  */
 465   4, /* vec_align_load_cost  */
 466   4, /* vec_unalign_load_cost  */
 467   1, /* vec_unalign_store_cost  */
 468   1, /* vec_store_cost  */
 469   1, /* cond_taken_branch_cost  */
 470   1 /* cond_not_taken_branch_cost  */
 471 };
 472
 473 static const struct cpu_vector_cost exynosm1_vector_cost =
 474 {
 475   1, /* scalar_int_stmt_cost  */
 476   1, /* scalar_fp_stmt_cost  */
 477   5, /* scalar_load_cost  */
 478   1, /* scalar_store_cost  */
 479   3, /* vec_int_stmt_cost  */
 480   3, /* vec_fp_stmt_cost  */
 481   3, /* vec_permute_cost  */
 482   3, /* vec_to_scalar_cost  */
 483   3, /* scalar_to_vec_cost  */
 484   5, /* vec_align_load_cost  */
 485   5, /* vec_unalign_load_cost  */
 486   1, /* vec_unalign_store_cost  */
 487   1, /* vec_store_cost  */
 488   1, /* cond_taken_branch_cost  */
 489   1 /* cond_not_taken_branch_cost  */
 490 };
 491
 492 /* Generic costs for vector insn classes.  */
 493 static const struct cpu_vector_cost xgene1_vector_cost =
 494 {
 495   1, /* scalar_int_stmt_cost  */
 496   1, /* scalar_fp_stmt_cost  */
 497   5, /* scalar_load_cost  */
 498   1, /* scalar_store_cost  */
 499   2, /* vec_int_stmt_cost  */
 500   2, /* vec_fp_stmt_cost  */
 501   2, /* vec_permute_cost  */
 502   4, /* vec_to_scalar_cost  */
 503   4, /* scalar_to_vec_cost  */
 504   10, /* vec_align_load_cost  */
 505   10, /* vec_unalign_load_cost  */
 506   2, /* vec_unalign_store_cost  */
 507   2, /* vec_store_cost  */
 508   2, /* cond_taken_branch_cost  */
 509   1 /* cond_not_taken_branch_cost  */
 510 };
 511
 512 /* Costs for vector insn classes for Vulcan.  */
 513 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 514 {
 515   1, /* scalar_int_stmt_cost  */
 516   6, /* scalar_fp_stmt_cost  */
 517   4, /* scalar_load_cost  */
 518   1, /* scalar_store_cost  */
 519   5, /* vec_int_stmt_cost  */
 520   6, /* vec_fp_stmt_cost  */
 521   3, /* vec_permute_cost  */
 522   6, /* vec_to_scalar_cost  */
 523   5, /* scalar_to_vec_cost  */
 524   8, /* vec_align_load_cost  */
 525   8, /* vec_unalign_load_cost  */
 526   4, /* vec_unalign_store_cost  */
 527   4, /* vec_store_cost  */
 528   2, /* cond_taken_branch_cost  */
 529   1  /* cond_not_taken_branch_cost  */
 530 };
 531
 532 /* Generic costs for branch instructions.  */
 533 static const struct cpu_branch_cost generic_branch_cost =
 534 {
 535   1,  /* Predictable.  */
 536   3   /* Unpredictable.  */
 537 };
 538
 539 /* Generic approximation modes.  */
 540 static const cpu_approx_modes generic_approx_modes =
 541 {
 542   AARCH64_APPROX_NONE,  /* division  */
 543   AARCH64_APPROX_NONE,  /* sqrt  */
 544   AARCH64_APPROX_NONE   /* recip_sqrt  */
 545 };
 546
 547 /* Approximation modes for Exynos M1.  */
 548 static const cpu_approx_modes exynosm1_approx_modes =
 549 {
 550   AARCH64_APPROX_NONE,  /* division  */
 551   AARCH64_APPROX_ALL,   /* sqrt  */
 552   AARCH64_APPROX_ALL    /* recip_sqrt  */
 553 };
 554
 555 /* Approximation modes for X-Gene 1.  */
 556 static const cpu_approx_modes xgene1_approx_modes =
 557 {
 558   AARCH64_APPROX_NONE,  /* division  */
 559   AARCH64_APPROX_NONE,  /* sqrt  */
 560   AARCH64_APPROX_ALL    /* recip_sqrt  */
 561 };
 562
 563 /* Generic prefetch settings (which disable prefetch).  */
 564 static const cpu_prefetch_tune generic_prefetch_tune =
 565 {
 566   0,                    /* num_slots  */
 567   -1,                   /* l1_cache_size  */
 568   -1,                   /* l1_cache_line_size  */
 569   -1,                   /* l2_cache_size  */
 570   true,                 /* prefetch_dynamic_strides */
 571   -1,                   /* minimum_stride */
 572   -1                    /* default_opt_level  */
 573 };
 574
 575 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 576 {
 577   0,                    /* num_slots  */
 578   -1,                   /* l1_cache_size  */
 579   64,                   /* l1_cache_line_size  */
 580   -1,                   /* l2_cache_size  */
 581   true,                 /* prefetch_dynamic_strides */
 582   -1,                   /* minimum_stride */
 583   -1                    /* default_opt_level  */
 584 };
 585
 586 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 587 {
 588   4,                    /* num_slots  */
 589   32,                   /* l1_cache_size  */
 590   64,                   /* l1_cache_line_size  */
 591   512,                  /* l2_cache_size  */
 592   false,                /* prefetch_dynamic_strides */
 593   2048,                 /* minimum_stride */
 594   3                     /* default_opt_level  */
 595 };
 596
 597 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 598 {
 599   8,                    /* num_slots  */
 600   32,                   /* l1_cache_size  */
 601   128,                  /* l1_cache_line_size  */
 602   16*1024,              /* l2_cache_size  */
 603   true,                 /* prefetch_dynamic_strides */
 604   -1,                   /* minimum_stride */
 605   3                     /* default_opt_level  */
 606 };
 607
 608 static const cpu_prefetch_tune thunderx_prefetch_tune =
 609 {
 610   8,                    /* num_slots  */
 611   32,                   /* l1_cache_size  */
 612   128,                  /* l1_cache_line_size  */
 613   -1,                   /* l2_cache_size  */
 614   true,                 /* prefetch_dynamic_strides */
 615   -1,                   /* minimum_stride */
 616   -1                    /* default_opt_level  */
 617 };
 618
 619 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 620 {
 621   8,                    /* num_slots  */
 622   32,                   /* l1_cache_size  */
 623   64,                   /* l1_cache_line_size  */
 624   256,                  /* l2_cache_size  */
 625   true,                 /* prefetch_dynamic_strides */
 626   -1,                   /* minimum_stride */
 627   -1                    /* default_opt_level  */
 628 };
 629
 630 static const struct tune_params generic_tunings =
 631 {
 632   &cortexa57_extra_costs,
 633   &generic_addrcost_table,
 634   &generic_regmove_cost,
 635   &generic_vector_cost,
 636   &generic_branch_cost,
 637   &generic_approx_modes,
 638   4, /* memmov_cost  */
 639   2, /* issue_rate  */
 640   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 641   "8",  /* function_align.  */
 642   "4",  /* jump_align.  */
 643   "8",  /* loop_align.  */
 644   2,    /* int_reassoc_width.  */
 645   4,    /* fp_reassoc_width.  */
 646   1,    /* vec_reassoc_width.  */
 647   2,    /* min_div_recip_mul_sf.  */
 648   2,    /* min_div_recip_mul_df.  */
 649   0,    /* max_case_values.  */
 650   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 651   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 652   &generic_prefetch_tune
 653 };
 654
 655 static const struct tune_params cortexa35_tunings =
 656 {
 657   &cortexa53_extra_costs,
 658   &generic_addrcost_table,
 659   &cortexa53_regmove_cost,
 660   &generic_vector_cost,
 661   &generic_branch_cost,
 662   &generic_approx_modes,
 663   4, /* memmov_cost  */
 664   1, /* issue_rate  */
 665   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 666    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 667   "16", /* function_align.  */
 668   "4",  /* jump_align.  */
 669   "8",  /* loop_align.  */
 670   2,    /* int_reassoc_width.  */
 671   4,    /* fp_reassoc_width.  */
 672   1,    /* vec_reassoc_width.  */
 673   2,    /* min_div_recip_mul_sf.  */
 674   2,    /* min_div_recip_mul_df.  */
 675   0,    /* max_case_values.  */
 676   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 677   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 678   &generic_prefetch_tune
 679 };
 680
 681 static const struct tune_params cortexa53_tunings =
 682 {
 683   &cortexa53_extra_costs,
 684   &generic_addrcost_table,
 685   &cortexa53_regmove_cost,
 686   &generic_vector_cost,
 687   &generic_branch_cost,
 688   &generic_approx_modes,
 689   4, /* memmov_cost  */
 690   2, /* issue_rate  */
 691   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 692    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 693   "16", /* function_align.  */
 694   "4",  /* jump_align.  */
 695   "8",  /* loop_align.  */
 696   2,    /* int_reassoc_width.  */
 697   4,    /* fp_reassoc_width.  */
 698   1,    /* vec_reassoc_width.  */
 699   2,    /* min_div_recip_mul_sf.  */
 700   2,    /* min_div_recip_mul_df.  */
 701   0,    /* max_case_values.  */
 702   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 703   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 704   &generic_prefetch_tune
 705 };
 706
 707 static const struct tune_params cortexa57_tunings =
 708 {
 709   &cortexa57_extra_costs,
 710   &generic_addrcost_table,
 711   &cortexa57_regmove_cost,
 712   &cortexa57_vector_cost,
 713   &generic_branch_cost,
 714   &generic_approx_modes,
 715   4, /* memmov_cost  */
 716   3, /* issue_rate  */
 717   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 718    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 719   "16", /* function_align.  */
 720   "4",  /* jump_align.  */
 721   "8",  /* loop_align.  */
 722   2,    /* int_reassoc_width.  */
 723   4,    /* fp_reassoc_width.  */
 724   1,    /* vec_reassoc_width.  */
 725   2,    /* min_div_recip_mul_sf.  */
 726   2,    /* min_div_recip_mul_df.  */
 727   0,    /* max_case_values.  */
 728   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 729   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 730   &generic_prefetch_tune
 731 };
 732
 733 static const struct tune_params cortexa72_tunings =
 734 {
 735   &cortexa57_extra_costs,
 736   &generic_addrcost_table,
 737   &cortexa57_regmove_cost,
 738   &cortexa57_vector_cost,
 739   &generic_branch_cost,
 740   &generic_approx_modes,
 741   4, /* memmov_cost  */
 742   3, /* issue_rate  */
 743   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 744    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 745   "16", /* function_align.  */
 746   "4",  /* jump_align.  */
 747   "8",  /* loop_align.  */
 748   2,    /* int_reassoc_width.  */
 749   4,    /* fp_reassoc_width.  */
 750   1,    /* vec_reassoc_width.  */
 751   2,    /* min_div_recip_mul_sf.  */
 752   2,    /* min_div_recip_mul_df.  */
 753   0,    /* max_case_values.  */
 754   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 755   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 756   &generic_prefetch_tune
 757 };
 758
 759 static const struct tune_params cortexa73_tunings =
 760 {
 761   &cortexa57_extra_costs,
 762   &generic_addrcost_table,
 763   &cortexa57_regmove_cost,
 764   &cortexa57_vector_cost,
 765   &generic_branch_cost,
 766   &generic_approx_modes,
 767   4, /* memmov_cost.  */
 768   2, /* issue_rate.  */
 769   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 770    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 771   "16", /* function_align.  */
 772   "4",  /* jump_align.  */
 773   "8",  /* loop_align.  */
 774   2,    /* int_reassoc_width.  */
 775   4,    /* fp_reassoc_width.  */
 776   1,    /* vec_reassoc_width.  */
 777   2,    /* min_div_recip_mul_sf.  */
 778   2,    /* min_div_recip_mul_df.  */
 779   0,    /* max_case_values.  */
 780   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 781   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 782   &generic_prefetch_tune
 783 };
 784
 785
 786
 787 static const struct tune_params exynosm1_tunings =
 788 {
 789   &exynosm1_extra_costs,
 790   &exynosm1_addrcost_table,
 791   &exynosm1_regmove_cost,
 792   &exynosm1_vector_cost,
 793   &generic_branch_cost,
 794   &exynosm1_approx_modes,
 795   4,    /* memmov_cost  */
 796   3,    /* issue_rate  */
 797   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 798   "4",  /* function_align.  */
 799   "4",  /* jump_align.  */
 800   "4",  /* loop_align.  */
 801   2,    /* int_reassoc_width.  */
 802   4,    /* fp_reassoc_width.  */
 803   1,    /* vec_reassoc_width.  */
 804   2,    /* min_div_recip_mul_sf.  */
 805   2,    /* min_div_recip_mul_df.  */
 806   48,   /* max_case_values.  */
 807   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 808   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 809   &exynosm1_prefetch_tune
 810 };
 811
 812 static const struct tune_params thunderxt88_tunings =
 813 {
 814   &thunderx_extra_costs,
 815   &generic_addrcost_table,
 816   &thunderx_regmove_cost,
 817   &thunderx_vector_cost,
 818   &generic_branch_cost,
 819   &generic_approx_modes,
 820   6, /* memmov_cost  */
 821   2, /* issue_rate  */
 822   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 823   "8",  /* function_align.  */
 824   "8",  /* jump_align.  */
 825   "8",  /* loop_align.  */
 826   2,    /* int_reassoc_width.  */
 827   4,    /* fp_reassoc_width.  */
 828   1,    /* vec_reassoc_width.  */
 829   2,    /* min_div_recip_mul_sf.  */
 830   2,    /* min_div_recip_mul_df.  */
 831   0,    /* max_case_values.  */
 832   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 833   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 834   &thunderxt88_prefetch_tune
 835 };
 836
 837 static const struct tune_params thunderx_tunings =
 838 {
 839   &thunderx_extra_costs,
 840   &generic_addrcost_table,
 841   &thunderx_regmove_cost,
 842   &thunderx_vector_cost,
 843   &generic_branch_cost,
 844   &generic_approx_modes,
 845   6, /* memmov_cost  */
 846   2, /* issue_rate  */
 847   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 848   "8",  /* function_align.  */
 849   "8",  /* jump_align.  */
 850   "8",  /* loop_align.  */
 851   2,    /* int_reassoc_width.  */
 852   4,    /* fp_reassoc_width.  */
 853   1,    /* vec_reassoc_width.  */
 854   2,    /* min_div_recip_mul_sf.  */
 855   2,    /* min_div_recip_mul_df.  */
 856   0,    /* max_case_values.  */
 857   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 858   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 859    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 860   &thunderx_prefetch_tune
 861 };
 862
 863 static const struct tune_params xgene1_tunings =
 864 {
 865   &xgene1_extra_costs,
 866   &xgene1_addrcost_table,
 867   &xgene1_regmove_cost,
 868   &xgene1_vector_cost,
 869   &generic_branch_cost,
 870   &xgene1_approx_modes,
 871   6, /* memmov_cost  */
 872   4, /* issue_rate  */
 873   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 874   "16", /* function_align.  */
 875   "8",  /* jump_align.  */
 876   "16", /* loop_align.  */
 877   2,    /* int_reassoc_width.  */
 878   4,    /* fp_reassoc_width.  */
 879   1,    /* vec_reassoc_width.  */
 880   2,    /* min_div_recip_mul_sf.  */
 881   2,    /* min_div_recip_mul_df.  */
 882   0,    /* max_case_values.  */
 883   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 884   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
 885   &generic_prefetch_tune
 886 };
 887
 888 static const struct tune_params qdf24xx_tunings =
 889 {
 890   &qdf24xx_extra_costs,
 891   &qdf24xx_addrcost_table,
 892   &qdf24xx_regmove_cost,
 893   &generic_vector_cost,
 894   &generic_branch_cost,
 895   &generic_approx_modes,
 896   4, /* memmov_cost  */
 897   4, /* issue_rate  */
 898   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 899    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 900   "16", /* function_align.  */
 901   "8",  /* jump_align.  */
 902   "16", /* loop_align.  */
 903   2,    /* int_reassoc_width.  */
 904   4,    /* fp_reassoc_width.  */
 905   1,    /* vec_reassoc_width.  */
 906   2,    /* min_div_recip_mul_sf.  */
 907   2,    /* min_div_recip_mul_df.  */
 908   0,    /* max_case_values.  */
 909   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 910   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
 911   &qdf24xx_prefetch_tune
 912 };
 913
 914 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
 915    for now.  */
 916 static const struct tune_params saphira_tunings =
 917 {
 918   &generic_extra_costs,
 919   &generic_addrcost_table,
 920   &generic_regmove_cost,
 921   &generic_vector_cost,
 922   &generic_branch_cost,
 923   &generic_approx_modes,
 924   4, /* memmov_cost  */
 925   4, /* issue_rate  */
 926   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 927    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 928   "16", /* function_align.  */
 929   "8",  /* jump_align.  */
 930   "16", /* loop_align.  */
 931   2,    /* int_reassoc_width.  */
 932   4,    /* fp_reassoc_width.  */
 933   1,    /* vec_reassoc_width.  */
 934   2,    /* min_div_recip_mul_sf.  */
 935   2,    /* min_div_recip_mul_df.  */
 936   0,    /* max_case_values.  */
 937   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 938   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
 939   &generic_prefetch_tune
 940 };
 941
 942 static const struct tune_params thunderx2t99_tunings =
 943 {
 944   &thunderx2t99_extra_costs,
 945   &thunderx2t99_addrcost_table,
 946   &thunderx2t99_regmove_cost,
 947   &thunderx2t99_vector_cost,
 948   &generic_branch_cost,
 949   &generic_approx_modes,
 950   4, /* memmov_cost.  */
 951   4, /* issue_rate.  */
 952   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
 953    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 954   "16", /* function_align.  */
 955   "8",  /* jump_align.  */
 956   "16", /* loop_align.  */
 957   3,    /* int_reassoc_width.  */
 958   2,    /* fp_reassoc_width.  */
 959   2,    /* vec_reassoc_width.  */
 960   2,    /* min_div_recip_mul_sf.  */
 961   2,    /* min_div_recip_mul_df.  */
 962   0,    /* max_case_values.  */
 963   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 964   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 965   &thunderx2t99_prefetch_tune
 966 };
 967
 968 /* Support for fine-grained override of the tuning structures.  */
 969 struct aarch64_tuning_override_function
 970 {
 971   const char* name;
 972   void (*parse_override)(const char*, struct tune_params*);
 973 };
 974
 975 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 976 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 977
 978 static const struct aarch64_tuning_override_function
 979 aarch64_tuning_override_functions[] =
 980 {
 981   { "fuse", aarch64_parse_fuse_string },
 982   { "tune", aarch64_parse_tune_string },
 983   { NULL, NULL }
 984 };
 985
 986 /* A processor implementing AArch64.  */
 987 struct processor
 988 {
 989   const char *const name;
 990   enum aarch64_processor ident;
 991   enum aarch64_processor sched_core;
 992   enum aarch64_arch arch;
 993   unsigned architecture_version;
 994   const unsigned long flags;
 995   const struct tune_params *const tune;
 996 };
 997
 998 /* Architectures implementing AArch64.  */
 999 static const struct processor all_architectures[] =
1000 {
1001 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1002   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1003 #include "aarch64-arches.def"
1004   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1005 };
1006
1007 /* Processor cores implementing AArch64.  */
1008 static const struct processor all_cores[] =
1009 {
1010 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1011   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
1012   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
1013   FLAGS, &COSTS##_tunings},
1014 #include "aarch64-cores.def"
1015   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1016     AARCH64_FL_FOR_ARCH8, &generic_tunings},
1017   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1018 };
1019
1020
1021 /* Target specification.  These are populated by the -march, -mtune, -mcpu
1022    handling code or by target attributes.  */
1023 static const struct processor *selected_arch;
1024 static const struct processor *selected_cpu;
1025 static const struct processor *selected_tune;
1026
1027 /* The current tuning set.  */
1028 struct tune_params aarch64_tune_params = generic_tunings;
1029
1030 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1031
1032 /* An ISA extension in the co-processor and main instruction set space.  */
1033 struct aarch64_option_extension
1034 {
1035   const char *const name;
1036   const unsigned long flags_on;
1037   const unsigned long flags_off;
1038 };
1039
1040 typedef enum aarch64_cond_code
1041 {
1042   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1043   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1044   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1045 }
1046 aarch64_cc;
1047
1048 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1049
1050 /* The condition codes of the processor, and the inverse function.  */
1051 static const char * const aarch64_condition_codes[] =
1052 {
1053   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1054   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1055 };
1056
1057 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1058 const char *
1059 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1060                         const char * branch_format)
1061 {
1062     rtx_code_label * tmp_label = gen_label_rtx ();
1063     char label_buf[256];
1064     char buffer[128];
1065     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1066                                  CODE_LABEL_NUMBER (tmp_label));
1067     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1068     rtx dest_label = operands[pos_label];
1069     operands[pos_label] = tmp_label;
1070
1071     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1072     output_asm_insn (buffer, operands);
1073
1074     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1075     operands[pos_label] = dest_label;
1076     output_asm_insn (buffer, operands);
1077     return "";
1078 }
1079
1080 void
1081 aarch64_err_no_fpadvsimd (machine_mode mode)
1082 {
1083   if (TARGET_GENERAL_REGS_ONLY)
1084     if (FLOAT_MODE_P (mode))
1085       error ("%qs is incompatible with the use of floating-point types",
1086              "-mgeneral-regs-only");
1087     else
1088       error ("%qs is incompatible with the use of vector types",
1089              "-mgeneral-regs-only");
1090   else
1091     if (FLOAT_MODE_P (mode))
1092       error ("%qs feature modifier is incompatible with the use of"
1093              " floating-point types", "+nofp");
1094     else
1095       error ("%qs feature modifier is incompatible with the use of"
1096              " vector types", "+nofp");
1097 }
1098
1099 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1100    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1101    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1102    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1103    and GENERAL_REGS is lower than the memory cost (in this case the best class
1104    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
1105    cost results in bad allocations with many redundant int<->FP moves which
1106    are expensive on various cores.
1107    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1108    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
1109    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
1110    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
1111    The result of this is that it is no longer inefficient to have a higher
1112    memory move cost than the register move cost.
1113 */
1114
1115 static reg_class_t
1116 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1117                                          reg_class_t best_class)
1118 {
1119   machine_mode mode;
1120
1121   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1122       || !reg_class_subset_p (FP_REGS, allocno_class))
1123     return allocno_class;
1124
1125   if (!reg_class_subset_p (GENERAL_REGS, best_class)
1126       || !reg_class_subset_p (FP_REGS, best_class))
1127     return best_class;
1128
1129   mode = PSEUDO_REGNO_MODE (regno);
1130   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1131 }
1132
1133 static unsigned int
1134 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1135 {
1136   if (GET_MODE_UNIT_SIZE (mode) == 4)
1137     return aarch64_tune_params.min_div_recip_mul_sf;
1138   return aarch64_tune_params.min_div_recip_mul_df;
1139 }
1140
1141 /* Return the reassociation width of treeop OPC with mode MODE.  */
1142 static int
1143 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1144 {
1145   if (VECTOR_MODE_P (mode))
1146     return aarch64_tune_params.vec_reassoc_width;
1147   if (INTEGRAL_MODE_P (mode))
1148     return aarch64_tune_params.int_reassoc_width;
1149   /* Avoid reassociating floating point addition so we emit more FMAs.  */
1150   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1151     return aarch64_tune_params.fp_reassoc_width;
1152   return 1;
1153 }
1154
1155 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1156 unsigned
1157 aarch64_dbx_register_number (unsigned regno)
1158 {
1159    if (GP_REGNUM_P (regno))
1160      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1161    else if (regno == SP_REGNUM)
1162      return AARCH64_DWARF_SP;
1163    else if (FP_REGNUM_P (regno))
1164      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1165    else if (PR_REGNUM_P (regno))
1166      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1167    else if (regno == VG_REGNUM)
1168      return AARCH64_DWARF_VG;
1169
1170    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1171       equivalent DWARF register.  */
1172    return DWARF_FRAME_REGISTERS;
1173 }
1174
1175 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1176 static bool
1177 aarch64_advsimd_struct_mode_p (machine_mode mode)
1178 {
1179   return (TARGET_SIMD
1180           && (mode == OImode || mode == CImode || mode == XImode));
1181 }
1182
1183 /* Return true if MODE is an SVE predicate mode.  */
1184 static bool
1185 aarch64_sve_pred_mode_p (machine_mode mode)
1186 {
1187   return (TARGET_SVE
1188           && (mode == VNx16BImode
1189               || mode == VNx8BImode
1190               || mode == VNx4BImode
1191               || mode == VNx2BImode));
1192 }
1193
1194 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1195 const unsigned int VEC_ADVSIMD  = 1;
1196 const unsigned int VEC_SVE_DATA = 2;
1197 const unsigned int VEC_SVE_PRED = 4;
1198 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1199    a structure of 2, 3 or 4 vectors.  */
1200 const unsigned int VEC_STRUCT   = 8;
1201 /* Useful combinations of the above.  */
1202 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1203 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1204
1205 /* Return a set of flags describing the vector properties of mode MODE.
1206    Ignore modes that are not supported by the current target.  */
1207 static unsigned int
1208 aarch64_classify_vector_mode (machine_mode mode)
1209 {
1210   if (aarch64_advsimd_struct_mode_p (mode))
1211     return VEC_ADVSIMD | VEC_STRUCT;
1212
1213   if (aarch64_sve_pred_mode_p (mode))
1214     return VEC_SVE_PRED;
1215
1216   scalar_mode inner = GET_MODE_INNER (mode);
1217   if (VECTOR_MODE_P (mode)
1218       && (inner == QImode
1219           || inner == HImode
1220           || inner == HFmode
1221           || inner == SImode
1222           || inner == SFmode
1223           || inner == DImode
1224           || inner == DFmode))
1225     {
1226       if (TARGET_SVE)
1227         {
1228           if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1229             return VEC_SVE_DATA;
1230           if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1231               || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1232               || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1233             return VEC_SVE_DATA | VEC_STRUCT;
1234         }
1235
1236       /* This includes V1DF but not V1DI (which doesn't exist).  */
1237       if (TARGET_SIMD
1238           && (known_eq (GET_MODE_BITSIZE (mode), 64)
1239               || known_eq (GET_MODE_BITSIZE (mode), 128)))
1240         return VEC_ADVSIMD;
1241     }
1242
1243   return 0;
1244 }
1245
1246 /* Return true if MODE is any of the data vector modes, including
1247    structure modes.  */
1248 static bool
1249 aarch64_vector_data_mode_p (machine_mode mode)
1250 {
1251   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1252 }
1253
1254 /* Return true if MODE is an SVE data vector mode; either a single vector
1255    or a structure of vectors.  */
1256 static bool
1257 aarch64_sve_data_mode_p (machine_mode mode)
1258 {
1259   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1260 }
1261
1262 /* Implement target hook TARGET_ARRAY_MODE.  */
1263 static opt_machine_mode
1264 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1265 {
1266   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1267       && IN_RANGE (nelems, 2, 4))
1268     return mode_for_vector (GET_MODE_INNER (mode),
1269                             GET_MODE_NUNITS (mode) * nelems);
1270
1271   return opt_machine_mode ();
1272 }
1273
1274 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1275 static bool
1276 aarch64_array_mode_supported_p (machine_mode mode,
1277                                 unsigned HOST_WIDE_INT nelems)
1278 {
1279   if (TARGET_SIMD
1280       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1281           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1282       && (nelems >= 2 && nelems <= 4))
1283     return true;
1284
1285   return false;
1286 }
1287
1288 /* Return the SVE predicate mode to use for elements that have
1289    ELEM_NBYTES bytes, if such a mode exists.  */
1290
1291 opt_machine_mode
1292 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1293 {
1294   if (TARGET_SVE)
1295     {
1296       if (elem_nbytes == 1)
1297         return VNx16BImode;
1298       if (elem_nbytes == 2)
1299         return VNx8BImode;
1300       if (elem_nbytes == 4)
1301         return VNx4BImode;
1302       if (elem_nbytes == 8)
1303         return VNx2BImode;
1304     }
1305   return opt_machine_mode ();
1306 }
1307
1308 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1309
1310 static opt_machine_mode
1311 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1312 {
1313   if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1314     {
1315       unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1316       machine_mode pred_mode;
1317       if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1318         return pred_mode;
1319     }
1320
1321   return default_get_mask_mode (nunits, nbytes);
1322 }
1323
1324 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
1325    prefer to use the first arithmetic operand as the else value if
1326    the else value doesn't matter, since that exactly matches the SVE
1327    destructive merging form.  For ternary operations we could either
1328    pick the first operand and use FMAD-like instructions or the last
1329    operand and use FMLA-like instructions; the latter seems more
1330    natural.  */
1331
1332 static tree
1333 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1334 {
1335   return nops == 3 ? ops[2] : ops[0];
1336 }
1337
1338 /* Implement TARGET_HARD_REGNO_NREGS.  */
1339
1340 static unsigned int
1341 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1342 {
1343   /* ??? Logically we should only need to provide a value when
1344      HARD_REGNO_MODE_OK says that the combination is valid,
1345      but at the moment we need to handle all modes.  Just ignore
1346      any runtime parts for registers that can't store them.  */
1347   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1348   switch (aarch64_regno_regclass (regno))
1349     {
1350     case FP_REGS:
1351     case FP_LO_REGS:
1352       if (aarch64_sve_data_mode_p (mode))
1353         return exact_div (GET_MODE_SIZE (mode),
1354                           BYTES_PER_SVE_VECTOR).to_constant ();
1355       return CEIL (lowest_size, UNITS_PER_VREG);
1356     case PR_REGS:
1357     case PR_LO_REGS:
1358     case PR_HI_REGS:
1359       return 1;
1360     default:
1361       return CEIL (lowest_size, UNITS_PER_WORD);
1362     }
1363   gcc_unreachable ();
1364 }
1365
1366 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1367
1368 static bool
1369 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1370 {
1371   if (GET_MODE_CLASS (mode) == MODE_CC)
1372     return regno == CC_REGNUM;
1373
1374   if (regno == VG_REGNUM)
1375     /* This must have the same size as _Unwind_Word.  */
1376     return mode == DImode;
1377
1378   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1379   if (vec_flags & VEC_SVE_PRED)
1380     return PR_REGNUM_P (regno);
1381
1382   if (PR_REGNUM_P (regno))
1383     return 0;
1384
1385   if (regno == SP_REGNUM)
1386     /* The purpose of comparing with ptr_mode is to support the
1387        global register variable associated with the stack pointer
1388        register via the syntax of asm ("wsp") in ILP32.  */
1389     return mode == Pmode || mode == ptr_mode;
1390
1391   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1392     return mode == Pmode;
1393
1394   if (GP_REGNUM_P (regno) && known_le (GET_MODE_SIZE (mode), 16))
1395     return true;
1396
1397   if (FP_REGNUM_P (regno))
1398     {
1399       if (vec_flags & VEC_STRUCT)
1400         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1401       else
1402         return !VECTOR_MODE_P (mode) || vec_flags != 0;
1403     }
1404
1405   return false;
1406 }
1407
1408 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1409    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1410    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1411
1412 static bool
1413 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1414 {
1415   return FP_REGNUM_P (regno) && maybe_gt (GET_MODE_SIZE (mode), 8);
1416 }
1417
1418 /* Implement REGMODE_NATURAL_SIZE.  */
1419 poly_uint64
1420 aarch64_regmode_natural_size (machine_mode mode)
1421 {
1422   /* The natural size for SVE data modes is one SVE data vector,
1423      and similarly for predicates.  We can't independently modify
1424      anything smaller than that.  */
1425   /* ??? For now, only do this for variable-width SVE registers.
1426      Doing it for constant-sized registers breaks lower-subreg.c.  */
1427   /* ??? And once that's fixed, we should probably have similar
1428      code for Advanced SIMD.  */
1429   if (!aarch64_sve_vg.is_constant ())
1430     {
1431       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1432       if (vec_flags & VEC_SVE_PRED)
1433         return BYTES_PER_SVE_PRED;
1434       if (vec_flags & VEC_SVE_DATA)
1435         return BYTES_PER_SVE_VECTOR;
1436     }
1437   return UNITS_PER_WORD;
1438 }
1439
1440 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1441 machine_mode
1442 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1443                                      machine_mode mode)
1444 {
1445   /* The predicate mode determines which bits are significant and
1446      which are "don't care".  Decreasing the number of lanes would
1447      lose data while increasing the number of lanes would make bits
1448      unnecessarily significant.  */
1449   if (PR_REGNUM_P (regno))
1450     return mode;
1451   if (known_ge (GET_MODE_SIZE (mode), 4))
1452     return mode;
1453   else
1454     return SImode;
1455 }
1456
1457 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
1458    that strcpy from constants will be faster.  */
1459
1460 static HOST_WIDE_INT
1461 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1462 {
1463   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1464     return MAX (align, BITS_PER_WORD);
1465   return align;
1466 }
1467
1468 /* Return true if calls to DECL should be treated as
1469    long-calls (ie called via a register).  */
1470 static bool
1471 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1472 {
1473   return false;
1474 }
1475
1476 /* Return true if calls to symbol-ref SYM should be treated as
1477    long-calls (ie called via a register).  */
1478 bool
1479 aarch64_is_long_call_p (rtx sym)
1480 {
1481   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1482 }
1483
1484 /* Return true if calls to symbol-ref SYM should not go through
1485    plt stubs.  */
1486
1487 bool
1488 aarch64_is_noplt_call_p (rtx sym)
1489 {
1490   const_tree decl = SYMBOL_REF_DECL (sym);
1491
1492   if (flag_pic
1493       && decl
1494       && (!flag_plt
1495           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1496       && !targetm.binds_local_p (decl))
1497     return true;
1498
1499   return false;
1500 }
1501
1502 /* Return true if the offsets to a zero/sign-extract operation
1503    represent an expression that matches an extend operation.  The
1504    operands represent the paramters from
1505
1506    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1507 bool
1508 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1509                                 rtx extract_imm)
1510 {
1511   HOST_WIDE_INT mult_val, extract_val;
1512
1513   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1514     return false;
1515
1516   mult_val = INTVAL (mult_imm);
1517   extract_val = INTVAL (extract_imm);
1518
1519   if (extract_val > 8
1520       && extract_val < GET_MODE_BITSIZE (mode)
1521       && exact_log2 (extract_val & ~7) > 0
1522       && (extract_val & 7) <= 4
1523       && mult_val == (1 << (extract_val & 7)))
1524     return true;
1525
1526   return false;
1527 }
1528
1529 /* Emit an insn that's a simple single-set.  Both the operands must be
1530    known to be valid.  */
1531 inline static rtx_insn *
1532 emit_set_insn (rtx x, rtx y)
1533 {
1534   return emit_insn (gen_rtx_SET (x, y));
1535 }
1536
1537 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1538    return the rtx for register 0 in the proper mode.  */
1539 rtx
1540 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1541 {
1542   machine_mode mode = SELECT_CC_MODE (code, x, y);
1543   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1544
1545   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1546   return cc_reg;
1547 }
1548
1549 /* Build the SYMBOL_REF for __tls_get_addr.  */
1550
1551 static GTY(()) rtx tls_get_addr_libfunc;
1552
1553 rtx
1554 aarch64_tls_get_addr (void)
1555 {
1556   if (!tls_get_addr_libfunc)
1557     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1558   return tls_get_addr_libfunc;
1559 }
1560
1561 /* Return the TLS model to use for ADDR.  */
1562
1563 static enum tls_model
1564 tls_symbolic_operand_type (rtx addr)
1565 {
1566   enum tls_model tls_kind = TLS_MODEL_NONE;
1567   if (GET_CODE (addr) == CONST)
1568     {
1569       poly_int64 addend;
1570       rtx sym = strip_offset (addr, &addend);
1571       if (GET_CODE (sym) == SYMBOL_REF)
1572         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1573     }
1574   else if (GET_CODE (addr) == SYMBOL_REF)
1575     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1576
1577   return tls_kind;
1578 }
1579
1580 /* We'll allow lo_sum's in addresses in our legitimate addresses
1581    so that combine would take care of combining addresses where
1582    necessary, but for generation purposes, we'll generate the address
1583    as :
1584    RTL                               Absolute
1585    tmp = hi (symbol_ref);            adrp  x1, foo
1586    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1587                                      nop
1588
1589    PIC                               TLS
1590    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1591    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1592                                      bl   __tls_get_addr
1593                                      nop
1594
1595    Load TLS symbol, depending on TLS mechanism and TLS access model.
1596
1597    Global Dynamic - Traditional TLS:
1598    adrp tmp, :tlsgd:imm
1599    add  dest, tmp, #:tlsgd_lo12:imm
1600    bl   __tls_get_addr
1601
1602    Global Dynamic - TLS Descriptors:
1603    adrp dest, :tlsdesc:imm
1604    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1605    add  dest, dest, #:tlsdesc_lo12:imm
1606    blr  tmp
1607    mrs  tp, tpidr_el0
1608    add  dest, dest, tp
1609
1610    Initial Exec:
1611    mrs  tp, tpidr_el0
1612    adrp tmp, :gottprel:imm
1613    ldr  dest, [tmp, #:gottprel_lo12:imm]
1614    add  dest, dest, tp
1615
1616    Local Exec:
1617    mrs  tp, tpidr_el0
1618    add  t0, tp, #:tprel_hi12:imm, lsl #12
1619    add  t0, t0, #:tprel_lo12_nc:imm
1620 */
1621
1622 static void
1623 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1624                                    enum aarch64_symbol_type type)
1625 {
1626   switch (type)
1627     {
1628     case SYMBOL_SMALL_ABSOLUTE:
1629       {
1630         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1631         rtx tmp_reg = dest;
1632         machine_mode mode = GET_MODE (dest);
1633
1634         gcc_assert (mode == Pmode || mode == ptr_mode);
1635
1636         if (can_create_pseudo_p ())
1637           tmp_reg = gen_reg_rtx (mode);
1638
1639         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1640         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1641         return;
1642       }
1643
1644     case SYMBOL_TINY_ABSOLUTE:
1645       emit_insn (gen_rtx_SET (dest, imm));
1646       return;
1647
1648     case SYMBOL_SMALL_GOT_28K:
1649       {
1650         machine_mode mode = GET_MODE (dest);
1651         rtx gp_rtx = pic_offset_table_rtx;
1652         rtx insn;
1653         rtx mem;
1654
1655         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1656            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1657            decide rtx costs, in which case pic_offset_table_rtx is not
1658            initialized.  For that case no need to generate the first adrp
1659            instruction as the final cost for global variable access is
1660            one instruction.  */
1661         if (gp_rtx != NULL)
1662           {
1663             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1664                using the page base as GOT base, the first page may be wasted,
1665                in the worst scenario, there is only 28K space for GOT).
1666
1667                The generate instruction sequence for accessing global variable
1668                is:
1669
1670                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1671
1672                Only one instruction needed. But we must initialize
1673                pic_offset_table_rtx properly.  We generate initialize insn for
1674                every global access, and allow CSE to remove all redundant.
1675
1676                The final instruction sequences will look like the following
1677                for multiply global variables access.
1678
1679                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1680
1681                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1682                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1683                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1684                  ...  */
1685
1686             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1687             crtl->uses_pic_offset_table = 1;
1688             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1689
1690             if (mode != GET_MODE (gp_rtx))
1691              gp_rtx = gen_lowpart (mode, gp_rtx);
1692
1693           }
1694
1695         if (mode == ptr_mode)
1696           {
1697             if (mode == DImode)
1698               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1699             else
1700               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1701
1702             mem = XVECEXP (SET_SRC (insn), 0, 0);
1703           }
1704         else
1705           {
1706             gcc_assert (mode == Pmode);
1707
1708             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1709             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1710           }
1711
1712         /* The operand is expected to be MEM.  Whenever the related insn
1713            pattern changed, above code which calculate mem should be
1714            updated.  */
1715         gcc_assert (GET_CODE (mem) == MEM);
1716         MEM_READONLY_P (mem) = 1;
1717         MEM_NOTRAP_P (mem) = 1;
1718         emit_insn (insn);
1719         return;
1720       }
1721
1722     case SYMBOL_SMALL_GOT_4G:
1723       {
1724         /* In ILP32, the mode of dest can be either SImode or DImode,
1725            while the got entry is always of SImode size.  The mode of
1726            dest depends on how dest is used: if dest is assigned to a
1727            pointer (e.g. in the memory), it has SImode; it may have
1728            DImode if dest is dereferenced to access the memeory.
1729            This is why we have to handle three different ldr_got_small
1730            patterns here (two patterns for ILP32).  */
1731
1732         rtx insn;
1733         rtx mem;
1734         rtx tmp_reg = dest;
1735         machine_mode mode = GET_MODE (dest);
1736
1737         if (can_create_pseudo_p ())
1738           tmp_reg = gen_reg_rtx (mode);
1739
1740         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1741         if (mode == ptr_mode)
1742           {
1743             if (mode == DImode)
1744               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1745             else
1746               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1747
1748             mem = XVECEXP (SET_SRC (insn), 0, 0);
1749           }
1750         else
1751           {
1752             gcc_assert (mode == Pmode);
1753
1754             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1755             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1756           }
1757
1758         gcc_assert (GET_CODE (mem) == MEM);
1759         MEM_READONLY_P (mem) = 1;
1760         MEM_NOTRAP_P (mem) = 1;
1761         emit_insn (insn);
1762         return;
1763       }
1764
1765     case SYMBOL_SMALL_TLSGD:
1766       {
1767         rtx_insn *insns;
1768         machine_mode mode = GET_MODE (dest);
1769         rtx result = gen_rtx_REG (mode, R0_REGNUM);
1770
1771         start_sequence ();
1772         if (TARGET_ILP32)
1773           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1774         else
1775           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1776         insns = get_insns ();
1777         end_sequence ();
1778
1779         RTL_CONST_CALL_P (insns) = 1;
1780         emit_libcall_block (insns, dest, result, imm);
1781         return;
1782       }
1783
1784     case SYMBOL_SMALL_TLSDESC:
1785       {
1786         machine_mode mode = GET_MODE (dest);
1787         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1788         rtx tp;
1789
1790         gcc_assert (mode == Pmode || mode == ptr_mode);
1791
1792         /* In ILP32, the got entry is always of SImode size.  Unlike
1793            small GOT, the dest is fixed at reg 0.  */
1794         if (TARGET_ILP32)
1795           emit_insn (gen_tlsdesc_small_si (imm));
1796         else
1797           emit_insn (gen_tlsdesc_small_di (imm));
1798         tp = aarch64_load_tp (NULL);
1799
1800         if (mode != Pmode)
1801           tp = gen_lowpart (mode, tp);
1802
1803         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1804         if (REG_P (dest))
1805           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1806         return;
1807       }
1808
1809     case SYMBOL_SMALL_TLSIE:
1810       {
1811         /* In ILP32, the mode of dest can be either SImode or DImode,
1812            while the got entry is always of SImode size.  The mode of
1813            dest depends on how dest is used: if dest is assigned to a
1814            pointer (e.g. in the memory), it has SImode; it may have
1815            DImode if dest is dereferenced to access the memeory.
1816            This is why we have to handle three different tlsie_small
1817            patterns here (two patterns for ILP32).  */
1818         machine_mode mode = GET_MODE (dest);
1819         rtx tmp_reg = gen_reg_rtx (mode);
1820         rtx tp = aarch64_load_tp (NULL);
1821
1822         if (mode == ptr_mode)
1823           {
1824             if (mode == DImode)
1825               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1826             else
1827               {
1828                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1829                 tp = gen_lowpart (mode, tp);
1830               }
1831           }
1832         else
1833           {
1834             gcc_assert (mode == Pmode);
1835             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1836           }
1837
1838         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1839         if (REG_P (dest))
1840           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1841         return;
1842       }
1843
1844     case SYMBOL_TLSLE12:
1845     case SYMBOL_TLSLE24:
1846     case SYMBOL_TLSLE32:
1847     case SYMBOL_TLSLE48:
1848       {
1849         machine_mode mode = GET_MODE (dest);
1850         rtx tp = aarch64_load_tp (NULL);
1851
1852         if (mode != Pmode)
1853           tp = gen_lowpart (mode, tp);
1854
1855         switch (type)
1856           {
1857           case SYMBOL_TLSLE12:
1858             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1859                         (dest, tp, imm));
1860             break;
1861           case SYMBOL_TLSLE24:
1862             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1863                         (dest, tp, imm));
1864           break;
1865           case SYMBOL_TLSLE32:
1866             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1867                         (dest, imm));
1868             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1869                         (dest, dest, tp));
1870           break;
1871           case SYMBOL_TLSLE48:
1872             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1873                         (dest, imm));
1874             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1875                         (dest, dest, tp));
1876             break;
1877           default:
1878             gcc_unreachable ();
1879           }
1880
1881         if (REG_P (dest))
1882           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1883         return;
1884       }
1885
1886     case SYMBOL_TINY_GOT:
1887       emit_insn (gen_ldr_got_tiny (dest, imm));
1888       return;
1889
1890     case SYMBOL_TINY_TLSIE:
1891       {
1892         machine_mode mode = GET_MODE (dest);
1893         rtx tp = aarch64_load_tp (NULL);
1894
1895         if (mode == ptr_mode)
1896           {
1897             if (mode == DImode)
1898               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1899             else
1900               {
1901                 tp = gen_lowpart (mode, tp);
1902                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1903               }
1904           }
1905         else
1906           {
1907             gcc_assert (mode == Pmode);
1908             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1909           }
1910
1911         if (REG_P (dest))
1912           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1913         return;
1914       }
1915
1916     default:
1917       gcc_unreachable ();
1918     }
1919 }
1920
1921 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1922    handle all moves if !can_create_pseudo_p ().  The distinction is
1923    important because, unlike emit_move_insn, the move expanders know
1924    how to force Pmode objects into the constant pool even when the
1925    constant pool address is not itself legitimate.  */
1926 static rtx
1927 aarch64_emit_move (rtx dest, rtx src)
1928 {
1929   return (can_create_pseudo_p ()
1930           ? emit_move_insn (dest, src)
1931           : emit_move_insn_1 (dest, src));
1932 }
1933
1934 /* Apply UNOPTAB to OP and store the result in DEST.  */
1935
1936 static void
1937 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
1938 {
1939   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
1940   if (dest != tmp)
1941     emit_move_insn (dest, tmp);
1942 }
1943
1944 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
1945
1946 static void
1947 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
1948 {
1949   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
1950                           OPTAB_DIRECT);
1951   if (dest != tmp)
1952     emit_move_insn (dest, tmp);
1953 }
1954
1955 /* Split a 128-bit move operation into two 64-bit move operations,
1956    taking care to handle partial overlap of register to register
1957    copies.  Special cases are needed when moving between GP regs and
1958    FP regs.  SRC can be a register, constant or memory; DST a register
1959    or memory.  If either operand is memory it must not have any side
1960    effects.  */
1961 void
1962 aarch64_split_128bit_move (rtx dst, rtx src)
1963 {
1964   rtx dst_lo, dst_hi;
1965   rtx src_lo, src_hi;
1966
1967   machine_mode mode = GET_MODE (dst);
1968
1969   gcc_assert (mode == TImode || mode == TFmode);
1970   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1971   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1972
1973   if (REG_P (dst) && REG_P (src))
1974     {
1975       int src_regno = REGNO (src);
1976       int dst_regno = REGNO (dst);
1977
1978       /* Handle FP <-> GP regs.  */
1979       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1980         {
1981           src_lo = gen_lowpart (word_mode, src);
1982           src_hi = gen_highpart (word_mode, src);
1983
1984           emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
1985           emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
1986           return;
1987         }
1988       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1989         {
1990           dst_lo = gen_lowpart (word_mode, dst);
1991           dst_hi = gen_highpart (word_mode, dst);
1992
1993           emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
1994           emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
1995           return;
1996         }
1997     }
1998
1999   dst_lo = gen_lowpart (word_mode, dst);
2000   dst_hi = gen_highpart (word_mode, dst);
2001   src_lo = gen_lowpart (word_mode, src);
2002   src_hi = gen_highpart_mode (word_mode, mode, src);
2003
2004   /* At most one pairing may overlap.  */
2005   if (reg_overlap_mentioned_p (dst_lo, src_hi))
2006     {
2007       aarch64_emit_move (dst_hi, src_hi);
2008       aarch64_emit_move (dst_lo, src_lo);
2009     }
2010   else
2011     {
2012       aarch64_emit_move (dst_lo, src_lo);
2013       aarch64_emit_move (dst_hi, src_hi);
2014     }
2015 }
2016
2017 bool
2018 aarch64_split_128bit_move_p (rtx dst, rtx src)
2019 {
2020   return (! REG_P (src)
2021           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2022 }
2023
2024 /* Split a complex SIMD combine.  */
2025
2026 void
2027 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2028 {
2029   machine_mode src_mode = GET_MODE (src1);
2030   machine_mode dst_mode = GET_MODE (dst);
2031
2032   gcc_assert (VECTOR_MODE_P (dst_mode));
2033   gcc_assert (register_operand (dst, dst_mode)
2034               && register_operand (src1, src_mode)
2035               && register_operand (src2, src_mode));
2036
2037   emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2038   return;
2039 }
2040
2041 /* Split a complex SIMD move.  */
2042
2043 void
2044 aarch64_split_simd_move (rtx dst, rtx src)
2045 {
2046   machine_mode src_mode = GET_MODE (src);
2047   machine_mode dst_mode = GET_MODE (dst);
2048
2049   gcc_assert (VECTOR_MODE_P (dst_mode));
2050
2051   if (REG_P (dst) && REG_P (src))
2052     {
2053       gcc_assert (VECTOR_MODE_P (src_mode));
2054       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2055     }
2056 }
2057
2058 bool
2059 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2060                               machine_mode ymode, rtx y)
2061 {
2062   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2063   gcc_assert (r != NULL);
2064   return rtx_equal_p (x, r);
2065 }
2066
2067
2068 static rtx
2069 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2070 {
2071   if (can_create_pseudo_p ())
2072     return force_reg (mode, value);
2073   else
2074     {
2075       gcc_assert (x);
2076       aarch64_emit_move (x, value);
2077       return x;
2078     }
2079 }
2080
2081 /* Return true if we can move VALUE into a register using a single
2082    CNT[BHWD] instruction.  */
2083
2084 static bool
2085 aarch64_sve_cnt_immediate_p (poly_int64 value)
2086 {
2087   HOST_WIDE_INT factor = value.coeffs[0];
2088   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
2089   return (value.coeffs[1] == factor
2090           && IN_RANGE (factor, 2, 16 * 16)
2091           && (factor & 1) == 0
2092           && factor <= 16 * (factor & -factor));
2093 }
2094
2095 /* Likewise for rtx X.  */
2096
2097 bool
2098 aarch64_sve_cnt_immediate_p (rtx x)
2099 {
2100   poly_int64 value;
2101   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2102 }
2103
2104 /* Return the asm string for an instruction with a CNT-like vector size
2105    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2106    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2107    first part of the operands template (the part that comes before the
2108    vector size itself).  FACTOR is the number of quadwords.
2109    NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2110    If it is zero, we can use any element size.  */
2111
2112 static char *
2113 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2114                                   unsigned int factor,
2115                                   unsigned int nelts_per_vq)
2116 {
2117   static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2118
2119   if (nelts_per_vq == 0)
2120     /* There is some overlap in the ranges of the four CNT instructions.
2121        Here we always use the smallest possible element size, so that the
2122        multiplier is 1 whereever possible.  */
2123     nelts_per_vq = factor & -factor;
2124   int shift = std::min (exact_log2 (nelts_per_vq), 4);
2125   gcc_assert (IN_RANGE (shift, 1, 4));
2126   char suffix = "dwhb"[shift - 1];
2127
2128   factor >>= shift;
2129   unsigned int written;
2130   if (factor == 1)
2131     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2132                         prefix, suffix, operands);
2133   else
2134     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2135                         prefix, suffix, operands, factor);
2136   gcc_assert (written < sizeof (buffer));
2137   return buffer;
2138 }
2139
2140 /* Return the asm string for an instruction with a CNT-like vector size
2141    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2142    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2143    first part of the operands template (the part that comes before the
2144    vector size itself).  X is the value of the vector size operand,
2145    as a polynomial integer rtx.  */
2146
2147 char *
2148 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2149                                   rtx x)
2150 {
2151   poly_int64 value = rtx_to_poly_int64 (x);
2152   gcc_assert (aarch64_sve_cnt_immediate_p (value));
2153   return aarch64_output_sve_cnt_immediate (prefix, operands,
2154                                            value.coeffs[1], 0);
2155 }
2156
2157 /* Return true if we can add VALUE to a register using a single ADDVL
2158    or ADDPL instruction.  */
2159
2160 static bool
2161 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2162 {
2163   HOST_WIDE_INT factor = value.coeffs[0];
2164   if (factor == 0 || value.coeffs[1] != factor)
2165     return false;
2166   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2167      and a value of 16 is one vector width.  */
2168   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2169           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2170 }
2171
2172 /* Likewise for rtx X.  */
2173
2174 bool
2175 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2176 {
2177   poly_int64 value;
2178   return (poly_int_rtx_p (x, &value)
2179           && aarch64_sve_addvl_addpl_immediate_p (value));
2180 }
2181
2182 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2183    and storing the result in operand 0.  */
2184
2185 char *
2186 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2187 {
2188   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2189   poly_int64 offset_value = rtx_to_poly_int64 (offset);
2190   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2191
2192   /* Use INC or DEC if possible.  */
2193   if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2194     {
2195       if (aarch64_sve_cnt_immediate_p (offset_value))
2196         return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2197                                                  offset_value.coeffs[1], 0);
2198       if (aarch64_sve_cnt_immediate_p (-offset_value))
2199         return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2200                                                  -offset_value.coeffs[1], 0);
2201     }
2202
2203   int factor = offset_value.coeffs[1];
2204   if ((factor & 15) == 0)
2205     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2206   else
2207     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2208   return buffer;
2209 }
2210
2211 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2212    instruction.  If it is, store the number of elements in each vector
2213    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2214    factor in *FACTOR_OUT (if nonnull).  */
2215
2216 bool
2217 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2218                                  unsigned int *nelts_per_vq_out)
2219 {
2220   rtx elt;
2221   poly_int64 value;
2222
2223   if (!const_vec_duplicate_p (x, &elt)
2224       || !poly_int_rtx_p (elt, &value))
2225     return false;
2226
2227   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2228   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2229     /* There's no vector INCB.  */
2230     return false;
2231
2232   HOST_WIDE_INT factor = value.coeffs[0];
2233   if (value.coeffs[1] != factor)
2234     return false;
2235
2236   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
2237   if ((factor % nelts_per_vq) != 0
2238       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2239     return false;
2240
2241   if (factor_out)
2242     *factor_out = factor;
2243   if (nelts_per_vq_out)
2244     *nelts_per_vq_out = nelts_per_vq;
2245   return true;
2246 }
2247
2248 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2249    instruction.  */
2250
2251 bool
2252 aarch64_sve_inc_dec_immediate_p (rtx x)
2253 {
2254   return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2255 }
2256
2257 /* Return the asm template for an SVE vector INC or DEC instruction.
2258    OPERANDS gives the operands before the vector count and X is the
2259    value of the vector count operand itself.  */
2260
2261 char *
2262 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2263 {
2264   int factor;
2265   unsigned int nelts_per_vq;
2266   if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2267     gcc_unreachable ();
2268   if (factor < 0)
2269     return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2270                                              nelts_per_vq);
2271   else
2272     return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2273                                              nelts_per_vq);
2274 }
2275
2276 static int
2277 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2278                                 scalar_int_mode mode)
2279 {
2280   int i;
2281   unsigned HOST_WIDE_INT val, val2, mask;
2282   int one_match, zero_match;
2283   int num_insns;
2284
2285   val = INTVAL (imm);
2286
2287   if (aarch64_move_imm (val, mode))
2288     {
2289       if (generate)
2290         emit_insn (gen_rtx_SET (dest, imm));
2291       return 1;
2292     }
2293
2294   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2295      (with XXXX non-zero). In that case check to see if the move can be done in
2296      a smaller mode.  */
2297   val2 = val & 0xffffffff;
2298   if (mode == DImode
2299       && aarch64_move_imm (val2, SImode)
2300       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2301     {
2302       if (generate)
2303         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2304
2305       /* Check if we have to emit a second instruction by checking to see
2306          if any of the upper 32 bits of the original DI mode value is set.  */
2307       if (val == val2)
2308         return 1;
2309
2310       i = (val >> 48) ? 48 : 32;
2311
2312       if (generate)
2313          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2314                                     GEN_INT ((val >> i) & 0xffff)));
2315
2316       return 2;
2317     }
2318
2319   if ((val >> 32) == 0 || mode == SImode)
2320     {
2321       if (generate)
2322         {
2323           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2324           if (mode == SImode)
2325             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2326                                        GEN_INT ((val >> 16) & 0xffff)));
2327           else
2328             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2329                                        GEN_INT ((val >> 16) & 0xffff)));
2330         }
2331       return 2;
2332     }
2333
2334   /* Remaining cases are all for DImode.  */
2335
2336   mask = 0xffff;
2337   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2338     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2339   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2340     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2341
2342   if (zero_match != 2 && one_match != 2)
2343     {
2344       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2345          For a 64-bit bitmask try whether changing 16 bits to all ones or
2346          zeroes creates a valid bitmask.  To check any repeated bitmask,
2347          try using 16 bits from the other 32-bit half of val.  */
2348
2349       for (i = 0; i < 64; i += 16, mask <<= 16)
2350         {
2351           val2 = val & ~mask;
2352           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2353             break;
2354           val2 = val | mask;
2355           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2356             break;
2357           val2 = val2 & ~mask;
2358           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2359           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2360             break;
2361         }
2362       if (i != 64)
2363         {
2364           if (generate)
2365             {
2366               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2367               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2368                                          GEN_INT ((val >> i) & 0xffff)));
2369             }
2370           return 2;
2371         }
2372     }
2373
2374   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2375      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
2376      otherwise skip zero bits.  */
2377
2378   num_insns = 1;
2379   mask = 0xffff;
2380   val2 = one_match > zero_match ? ~val : val;
2381   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2382
2383   if (generate)
2384     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2385                                            ? (val | ~(mask << i))
2386                                            : (val & (mask << i)))));
2387   for (i += 16; i < 64; i += 16)
2388     {
2389       if ((val2 & (mask << i)) == 0)
2390         continue;
2391       if (generate)
2392         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2393                                    GEN_INT ((val >> i) & 0xffff)));
2394       num_insns ++;
2395     }
2396
2397   return num_insns;
2398 }
2399
2400 /* Return whether imm is a 128-bit immediate which is simple enough to
2401    expand inline.  */
2402 bool
2403 aarch64_mov128_immediate (rtx imm)
2404 {
2405   if (GET_CODE (imm) == CONST_INT)
2406     return true;
2407
2408   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2409
2410   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2411   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2412
2413   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2414          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2415 }
2416
2417
2418 /* Return the number of temporary registers that aarch64_add_offset_1
2419    would need to add OFFSET to a register.  */
2420
2421 static unsigned int
2422 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2423 {
2424   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2425 }
2426
2427 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
2428    a non-polynomial OFFSET.  MODE is the mode of the addition.
2429    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2430    be set and CFA adjustments added to the generated instructions.
2431
2432    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2433    temporary if register allocation is already complete.  This temporary
2434    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
2435    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2436    the immediate again.
2437
2438    Since this function may be used to adjust the stack pointer, we must
2439    ensure that it cannot cause transient stack deallocation (for example
2440    by first incrementing SP and then decrementing when adjusting by a
2441    large immediate).  */
2442
2443 static void
2444 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2445                       rtx src, HOST_WIDE_INT offset, rtx temp1,
2446                       bool frame_related_p, bool emit_move_imm)
2447 {
2448   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2449   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2450
2451   HOST_WIDE_INT moffset = abs_hwi (offset);
2452   rtx_insn *insn;
2453
2454   if (!moffset)
2455     {
2456       if (!rtx_equal_p (dest, src))
2457         {
2458           insn = emit_insn (gen_rtx_SET (dest, src));
2459           RTX_FRAME_RELATED_P (insn) = frame_related_p;
2460         }
2461       return;
2462     }
2463
2464   /* Single instruction adjustment.  */
2465   if (aarch64_uimm12_shift (moffset))
2466     {
2467       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2468       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2469       return;
2470     }
2471
2472   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2473      and either:
2474
2475      a) the offset cannot be loaded by a 16-bit move or
2476      b) there is no spare register into which we can move it.  */
2477   if (moffset < 0x1000000
2478       && ((!temp1 && !can_create_pseudo_p ())
2479           || !aarch64_move_imm (moffset, mode)))
2480     {
2481       HOST_WIDE_INT low_off = moffset & 0xfff;
2482
2483       low_off = offset < 0 ? -low_off : low_off;
2484       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2485       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2486       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2487       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2488       return;
2489     }
2490
2491   /* Emit a move immediate if required and an addition/subtraction.  */
2492   if (emit_move_imm)
2493     {
2494       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2495       temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2496     }
2497   insn = emit_insn (offset < 0
2498                     ? gen_sub3_insn (dest, src, temp1)
2499                     : gen_add3_insn (dest, src, temp1));
2500   if (frame_related_p)
2501     {
2502       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2503       rtx adj = plus_constant (mode, src, offset);
2504       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2505     }
2506 }
2507
2508 /* Return the number of temporary registers that aarch64_add_offset
2509    would need to move OFFSET into a register or add OFFSET to a register;
2510    ADD_P is true if we want the latter rather than the former.  */
2511
2512 static unsigned int
2513 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2514 {
2515   /* This follows the same structure as aarch64_add_offset.  */
2516   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2517     return 0;
2518
2519   unsigned int count = 0;
2520   HOST_WIDE_INT factor = offset.coeffs[1];
2521   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2522   poly_int64 poly_offset (factor, factor);
2523   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2524     /* Need one register for the ADDVL/ADDPL result.  */
2525     count += 1;
2526   else if (factor != 0)
2527     {
2528       factor = abs (factor);
2529       if (factor > 16 * (factor & -factor))
2530         /* Need one register for the CNT result and one for the multiplication
2531            factor.  If necessary, the second temporary can be reused for the
2532            constant part of the offset.  */
2533         return 2;
2534       /* Need one register for the CNT result (which might then
2535          be shifted).  */
2536       count += 1;
2537     }
2538   return count + aarch64_add_offset_1_temporaries (constant);
2539 }
2540
2541 /* If X can be represented as a poly_int64, return the number
2542    of temporaries that are required to add it to a register.
2543    Return -1 otherwise.  */
2544
2545 int
2546 aarch64_add_offset_temporaries (rtx x)
2547 {
2548   poly_int64 offset;
2549   if (!poly_int_rtx_p (x, &offset))
2550     return -1;
2551   return aarch64_offset_temporaries (true, offset);
2552 }
2553
2554 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
2555    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2556    be set and CFA adjustments added to the generated instructions.
2557
2558    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2559    temporary if register allocation is already complete.  This temporary
2560    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2561    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2562    false to avoid emitting the immediate again.
2563
2564    TEMP2, if nonnull, is a second temporary register that doesn't
2565    overlap either DEST or REG.
2566
2567    Since this function may be used to adjust the stack pointer, we must
2568    ensure that it cannot cause transient stack deallocation (for example
2569    by first incrementing SP and then decrementing when adjusting by a
2570    large immediate).  */
2571
2572 static void
2573 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2574                     poly_int64 offset, rtx temp1, rtx temp2,
2575                     bool frame_related_p, bool emit_move_imm = true)
2576 {
2577   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2578   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2579   gcc_assert (temp1 == NULL_RTX
2580               || !frame_related_p
2581               || !reg_overlap_mentioned_p (temp1, dest));
2582   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2583
2584   /* Try using ADDVL or ADDPL to add the whole value.  */
2585   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2586     {
2587       rtx offset_rtx = gen_int_mode (offset, mode);
2588       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2589       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2590       return;
2591     }
2592
2593   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2594      SVE vector register, over and above the minimum size of 128 bits.
2595      This is equivalent to half the value returned by CNTD with a
2596      vector shape of ALL.  */
2597   HOST_WIDE_INT factor = offset.coeffs[1];
2598   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2599
2600   /* Try using ADDVL or ADDPL to add the VG-based part.  */
2601   poly_int64 poly_offset (factor, factor);
2602   if (src != const0_rtx
2603       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2604     {
2605       rtx offset_rtx = gen_int_mode (poly_offset, mode);
2606       if (frame_related_p)
2607         {
2608           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2609           RTX_FRAME_RELATED_P (insn) = true;
2610           src = dest;
2611         }
2612       else
2613         {
2614           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2615           src = aarch64_force_temporary (mode, temp1, addr);
2616           temp1 = temp2;
2617           temp2 = NULL_RTX;
2618         }
2619     }
2620   /* Otherwise use a CNT-based sequence.  */
2621   else if (factor != 0)
2622     {
2623       /* Use a subtraction if we have a negative factor.  */
2624       rtx_code code = PLUS;
2625       if (factor < 0)
2626         {
2627           factor = -factor;
2628           code = MINUS;
2629         }
2630
2631       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
2632          into the multiplication.  */
2633       rtx val;
2634       int shift = 0;
2635       if (factor & 1)
2636         /* Use a right shift by 1.  */
2637         shift = -1;
2638       else
2639         factor /= 2;
2640       HOST_WIDE_INT low_bit = factor & -factor;
2641       if (factor <= 16 * low_bit)
2642         {
2643           if (factor > 16 * 8)
2644             {
2645               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2646                  the value with the minimum multiplier and shift it into
2647                  position.  */
2648               int extra_shift = exact_log2 (low_bit);
2649               shift += extra_shift;
2650               factor >>= extra_shift;
2651             }
2652           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
2653         }
2654       else
2655         {
2656           /* Use CNTD, then multiply it by FACTOR.  */
2657           val = gen_int_mode (poly_int64 (2, 2), mode);
2658           val = aarch64_force_temporary (mode, temp1, val);
2659
2660           /* Go back to using a negative multiplication factor if we have
2661              no register from which to subtract.  */
2662           if (code == MINUS && src == const0_rtx)
2663             {
2664               factor = -factor;
2665               code = PLUS;
2666             }
2667           rtx coeff1 = gen_int_mode (factor, mode);
2668           coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
2669           val = gen_rtx_MULT (mode, val, coeff1);
2670         }
2671
2672       if (shift > 0)
2673         {
2674           /* Multiply by 1 << SHIFT.  */
2675           val = aarch64_force_temporary (mode, temp1, val);
2676           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
2677         }
2678       else if (shift == -1)
2679         {
2680           /* Divide by 2.  */
2681           val = aarch64_force_temporary (mode, temp1, val);
2682           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
2683         }
2684
2685       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
2686       if (src != const0_rtx)
2687         {
2688           val = aarch64_force_temporary (mode, temp1, val);
2689           val = gen_rtx_fmt_ee (code, mode, src, val);
2690         }
2691       else if (code == MINUS)
2692         {
2693           val = aarch64_force_temporary (mode, temp1, val);
2694           val = gen_rtx_NEG (mode, val);
2695         }
2696
2697       if (constant == 0 || frame_related_p)
2698         {
2699           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
2700           if (frame_related_p)
2701             {
2702               RTX_FRAME_RELATED_P (insn) = true;
2703               add_reg_note (insn, REG_CFA_ADJUST_CFA,
2704                             gen_rtx_SET (dest, plus_constant (Pmode, src,
2705                                                               poly_offset)));
2706             }
2707           src = dest;
2708           if (constant == 0)
2709             return;
2710         }
2711       else
2712         {
2713           src = aarch64_force_temporary (mode, temp1, val);
2714           temp1 = temp2;
2715           temp2 = NULL_RTX;
2716         }
2717
2718       emit_move_imm = true;
2719     }
2720
2721   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
2722                         frame_related_p, emit_move_imm);
2723 }
2724
2725 /* Like aarch64_add_offset, but the offset is given as an rtx rather
2726    than a poly_int64.  */
2727
2728 void
2729 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2730                           rtx offset_rtx, rtx temp1, rtx temp2)
2731 {
2732   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
2733                       temp1, temp2, false);
2734 }
2735
2736 /* Add DELTA to the stack pointer, marking the instructions frame-related.
2737    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
2738    if TEMP1 already contains abs (DELTA).  */
2739
2740 static inline void
2741 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
2742 {
2743   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
2744                       temp1, temp2, true, emit_move_imm);
2745 }
2746
2747 /* Subtract DELTA from the stack pointer, marking the instructions
2748    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
2749    if nonnull.  */
2750
2751 static inline void
2752 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p)
2753 {
2754   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
2755                       temp1, temp2, frame_related_p);
2756 }
2757
2758 /* Set DEST to (vec_series BASE STEP).  */
2759
2760 static void
2761 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
2762 {
2763   machine_mode mode = GET_MODE (dest);
2764   scalar_mode inner = GET_MODE_INNER (mode);
2765
2766   /* Each operand can be a register or an immediate in the range [-16, 15].  */
2767   if (!aarch64_sve_index_immediate_p (base))
2768     base = force_reg (inner, base);
2769   if (!aarch64_sve_index_immediate_p (step))
2770     step = force_reg (inner, step);
2771
2772   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
2773 }
2774
2775 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
2776    integer of mode INT_MODE.  Return true on success.  */
2777
2778 static bool
2779 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
2780                                       rtx src)
2781 {
2782   /* If the constant is smaller than 128 bits, we can do the move
2783      using a vector of SRC_MODEs.  */
2784   if (src_mode != TImode)
2785     {
2786       poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
2787                                      GET_MODE_SIZE (src_mode));
2788       machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
2789       emit_move_insn (gen_lowpart (dup_mode, dest),
2790                       gen_const_vec_duplicate (dup_mode, src));
2791       return true;
2792     }
2793
2794   /* Use LD1RQ[BHWD] to load the 128 bits from memory.  */
2795   src = force_const_mem (src_mode, src);
2796   if (!src)
2797     return false;
2798
2799   /* Make sure that the address is legitimate.  */
2800   if (!aarch64_sve_ld1r_operand_p (src))
2801     {
2802       rtx addr = force_reg (Pmode, XEXP (src, 0));
2803       src = replace_equiv_address (src, addr);
2804     }
2805
2806   machine_mode mode = GET_MODE (dest);
2807   unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
2808   machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
2809   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
2810   src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
2811   emit_insn (gen_rtx_SET (dest, src));
2812   return true;
2813 }
2814
2815 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
2816    isn't a simple duplicate or series.  */
2817
2818 static void
2819 aarch64_expand_sve_const_vector (rtx dest, rtx src)
2820 {
2821   machine_mode mode = GET_MODE (src);
2822   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
2823   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
2824   gcc_assert (npatterns > 1);
2825
2826   if (nelts_per_pattern == 1)
2827     {
2828       /* The constant is a repeating seqeuence of at least two elements,
2829          where the repeating elements occupy no more than 128 bits.
2830          Get an integer representation of the replicated value.  */
2831       scalar_int_mode int_mode;
2832       if (BYTES_BIG_ENDIAN)
2833         /* For now, always use LD1RQ to load the value on big-endian
2834            targets, since the handling of smaller integers includes a
2835            subreg that is semantically an element reverse.  */
2836         int_mode = TImode;
2837       else
2838         {
2839           unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
2840           gcc_assert (int_bits <= 128);
2841           int_mode = int_mode_for_size (int_bits, 0).require ();
2842         }
2843       rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
2844       if (int_value
2845           && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
2846         return;
2847     }
2848
2849   /* Expand each pattern individually.  */
2850   rtx_vector_builder builder;
2851   auto_vec<rtx, 16> vectors (npatterns);
2852   for (unsigned int i = 0; i < npatterns; ++i)
2853     {
2854       builder.new_vector (mode, 1, nelts_per_pattern);
2855       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
2856         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
2857       vectors.quick_push (force_reg (mode, builder.build ()));
2858     }
2859
2860   /* Use permutes to interleave the separate vectors.  */
2861   while (npatterns > 1)
2862     {
2863       npatterns /= 2;
2864       for (unsigned int i = 0; i < npatterns; ++i)
2865         {
2866           rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
2867           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
2868           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
2869           vectors[i] = tmp;
2870         }
2871     }
2872   gcc_assert (vectors[0] == dest);
2873 }
2874
2875 /* Set DEST to immediate IMM.  For SVE vector modes, GEN_VEC_DUPLICATE
2876    is a pattern that can be used to set DEST to a replicated scalar
2877    element.  */
2878
2879 void
2880 aarch64_expand_mov_immediate (rtx dest, rtx imm,
2881                               rtx (*gen_vec_duplicate) (rtx, rtx))
2882 {
2883   machine_mode mode = GET_MODE (dest);
2884
2885   /* Check on what type of symbol it is.  */
2886   scalar_int_mode int_mode;
2887   if ((GET_CODE (imm) == SYMBOL_REF
2888        || GET_CODE (imm) == LABEL_REF
2889        || GET_CODE (imm) == CONST
2890        || GET_CODE (imm) == CONST_POLY_INT)
2891       && is_a <scalar_int_mode> (mode, &int_mode))
2892     {
2893       rtx mem;
2894       poly_int64 offset;
2895       HOST_WIDE_INT const_offset;
2896       enum aarch64_symbol_type sty;
2897
2898       /* If we have (const (plus symbol offset)), separate out the offset
2899          before we start classifying the symbol.  */
2900       rtx base = strip_offset (imm, &offset);
2901
2902       /* We must always add an offset involving VL separately, rather than
2903          folding it into the relocation.  */
2904       if (!offset.is_constant (&const_offset))
2905         {
2906           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
2907             emit_insn (gen_rtx_SET (dest, imm));
2908           else
2909             {
2910               /* Do arithmetic on 32-bit values if the result is smaller
2911                  than that.  */
2912               if (partial_subreg_p (int_mode, SImode))
2913                 {
2914                   /* It is invalid to do symbol calculations in modes
2915                      narrower than SImode.  */
2916                   gcc_assert (base == const0_rtx);
2917                   dest = gen_lowpart (SImode, dest);
2918                   int_mode = SImode;
2919                 }
2920               if (base != const0_rtx)
2921                 {
2922                   base = aarch64_force_temporary (int_mode, dest, base);
2923                   aarch64_add_offset (int_mode, dest, base, offset,
2924                                       NULL_RTX, NULL_RTX, false);
2925                 }
2926               else
2927                 aarch64_add_offset (int_mode, dest, base, offset,
2928                                     dest, NULL_RTX, false);
2929             }
2930           return;
2931         }
2932
2933       sty = aarch64_classify_symbol (base, const_offset);
2934       switch (sty)
2935         {
2936         case SYMBOL_FORCE_TO_MEM:
2937           if (const_offset != 0
2938               && targetm.cannot_force_const_mem (int_mode, imm))
2939             {
2940               gcc_assert (can_create_pseudo_p ());
2941               base = aarch64_force_temporary (int_mode, dest, base);
2942               aarch64_add_offset (int_mode, dest, base, const_offset,
2943                                   NULL_RTX, NULL_RTX, false);
2944               return;
2945             }
2946
2947           mem = force_const_mem (ptr_mode, imm);
2948           gcc_assert (mem);
2949
2950           /* If we aren't generating PC relative literals, then
2951              we need to expand the literal pool access carefully.
2952              This is something that needs to be done in a number
2953              of places, so could well live as a separate function.  */
2954           if (!aarch64_pcrelative_literal_loads)
2955             {
2956               gcc_assert (can_create_pseudo_p ());
2957               base = gen_reg_rtx (ptr_mode);
2958               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
2959               if (ptr_mode != Pmode)
2960                 base = convert_memory_address (Pmode, base);
2961               mem = gen_rtx_MEM (ptr_mode, base);
2962             }
2963
2964           if (int_mode != ptr_mode)
2965             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
2966
2967           emit_insn (gen_rtx_SET (dest, mem));
2968
2969           return;
2970
2971         case SYMBOL_SMALL_TLSGD:
2972         case SYMBOL_SMALL_TLSDESC:
2973         case SYMBOL_SMALL_TLSIE:
2974         case SYMBOL_SMALL_GOT_28K:
2975         case SYMBOL_SMALL_GOT_4G:
2976         case SYMBOL_TINY_GOT:
2977         case SYMBOL_TINY_TLSIE:
2978           if (const_offset != 0)
2979             {
2980               gcc_assert(can_create_pseudo_p ());
2981               base = aarch64_force_temporary (int_mode, dest, base);
2982               aarch64_add_offset (int_mode, dest, base, const_offset,
2983                                   NULL_RTX, NULL_RTX, false);
2984               return;
2985             }
2986           /* FALLTHRU */
2987
2988         case SYMBOL_SMALL_ABSOLUTE:
2989         case SYMBOL_TINY_ABSOLUTE:
2990         case SYMBOL_TLSLE12:
2991         case SYMBOL_TLSLE24:
2992         case SYMBOL_TLSLE32:
2993         case SYMBOL_TLSLE48:
2994           aarch64_load_symref_appropriately (dest, imm, sty);
2995           return;
2996
2997         default:
2998           gcc_unreachable ();
2999         }
3000     }
3001
3002   if (!CONST_INT_P (imm))
3003     {
3004       rtx base, step, value;
3005       if (GET_CODE (imm) == HIGH
3006           || aarch64_simd_valid_immediate (imm, NULL))
3007         emit_insn (gen_rtx_SET (dest, imm));
3008       else if (const_vec_series_p (imm, &base, &step))
3009         aarch64_expand_vec_series (dest, base, step);
3010       else if (const_vec_duplicate_p (imm, &value))
3011         {
3012           /* If the constant is out of range of an SVE vector move,
3013              load it from memory if we can, otherwise move it into
3014              a register and use a DUP.  */
3015           scalar_mode inner_mode = GET_MODE_INNER (mode);
3016           rtx op = force_const_mem (inner_mode, value);
3017           if (!op)
3018             op = force_reg (inner_mode, value);
3019           else if (!aarch64_sve_ld1r_operand_p (op))
3020             {
3021               rtx addr = force_reg (Pmode, XEXP (op, 0));
3022               op = replace_equiv_address (op, addr);
3023             }
3024           emit_insn (gen_vec_duplicate (dest, op));
3025         }
3026       else if (GET_CODE (imm) == CONST_VECTOR
3027                && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3028         aarch64_expand_sve_const_vector (dest, imm);
3029       else
3030         {
3031           rtx mem = force_const_mem (mode, imm);
3032           gcc_assert (mem);
3033           emit_move_insn (dest, mem);
3034         }
3035
3036       return;
3037     }
3038
3039   aarch64_internal_mov_immediate (dest, imm, true,
3040                                   as_a <scalar_int_mode> (mode));
3041 }
3042
3043 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
3044    that is known to contain PTRUE.  */
3045
3046 void
3047 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3048 {
3049   emit_insn (gen_rtx_SET (dest, gen_rtx_UNSPEC (GET_MODE (dest),
3050                                                 gen_rtvec (2, pred, src),
3051                                                 UNSPEC_MERGE_PTRUE)));
3052 }
3053
3054 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3055    operand is in memory.  In this case we need to use the predicated LD1
3056    and ST1 instead of LDR and STR, both for correctness on big-endian
3057    targets and because LD1 and ST1 support a wider range of addressing modes.
3058    PRED_MODE is the mode of the predicate.
3059
3060    See the comment at the head of aarch64-sve.md for details about the
3061    big-endian handling.  */
3062
3063 void
3064 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3065 {
3066   machine_mode mode = GET_MODE (dest);
3067   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3068   if (!register_operand (src, mode)
3069       && !register_operand (dest, mode))
3070     {
3071       rtx tmp = gen_reg_rtx (mode);
3072       if (MEM_P (src))
3073         aarch64_emit_sve_pred_move (tmp, ptrue, src);
3074       else
3075         emit_move_insn (tmp, src);
3076       src = tmp;
3077     }
3078   aarch64_emit_sve_pred_move (dest, ptrue, src);
3079 }
3080
3081 /* Called only on big-endian targets.  See whether an SVE vector move
3082    from SRC to DEST is effectively a REV[BHW] instruction, because at
3083    least one operand is a subreg of an SVE vector that has wider or
3084    narrower elements.  Return true and emit the instruction if so.
3085
3086    For example:
3087
3088      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3089
3090    represents a VIEW_CONVERT between the following vectors, viewed
3091    in memory order:
3092
3093      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
3094      R1: { [0],      [1],      [2],      [3],     ... }
3095
3096    The high part of lane X in R2 should therefore correspond to lane X*2
3097    of R1, but the register representations are:
3098
3099          msb                                      lsb
3100      R2: ...... [1].high  [1].low   [0].high  [0].low
3101      R1: ...... [3]       [2]       [1]       [0]
3102
3103    where the low part of lane X in R2 corresponds to lane X*2 in R1.
3104    We therefore need a reverse operation to swap the high and low values
3105    around.
3106
3107    This is purely an optimization.  Without it we would spill the
3108    subreg operand to the stack in one mode and reload it in the
3109    other mode, which has the same effect as the REV.  */
3110
3111 bool
3112 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3113 {
3114   gcc_assert (BYTES_BIG_ENDIAN);
3115   if (GET_CODE (dest) == SUBREG)
3116     dest = SUBREG_REG (dest);
3117   if (GET_CODE (src) == SUBREG)
3118     src = SUBREG_REG (src);
3119
3120   /* The optimization handles two single SVE REGs with different element
3121      sizes.  */
3122   if (!REG_P (dest)
3123       || !REG_P (src)
3124       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3125       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3126       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3127           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3128     return false;
3129
3130   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
3131   rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3132   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3133                                UNSPEC_REV_SUBREG);
3134   emit_insn (gen_rtx_SET (dest, unspec));
3135   return true;
3136 }
3137
3138 /* Return a copy of X with mode MODE, without changing its other
3139    attributes.  Unlike gen_lowpart, this doesn't care whether the
3140    mode change is valid.  */
3141
3142 static rtx
3143 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3144 {
3145   if (GET_MODE (x) == mode)
3146     return x;
3147
3148   x = shallow_copy_rtx (x);
3149   set_mode_and_regno (x, mode, REGNO (x));
3150   return x;
3151 }
3152
3153 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3154    operands.  */
3155
3156 void
3157 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3158 {
3159   /* Decide which REV operation we need.  The mode with narrower elements
3160      determines the mode of the operands and the mode with the wider
3161      elements determines the reverse width.  */
3162   machine_mode mode_with_wider_elts = GET_MODE (dest);
3163   machine_mode mode_with_narrower_elts = GET_MODE (src);
3164   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3165       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3166     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3167
3168   unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3169   unsigned int unspec;
3170   if (wider_bytes == 8)
3171     unspec = UNSPEC_REV64;
3172   else if (wider_bytes == 4)
3173     unspec = UNSPEC_REV32;
3174   else if (wider_bytes == 2)
3175     unspec = UNSPEC_REV16;
3176   else
3177     gcc_unreachable ();
3178   machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3179
3180   /* Emit:
3181
3182        (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3183                          UNSPEC_MERGE_PTRUE))
3184
3185      with the appropriate modes.  */
3186   ptrue = gen_lowpart (pred_mode, ptrue);
3187   dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3188   src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3189   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3190   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3191                         UNSPEC_MERGE_PTRUE);
3192   emit_insn (gen_rtx_SET (dest, src));
3193 }
3194
3195 static bool
3196 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3197                                  tree exp ATTRIBUTE_UNUSED)
3198 {
3199   /* Currently, always true.  */
3200   return true;
3201 }
3202
3203 /* Implement TARGET_PASS_BY_REFERENCE.  */
3204
3205 static bool
3206 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3207                            machine_mode mode,
3208                            const_tree type,
3209                            bool named ATTRIBUTE_UNUSED)
3210 {
3211   HOST_WIDE_INT size;
3212   machine_mode dummymode;
3213   int nregs;
3214
3215   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
3216   if (mode == BLKmode && type)
3217     size = int_size_in_bytes (type);
3218   else
3219     /* No frontends can create types with variable-sized modes, so we
3220        shouldn't be asked to pass or return them.  */
3221     size = GET_MODE_SIZE (mode).to_constant ();
3222
3223   /* Aggregates are passed by reference based on their size.  */
3224   if (type && AGGREGATE_TYPE_P (type))
3225     {
3226       size = int_size_in_bytes (type);
3227     }
3228
3229   /* Variable sized arguments are always returned by reference.  */
3230   if (size < 0)
3231     return true;
3232
3233   /* Can this be a candidate to be passed in fp/simd register(s)?  */
3234   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3235                                                &dummymode, &nregs,
3236                                                NULL))
3237     return false;
3238
3239   /* Arguments which are variable sized or larger than 2 registers are
3240      passed by reference unless they are a homogenous floating point
3241      aggregate.  */
3242   return size > 2 * UNITS_PER_WORD;
3243 }
3244
3245 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
3246 static bool
3247 aarch64_return_in_msb (const_tree valtype)
3248 {
3249   machine_mode dummy_mode;
3250   int dummy_int;
3251
3252   /* Never happens in little-endian mode.  */
3253   if (!BYTES_BIG_ENDIAN)
3254     return false;
3255
3256   /* Only composite types smaller than or equal to 16 bytes can
3257      be potentially returned in registers.  */
3258   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3259       || int_size_in_bytes (valtype) <= 0
3260       || int_size_in_bytes (valtype) > 16)
3261     return false;
3262
3263   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3264      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3265      is always passed/returned in the least significant bits of fp/simd
3266      register(s).  */
3267   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3268                                                &dummy_mode, &dummy_int, NULL))
3269     return false;
3270
3271   return true;
3272 }
3273
3274 /* Implement TARGET_FUNCTION_VALUE.
3275    Define how to find the value returned by a function.  */
3276
3277 static rtx
3278 aarch64_function_value (const_tree type, const_tree func,
3279                         bool outgoing ATTRIBUTE_UNUSED)
3280 {
3281   machine_mode mode;
3282   int unsignedp;
3283   int count;
3284   machine_mode ag_mode;
3285
3286   mode = TYPE_MODE (type);
3287   if (INTEGRAL_TYPE_P (type))
3288     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3289
3290   if (aarch64_return_in_msb (type))
3291     {
3292       HOST_WIDE_INT size = int_size_in_bytes (type);
3293
3294       if (size % UNITS_PER_WORD != 0)
3295         {
3296           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3297           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3298         }
3299     }
3300
3301   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3302                                                &ag_mode, &count, NULL))
3303     {
3304       if (!aarch64_composite_type_p (type, mode))
3305         {
3306           gcc_assert (count == 1 && mode == ag_mode);
3307           return gen_rtx_REG (mode, V0_REGNUM);
3308         }
3309       else
3310         {
3311           int i;
3312           rtx par;
3313
3314           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3315           for (i = 0; i < count; i++)
3316             {
3317               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3318               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3319               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3320               XVECEXP (par, 0, i) = tmp;
3321             }
3322           return par;
3323         }
3324     }
3325   else
3326     return gen_rtx_REG (mode, R0_REGNUM);
3327 }
3328
3329 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3330    Return true if REGNO is the number of a hard register in which the values
3331    of called function may come back.  */
3332
3333 static bool
3334 aarch64_function_value_regno_p (const unsigned int regno)
3335 {
3336   /* Maximum of 16 bytes can be returned in the general registers.  Examples
3337      of 16-byte return values are: 128-bit integers and 16-byte small
3338      structures (excluding homogeneous floating-point aggregates).  */
3339   if (regno == R0_REGNUM || regno == R1_REGNUM)
3340     return true;
3341
3342   /* Up to four fp/simd registers can return a function value, e.g. a
3343      homogeneous floating-point aggregate having four members.  */
3344   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3345     return TARGET_FLOAT;
3346
3347   return false;
3348 }
3349
3350 /* Implement TARGET_RETURN_IN_MEMORY.
3351
3352    If the type T of the result of a function is such that
3353      void func (T arg)
3354    would require that arg be passed as a value in a register (or set of
3355    registers) according to the parameter passing rules, then the result
3356    is returned in the same registers as would be used for such an
3357    argument.  */
3358
3359 static bool
3360 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3361 {
3362   HOST_WIDE_INT size;
3363   machine_mode ag_mode;
3364   int count;
3365
3366   if (!AGGREGATE_TYPE_P (type)
3367       && TREE_CODE (type) != COMPLEX_TYPE
3368       && TREE_CODE (type) != VECTOR_TYPE)
3369     /* Simple scalar types always returned in registers.  */
3370     return false;
3371
3372   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3373                                                type,
3374                                                &ag_mode,
3375                                                &count,
3376                                                NULL))
3377     return false;
3378
3379   /* Types larger than 2 registers returned in memory.  */
3380   size = int_size_in_bytes (type);
3381   return (size < 0 || size > 2 * UNITS_PER_WORD);
3382 }
3383
3384 static bool
3385 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3386                                const_tree type, int *nregs)
3387 {
3388   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3389   return aarch64_vfp_is_call_or_return_candidate (mode,
3390                                                   type,
3391                                                   &pcum->aapcs_vfp_rmode,
3392                                                   nregs,
3393                                                   NULL);
3394 }
3395
3396 /* Given MODE and TYPE of a function argument, return the alignment in
3397    bits.  The idea is to suppress any stronger alignment requested by
3398    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3399    This is a helper function for local use only.  */
3400
3401 static unsigned int
3402 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
3403 {
3404   if (!type)
3405     return GET_MODE_ALIGNMENT (mode);
3406
3407   if (integer_zerop (TYPE_SIZE (type)))
3408     return 0;
3409
3410   gcc_assert (TYPE_MODE (type) == mode);
3411
3412   if (!AGGREGATE_TYPE_P (type))
3413     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3414
3415   if (TREE_CODE (type) == ARRAY_TYPE)
3416     return TYPE_ALIGN (TREE_TYPE (type));
3417
3418   unsigned int alignment = 0;
3419   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3420     if (TREE_CODE (field) == FIELD_DECL)
3421       alignment = std::max (alignment, DECL_ALIGN (field));
3422
3423   return alignment;
3424 }
3425
3426 /* Layout a function argument according to the AAPCS64 rules.  The rule
3427    numbers refer to the rule numbers in the AAPCS64.  */
3428
3429 static void
3430 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3431                     const_tree type,
3432                     bool named ATTRIBUTE_UNUSED)
3433 {
3434   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3435   int ncrn, nvrn, nregs;
3436   bool allocate_ncrn, allocate_nvrn;
3437   HOST_WIDE_INT size;
3438
3439   /* We need to do this once per argument.  */
3440   if (pcum->aapcs_arg_processed)
3441     return;
3442
3443   pcum->aapcs_arg_processed = true;
3444
3445   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
3446   if (type)
3447     size = int_size_in_bytes (type);
3448   else
3449     /* No frontends can create types with variable-sized modes, so we
3450        shouldn't be asked to pass or return them.  */
3451     size = GET_MODE_SIZE (mode).to_constant ();
3452   size = ROUND_UP (size, UNITS_PER_WORD);
3453
3454   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3455   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3456                                                  mode,
3457                                                  type,
3458                                                  &nregs);
3459
3460   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3461      The following code thus handles passing by SIMD/FP registers first.  */
3462
3463   nvrn = pcum->aapcs_nvrn;
3464
3465   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3466      and homogenous short-vector aggregates (HVA).  */
3467   if (allocate_nvrn)
3468     {
3469       if (!TARGET_FLOAT)
3470         aarch64_err_no_fpadvsimd (mode);
3471
3472       if (nvrn + nregs <= NUM_FP_ARG_REGS)
3473         {
3474           pcum->aapcs_nextnvrn = nvrn + nregs;
3475           if (!aarch64_composite_type_p (type, mode))
3476             {
3477               gcc_assert (nregs == 1);
3478               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3479             }
3480           else
3481             {
3482               rtx par;
3483               int i;
3484               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3485               for (i = 0; i < nregs; i++)
3486                 {
3487                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3488                                          V0_REGNUM + nvrn + i);
3489                   rtx offset = gen_int_mode
3490                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3491                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3492                   XVECEXP (par, 0, i) = tmp;
3493                 }
3494               pcum->aapcs_reg = par;
3495             }
3496           return;
3497         }
3498       else
3499         {
3500           /* C.3 NSRN is set to 8.  */
3501           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3502           goto on_stack;
3503         }
3504     }
3505
3506   ncrn = pcum->aapcs_ncrn;
3507   nregs = size / UNITS_PER_WORD;
3508
3509   /* C6 - C9.  though the sign and zero extension semantics are
3510      handled elsewhere.  This is the case where the argument fits
3511      entirely general registers.  */
3512   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3513     {
3514
3515       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3516
3517       /* C.8 if the argument has an alignment of 16 then the NGRN is
3518          rounded up to the next even number.  */
3519       if (nregs == 2
3520           && ncrn % 2
3521           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3522              comparison is there because for > 16 * BITS_PER_UNIT
3523              alignment nregs should be > 2 and therefore it should be
3524              passed by reference rather than value.  */
3525           && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3526         {
3527           ++ncrn;
3528           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3529         }
3530
3531       /* NREGS can be 0 when e.g. an empty structure is to be passed.
3532          A reg is still generated for it, but the caller should be smart
3533          enough not to use it.  */
3534       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3535         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3536       else
3537         {
3538           rtx par;
3539           int i;
3540
3541           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3542           for (i = 0; i < nregs; i++)
3543             {
3544               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3545               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3546                                        GEN_INT (i * UNITS_PER_WORD));
3547               XVECEXP (par, 0, i) = tmp;
3548             }
3549           pcum->aapcs_reg = par;
3550         }
3551
3552       pcum->aapcs_nextncrn = ncrn + nregs;
3553       return;
3554     }
3555
3556   /* C.11  */
3557   pcum->aapcs_nextncrn = NUM_ARG_REGS;
3558
3559   /* The argument is passed on stack; record the needed number of words for
3560      this argument and align the total size if necessary.  */
3561 on_stack:
3562   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3563
3564   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3565     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
3566                                        16 / UNITS_PER_WORD);
3567   return;
3568 }
3569
3570 /* Implement TARGET_FUNCTION_ARG.  */
3571
3572 static rtx
3573 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3574                       const_tree type, bool named)
3575 {
3576   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3577   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3578
3579   if (mode == VOIDmode)
3580     return NULL_RTX;
3581
3582   aarch64_layout_arg (pcum_v, mode, type, named);
3583   return pcum->aapcs_reg;
3584 }
3585
3586 void
3587 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
3588                            const_tree fntype ATTRIBUTE_UNUSED,
3589                            rtx libname ATTRIBUTE_UNUSED,
3590                            const_tree fndecl ATTRIBUTE_UNUSED,
3591                            unsigned n_named ATTRIBUTE_UNUSED)
3592 {
3593   pcum->aapcs_ncrn = 0;
3594   pcum->aapcs_nvrn = 0;
3595   pcum->aapcs_nextncrn = 0;
3596   pcum->aapcs_nextnvrn = 0;
3597   pcum->pcs_variant = ARM_PCS_AAPCS64;
3598   pcum->aapcs_reg = NULL_RTX;
3599   pcum->aapcs_arg_processed = false;
3600   pcum->aapcs_stack_words = 0;
3601   pcum->aapcs_stack_size = 0;
3602
3603   if (!TARGET_FLOAT
3604       && fndecl && TREE_PUBLIC (fndecl)
3605       && fntype && fntype != error_mark_node)
3606     {
3607       const_tree type = TREE_TYPE (fntype);
3608       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
3609       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
3610       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
3611                                                    &mode, &nregs, NULL))
3612         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
3613     }
3614   return;
3615 }
3616
3617 static void
3618 aarch64_function_arg_advance (cumulative_args_t pcum_v,
3619                               machine_mode mode,
3620                               const_tree type,
3621                               bool named)
3622 {
3623   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3624   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
3625     {
3626       aarch64_layout_arg (pcum_v, mode, type, named);
3627       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
3628                   != (pcum->aapcs_stack_words != 0));
3629       pcum->aapcs_arg_processed = false;
3630       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
3631       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
3632       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
3633       pcum->aapcs_stack_words = 0;
3634       pcum->aapcs_reg = NULL_RTX;
3635     }
3636 }
3637
3638 bool
3639 aarch64_function_arg_regno_p (unsigned regno)
3640 {
3641   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
3642           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
3643 }
3644
3645 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
3646    PARM_BOUNDARY bits of alignment, but will be given anything up
3647    to STACK_BOUNDARY bits if the type requires it.  This makes sure
3648    that both before and after the layout of each argument, the Next
3649    Stacked Argument Address (NSAA) will have a minimum alignment of
3650    8 bytes.  */
3651
3652 static unsigned int
3653 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
3654 {
3655   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
3656   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
3657 }
3658
3659 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
3660
3661 static fixed_size_mode
3662 aarch64_get_reg_raw_mode (int regno)
3663 {
3664   if (TARGET_SVE && FP_REGNUM_P (regno))
3665     /* Don't use the SVE part of the register for __builtin_apply and
3666        __builtin_return.  The SVE registers aren't used by the normal PCS,
3667        so using them there would be a waste of time.  The PCS extensions
3668        for SVE types are fundamentally incompatible with the
3669        __builtin_return/__builtin_apply interface.  */
3670     return as_a <fixed_size_mode> (V16QImode);
3671   return default_get_reg_raw_mode (regno);
3672 }
3673
3674 /* Implement TARGET_FUNCTION_ARG_PADDING.
3675
3676    Small aggregate types are placed in the lowest memory address.
3677
3678    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
3679
3680 static pad_direction
3681 aarch64_function_arg_padding (machine_mode mode, const_tree type)
3682 {
3683   /* On little-endian targets, the least significant byte of every stack
3684      argument is passed at the lowest byte address of the stack slot.  */
3685   if (!BYTES_BIG_ENDIAN)
3686     return PAD_UPWARD;
3687
3688   /* Otherwise, integral, floating-point and pointer types are padded downward:
3689      the least significant byte of a stack argument is passed at the highest
3690      byte address of the stack slot.  */
3691   if (type
3692       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
3693          || POINTER_TYPE_P (type))
3694       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
3695     return PAD_DOWNWARD;
3696
3697   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
3698   return PAD_UPWARD;
3699 }
3700
3701 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
3702
3703    It specifies padding for the last (may also be the only)
3704    element of a block move between registers and memory.  If
3705    assuming the block is in the memory, padding upward means that
3706    the last element is padded after its highest significant byte,
3707    while in downward padding, the last element is padded at the
3708    its least significant byte side.
3709
3710    Small aggregates and small complex types are always padded
3711    upwards.
3712
3713    We don't need to worry about homogeneous floating-point or
3714    short-vector aggregates; their move is not affected by the
3715    padding direction determined here.  Regardless of endianness,
3716    each element of such an aggregate is put in the least
3717    significant bits of a fp/simd register.
3718
3719    Return !BYTES_BIG_ENDIAN if the least significant byte of the
3720    register has useful data, and return the opposite if the most
3721    significant byte does.  */
3722
3723 bool
3724 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
3725                      bool first ATTRIBUTE_UNUSED)
3726 {
3727
3728   /* Small composite types are always padded upward.  */
3729   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
3730     {
3731       HOST_WIDE_INT size;
3732       if (type)
3733         size = int_size_in_bytes (type);
3734       else
3735         /* No frontends can create types with variable-sized modes, so we
3736            shouldn't be asked to pass or return them.  */
3737         size = GET_MODE_SIZE (mode).to_constant ();
3738       if (size < 2 * UNITS_PER_WORD)
3739         return true;
3740     }
3741
3742   /* Otherwise, use the default padding.  */
3743   return !BYTES_BIG_ENDIAN;
3744 }
3745
3746 static scalar_int_mode
3747 aarch64_libgcc_cmp_return_mode (void)
3748 {
3749   return SImode;
3750 }
3751
3752 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
3753
3754 /* We use the 12-bit shifted immediate arithmetic instructions so values
3755    must be multiple of (1 << 12), i.e. 4096.  */
3756 #define ARITH_FACTOR 4096
3757
3758 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
3759 #error Cannot use simple address calculation for stack probing
3760 #endif
3761
3762 /* The pair of scratch registers used for stack probing.  */
3763 #define PROBE_STACK_FIRST_REG  9
3764 #define PROBE_STACK_SECOND_REG 10
3765
3766 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
3767    inclusive.  These are offsets from the current stack pointer.  */
3768
3769 static void
3770 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
3771 {
3772   HOST_WIDE_INT size;
3773   if (!poly_size.is_constant (&size))
3774     {
3775       sorry ("stack probes for SVE frames");
3776       return;
3777     }
3778
3779   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
3780
3781   /* See the same assertion on PROBE_INTERVAL above.  */
3782   gcc_assert ((first % ARITH_FACTOR) == 0);
3783
3784   /* See if we have a constant small number of probes to generate.  If so,
3785      that's the easy case.  */
3786   if (size <= PROBE_INTERVAL)
3787     {
3788       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
3789
3790       emit_set_insn (reg1,
3791                      plus_constant (Pmode,
3792                                     stack_pointer_rtx, -(first + base)));
3793       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
3794     }
3795
3796   /* The run-time loop is made up of 8 insns in the generic case while the
3797      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
3798   else if (size <= 4 * PROBE_INTERVAL)
3799     {
3800       HOST_WIDE_INT i, rem;
3801
3802       emit_set_insn (reg1,
3803                      plus_constant (Pmode,
3804                                     stack_pointer_rtx,
3805                                     -(first + PROBE_INTERVAL)));
3806       emit_stack_probe (reg1);
3807
3808       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
3809          it exceeds SIZE.  If only two probes are needed, this will not
3810          generate any code.  Then probe at FIRST + SIZE.  */
3811       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
3812         {
3813           emit_set_insn (reg1,
3814                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
3815           emit_stack_probe (reg1);
3816         }
3817
3818       rem = size - (i - PROBE_INTERVAL);
3819       if (rem > 256)
3820         {
3821           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3822
3823           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
3824           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
3825         }
3826       else
3827         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
3828     }
3829
3830   /* Otherwise, do the same as above, but in a loop.  Note that we must be
3831      extra careful with variables wrapping around because we might be at
3832      the very top (or the very bottom) of the address space and we have
3833      to be able to handle this case properly; in particular, we use an
3834      equality test for the loop condition.  */
3835   else
3836     {
3837       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
3838
3839       /* Step 1: round SIZE to the previous multiple of the interval.  */
3840
3841       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
3842
3843
3844       /* Step 2: compute initial and final value of the loop counter.  */
3845
3846       /* TEST_ADDR = SP + FIRST.  */
3847       emit_set_insn (reg1,
3848                      plus_constant (Pmode, stack_pointer_rtx, -first));
3849
3850       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
3851       HOST_WIDE_INT adjustment = - (first + rounded_size);
3852       if (! aarch64_uimm12_shift (adjustment))
3853         {
3854           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
3855                                           true, Pmode);
3856           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
3857         }
3858       else
3859         emit_set_insn (reg2,
3860                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
3861
3862       /* Step 3: the loop
3863
3864          do
3865            {
3866              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
3867              probe at TEST_ADDR
3868            }
3869          while (TEST_ADDR != LAST_ADDR)
3870
3871          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
3872          until it is equal to ROUNDED_SIZE.  */
3873
3874       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
3875
3876
3877       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
3878          that SIZE is equal to ROUNDED_SIZE.  */
3879
3880       if (size != rounded_size)
3881         {
3882           HOST_WIDE_INT rem = size - rounded_size;
3883
3884           if (rem > 256)
3885             {
3886               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3887
3888               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
3889               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
3890             }
3891           else
3892             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
3893         }
3894     }
3895
3896   /* Make sure nothing is scheduled before we are done.  */
3897   emit_insn (gen_blockage ());
3898 }
3899
3900 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
3901    absolute addresses.  */
3902
3903 const char *
3904 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
3905 {
3906   static int labelno = 0;
3907   char loop_lab[32];
3908   rtx xops[2];
3909
3910   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
3911
3912   /* Loop.  */
3913   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
3914
3915   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
3916   xops[0] = reg1;
3917   xops[1] = GEN_INT (PROBE_INTERVAL);
3918   output_asm_insn ("sub\t%0, %0, %1", xops);
3919
3920   /* Probe at TEST_ADDR.  */
3921   output_asm_insn ("str\txzr, [%0]", xops);
3922
3923   /* Test if TEST_ADDR == LAST_ADDR.  */
3924   xops[1] = reg2;
3925   output_asm_insn ("cmp\t%0, %1", xops);
3926
3927   /* Branch.  */
3928   fputs ("\tb.ne\t", asm_out_file);
3929   assemble_name_raw (asm_out_file, loop_lab);
3930   fputc ('\n', asm_out_file);
3931
3932   return "";
3933 }
3934
3935 /* Determine whether a frame chain needs to be generated.  */
3936 static bool
3937 aarch64_needs_frame_chain (void)
3938 {
3939   /* Force a frame chain for EH returns so the return address is at FP+8.  */
3940   if (frame_pointer_needed || crtl->calls_eh_return)
3941     return true;
3942
3943   /* A leaf function cannot have calls or write LR.  */
3944   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
3945
3946   /* Don't use a frame chain in leaf functions if leaf frame pointers
3947      are disabled.  */
3948   if (flag_omit_leaf_frame_pointer && is_leaf)
3949     return false;
3950
3951   return aarch64_use_frame_pointer;
3952 }
3953
3954 /* Mark the registers that need to be saved by the callee and calculate
3955    the size of the callee-saved registers area and frame record (both FP
3956    and LR may be omitted).  */
3957 static void
3958 aarch64_layout_frame (void)
3959 {
3960   HOST_WIDE_INT offset = 0;
3961   int regno, last_fp_reg = INVALID_REGNUM;
3962
3963   if (reload_completed && cfun->machine->frame.laid_out)
3964     return;
3965
3966   cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
3967
3968 #define SLOT_NOT_REQUIRED (-2)
3969 #define SLOT_REQUIRED     (-1)
3970
3971   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
3972   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
3973
3974   /* First mark all the registers that really need to be saved...  */
3975   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
3976     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
3977
3978   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
3979     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
3980
3981   /* ... that includes the eh data registers (if needed)...  */
3982   if (crtl->calls_eh_return)
3983     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
3984       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
3985         = SLOT_REQUIRED;
3986
3987   /* ... and any callee saved register that dataflow says is live.  */
3988   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
3989     if (df_regs_ever_live_p (regno)
3990         && (regno == R30_REGNUM
3991             || !call_used_regs[regno]))
3992       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
3993
3994   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
3995     if (df_regs_ever_live_p (regno)
3996         && !call_used_regs[regno])
3997       {
3998         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
3999         last_fp_reg = regno;
4000       }
4001
4002   if (cfun->machine->frame.emit_frame_chain)
4003     {
4004       /* FP and LR are placed in the linkage record.  */
4005       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4006       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4007       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4008       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4009       offset = 2 * UNITS_PER_WORD;
4010     }
4011
4012   /* Now assign stack slots for them.  */
4013   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4014     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4015       {
4016         cfun->machine->frame.reg_offset[regno] = offset;
4017         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4018           cfun->machine->frame.wb_candidate1 = regno;
4019         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4020           cfun->machine->frame.wb_candidate2 = regno;
4021         offset += UNITS_PER_WORD;
4022       }
4023
4024   HOST_WIDE_INT max_int_offset = offset;
4025   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4026   bool has_align_gap = offset != max_int_offset;
4027
4028   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4029     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4030       {
4031         /* If there is an alignment gap between integer and fp callee-saves,
4032            allocate the last fp register to it if possible.  */
4033         if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
4034           {
4035             cfun->machine->frame.reg_offset[regno] = max_int_offset;
4036             break;
4037           }
4038
4039         cfun->machine->frame.reg_offset[regno] = offset;
4040         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4041           cfun->machine->frame.wb_candidate1 = regno;
4042         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4043                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4044           cfun->machine->frame.wb_candidate2 = regno;
4045         offset += UNITS_PER_WORD;
4046       }
4047
4048   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4049
4050   cfun->machine->frame.saved_regs_size = offset;
4051
4052   HOST_WIDE_INT varargs_and_saved_regs_size
4053     = offset + cfun->machine->frame.saved_varargs_size;
4054
4055   cfun->machine->frame.hard_fp_offset
4056     = aligned_upper_bound (varargs_and_saved_regs_size
4057                            + get_frame_size (),
4058                            STACK_BOUNDARY / BITS_PER_UNIT);
4059
4060   /* Both these values are already aligned.  */
4061   gcc_assert (multiple_p (crtl->outgoing_args_size,
4062                           STACK_BOUNDARY / BITS_PER_UNIT));
4063   cfun->machine->frame.frame_size
4064     = (cfun->machine->frame.hard_fp_offset
4065        + crtl->outgoing_args_size);
4066
4067   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4068
4069   cfun->machine->frame.initial_adjust = 0;
4070   cfun->machine->frame.final_adjust = 0;
4071   cfun->machine->frame.callee_adjust = 0;
4072   cfun->machine->frame.callee_offset = 0;
4073
4074   HOST_WIDE_INT max_push_offset = 0;
4075   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4076     max_push_offset = 512;
4077   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4078     max_push_offset = 256;
4079
4080   HOST_WIDE_INT const_size, const_fp_offset;
4081   if (cfun->machine->frame.frame_size.is_constant (&const_size)
4082       && const_size < max_push_offset
4083       && known_eq (crtl->outgoing_args_size, 0))
4084     {
4085       /* Simple, small frame with no outgoing arguments:
4086          stp reg1, reg2, [sp, -frame_size]!
4087          stp reg3, reg4, [sp, 16]  */
4088       cfun->machine->frame.callee_adjust = const_size;
4089     }
4090   else if (known_lt (crtl->outgoing_args_size
4091                      + cfun->machine->frame.saved_regs_size, 512)
4092            && !(cfun->calls_alloca
4093                 && known_lt (cfun->machine->frame.hard_fp_offset,
4094                              max_push_offset)))
4095     {
4096       /* Frame with small outgoing arguments:
4097          sub sp, sp, frame_size
4098          stp reg1, reg2, [sp, outgoing_args_size]
4099          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
4100       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4101       cfun->machine->frame.callee_offset
4102         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4103     }
4104   else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4105            && const_fp_offset < max_push_offset)
4106     {
4107       /* Frame with large outgoing arguments but a small local area:
4108          stp reg1, reg2, [sp, -hard_fp_offset]!
4109          stp reg3, reg4, [sp, 16]
4110          sub sp, sp, outgoing_args_size  */
4111       cfun->machine->frame.callee_adjust = const_fp_offset;
4112       cfun->machine->frame.final_adjust
4113         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4114     }
4115   else
4116     {
4117       /* Frame with large local area and outgoing arguments using frame pointer:
4118          sub sp, sp, hard_fp_offset
4119          stp x29, x30, [sp, 0]
4120          add x29, sp, 0
4121          stp reg3, reg4, [sp, 16]
4122          sub sp, sp, outgoing_args_size  */
4123       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4124       cfun->machine->frame.final_adjust
4125         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4126     }
4127
4128   cfun->machine->frame.laid_out = true;
4129 }
4130
4131 /* Return true if the register REGNO is saved on entry to
4132    the current function.  */
4133
4134 static bool
4135 aarch64_register_saved_on_entry (int regno)
4136 {
4137   return cfun->machine->frame.reg_offset[regno] >= 0;
4138 }
4139
4140 /* Return the next register up from REGNO up to LIMIT for the callee
4141    to save.  */
4142
4143 static unsigned
4144 aarch64_next_callee_save (unsigned regno, unsigned limit)
4145 {
4146   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4147     regno ++;
4148   return regno;
4149 }
4150
4151 /* Push the register number REGNO of mode MODE to the stack with write-back
4152    adjusting the stack by ADJUSTMENT.  */
4153
4154 static void
4155 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4156                            HOST_WIDE_INT adjustment)
4157  {
4158   rtx base_rtx = stack_pointer_rtx;
4159   rtx insn, reg, mem;
4160
4161   reg = gen_rtx_REG (mode, regno);
4162   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4163                             plus_constant (Pmode, base_rtx, -adjustment));
4164   mem = gen_frame_mem (mode, mem);
4165
4166   insn = emit_move_insn (mem, reg);
4167   RTX_FRAME_RELATED_P (insn) = 1;
4168 }
4169
4170 /* Generate and return an instruction to store the pair of registers
4171    REG and REG2 of mode MODE to location BASE with write-back adjusting
4172    the stack location BASE by ADJUSTMENT.  */
4173
4174 static rtx
4175 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4176                           HOST_WIDE_INT adjustment)
4177 {
4178   switch (mode)
4179     {
4180     case E_DImode:
4181       return gen_storewb_pairdi_di (base, base, reg, reg2,
4182                                     GEN_INT (-adjustment),
4183                                     GEN_INT (UNITS_PER_WORD - adjustment));
4184     case E_DFmode:
4185       return gen_storewb_pairdf_di (base, base, reg, reg2,
4186                                     GEN_INT (-adjustment),
4187                                     GEN_INT (UNITS_PER_WORD - adjustment));
4188     default:
4189       gcc_unreachable ();
4190     }
4191 }
4192
4193 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4194    stack pointer by ADJUSTMENT.  */
4195
4196 static void
4197 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4198 {
4199   rtx_insn *insn;
4200   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4201
4202   if (regno2 == INVALID_REGNUM)
4203     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4204
4205   rtx reg1 = gen_rtx_REG (mode, regno1);
4206   rtx reg2 = gen_rtx_REG (mode, regno2);
4207
4208   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4209                                               reg2, adjustment));
4210   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4211   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4212   RTX_FRAME_RELATED_P (insn) = 1;
4213 }
4214
4215 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4216    adjusting it by ADJUSTMENT afterwards.  */
4217
4218 static rtx
4219 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4220                          HOST_WIDE_INT adjustment)
4221 {
4222   switch (mode)
4223     {
4224     case E_DImode:
4225       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4226                                    GEN_INT (UNITS_PER_WORD));
4227     case E_DFmode:
4228       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4229                                    GEN_INT (UNITS_PER_WORD));
4230     default:
4231       gcc_unreachable ();
4232     }
4233 }
4234
4235 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4236    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4237    into CFI_OPS.  */
4238
4239 static void
4240 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4241                   rtx *cfi_ops)
4242 {
4243   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4244   rtx reg1 = gen_rtx_REG (mode, regno1);
4245
4246   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4247
4248   if (regno2 == INVALID_REGNUM)
4249     {
4250       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4251       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4252       emit_move_insn (reg1, gen_frame_mem (mode, mem));
4253     }
4254   else
4255     {
4256       rtx reg2 = gen_rtx_REG (mode, regno2);
4257       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4258       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4259                                           reg2, adjustment));
4260     }
4261 }
4262
4263 /* Generate and return a store pair instruction of mode MODE to store
4264    register REG1 to MEM1 and register REG2 to MEM2.  */
4265
4266 static rtx
4267 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4268                         rtx reg2)
4269 {
4270   switch (mode)
4271     {
4272     case E_DImode:
4273       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
4274
4275     case E_DFmode:
4276       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
4277
4278     default:
4279       gcc_unreachable ();
4280     }
4281 }
4282
4283 /* Generate and regurn a load pair isntruction of mode MODE to load register
4284    REG1 from MEM1 and register REG2 from MEM2.  */
4285
4286 static rtx
4287 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4288                        rtx mem2)
4289 {
4290   switch (mode)
4291     {
4292     case E_DImode:
4293       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
4294
4295     case E_DFmode:
4296       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
4297
4298     default:
4299       gcc_unreachable ();
4300     }
4301 }
4302
4303 /* Return TRUE if return address signing should be enabled for the current
4304    function, otherwise return FALSE.  */
4305
4306 bool
4307 aarch64_return_address_signing_enabled (void)
4308 {
4309   /* This function should only be called after frame laid out.   */
4310   gcc_assert (cfun->machine->frame.laid_out);
4311
4312   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4313      if it's LR is pushed onto stack.  */
4314   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4315           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4316               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4317 }
4318
4319 /* Emit code to save the callee-saved registers from register number START
4320    to LIMIT to the stack at the location starting at offset START_OFFSET,
4321    skipping any write-back candidates if SKIP_WB is true.  */
4322
4323 static void
4324 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4325                            unsigned start, unsigned limit, bool skip_wb)
4326 {
4327   rtx_insn *insn;
4328   unsigned regno;
4329   unsigned regno2;
4330
4331   for (regno = aarch64_next_callee_save (start, limit);
4332        regno <= limit;
4333        regno = aarch64_next_callee_save (regno + 1, limit))
4334     {
4335       rtx reg, mem;
4336       poly_int64 offset;
4337
4338       if (skip_wb
4339           && (regno == cfun->machine->frame.wb_candidate1
4340               || regno == cfun->machine->frame.wb_candidate2))
4341         continue;
4342
4343       if (cfun->machine->reg_is_wrapped_separately[regno])
4344        continue;
4345
4346       reg = gen_rtx_REG (mode, regno);
4347       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4348       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4349                                                 offset));
4350
4351       regno2 = aarch64_next_callee_save (regno + 1, limit);
4352
4353       if (regno2 <= limit
4354           && !cfun->machine->reg_is_wrapped_separately[regno2]
4355           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4356               == cfun->machine->frame.reg_offset[regno2]))
4357
4358         {
4359           rtx reg2 = gen_rtx_REG (mode, regno2);
4360           rtx mem2;
4361
4362           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4363           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4364                                                      offset));
4365           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4366                                                     reg2));
4367
4368           /* The first part of a frame-related parallel insn is
4369              always assumed to be relevant to the frame
4370              calculations; subsequent parts, are only
4371              frame-related if explicitly marked.  */
4372           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4373           regno = regno2;
4374         }
4375       else
4376         insn = emit_move_insn (mem, reg);
4377
4378       RTX_FRAME_RELATED_P (insn) = 1;
4379     }
4380 }
4381
4382 /* Emit code to restore the callee registers of mode MODE from register
4383    number START up to and including LIMIT.  Restore from the stack offset
4384    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4385    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
4386
4387 static void
4388 aarch64_restore_callee_saves (machine_mode mode,
4389                               poly_int64 start_offset, unsigned start,
4390                               unsigned limit, bool skip_wb, rtx *cfi_ops)
4391 {
4392   rtx base_rtx = stack_pointer_rtx;
4393   unsigned regno;
4394   unsigned regno2;
4395   poly_int64 offset;
4396
4397   for (regno = aarch64_next_callee_save (start, limit);
4398        regno <= limit;
4399        regno = aarch64_next_callee_save (regno + 1, limit))
4400     {
4401       if (cfun->machine->reg_is_wrapped_separately[regno])
4402        continue;
4403
4404       rtx reg, mem;
4405
4406       if (skip_wb
4407           && (regno == cfun->machine->frame.wb_candidate1
4408               || regno == cfun->machine->frame.wb_candidate2))
4409         continue;
4410
4411       reg = gen_rtx_REG (mode, regno);
4412       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4413       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4414
4415       regno2 = aarch64_next_callee_save (regno + 1, limit);
4416
4417       if (regno2 <= limit
4418           && !cfun->machine->reg_is_wrapped_separately[regno2]
4419           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4420               == cfun->machine->frame.reg_offset[regno2]))
4421         {
4422           rtx reg2 = gen_rtx_REG (mode, regno2);
4423           rtx mem2;
4424
4425           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4426           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4427           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4428
4429           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4430           regno = regno2;
4431         }
4432       else
4433         emit_move_insn (reg, mem);
4434       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
4435     }
4436 }
4437
4438 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4439    of MODE.  */
4440
4441 static inline bool
4442 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4443 {
4444   HOST_WIDE_INT multiple;
4445   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4446           && IN_RANGE (multiple, -8, 7));
4447 }
4448
4449 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4450    of MODE.  */
4451
4452 static inline bool
4453 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4454 {
4455   HOST_WIDE_INT multiple;
4456   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4457           && IN_RANGE (multiple, 0, 63));
4458 }
4459
4460 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4461    of MODE.  */
4462
4463 bool
4464 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4465 {
4466   HOST_WIDE_INT multiple;
4467   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4468           && IN_RANGE (multiple, -64, 63));
4469 }
4470
4471 /* Return true if OFFSET is a signed 9-bit value.  */
4472
4473 static inline bool
4474 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
4475                                poly_int64 offset)
4476 {
4477   HOST_WIDE_INT const_offset;
4478   return (offset.is_constant (&const_offset)
4479           && IN_RANGE (const_offset, -256, 255));
4480 }
4481
4482 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4483    of MODE.  */
4484
4485 static inline bool
4486 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4487 {
4488   HOST_WIDE_INT multiple;
4489   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4490           && IN_RANGE (multiple, -256, 255));
4491 }
4492
4493 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4494    of MODE.  */
4495
4496 static inline bool
4497 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4498 {
4499   HOST_WIDE_INT multiple;
4500   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4501           && IN_RANGE (multiple, 0, 4095));
4502 }
4503
4504 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
4505
4506 static sbitmap
4507 aarch64_get_separate_components (void)
4508 {
4509   aarch64_layout_frame ();
4510
4511   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4512   bitmap_clear (components);
4513
4514   /* The registers we need saved to the frame.  */
4515   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4516     if (aarch64_register_saved_on_entry (regno))
4517       {
4518         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4519         if (!frame_pointer_needed)
4520           offset += cfun->machine->frame.frame_size
4521                     - cfun->machine->frame.hard_fp_offset;
4522         /* Check that we can access the stack slot of the register with one
4523            direct load with no adjustments needed.  */
4524         if (offset_12bit_unsigned_scaled_p (DImode, offset))
4525           bitmap_set_bit (components, regno);
4526       }
4527
4528   /* Don't mess with the hard frame pointer.  */
4529   if (frame_pointer_needed)
4530     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
4531
4532   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4533   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4534   /* If aarch64_layout_frame has chosen registers to store/restore with
4535      writeback don't interfere with them to avoid having to output explicit
4536      stack adjustment instructions.  */
4537   if (reg2 != INVALID_REGNUM)
4538     bitmap_clear_bit (components, reg2);
4539   if (reg1 != INVALID_REGNUM)
4540     bitmap_clear_bit (components, reg1);
4541
4542   bitmap_clear_bit (components, LR_REGNUM);
4543   bitmap_clear_bit (components, SP_REGNUM);
4544
4545   return components;
4546 }
4547
4548 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
4549
4550 static sbitmap
4551 aarch64_components_for_bb (basic_block bb)
4552 {
4553   bitmap in = DF_LIVE_IN (bb);
4554   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
4555   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
4556
4557   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4558   bitmap_clear (components);
4559
4560   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
4561   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4562     if ((!call_used_regs[regno])
4563        && (bitmap_bit_p (in, regno)
4564            || bitmap_bit_p (gen, regno)
4565            || bitmap_bit_p (kill, regno)))
4566       {
4567         unsigned regno2, offset, offset2;
4568         bitmap_set_bit (components, regno);
4569
4570         /* If there is a callee-save at an adjacent offset, add it too
4571            to increase the use of LDP/STP.  */
4572         offset = cfun->machine->frame.reg_offset[regno];
4573         regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
4574
4575         if (regno2 <= LAST_SAVED_REGNUM)
4576           {
4577             offset2 = cfun->machine->frame.reg_offset[regno2];
4578             if ((offset & ~8) == (offset2 & ~8))
4579               bitmap_set_bit (components, regno2);
4580           }
4581       }
4582
4583   return components;
4584 }
4585
4586 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
4587    Nothing to do for aarch64.  */
4588
4589 static void
4590 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
4591 {
4592 }
4593
4594 /* Return the next set bit in BMP from START onwards.  Return the total number
4595    of bits in BMP if no set bit is found at or after START.  */
4596
4597 static unsigned int
4598 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
4599 {
4600   unsigned int nbits = SBITMAP_SIZE (bmp);
4601   if (start == nbits)
4602     return start;
4603
4604   gcc_assert (start < nbits);
4605   for (unsigned int i = start; i < nbits; i++)
4606     if (bitmap_bit_p (bmp, i))
4607       return i;
4608
4609   return nbits;
4610 }
4611
4612 /* Do the work for aarch64_emit_prologue_components and
4613    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
4614    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
4615    for these components or the epilogue sequence.  That is, it determines
4616    whether we should emit stores or loads and what kind of CFA notes to attach
4617    to the insns.  Otherwise the logic for the two sequences is very
4618    similar.  */
4619
4620 static void
4621 aarch64_process_components (sbitmap components, bool prologue_p)
4622 {
4623   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
4624                              ? HARD_FRAME_POINTER_REGNUM
4625                              : STACK_POINTER_REGNUM);
4626
4627   unsigned last_regno = SBITMAP_SIZE (components);
4628   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
4629   rtx_insn *insn = NULL;
4630
4631   while (regno != last_regno)
4632     {
4633       /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
4634          so DFmode for the vector registers is enough.  */
4635       machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
4636       rtx reg = gen_rtx_REG (mode, regno);
4637       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4638       if (!frame_pointer_needed)
4639         offset += cfun->machine->frame.frame_size
4640                   - cfun->machine->frame.hard_fp_offset;
4641       rtx addr = plus_constant (Pmode, ptr_reg, offset);
4642       rtx mem = gen_frame_mem (mode, addr);
4643
4644       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
4645       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
4646       /* No more registers to handle after REGNO.
4647          Emit a single save/restore and exit.  */
4648       if (regno2 == last_regno)
4649         {
4650           insn = emit_insn (set);
4651           RTX_FRAME_RELATED_P (insn) = 1;
4652           if (prologue_p)
4653             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4654           else
4655             add_reg_note (insn, REG_CFA_RESTORE, reg);
4656           break;
4657         }
4658
4659       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
4660       /* The next register is not of the same class or its offset is not
4661          mergeable with the current one into a pair.  */
4662       if (!satisfies_constraint_Ump (mem)
4663           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
4664           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
4665                        GET_MODE_SIZE (mode)))
4666         {
4667           insn = emit_insn (set);
4668           RTX_FRAME_RELATED_P (insn) = 1;
4669           if (prologue_p)
4670             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4671           else
4672             add_reg_note (insn, REG_CFA_RESTORE, reg);
4673
4674           regno = regno2;
4675           continue;
4676         }
4677
4678       /* REGNO2 can be saved/restored in a pair with REGNO.  */
4679       rtx reg2 = gen_rtx_REG (mode, regno2);
4680       if (!frame_pointer_needed)
4681         offset2 += cfun->machine->frame.frame_size
4682                   - cfun->machine->frame.hard_fp_offset;
4683       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
4684       rtx mem2 = gen_frame_mem (mode, addr2);
4685       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
4686                              : gen_rtx_SET (reg2, mem2);
4687
4688       if (prologue_p)
4689         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
4690       else
4691         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4692
4693       RTX_FRAME_RELATED_P (insn) = 1;
4694       if (prologue_p)
4695         {
4696           add_reg_note (insn, REG_CFA_OFFSET, set);
4697           add_reg_note (insn, REG_CFA_OFFSET, set2);
4698         }
4699       else
4700         {
4701           add_reg_note (insn, REG_CFA_RESTORE, reg);
4702           add_reg_note (insn, REG_CFA_RESTORE, reg2);
4703         }
4704
4705       regno = aarch64_get_next_set_bit (components, regno2 + 1);
4706     }
4707 }
4708
4709 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
4710
4711 static void
4712 aarch64_emit_prologue_components (sbitmap components)
4713 {
4714   aarch64_process_components (components, true);
4715 }
4716
4717 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
4718
4719 static void
4720 aarch64_emit_epilogue_components (sbitmap components)
4721 {
4722   aarch64_process_components (components, false);
4723 }
4724
4725 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
4726
4727 static void
4728 aarch64_set_handled_components (sbitmap components)
4729 {
4730   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4731     if (bitmap_bit_p (components, regno))
4732       cfun->machine->reg_is_wrapped_separately[regno] = true;
4733 }
4734
4735 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
4736    is saved at BASE + OFFSET.  */
4737
4738 static void
4739 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
4740                             rtx base, poly_int64 offset)
4741 {
4742   rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
4743   add_reg_note (insn, REG_CFA_EXPRESSION,
4744                 gen_rtx_SET (mem, regno_reg_rtx[reg]));
4745 }
4746
4747 /* AArch64 stack frames generated by this compiler look like:
4748
4749         +-------------------------------+
4750         |                               |
4751         |  incoming stack arguments     |
4752         |                               |
4753         +-------------------------------+
4754         |                               | <-- incoming stack pointer (aligned)
4755         |  callee-allocated save area   |
4756         |  for register varargs         |
4757         |                               |
4758         +-------------------------------+
4759         |  local variables              | <-- frame_pointer_rtx
4760         |                               |
4761         +-------------------------------+
4762         |  padding0                     | \
4763         +-------------------------------+  |
4764         |  callee-saved registers       |  | frame.saved_regs_size
4765         +-------------------------------+  |
4766         |  LR'                          |  |
4767         +-------------------------------+  |
4768         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
4769         +-------------------------------+
4770         |  dynamic allocation           |
4771         +-------------------------------+
4772         |  padding                      |
4773         +-------------------------------+
4774         |  outgoing stack arguments     | <-- arg_pointer
4775         |                               |
4776         +-------------------------------+
4777         |                               | <-- stack_pointer_rtx (aligned)
4778
4779    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
4780    but leave frame_pointer_rtx and hard_frame_pointer_rtx
4781    unchanged.  */
4782
4783 /* Generate the prologue instructions for entry into a function.
4784    Establish the stack frame by decreasing the stack pointer with a
4785    properly calculated size and, if necessary, create a frame record
4786    filled with the values of LR and previous frame pointer.  The
4787    current FP is also set up if it is in use.  */
4788
4789 void
4790 aarch64_expand_prologue (void)
4791 {
4792   aarch64_layout_frame ();
4793
4794   poly_int64 frame_size = cfun->machine->frame.frame_size;
4795   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4796   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4797   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4798   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4799   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4800   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4801   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
4802   rtx_insn *insn;
4803
4804   /* Sign return address for functions.  */
4805   if (aarch64_return_address_signing_enabled ())
4806     {
4807       insn = emit_insn (gen_pacisp ());
4808       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
4809       RTX_FRAME_RELATED_P (insn) = 1;
4810     }
4811
4812   if (flag_stack_usage_info)
4813     current_function_static_stack_size = constant_lower_bound (frame_size);
4814
4815   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4816     {
4817       if (crtl->is_leaf && !cfun->calls_alloca)
4818         {
4819           if (maybe_gt (frame_size, PROBE_INTERVAL)
4820               && maybe_gt (frame_size, get_stack_check_protect ()))
4821             aarch64_emit_probe_stack_range (get_stack_check_protect (),
4822                                             (frame_size
4823                                              - get_stack_check_protect ()));
4824         }
4825       else if (maybe_gt (frame_size, 0))
4826         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
4827     }
4828
4829   rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4830   rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4831
4832   aarch64_sub_sp (ip0_rtx, ip1_rtx, initial_adjust, true);
4833
4834   if (callee_adjust != 0)
4835     aarch64_push_regs (reg1, reg2, callee_adjust);
4836
4837   if (emit_frame_chain)
4838     {
4839       poly_int64 reg_offset = callee_adjust;
4840       if (callee_adjust == 0)
4841         {
4842           reg1 = R29_REGNUM;
4843           reg2 = R30_REGNUM;
4844           reg_offset = callee_offset;
4845           aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
4846         }
4847       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
4848                           stack_pointer_rtx, callee_offset,
4849                           ip1_rtx, ip0_rtx, frame_pointer_needed);
4850       if (frame_pointer_needed && !frame_size.is_constant ())
4851         {
4852           /* Variable-sized frames need to describe the save slot
4853              address using DW_CFA_expression rather than DW_CFA_offset.
4854              This means that, without taking further action, the
4855              locations of the registers that we've already saved would
4856              remain based on the stack pointer even after we redefine
4857              the CFA based on the frame pointer.  We therefore need new
4858              DW_CFA_expressions to re-express the save slots with addresses
4859              based on the frame pointer.  */
4860           rtx_insn *insn = get_last_insn ();
4861           gcc_assert (RTX_FRAME_RELATED_P (insn));
4862
4863           /* Add an explicit CFA definition if this was previously
4864              implicit.  */
4865           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
4866             {
4867               rtx src = plus_constant (Pmode, stack_pointer_rtx,
4868                                        callee_offset);
4869               add_reg_note (insn, REG_CFA_ADJUST_CFA,
4870                             gen_rtx_SET (hard_frame_pointer_rtx, src));
4871             }
4872
4873           /* Change the save slot expressions for the registers that
4874              we've already saved.  */
4875           reg_offset -= callee_offset;
4876           aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
4877                                       reg_offset + UNITS_PER_WORD);
4878           aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
4879                                       reg_offset);
4880         }
4881       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
4882     }
4883
4884   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4885                              callee_adjust != 0 || emit_frame_chain);
4886   aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4887                              callee_adjust != 0 || emit_frame_chain);
4888   aarch64_sub_sp (ip1_rtx, ip0_rtx, final_adjust, !frame_pointer_needed);
4889 }
4890
4891 /* Return TRUE if we can use a simple_return insn.
4892
4893    This function checks whether the callee saved stack is empty, which
4894    means no restore actions are need. The pro_and_epilogue will use
4895    this to check whether shrink-wrapping opt is feasible.  */
4896
4897 bool
4898 aarch64_use_return_insn_p (void)
4899 {
4900   if (!reload_completed)
4901     return false;
4902
4903   if (crtl->profile)
4904     return false;
4905
4906   aarch64_layout_frame ();
4907
4908   return known_eq (cfun->machine->frame.frame_size, 0);
4909 }
4910
4911 /* Generate the epilogue instructions for returning from a function.
4912    This is almost exactly the reverse of the prolog sequence, except
4913    that we need to insert barriers to avoid scheduling loads that read
4914    from a deallocated stack, and we optimize the unwind records by
4915    emitting them all together if possible.  */
4916 void
4917 aarch64_expand_epilogue (bool for_sibcall)
4918 {
4919   aarch64_layout_frame ();
4920
4921   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4922   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4923   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4924   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4925   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4926   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4927   rtx cfi_ops = NULL;
4928   rtx_insn *insn;
4929   /* A stack clash protection prologue may not have left IP0_REGNUM or
4930      IP1_REGNUM in a usable state.  The same is true for allocations
4931      with an SVE component, since we then need both temporary registers
4932      for each allocation.  */
4933   bool can_inherit_p = (initial_adjust.is_constant ()
4934                         && final_adjust.is_constant ()
4935                         && !flag_stack_clash_protection);
4936
4937   /* We need to add memory barrier to prevent read from deallocated stack.  */
4938   bool need_barrier_p
4939     = maybe_ne (get_frame_size ()
4940                 + cfun->machine->frame.saved_varargs_size, 0);
4941
4942   /* Emit a barrier to prevent loads from a deallocated stack.  */
4943   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
4944       || cfun->calls_alloca
4945       || crtl->calls_eh_return)
4946     {
4947       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
4948       need_barrier_p = false;
4949     }
4950
4951   /* Restore the stack pointer from the frame pointer if it may not
4952      be the same as the stack pointer.  */
4953   rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4954   rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4955   if (frame_pointer_needed
4956       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
4957     /* If writeback is used when restoring callee-saves, the CFA
4958        is restored on the instruction doing the writeback.  */
4959     aarch64_add_offset (Pmode, stack_pointer_rtx,
4960                         hard_frame_pointer_rtx, -callee_offset,
4961                         ip1_rtx, ip0_rtx, callee_adjust == 0);
4962   else
4963     aarch64_add_sp (ip1_rtx, ip0_rtx, final_adjust,
4964                     !can_inherit_p || df_regs_ever_live_p (IP1_REGNUM));
4965
4966   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4967                                 callee_adjust != 0, &cfi_ops);
4968   aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4969                                 callee_adjust != 0, &cfi_ops);
4970
4971   if (need_barrier_p)
4972     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
4973
4974   if (callee_adjust != 0)
4975     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
4976
4977   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
4978     {
4979       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
4980       insn = get_last_insn ();
4981       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
4982       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
4983       RTX_FRAME_RELATED_P (insn) = 1;
4984       cfi_ops = NULL;
4985     }
4986
4987   aarch64_add_sp (ip0_rtx, ip1_rtx, initial_adjust,
4988                   !can_inherit_p || df_regs_ever_live_p (IP0_REGNUM));
4989
4990   if (cfi_ops)
4991     {
4992       /* Emit delayed restores and reset the CFA to be SP.  */
4993       insn = get_last_insn ();
4994       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
4995       REG_NOTES (insn) = cfi_ops;
4996       RTX_FRAME_RELATED_P (insn) = 1;
4997     }
4998
4999   /* We prefer to emit the combined return/authenticate instruction RETAA,
5000      however there are three cases in which we must instead emit an explicit
5001      authentication instruction.
5002
5003         1) Sibcalls don't return in a normal way, so if we're about to call one
5004            we must authenticate.
5005
5006         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5007            generating code for !TARGET_ARMV8_3 we can't use it and must
5008            explicitly authenticate.
5009
5010         3) On an eh_return path we make extra stack adjustments to update the
5011            canonical frame address to be the exception handler's CFA.  We want
5012            to authenticate using the CFA of the function which calls eh_return.
5013     */
5014   if (aarch64_return_address_signing_enabled ()
5015       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
5016     {
5017       insn = emit_insn (gen_autisp ());
5018       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5019       RTX_FRAME_RELATED_P (insn) = 1;
5020     }
5021
5022   /* Stack adjustment for exception handler.  */
5023   if (crtl->calls_eh_return)
5024     {
5025       /* We need to unwind the stack by the offset computed by
5026          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
5027          to be SP; letting the CFA move during this adjustment
5028          is just as correct as retaining the CFA from the body
5029          of the function.  Therefore, do nothing special.  */
5030       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
5031     }
5032
5033   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
5034   if (!for_sibcall)
5035     emit_jump_insn (ret_rtx);
5036 }
5037
5038 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
5039    normally or return to a previous frame after unwinding.
5040
5041    An EH return uses a single shared return sequence.  The epilogue is
5042    exactly like a normal epilogue except that it has an extra input
5043    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5044    that must be applied after the frame has been destroyed.  An extra label
5045    is inserted before the epilogue which initializes this register to zero,
5046    and this is the entry point for a normal return.
5047
5048    An actual EH return updates the return address, initializes the stack
5049    adjustment and jumps directly into the epilogue (bypassing the zeroing
5050    of the adjustment).  Since the return address is typically saved on the
5051    stack when a function makes a call, the saved LR must be updated outside
5052    the epilogue.
5053
5054    This poses problems as the store is generated well before the epilogue,
5055    so the offset of LR is not known yet.  Also optimizations will remove the
5056    store as it appears dead, even after the epilogue is generated (as the
5057    base or offset for loading LR is different in many cases).
5058
5059    To avoid these problems this implementation forces the frame pointer
5060    in eh_return functions so that the location of LR is fixed and known early.
5061    It also marks the store volatile, so no optimization is permitted to
5062    remove the store.  */
5063 rtx
5064 aarch64_eh_return_handler_rtx (void)
5065 {
5066   rtx tmp = gen_frame_mem (Pmode,
5067     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
5068
5069   /* Mark the store volatile, so no optimization is permitted to remove it.  */
5070   MEM_VOLATILE_P (tmp) = true;
5071   return tmp;
5072 }
5073
5074 /* Output code to add DELTA to the first argument, and then jump
5075    to FUNCTION.  Used for C++ multiple inheritance.  */
5076 static void
5077 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
5078                          HOST_WIDE_INT delta,
5079                          HOST_WIDE_INT vcall_offset,
5080                          tree function)
5081 {
5082   /* The this pointer is always in x0.  Note that this differs from
5083      Arm where the this pointer maybe bumped to r1 if r0 is required
5084      to return a pointer to an aggregate.  On AArch64 a result value
5085      pointer will be in x8.  */
5086   int this_regno = R0_REGNUM;
5087   rtx this_rtx, temp0, temp1, addr, funexp;
5088   rtx_insn *insn;
5089
5090   reload_completed = 1;
5091   emit_note (NOTE_INSN_PROLOGUE_END);
5092
5093   this_rtx = gen_rtx_REG (Pmode, this_regno);
5094   temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
5095   temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
5096
5097   if (vcall_offset == 0)
5098     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
5099   else
5100     {
5101       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
5102
5103       addr = this_rtx;
5104       if (delta != 0)
5105         {
5106           if (delta >= -256 && delta < 256)
5107             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
5108                                        plus_constant (Pmode, this_rtx, delta));
5109           else
5110             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
5111                                 temp1, temp0, false);
5112         }
5113
5114       if (Pmode == ptr_mode)
5115         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
5116       else
5117         aarch64_emit_move (temp0,
5118                            gen_rtx_ZERO_EXTEND (Pmode,
5119                                                 gen_rtx_MEM (ptr_mode, addr)));
5120
5121       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
5122           addr = plus_constant (Pmode, temp0, vcall_offset);
5123       else
5124         {
5125           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
5126                                           Pmode);
5127           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
5128         }
5129
5130       if (Pmode == ptr_mode)
5131         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
5132       else
5133         aarch64_emit_move (temp1,
5134                            gen_rtx_SIGN_EXTEND (Pmode,
5135                                                 gen_rtx_MEM (ptr_mode, addr)));
5136
5137       emit_insn (gen_add2_insn (this_rtx, temp1));
5138     }
5139
5140   /* Generate a tail call to the target function.  */
5141   if (!TREE_USED (function))
5142     {
5143       assemble_external (function);
5144       TREE_USED (function) = 1;
5145     }
5146   funexp = XEXP (DECL_RTL (function), 0);
5147   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
5148   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
5149   SIBLING_CALL_P (insn) = 1;
5150
5151   insn = get_insns ();
5152   shorten_branches (insn);
5153   final_start_function (insn, file, 1);
5154   final (insn, file, 1);
5155   final_end_function ();
5156
5157   /* Stop pretending to be a post-reload pass.  */
5158   reload_completed = 0;
5159 }
5160
5161 static bool
5162 aarch64_tls_referenced_p (rtx x)
5163 {
5164   if (!TARGET_HAVE_TLS)
5165     return false;
5166   subrtx_iterator::array_type array;
5167   FOR_EACH_SUBRTX (iter, array, x, ALL)
5168     {
5169       const_rtx x = *iter;
5170       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
5171         return true;
5172       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
5173          TLS offsets, not real symbol references.  */
5174       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5175         iter.skip_subrtxes ();
5176     }
5177   return false;
5178 }
5179
5180
5181 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5182    a left shift of 0 or 12 bits.  */
5183 bool
5184 aarch64_uimm12_shift (HOST_WIDE_INT val)
5185 {
5186   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
5187           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
5188           );
5189 }
5190
5191
5192 /* Return true if val is an immediate that can be loaded into a
5193    register by a MOVZ instruction.  */
5194 static bool
5195 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
5196 {
5197   if (GET_MODE_SIZE (mode) > 4)
5198     {
5199       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
5200           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
5201         return 1;
5202     }
5203   else
5204     {
5205       /* Ignore sign extension.  */
5206       val &= (HOST_WIDE_INT) 0xffffffff;
5207     }
5208   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
5209           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
5210 }
5211
5212 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
5213    64-bit (DImode) integer.  */
5214
5215 static unsigned HOST_WIDE_INT
5216 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5217 {
5218   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
5219   while (size < 64)
5220     {
5221       val &= (HOST_WIDE_INT_1U << size) - 1;
5222       val |= val << size;
5223       size *= 2;
5224     }
5225   return val;
5226 }
5227
5228 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
5229
5230 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
5231   {
5232     0x0000000100000001ull,
5233     0x0001000100010001ull,
5234     0x0101010101010101ull,
5235     0x1111111111111111ull,
5236     0x5555555555555555ull,
5237   };
5238
5239
5240 /* Return true if val is a valid bitmask immediate.  */
5241
5242 bool
5243 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
5244 {
5245   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
5246   int bits;
5247
5248   /* Check for a single sequence of one bits and return quickly if so.
5249      The special cases of all ones and all zeroes returns false.  */
5250   val = aarch64_replicate_bitmask_imm (val_in, mode);
5251   tmp = val + (val & -val);
5252
5253   if (tmp == (tmp & -tmp))
5254     return (val + 1) > 1;
5255
5256   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
5257   if (mode == SImode)
5258     val = (val << 32) | (val & 0xffffffff);
5259
5260   /* Invert if the immediate doesn't start with a zero bit - this means we
5261      only need to search for sequences of one bits.  */
5262   if (val & 1)
5263     val = ~val;
5264
5265   /* Find the first set bit and set tmp to val with the first sequence of one
5266      bits removed.  Return success if there is a single sequence of ones.  */
5267   first_one = val & -val;
5268   tmp = val & (val + first_one);
5269
5270   if (tmp == 0)
5271     return true;
5272
5273   /* Find the next set bit and compute the difference in bit position.  */
5274   next_one = tmp & -tmp;
5275   bits = clz_hwi (first_one) - clz_hwi (next_one);
5276   mask = val ^ tmp;
5277
5278   /* Check the bit position difference is a power of 2, and that the first
5279      sequence of one bits fits within 'bits' bits.  */
5280   if ((mask >> bits) != 0 || bits != (bits & -bits))
5281     return false;
5282
5283   /* Check the sequence of one bits is repeated 64/bits times.  */
5284   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
5285 }
5286
5287 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5288    Assumed precondition: VAL_IN Is not zero.  */
5289
5290 unsigned HOST_WIDE_INT
5291 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
5292 {
5293   int lowest_bit_set = ctz_hwi (val_in);
5294   int highest_bit_set = floor_log2 (val_in);
5295   gcc_assert (val_in != 0);
5296
5297   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
5298           (HOST_WIDE_INT_1U << lowest_bit_set));
5299 }
5300
5301 /* Create constant where bits outside of lowest bit set to highest bit set
5302    are set to 1.  */
5303
5304 unsigned HOST_WIDE_INT
5305 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
5306 {
5307   return val_in | ~aarch64_and_split_imm1 (val_in);
5308 }
5309
5310 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
5311
5312 bool
5313 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
5314 {
5315   scalar_int_mode int_mode;
5316   if (!is_a <scalar_int_mode> (mode, &int_mode))
5317     return false;
5318
5319   if (aarch64_bitmask_imm (val_in, int_mode))
5320     return false;
5321
5322   if (aarch64_move_imm (val_in, int_mode))
5323     return false;
5324
5325   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
5326
5327   return aarch64_bitmask_imm (imm2, int_mode);
5328 }
5329
5330 /* Return true if val is an immediate that can be loaded into a
5331    register in a single instruction.  */
5332 bool
5333 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
5334 {
5335   scalar_int_mode int_mode;
5336   if (!is_a <scalar_int_mode> (mode, &int_mode))
5337     return false;
5338
5339   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
5340     return 1;
5341   return aarch64_bitmask_imm (val, int_mode);
5342 }
5343
5344 static bool
5345 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
5346 {
5347   rtx base, offset;
5348
5349   if (GET_CODE (x) == HIGH)
5350     return true;
5351
5352   /* There's no way to calculate VL-based values using relocations.  */
5353   subrtx_iterator::array_type array;
5354   FOR_EACH_SUBRTX (iter, array, x, ALL)
5355     if (GET_CODE (*iter) == CONST_POLY_INT)
5356       return true;
5357
5358   split_const (x, &base, &offset);
5359   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
5360     {
5361       if (aarch64_classify_symbol (base, INTVAL (offset))
5362           != SYMBOL_FORCE_TO_MEM)
5363         return true;
5364       else
5365         /* Avoid generating a 64-bit relocation in ILP32; leave
5366            to aarch64_expand_mov_immediate to handle it properly.  */
5367         return mode != ptr_mode;
5368     }
5369
5370   return aarch64_tls_referenced_p (x);
5371 }
5372
5373 /* Implement TARGET_CASE_VALUES_THRESHOLD.
5374    The expansion for a table switch is quite expensive due to the number
5375    of instructions, the table lookup and hard to predict indirect jump.
5376    When optimizing for speed, and -O3 enabled, use the per-core tuning if
5377    set, otherwise use tables for > 16 cases as a tradeoff between size and
5378    performance.  When optimizing for size, use the default setting.  */
5379
5380 static unsigned int
5381 aarch64_case_values_threshold (void)
5382 {
5383   /* Use the specified limit for the number of cases before using jump
5384      tables at higher optimization levels.  */
5385   if (optimize > 2
5386       && selected_cpu->tune->max_case_values != 0)
5387     return selected_cpu->tune->max_case_values;
5388   else
5389     return optimize_size ? default_case_values_threshold () : 17;
5390 }
5391
5392 /* Return true if register REGNO is a valid index register.
5393    STRICT_P is true if REG_OK_STRICT is in effect.  */
5394
5395 bool
5396 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
5397 {
5398   if (!HARD_REGISTER_NUM_P (regno))
5399     {
5400       if (!strict_p)
5401         return true;
5402
5403       if (!reg_renumber)
5404         return false;
5405
5406       regno = reg_renumber[regno];
5407     }
5408   return GP_REGNUM_P (regno);
5409 }
5410
5411 /* Return true if register REGNO is a valid base register for mode MODE.
5412    STRICT_P is true if REG_OK_STRICT is in effect.  */
5413
5414 bool
5415 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
5416 {
5417   if (!HARD_REGISTER_NUM_P (regno))
5418     {
5419       if (!strict_p)
5420         return true;
5421
5422       if (!reg_renumber)
5423         return false;
5424
5425       regno = reg_renumber[regno];
5426     }
5427
5428   /* The fake registers will be eliminated to either the stack or
5429      hard frame pointer, both of which are usually valid base registers.
5430      Reload deals with the cases where the eliminated form isn't valid.  */
5431   return (GP_REGNUM_P (regno)
5432           || regno == SP_REGNUM
5433           || regno == FRAME_POINTER_REGNUM
5434           || regno == ARG_POINTER_REGNUM);
5435 }
5436
5437 /* Return true if X is a valid base register for mode MODE.
5438    STRICT_P is true if REG_OK_STRICT is in effect.  */
5439
5440 static bool
5441 aarch64_base_register_rtx_p (rtx x, bool strict_p)
5442 {
5443   if (!strict_p
5444       && GET_CODE (x) == SUBREG
5445       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
5446     x = SUBREG_REG (x);
5447
5448   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
5449 }
5450
5451 /* Return true if address offset is a valid index.  If it is, fill in INFO
5452    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
5453
5454 static bool
5455 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
5456                         machine_mode mode, bool strict_p)
5457 {
5458   enum aarch64_address_type type;
5459   rtx index;
5460   int shift;
5461
5462   /* (reg:P) */
5463   if ((REG_P (x) || GET_CODE (x) == SUBREG)
5464       && GET_MODE (x) == Pmode)
5465     {
5466       type = ADDRESS_REG_REG;
5467       index = x;
5468       shift = 0;
5469     }
5470   /* (sign_extend:DI (reg:SI)) */
5471   else if ((GET_CODE (x) == SIGN_EXTEND
5472             || GET_CODE (x) == ZERO_EXTEND)
5473            && GET_MODE (x) == DImode
5474            && GET_MODE (XEXP (x, 0)) == SImode)
5475     {
5476       type = (GET_CODE (x) == SIGN_EXTEND)
5477         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5478       index = XEXP (x, 0);
5479       shift = 0;
5480     }
5481   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
5482   else if (GET_CODE (x) == MULT
5483            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5484                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5485            && GET_MODE (XEXP (x, 0)) == DImode
5486            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5487            && CONST_INT_P (XEXP (x, 1)))
5488     {
5489       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5490         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5491       index = XEXP (XEXP (x, 0), 0);
5492       shift = exact_log2 (INTVAL (XEXP (x, 1)));
5493     }
5494   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
5495   else if (GET_CODE (x) == ASHIFT
5496            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5497                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5498            && GET_MODE (XEXP (x, 0)) == DImode
5499            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5500            && CONST_INT_P (XEXP (x, 1)))
5501     {
5502       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5503         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5504       index = XEXP (XEXP (x, 0), 0);
5505       shift = INTVAL (XEXP (x, 1));
5506     }
5507   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
5508   else if ((GET_CODE (x) == SIGN_EXTRACT
5509             || GET_CODE (x) == ZERO_EXTRACT)
5510            && GET_MODE (x) == DImode
5511            && GET_CODE (XEXP (x, 0)) == MULT
5512            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5513            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5514     {
5515       type = (GET_CODE (x) == SIGN_EXTRACT)
5516         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5517       index = XEXP (XEXP (x, 0), 0);
5518       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5519       if (INTVAL (XEXP (x, 1)) != 32 + shift
5520           || INTVAL (XEXP (x, 2)) != 0)
5521         shift = -1;
5522     }
5523   /* (and:DI (mult:DI (reg:DI) (const_int scale))
5524      (const_int 0xffffffff<<shift)) */
5525   else if (GET_CODE (x) == AND
5526            && GET_MODE (x) == DImode
5527            && GET_CODE (XEXP (x, 0)) == MULT
5528            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5529            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5530            && CONST_INT_P (XEXP (x, 1)))
5531     {
5532       type = ADDRESS_REG_UXTW;
5533       index = XEXP (XEXP (x, 0), 0);
5534       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5535       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5536         shift = -1;
5537     }
5538   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
5539   else if ((GET_CODE (x) == SIGN_EXTRACT
5540             || GET_CODE (x) == ZERO_EXTRACT)
5541            && GET_MODE (x) == DImode
5542            && GET_CODE (XEXP (x, 0)) == ASHIFT
5543            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5544            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5545     {
5546       type = (GET_CODE (x) == SIGN_EXTRACT)
5547         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5548       index = XEXP (XEXP (x, 0), 0);
5549       shift = INTVAL (XEXP (XEXP (x, 0), 1));
5550       if (INTVAL (XEXP (x, 1)) != 32 + shift
5551           || INTVAL (XEXP (x, 2)) != 0)
5552         shift = -1;
5553     }
5554   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
5555      (const_int 0xffffffff<<shift)) */
5556   else if (GET_CODE (x) == AND
5557            && GET_MODE (x) == DImode
5558            && GET_CODE (XEXP (x, 0)) == ASHIFT
5559            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5560            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5561            && CONST_INT_P (XEXP (x, 1)))
5562     {
5563       type = ADDRESS_REG_UXTW;
5564       index = XEXP (XEXP (x, 0), 0);
5565       shift = INTVAL (XEXP (XEXP (x, 0), 1));
5566       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5567         shift = -1;
5568     }
5569   /* (mult:P (reg:P) (const_int scale)) */
5570   else if (GET_CODE (x) == MULT
5571            && GET_MODE (x) == Pmode
5572            && GET_MODE (XEXP (x, 0)) == Pmode
5573            && CONST_INT_P (XEXP (x, 1)))
5574     {
5575       type = ADDRESS_REG_REG;
5576       index = XEXP (x, 0);
5577       shift = exact_log2 (INTVAL (XEXP (x, 1)));
5578     }
5579   /* (ashift:P (reg:P) (const_int shift)) */
5580   else if (GET_CODE (x) == ASHIFT
5581            && GET_MODE (x) == Pmode
5582            && GET_MODE (XEXP (x, 0)) == Pmode
5583            && CONST_INT_P (XEXP (x, 1)))
5584     {
5585       type = ADDRESS_REG_REG;
5586       index = XEXP (x, 0);
5587       shift = INTVAL (XEXP (x, 1));
5588     }
5589   else
5590     return false;
5591
5592   if (!strict_p
5593       && GET_CODE (index) == SUBREG
5594       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
5595     index = SUBREG_REG (index);
5596
5597   if (aarch64_sve_data_mode_p (mode))
5598     {
5599       if (type != ADDRESS_REG_REG
5600           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
5601         return false;
5602     }
5603   else
5604     {
5605       if (shift != 0
5606           && !(IN_RANGE (shift, 1, 3)
5607                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
5608         return false;
5609     }
5610
5611   if (REG_P (index)
5612       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
5613     {
5614       info->type = type;
5615       info->offset = index;
5616       info->shift = shift;
5617       return true;
5618     }
5619
5620   return false;
5621 }
5622
5623 /* Return true if MODE is one of the modes for which we
5624    support LDP/STP operations.  */
5625
5626 static bool
5627 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
5628 {
5629   return mode == SImode || mode == DImode
5630          || mode == SFmode || mode == DFmode
5631          || (aarch64_vector_mode_supported_p (mode)
5632              && (known_eq (GET_MODE_SIZE (mode), 8)
5633                  || (known_eq (GET_MODE_SIZE (mode), 16)
5634                     && (aarch64_tune_params.extra_tuning_flags
5635                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
5636 }
5637
5638 /* Return true if REGNO is a virtual pointer register, or an eliminable
5639    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
5640    include stack_pointer or hard_frame_pointer.  */
5641 static bool
5642 virt_or_elim_regno_p (unsigned regno)
5643 {
5644   return ((regno >= FIRST_VIRTUAL_REGISTER
5645            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
5646           || regno == FRAME_POINTER_REGNUM
5647           || regno == ARG_POINTER_REGNUM);
5648 }
5649
5650 /* Return true if X is a valid address of type TYPE for machine mode MODE.
5651    If it is, fill in INFO appropriately.  STRICT_P is true if
5652    REG_OK_STRICT is in effect.  */
5653
5654 static bool
5655 aarch64_classify_address (struct aarch64_address_info *info,
5656                           rtx x, machine_mode mode, bool strict_p,
5657                           aarch64_addr_query_type type = ADDR_QUERY_M)
5658 {
5659   enum rtx_code code = GET_CODE (x);
5660   rtx op0, op1;
5661   poly_int64 offset;
5662
5663   HOST_WIDE_INT const_size;
5664
5665   /* On BE, we use load/store pair for all large int mode load/stores.
5666      TI/TFmode may also use a load/store pair.  */
5667   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5668   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
5669   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
5670                             || type == ADDR_QUERY_LDP_STP_N
5671                             || mode == TImode
5672                             || mode == TFmode
5673                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
5674
5675   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
5676      corresponds to the actual size of the memory being loaded/stored and the
5677      mode of the corresponding addressing mode is half of that.  */
5678   if (type == ADDR_QUERY_LDP_STP_N
5679       && known_eq (GET_MODE_SIZE (mode), 16))
5680     mode = DFmode;
5681
5682   bool allow_reg_index_p = (!load_store_pair_p
5683                             && (known_lt (GET_MODE_SIZE (mode), 16)
5684                                 || vec_flags == VEC_ADVSIMD
5685                                 || vec_flags == VEC_SVE_DATA));
5686
5687   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
5688      [Rn, #offset, MUL VL].  */
5689   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
5690       && (code != REG && code != PLUS))
5691     return false;
5692
5693   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
5694      REG addressing.  */
5695   if (advsimd_struct_p
5696       && !BYTES_BIG_ENDIAN
5697       && (code != POST_INC && code != REG))
5698     return false;
5699
5700   gcc_checking_assert (GET_MODE (x) == VOIDmode
5701                        || SCALAR_INT_MODE_P (GET_MODE (x)));
5702
5703   switch (code)
5704     {
5705     case REG:
5706     case SUBREG:
5707       info->type = ADDRESS_REG_IMM;
5708       info->base = x;
5709       info->offset = const0_rtx;
5710       info->const_offset = 0;
5711       return aarch64_base_register_rtx_p (x, strict_p);
5712
5713     case PLUS:
5714       op0 = XEXP (x, 0);
5715       op1 = XEXP (x, 1);
5716
5717       if (! strict_p
5718           && REG_P (op0)
5719           && virt_or_elim_regno_p (REGNO (op0))
5720           && poly_int_rtx_p (op1, &offset))
5721         {
5722           info->type = ADDRESS_REG_IMM;
5723           info->base = op0;
5724           info->offset = op1;
5725           info->const_offset = offset;
5726
5727           return true;
5728         }
5729
5730       if (maybe_ne (GET_MODE_SIZE (mode), 0)
5731           && aarch64_base_register_rtx_p (op0, strict_p)
5732           && poly_int_rtx_p (op1, &offset))
5733         {
5734           info->type = ADDRESS_REG_IMM;
5735           info->base = op0;
5736           info->offset = op1;
5737           info->const_offset = offset;
5738
5739           /* TImode and TFmode values are allowed in both pairs of X
5740              registers and individual Q registers.  The available
5741              address modes are:
5742              X,X: 7-bit signed scaled offset
5743              Q:   9-bit signed offset
5744              We conservatively require an offset representable in either mode.
5745              When performing the check for pairs of X registers i.e.  LDP/STP
5746              pass down DImode since that is the natural size of the LDP/STP
5747              instruction memory accesses.  */
5748           if (mode == TImode || mode == TFmode)
5749             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
5750                     && (offset_9bit_signed_unscaled_p (mode, offset)
5751                         || offset_12bit_unsigned_scaled_p (mode, offset)));
5752
5753           /* A 7bit offset check because OImode will emit a ldp/stp
5754              instruction (only big endian will get here).
5755              For ldp/stp instructions, the offset is scaled for the size of a
5756              single element of the pair.  */
5757           if (mode == OImode)
5758             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
5759
5760           /* Three 9/12 bit offsets checks because CImode will emit three
5761              ldr/str instructions (only big endian will get here).  */
5762           if (mode == CImode)
5763             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5764                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
5765                         || offset_12bit_unsigned_scaled_p (V16QImode,
5766                                                            offset + 32)));
5767
5768           /* Two 7bit offsets checks because XImode will emit two ldp/stp
5769              instructions (only big endian will get here).  */
5770           if (mode == XImode)
5771             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5772                     && aarch64_offset_7bit_signed_scaled_p (TImode,
5773                                                             offset + 32));
5774
5775           /* Make "m" use the LD1 offset range for SVE data modes, so
5776              that pre-RTL optimizers like ivopts will work to that
5777              instead of the wider LDR/STR range.  */
5778           if (vec_flags == VEC_SVE_DATA)
5779             return (type == ADDR_QUERY_M
5780                     ? offset_4bit_signed_scaled_p (mode, offset)
5781                     : offset_9bit_signed_scaled_p (mode, offset));
5782
5783           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
5784             {
5785               poly_int64 end_offset = (offset
5786                                        + GET_MODE_SIZE (mode)
5787                                        - BYTES_PER_SVE_VECTOR);
5788               return (type == ADDR_QUERY_M
5789                       ? offset_4bit_signed_scaled_p (mode, offset)
5790                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
5791                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
5792                                                          end_offset)));
5793             }
5794
5795           if (vec_flags == VEC_SVE_PRED)
5796             return offset_9bit_signed_scaled_p (mode, offset);
5797
5798           if (load_store_pair_p)
5799             return ((known_eq (GET_MODE_SIZE (mode), 4)
5800                      || known_eq (GET_MODE_SIZE (mode), 8)
5801                      || known_eq (GET_MODE_SIZE (mode), 16))
5802                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5803           else
5804             return (offset_9bit_signed_unscaled_p (mode, offset)
5805                     || offset_12bit_unsigned_scaled_p (mode, offset));
5806         }
5807
5808       if (allow_reg_index_p)
5809         {
5810           /* Look for base + (scaled/extended) index register.  */
5811           if (aarch64_base_register_rtx_p (op0, strict_p)
5812               && aarch64_classify_index (info, op1, mode, strict_p))
5813             {
5814               info->base = op0;
5815               return true;
5816             }
5817           if (aarch64_base_register_rtx_p (op1, strict_p)
5818               && aarch64_classify_index (info, op0, mode, strict_p))
5819             {
5820               info->base = op1;
5821               return true;
5822             }
5823         }
5824
5825       return false;
5826
5827     case POST_INC:
5828     case POST_DEC:
5829     case PRE_INC:
5830     case PRE_DEC:
5831       info->type = ADDRESS_REG_WB;
5832       info->base = XEXP (x, 0);
5833       info->offset = NULL_RTX;
5834       return aarch64_base_register_rtx_p (info->base, strict_p);
5835
5836     case POST_MODIFY:
5837     case PRE_MODIFY:
5838       info->type = ADDRESS_REG_WB;
5839       info->base = XEXP (x, 0);
5840       if (GET_CODE (XEXP (x, 1)) == PLUS
5841           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
5842           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
5843           && aarch64_base_register_rtx_p (info->base, strict_p))
5844         {
5845           info->offset = XEXP (XEXP (x, 1), 1);
5846           info->const_offset = offset;
5847
5848           /* TImode and TFmode values are allowed in both pairs of X
5849              registers and individual Q registers.  The available
5850              address modes are:
5851              X,X: 7-bit signed scaled offset
5852              Q:   9-bit signed offset
5853              We conservatively require an offset representable in either mode.
5854            */
5855           if (mode == TImode || mode == TFmode)
5856             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
5857                     && offset_9bit_signed_unscaled_p (mode, offset));
5858
5859           if (load_store_pair_p)
5860             return ((known_eq (GET_MODE_SIZE (mode), 4)
5861                      || known_eq (GET_MODE_SIZE (mode), 8)
5862                      || known_eq (GET_MODE_SIZE (mode), 16))
5863                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5864           else
5865             return offset_9bit_signed_unscaled_p (mode, offset);
5866         }
5867       return false;
5868
5869     case CONST:
5870     case SYMBOL_REF:
5871     case LABEL_REF:
5872       /* load literal: pc-relative constant pool entry.  Only supported
5873          for SI mode or larger.  */
5874       info->type = ADDRESS_SYMBOLIC;
5875
5876       if (!load_store_pair_p
5877           && GET_MODE_SIZE (mode).is_constant (&const_size)
5878           && const_size >= 4)
5879         {
5880           rtx sym, addend;
5881
5882           split_const (x, &sym, &addend);
5883           return ((GET_CODE (sym) == LABEL_REF
5884                    || (GET_CODE (sym) == SYMBOL_REF
5885                        && CONSTANT_POOL_ADDRESS_P (sym)
5886                        && aarch64_pcrelative_literal_loads)));
5887         }
5888       return false;
5889
5890     case LO_SUM:
5891       info->type = ADDRESS_LO_SUM;
5892       info->base = XEXP (x, 0);
5893       info->offset = XEXP (x, 1);
5894       if (allow_reg_index_p
5895           && aarch64_base_register_rtx_p (info->base, strict_p))
5896         {
5897           rtx sym, offs;
5898           split_const (info->offset, &sym, &offs);
5899           if (GET_CODE (sym) == SYMBOL_REF
5900               && (aarch64_classify_symbol (sym, INTVAL (offs))
5901                   == SYMBOL_SMALL_ABSOLUTE))
5902             {
5903               /* The symbol and offset must be aligned to the access size.  */
5904               unsigned int align;
5905
5906               if (CONSTANT_POOL_ADDRESS_P (sym))
5907                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
5908               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
5909                 {
5910                   tree exp = SYMBOL_REF_DECL (sym);
5911                   align = TYPE_ALIGN (TREE_TYPE (exp));
5912                   align = aarch64_constant_alignment (exp, align);
5913                 }
5914               else if (SYMBOL_REF_DECL (sym))
5915                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
5916               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
5917                        && SYMBOL_REF_BLOCK (sym) != NULL)
5918                 align = SYMBOL_REF_BLOCK (sym)->alignment;
5919               else
5920                 align = BITS_PER_UNIT;
5921
5922               poly_int64 ref_size = GET_MODE_SIZE (mode);
5923               if (known_eq (ref_size, 0))
5924                 ref_size = GET_MODE_SIZE (DImode);
5925
5926               return (multiple_p (INTVAL (offs), ref_size)
5927                       && multiple_p (align / BITS_PER_UNIT, ref_size));
5928             }
5929         }
5930       return false;
5931
5932     default:
5933       return false;
5934     }
5935 }
5936
5937 /* Return true if the address X is valid for a PRFM instruction.
5938    STRICT_P is true if we should do strict checking with
5939    aarch64_classify_address.  */
5940
5941 bool
5942 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
5943 {
5944   struct aarch64_address_info addr;
5945
5946   /* PRFM accepts the same addresses as DImode...  */
5947   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
5948   if (!res)
5949     return false;
5950
5951   /* ... except writeback forms.  */
5952   return addr.type != ADDRESS_REG_WB;
5953 }
5954
5955 bool
5956 aarch64_symbolic_address_p (rtx x)
5957 {
5958   rtx offset;
5959
5960   split_const (x, &x, &offset);
5961   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
5962 }
5963
5964 /* Classify the base of symbolic expression X.  */
5965
5966 enum aarch64_symbol_type
5967 aarch64_classify_symbolic_expression (rtx x)
5968 {
5969   rtx offset;
5970
5971   split_const (x, &x, &offset);
5972   return aarch64_classify_symbol (x, INTVAL (offset));
5973 }
5974
5975
5976 /* Return TRUE if X is a legitimate address for accessing memory in
5977    mode MODE.  */
5978 static bool
5979 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
5980 {
5981   struct aarch64_address_info addr;
5982
5983   return aarch64_classify_address (&addr, x, mode, strict_p);
5984 }
5985
5986 /* Return TRUE if X is a legitimate address of type TYPE for accessing
5987    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
5988 bool
5989 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
5990                               aarch64_addr_query_type type)
5991 {
5992   struct aarch64_address_info addr;
5993
5994   return aarch64_classify_address (&addr, x, mode, strict_p, type);
5995 }
5996
5997 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
5998
5999 static bool
6000 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
6001                                          poly_int64 orig_offset,
6002                                          machine_mode mode)
6003 {
6004   HOST_WIDE_INT size;
6005   if (GET_MODE_SIZE (mode).is_constant (&size))
6006     {
6007       HOST_WIDE_INT const_offset, second_offset;
6008
6009       /* A general SVE offset is A * VQ + B.  Remove the A component from
6010          coefficient 0 in order to get the constant B.  */
6011       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
6012
6013       /* Split an out-of-range address displacement into a base and
6014          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
6015          range otherwise to increase opportunities for sharing the base
6016          address of different sizes.  Unaligned accesses use the signed
6017          9-bit range, TImode/TFmode use the intersection of signed
6018          scaled 7-bit and signed 9-bit offset.  */
6019       if (mode == TImode || mode == TFmode)
6020         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
6021       else if ((const_offset & (size - 1)) != 0)
6022         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
6023       else
6024         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
6025
6026       if (second_offset == 0 || known_eq (orig_offset, second_offset))
6027         return false;
6028
6029       /* Split the offset into second_offset and the rest.  */
6030       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6031       *offset2 = gen_int_mode (second_offset, Pmode);
6032       return true;
6033     }
6034   else
6035     {
6036       /* Get the mode we should use as the basis of the range.  For structure
6037          modes this is the mode of one vector.  */
6038       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6039       machine_mode step_mode
6040         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
6041
6042       /* Get the "mul vl" multiplier we'd like to use.  */
6043       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
6044       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
6045       if (vec_flags & VEC_SVE_DATA)
6046         /* LDR supports a 9-bit range, but the move patterns for
6047            structure modes require all vectors to be in range of the
6048            same base.  The simplest way of accomodating that while still
6049            promoting reuse of anchor points between different modes is
6050            to use an 8-bit range unconditionally.  */
6051         vnum = ((vnum + 128) & 255) - 128;
6052       else
6053         /* Predicates are only handled singly, so we might as well use
6054            the full range.  */
6055         vnum = ((vnum + 256) & 511) - 256;
6056       if (vnum == 0)
6057         return false;
6058
6059       /* Convert the "mul vl" multiplier into a byte offset.  */
6060       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
6061       if (known_eq (second_offset, orig_offset))
6062         return false;
6063
6064       /* Split the offset into second_offset and the rest.  */
6065       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6066       *offset2 = gen_int_mode (second_offset, Pmode);
6067       return true;
6068     }
6069 }
6070
6071 /* Return the binary representation of floating point constant VALUE in INTVAL.
6072    If the value cannot be converted, return false without setting INTVAL.
6073    The conversion is done in the given MODE.  */
6074 bool
6075 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
6076 {
6077
6078   /* We make a general exception for 0.  */
6079   if (aarch64_float_const_zero_rtx_p (value))
6080     {
6081       *intval = 0;
6082       return true;
6083     }
6084
6085   scalar_float_mode mode;
6086   if (GET_CODE (value) != CONST_DOUBLE
6087       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
6088       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
6089       /* Only support up to DF mode.  */
6090       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
6091     return false;
6092
6093   unsigned HOST_WIDE_INT ival = 0;
6094
6095   long res[2];
6096   real_to_target (res,
6097                   CONST_DOUBLE_REAL_VALUE (value),
6098                   REAL_MODE_FORMAT (mode));
6099
6100   if (mode == DFmode)
6101     {
6102       int order = BYTES_BIG_ENDIAN ? 1 : 0;
6103       ival = zext_hwi (res[order], 32);
6104       ival |= (zext_hwi (res[1 - order], 32) << 32);
6105     }
6106   else
6107       ival = zext_hwi (res[0], 32);
6108
6109   *intval = ival;
6110   return true;
6111 }
6112
6113 /* Return TRUE if rtx X is an immediate constant that can be moved using a
6114    single MOV(+MOVK) followed by an FMOV.  */
6115 bool
6116 aarch64_float_const_rtx_p (rtx x)
6117 {
6118   machine_mode mode = GET_MODE (x);
6119   if (mode == VOIDmode)
6120     return false;
6121
6122   /* Determine whether it's cheaper to write float constants as
6123      mov/movk pairs over ldr/adrp pairs.  */
6124   unsigned HOST_WIDE_INT ival;
6125
6126   if (GET_CODE (x) == CONST_DOUBLE
6127       && SCALAR_FLOAT_MODE_P (mode)
6128       && aarch64_reinterpret_float_as_int (x, &ival))
6129     {
6130       scalar_int_mode imode = (mode == HFmode
6131                                ? SImode
6132                                : int_mode_for_mode (mode).require ());
6133       int num_instr = aarch64_internal_mov_immediate
6134                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
6135       return num_instr < 3;
6136     }
6137
6138   return false;
6139 }
6140
6141 /* Return TRUE if rtx X is immediate constant 0.0 */
6142 bool
6143 aarch64_float_const_zero_rtx_p (rtx x)
6144 {
6145   if (GET_MODE (x) == VOIDmode)
6146     return false;
6147
6148   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
6149     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
6150   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
6151 }
6152
6153 /* Return TRUE if rtx X is immediate constant that fits in a single
6154    MOVI immediate operation.  */
6155 bool
6156 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
6157 {
6158   if (!TARGET_SIMD)
6159      return false;
6160
6161   machine_mode vmode;
6162   scalar_int_mode imode;
6163   unsigned HOST_WIDE_INT ival;
6164
6165   if (GET_CODE (x) == CONST_DOUBLE
6166       && SCALAR_FLOAT_MODE_P (mode))
6167     {
6168       if (!aarch64_reinterpret_float_as_int (x, &ival))
6169         return false;
6170
6171       /* We make a general exception for 0.  */
6172       if (aarch64_float_const_zero_rtx_p (x))
6173         return true;
6174
6175       imode = int_mode_for_mode (mode).require ();
6176     }
6177   else if (GET_CODE (x) == CONST_INT
6178            && is_a <scalar_int_mode> (mode, &imode))
6179     ival = INTVAL (x);
6180   else
6181     return false;
6182
6183    /* use a 64 bit mode for everything except for DI/DF mode, where we use
6184      a 128 bit vector mode.  */
6185   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
6186
6187   vmode = aarch64_simd_container_mode (imode, width);
6188   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
6189
6190   return aarch64_simd_valid_immediate (v_op, NULL);
6191 }
6192
6193
6194 /* Return the fixed registers used for condition codes.  */
6195
6196 static bool
6197 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
6198 {
6199   *p1 = CC_REGNUM;
6200   *p2 = INVALID_REGNUM;
6201   return true;
6202 }
6203
6204 /* This function is used by the call expanders of the machine description.
6205    RESULT is the register in which the result is returned.  It's NULL for
6206    "call" and "sibcall".
6207    MEM is the location of the function call.
6208    SIBCALL indicates whether this function call is normal call or sibling call.
6209    It will generate different pattern accordingly.  */
6210
6211 void
6212 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
6213 {
6214   rtx call, callee, tmp;
6215   rtvec vec;
6216   machine_mode mode;
6217
6218   gcc_assert (MEM_P (mem));
6219   callee = XEXP (mem, 0);
6220   mode = GET_MODE (callee);
6221   gcc_assert (mode == Pmode);
6222
6223   /* Decide if we should generate indirect calls by loading the
6224      address of the callee into a register before performing
6225      the branch-and-link.  */
6226   if (SYMBOL_REF_P (callee)
6227       ? (aarch64_is_long_call_p (callee)
6228          || aarch64_is_noplt_call_p (callee))
6229       : !REG_P (callee))
6230     XEXP (mem, 0) = force_reg (mode, callee);
6231
6232   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
6233
6234   if (result != NULL_RTX)
6235     call = gen_rtx_SET (result, call);
6236
6237   if (sibcall)
6238     tmp = ret_rtx;
6239   else
6240     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
6241
6242   vec = gen_rtvec (2, call, tmp);
6243   call = gen_rtx_PARALLEL (VOIDmode, vec);
6244
6245   aarch64_emit_call_insn (call);
6246 }
6247
6248 /* Emit call insn with PAT and do aarch64-specific handling.  */
6249
6250 void
6251 aarch64_emit_call_insn (rtx pat)
6252 {
6253   rtx insn = emit_call_insn (pat);
6254
6255   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
6256   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
6257   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
6258 }
6259
6260 machine_mode
6261 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
6262 {
6263   /* All floating point compares return CCFP if it is an equality
6264      comparison, and CCFPE otherwise.  */
6265   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
6266     {
6267       switch (code)
6268         {
6269         case EQ:
6270         case NE:
6271         case UNORDERED:
6272         case ORDERED:
6273         case UNLT:
6274         case UNLE:
6275         case UNGT:
6276         case UNGE:
6277         case UNEQ:
6278           return CCFPmode;
6279
6280         case LT:
6281         case LE:
6282         case GT:
6283         case GE:
6284         case LTGT:
6285           return CCFPEmode;
6286
6287         default:
6288           gcc_unreachable ();
6289         }
6290     }
6291
6292   /* Equality comparisons of short modes against zero can be performed
6293      using the TST instruction with the appropriate bitmask.  */
6294   if (y == const0_rtx && REG_P (x)
6295       && (code == EQ || code == NE)
6296       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
6297     return CC_NZmode;
6298
6299   /* Similarly, comparisons of zero_extends from shorter modes can
6300      be performed using an ANDS with an immediate mask.  */
6301   if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
6302       && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6303       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
6304       && (code == EQ || code == NE))
6305     return CC_NZmode;
6306
6307   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6308       && y == const0_rtx
6309       && (code == EQ || code == NE || code == LT || code == GE)
6310       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
6311           || GET_CODE (x) == NEG
6312           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
6313               && CONST_INT_P (XEXP (x, 2)))))
6314     return CC_NZmode;
6315
6316   /* A compare with a shifted operand.  Because of canonicalization,
6317      the comparison will have to be swapped when we emit the assembly
6318      code.  */
6319   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6320       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
6321       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
6322           || GET_CODE (x) == LSHIFTRT
6323           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
6324     return CC_SWPmode;
6325
6326   /* Similarly for a negated operand, but we can only do this for
6327      equalities.  */
6328   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6329       && (REG_P (y) || GET_CODE (y) == SUBREG)
6330       && (code == EQ || code == NE)
6331       && GET_CODE (x) == NEG)
6332     return CC_Zmode;
6333
6334   /* A test for unsigned overflow.  */
6335   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
6336       && code == NE
6337       && GET_CODE (x) == PLUS
6338       && GET_CODE (y) == ZERO_EXTEND)
6339     return CC_Cmode;
6340
6341   /* A test for signed overflow.  */
6342   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
6343       && code == NE
6344       && GET_CODE (x) == PLUS
6345       && GET_CODE (y) == SIGN_EXTEND)
6346     return CC_Vmode;
6347
6348   /* For everything else, return CCmode.  */
6349   return CCmode;
6350 }
6351
6352 static int
6353 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
6354
6355 int
6356 aarch64_get_condition_code (rtx x)
6357 {
6358   machine_mode mode = GET_MODE (XEXP (x, 0));
6359   enum rtx_code comp_code = GET_CODE (x);
6360
6361   if (GET_MODE_CLASS (mode) != MODE_CC)
6362     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
6363   return aarch64_get_condition_code_1 (mode, comp_code);
6364 }
6365
6366 static int
6367 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
6368 {
6369   switch (mode)
6370     {
6371     case E_CCFPmode:
6372     case E_CCFPEmode:
6373       switch (comp_code)
6374         {
6375         case GE: return AARCH64_GE;
6376         case GT: return AARCH64_GT;
6377         case LE: return AARCH64_LS;
6378         case LT: return AARCH64_MI;
6379         case NE: return AARCH64_NE;
6380         case EQ: return AARCH64_EQ;
6381         case ORDERED: return AARCH64_VC;
6382         case UNORDERED: return AARCH64_VS;
6383         case UNLT: return AARCH64_LT;
6384         case UNLE: return AARCH64_LE;
6385         case UNGT: return AARCH64_HI;
6386         case UNGE: return AARCH64_PL;
6387         default: return -1;
6388         }
6389       break;
6390
6391     case E_CCmode:
6392       switch (comp_code)
6393         {
6394         case NE: return AARCH64_NE;
6395         case EQ: return AARCH64_EQ;
6396         case GE: return AARCH64_GE;
6397         case GT: return AARCH64_GT;
6398         case LE: return AARCH64_LE;
6399         case LT: return AARCH64_LT;
6400         case GEU: return AARCH64_CS;
6401         case GTU: return AARCH64_HI;
6402         case LEU: return AARCH64_LS;
6403         case LTU: return AARCH64_CC;
6404         default: return -1;
6405         }
6406       break;
6407
6408     case E_CC_SWPmode:
6409       switch (comp_code)
6410         {
6411         case NE: return AARCH64_NE;
6412         case EQ: return AARCH64_EQ;
6413         case GE: return AARCH64_LE;
6414         case GT: return AARCH64_LT;
6415         case LE: return AARCH64_GE;
6416         case LT: return AARCH64_GT;
6417         case GEU: return AARCH64_LS;
6418         case GTU: return AARCH64_CC;
6419         case LEU: return AARCH64_CS;
6420         case LTU: return AARCH64_HI;
6421         default: return -1;
6422         }
6423       break;
6424
6425     case E_CC_NZmode:
6426       switch (comp_code)
6427         {
6428         case NE: return AARCH64_NE;
6429         case EQ: return AARCH64_EQ;
6430         case GE: return AARCH64_PL;
6431         case LT: return AARCH64_MI;
6432         default: return -1;
6433         }
6434       break;
6435
6436     case E_CC_Zmode:
6437       switch (comp_code)
6438         {
6439         case NE: return AARCH64_NE;
6440         case EQ: return AARCH64_EQ;
6441         default: return -1;
6442         }
6443       break;
6444
6445     case E_CC_Cmode:
6446       switch (comp_code)
6447         {
6448         case NE: return AARCH64_CS;
6449         case EQ: return AARCH64_CC;
6450         default: return -1;
6451         }
6452       break;
6453
6454     case E_CC_Vmode:
6455       switch (comp_code)
6456         {
6457         case NE: return AARCH64_VS;
6458         case EQ: return AARCH64_VC;
6459         default: return -1;
6460         }
6461       break;
6462
6463     default:
6464       return -1;
6465     }
6466
6467   return -1;
6468 }
6469
6470 bool
6471 aarch64_const_vec_all_same_in_range_p (rtx x,
6472                                        HOST_WIDE_INT minval,
6473                                        HOST_WIDE_INT maxval)
6474 {
6475   rtx elt;
6476   return (const_vec_duplicate_p (x, &elt)
6477           && CONST_INT_P (elt)
6478           && IN_RANGE (INTVAL (elt), minval, maxval));
6479 }
6480
6481 bool
6482 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
6483 {
6484   return aarch64_const_vec_all_same_in_range_p (x, val, val);
6485 }
6486
6487 /* Return true if VEC is a constant in which every element is in the range
6488    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
6489
6490 static bool
6491 aarch64_const_vec_all_in_range_p (rtx vec,
6492                                   HOST_WIDE_INT minval,
6493                                   HOST_WIDE_INT maxval)
6494 {
6495   if (GET_CODE (vec) != CONST_VECTOR
6496       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
6497     return false;
6498
6499   int nunits;
6500   if (!CONST_VECTOR_STEPPED_P (vec))
6501     nunits = const_vector_encoded_nelts (vec);
6502   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
6503     return false;
6504
6505   for (int i = 0; i < nunits; i++)
6506     {
6507       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
6508       if (!CONST_INT_P (vec_elem)
6509           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
6510         return false;
6511     }
6512   return true;
6513 }
6514
6515 /* N Z C V.  */
6516 #define AARCH64_CC_V 1
6517 #define AARCH64_CC_C (1 << 1)
6518 #define AARCH64_CC_Z (1 << 2)
6519 #define AARCH64_CC_N (1 << 3)
6520
6521 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
6522 static const int aarch64_nzcv_codes[] =
6523 {
6524   0,            /* EQ, Z == 1.  */
6525   AARCH64_CC_Z, /* NE, Z == 0.  */
6526   0,            /* CS, C == 1.  */
6527   AARCH64_CC_C, /* CC, C == 0.  */
6528   0,            /* MI, N == 1.  */
6529   AARCH64_CC_N, /* PL, N == 0.  */
6530   0,            /* VS, V == 1.  */
6531   AARCH64_CC_V, /* VC, V == 0.  */
6532   0,            /* HI, C ==1 && Z == 0.  */
6533   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
6534   AARCH64_CC_V, /* GE, N == V.  */
6535   0,            /* LT, N != V.  */
6536   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
6537   0,            /* LE, !(Z == 0 && N == V).  */
6538   0,            /* AL, Any.  */
6539   0             /* NV, Any.  */
6540 };
6541
6542 /* Print floating-point vector immediate operand X to F, negating it
6543    first if NEGATE is true.  Return true on success, false if it isn't
6544    a constant we can handle.  */
6545
6546 static bool
6547 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
6548 {
6549   rtx elt;
6550
6551   if (!const_vec_duplicate_p (x, &elt))
6552     return false;
6553
6554   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
6555   if (negate)
6556     r = real_value_negate (&r);
6557
6558   /* We only handle the SVE single-bit immediates here.  */
6559   if (real_equal (&r, &dconst0))
6560     asm_fprintf (f, "0.0");
6561   else if (real_equal (&r, &dconst1))
6562     asm_fprintf (f, "1.0");
6563   else if (real_equal (&r, &dconsthalf))
6564     asm_fprintf (f, "0.5");
6565   else
6566     return false;
6567
6568   return true;
6569 }
6570
6571 /* Return the equivalent letter for size.  */
6572 static char
6573 sizetochar (int size)
6574 {
6575   switch (size)
6576     {
6577     case 64: return 'd';
6578     case 32: return 's';
6579     case 16: return 'h';
6580     case 8 : return 'b';
6581     default: gcc_unreachable ();
6582     }
6583 }
6584
6585 /* Print operand X to file F in a target specific manner according to CODE.
6586    The acceptable formatting commands given by CODE are:
6587      'c':               An integer or symbol address without a preceding #
6588                         sign.
6589      'C':               Take the duplicated element in a vector constant
6590                         and print it in hex.
6591      'D':               Take the duplicated element in a vector constant
6592                         and print it as an unsigned integer, in decimal.
6593      'e':               Print the sign/zero-extend size as a character 8->b,
6594                         16->h, 32->w.
6595      'p':               Prints N such that 2^N == X (X must be power of 2 and
6596                         const int).
6597      'P':               Print the number of non-zero bits in X (a const_int).
6598      'H':               Print the higher numbered register of a pair (TImode)
6599                         of regs.
6600      'm':               Print a condition (eq, ne, etc).
6601      'M':               Same as 'm', but invert condition.
6602      'N':               Take the duplicated element in a vector constant
6603                         and print the negative of it in decimal.
6604      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
6605      'S/T/U/V':         Print a FP/SIMD register name for a register list.
6606                         The register printed is the FP/SIMD register name
6607                         of X + 0/1/2/3 for S/T/U/V.
6608      'R':               Print a scalar FP/SIMD register name + 1.
6609      'X':               Print bottom 16 bits of integer constant in hex.
6610      'w/x':             Print a general register name or the zero register
6611                         (32-bit or 64-bit).
6612      '0':               Print a normal operand, if it's a general register,
6613                         then we assume DImode.
6614      'k':               Print NZCV for conditional compare instructions.
6615      'A':               Output address constant representing the first
6616                         argument of X, specifying a relocation offset
6617                         if appropriate.
6618      'L':               Output constant address specified by X
6619                         with a relocation offset if appropriate.
6620      'G':               Prints address of X, specifying a PC relative
6621                         relocation mode if appropriate.
6622      'y':               Output address of LDP or STP - this is used for
6623                         some LDP/STPs which don't use a PARALLEL in their
6624                         pattern (so the mode needs to be adjusted).
6625      'z':               Output address of a typical LDP or STP.  */
6626
6627 static void
6628 aarch64_print_operand (FILE *f, rtx x, int code)
6629 {
6630   rtx elt;
6631   switch (code)
6632     {
6633     case 'c':
6634       switch (GET_CODE (x))
6635         {
6636         case CONST_INT:
6637           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
6638           break;
6639
6640         case SYMBOL_REF:
6641           output_addr_const (f, x);
6642           break;
6643
6644         case CONST:
6645           if (GET_CODE (XEXP (x, 0)) == PLUS
6646               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
6647             {
6648               output_addr_const (f, x);
6649               break;
6650             }
6651           /* Fall through.  */
6652
6653         default:
6654           output_operand_lossage ("unsupported operand for code '%c'", code);
6655         }
6656       break;
6657
6658     case 'e':
6659       {
6660         int n;
6661
6662         if (!CONST_INT_P (x)
6663             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
6664           {
6665             output_operand_lossage ("invalid operand for '%%%c'", code);
6666             return;
6667           }
6668
6669         switch (n)
6670           {
6671           case 3:
6672             fputc ('b', f);
6673             break;
6674           case 4:
6675             fputc ('h', f);
6676             break;
6677           case 5:
6678             fputc ('w', f);
6679             break;
6680           default:
6681             output_operand_lossage ("invalid operand for '%%%c'", code);
6682             return;
6683           }
6684       }
6685       break;
6686
6687     case 'p':
6688       {
6689         int n;
6690
6691         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
6692           {
6693             output_operand_lossage ("invalid operand for '%%%c'", code);
6694             return;
6695           }
6696
6697         asm_fprintf (f, "%d", n);
6698       }
6699       break;
6700
6701     case 'P':
6702       if (!CONST_INT_P (x))
6703         {
6704           output_operand_lossage ("invalid operand for '%%%c'", code);
6705           return;
6706         }
6707
6708       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
6709       break;
6710
6711     case 'H':
6712       if (x == const0_rtx)
6713         {
6714           asm_fprintf (f, "xzr");
6715           break;
6716         }
6717
6718       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
6719         {
6720           output_operand_lossage ("invalid operand for '%%%c'", code);
6721           return;
6722         }
6723
6724       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
6725       break;
6726
6727     case 'M':
6728     case 'm':
6729       {
6730         int cond_code;
6731         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
6732         if (x == const_true_rtx)
6733           {
6734             if (code == 'M')
6735               fputs ("nv", f);
6736             return;
6737           }
6738
6739         if (!COMPARISON_P (x))
6740           {
6741             output_operand_lossage ("invalid operand for '%%%c'", code);
6742             return;
6743           }
6744
6745         cond_code = aarch64_get_condition_code (x);
6746         gcc_assert (cond_code >= 0);
6747         if (code == 'M')
6748           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
6749         fputs (aarch64_condition_codes[cond_code], f);
6750       }
6751       break;
6752
6753     case 'N':
6754       if (!const_vec_duplicate_p (x, &elt))
6755         {
6756           output_operand_lossage ("invalid vector constant");
6757           return;
6758         }
6759
6760       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6761         asm_fprintf (f, "%wd", -INTVAL (elt));
6762       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6763                && aarch64_print_vector_float_operand (f, x, true))
6764         ;
6765       else
6766         {
6767           output_operand_lossage ("invalid vector constant");
6768           return;
6769         }
6770       break;
6771
6772     case 'b':
6773     case 'h':
6774     case 's':
6775     case 'd':
6776     case 'q':
6777       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6778         {
6779           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6780           return;
6781         }
6782       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
6783       break;
6784
6785     case 'S':
6786     case 'T':
6787     case 'U':
6788     case 'V':
6789       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6790         {
6791           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6792           return;
6793         }
6794       asm_fprintf (f, "%c%d",
6795                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
6796                    REGNO (x) - V0_REGNUM + (code - 'S'));
6797       break;
6798
6799     case 'R':
6800       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6801         {
6802           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6803           return;
6804         }
6805       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
6806       break;
6807
6808     case 'X':
6809       if (!CONST_INT_P (x))
6810         {
6811           output_operand_lossage ("invalid operand for '%%%c'", code);
6812           return;
6813         }
6814       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
6815       break;
6816
6817     case 'C':
6818       {
6819         /* Print a replicated constant in hex.  */
6820         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6821           {
6822             output_operand_lossage ("invalid operand for '%%%c'", code);
6823             return;
6824           }
6825         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6826         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6827       }
6828       break;
6829
6830     case 'D':
6831       {
6832         /* Print a replicated constant in decimal, treating it as
6833            unsigned.  */
6834         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6835           {
6836             output_operand_lossage ("invalid operand for '%%%c'", code);
6837             return;
6838           }
6839         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6840         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6841       }
6842       break;
6843
6844     case 'w':
6845     case 'x':
6846       if (x == const0_rtx
6847           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
6848         {
6849           asm_fprintf (f, "%czr", code);
6850           break;
6851         }
6852
6853       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
6854         {
6855           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
6856           break;
6857         }
6858
6859       if (REG_P (x) && REGNO (x) == SP_REGNUM)
6860         {
6861           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
6862           break;
6863         }
6864
6865       /* Fall through */
6866
6867     case 0:
6868       if (x == NULL)
6869         {
6870           output_operand_lossage ("missing operand");
6871           return;
6872         }
6873
6874       switch (GET_CODE (x))
6875         {
6876         case REG:
6877           if (aarch64_sve_data_mode_p (GET_MODE (x)))
6878             {
6879               if (REG_NREGS (x) == 1)
6880                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
6881               else
6882                 {
6883                   char suffix
6884                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
6885                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
6886                                REGNO (x) - V0_REGNUM, suffix,
6887                                END_REGNO (x) - V0_REGNUM - 1, suffix);
6888                 }
6889             }
6890           else
6891             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
6892           break;
6893
6894         case MEM:
6895           output_address (GET_MODE (x), XEXP (x, 0));
6896           break;
6897
6898         case LABEL_REF:
6899         case SYMBOL_REF:
6900           output_addr_const (asm_out_file, x);
6901           break;
6902
6903         case CONST_INT:
6904           asm_fprintf (f, "%wd", INTVAL (x));
6905           break;
6906
6907         case CONST:
6908           if (!VECTOR_MODE_P (GET_MODE (x)))
6909             {
6910               output_addr_const (asm_out_file, x);
6911               break;
6912             }
6913           /* fall through */
6914
6915         case CONST_VECTOR:
6916           if (!const_vec_duplicate_p (x, &elt))
6917             {
6918               output_operand_lossage ("invalid vector constant");
6919               return;
6920             }
6921
6922           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6923             asm_fprintf (f, "%wd", INTVAL (elt));
6924           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6925                    && aarch64_print_vector_float_operand (f, x, false))
6926             ;
6927           else
6928             {
6929               output_operand_lossage ("invalid vector constant");
6930               return;
6931             }
6932           break;
6933
6934         case CONST_DOUBLE:
6935           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
6936              be getting CONST_DOUBLEs holding integers.  */
6937           gcc_assert (GET_MODE (x) != VOIDmode);
6938           if (aarch64_float_const_zero_rtx_p (x))
6939             {
6940               fputc ('0', f);
6941               break;
6942             }
6943           else if (aarch64_float_const_representable_p (x))
6944             {
6945 #define buf_size 20
6946               char float_buf[buf_size] = {'\0'};
6947               real_to_decimal_for_mode (float_buf,
6948                                         CONST_DOUBLE_REAL_VALUE (x),
6949                                         buf_size, buf_size,
6950                                         1, GET_MODE (x));
6951               asm_fprintf (asm_out_file, "%s", float_buf);
6952               break;
6953 #undef buf_size
6954             }
6955           output_operand_lossage ("invalid constant");
6956           return;
6957         default:
6958           output_operand_lossage ("invalid operand");
6959           return;
6960         }
6961       break;
6962
6963     case 'A':
6964       if (GET_CODE (x) == HIGH)
6965         x = XEXP (x, 0);
6966
6967       switch (aarch64_classify_symbolic_expression (x))
6968         {
6969         case SYMBOL_SMALL_GOT_4G:
6970           asm_fprintf (asm_out_file, ":got:");
6971           break;
6972
6973         case SYMBOL_SMALL_TLSGD:
6974           asm_fprintf (asm_out_file, ":tlsgd:");
6975           break;
6976
6977         case SYMBOL_SMALL_TLSDESC:
6978           asm_fprintf (asm_out_file, ":tlsdesc:");
6979           break;
6980
6981         case SYMBOL_SMALL_TLSIE:
6982           asm_fprintf (asm_out_file, ":gottprel:");
6983           break;
6984
6985         case SYMBOL_TLSLE24:
6986           asm_fprintf (asm_out_file, ":tprel:");
6987           break;
6988
6989         case SYMBOL_TINY_GOT:
6990           gcc_unreachable ();
6991           break;
6992
6993         default:
6994           break;
6995         }
6996       output_addr_const (asm_out_file, x);
6997       break;
6998
6999     case 'L':
7000       switch (aarch64_classify_symbolic_expression (x))
7001         {
7002         case SYMBOL_SMALL_GOT_4G:
7003           asm_fprintf (asm_out_file, ":lo12:");
7004           break;
7005
7006         case SYMBOL_SMALL_TLSGD:
7007           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
7008           break;
7009
7010         case SYMBOL_SMALL_TLSDESC:
7011           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
7012           break;
7013
7014         case SYMBOL_SMALL_TLSIE:
7015           asm_fprintf (asm_out_file, ":gottprel_lo12:");
7016           break;
7017
7018         case SYMBOL_TLSLE12:
7019           asm_fprintf (asm_out_file, ":tprel_lo12:");
7020           break;
7021
7022         case SYMBOL_TLSLE24:
7023           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
7024           break;
7025
7026         case SYMBOL_TINY_GOT:
7027           asm_fprintf (asm_out_file, ":got:");
7028           break;
7029
7030         case SYMBOL_TINY_TLSIE:
7031           asm_fprintf (asm_out_file, ":gottprel:");
7032           break;
7033
7034         default:
7035           break;
7036         }
7037       output_addr_const (asm_out_file, x);
7038       break;
7039
7040     case 'G':
7041       switch (aarch64_classify_symbolic_expression (x))
7042         {
7043         case SYMBOL_TLSLE24:
7044           asm_fprintf (asm_out_file, ":tprel_hi12:");
7045           break;
7046         default:
7047           break;
7048         }
7049       output_addr_const (asm_out_file, x);
7050       break;
7051
7052     case 'k':
7053       {
7054         HOST_WIDE_INT cond_code;
7055
7056         if (!CONST_INT_P (x))
7057           {
7058             output_operand_lossage ("invalid operand for '%%%c'", code);
7059             return;
7060           }
7061
7062         cond_code = INTVAL (x);
7063         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
7064         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
7065       }
7066       break;
7067
7068     case 'y':
7069     case 'z':
7070       {
7071         machine_mode mode = GET_MODE (x);
7072
7073         if (GET_CODE (x) != MEM
7074             || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
7075           {
7076             output_operand_lossage ("invalid operand for '%%%c'", code);
7077             return;
7078           }
7079
7080         if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
7081                                             code == 'y'
7082                                             ? ADDR_QUERY_LDP_STP_N
7083                                             : ADDR_QUERY_LDP_STP))
7084           output_operand_lossage ("invalid operand prefix '%%%c'", code);
7085       }
7086       break;
7087
7088     default:
7089       output_operand_lossage ("invalid operand prefix '%%%c'", code);
7090       return;
7091     }
7092 }
7093
7094 /* Print address 'x' of a memory access with mode 'mode'.
7095    'op' is the context required by aarch64_classify_address.  It can either be
7096    MEM for a normal memory access or PARALLEL for LDP/STP.  */
7097 static bool
7098 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
7099                                 aarch64_addr_query_type type)
7100 {
7101   struct aarch64_address_info addr;
7102   unsigned int size;
7103
7104   /* Check all addresses are Pmode - including ILP32.  */
7105   if (GET_MODE (x) != Pmode)
7106     output_operand_lossage ("invalid address mode");
7107
7108   if (aarch64_classify_address (&addr, x, mode, true, type))
7109     switch (addr.type)
7110       {
7111       case ADDRESS_REG_IMM:
7112         if (known_eq (addr.const_offset, 0))
7113           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
7114         else if (aarch64_sve_data_mode_p (mode))
7115           {
7116             HOST_WIDE_INT vnum
7117               = exact_div (addr.const_offset,
7118                            BYTES_PER_SVE_VECTOR).to_constant ();
7119             asm_fprintf (f, "[%s, #%wd, mul vl]",
7120                          reg_names[REGNO (addr.base)], vnum);
7121           }
7122         else if (aarch64_sve_pred_mode_p (mode))
7123           {
7124             HOST_WIDE_INT vnum
7125               = exact_div (addr.const_offset,
7126                            BYTES_PER_SVE_PRED).to_constant ();
7127             asm_fprintf (f, "[%s, #%wd, mul vl]",
7128                          reg_names[REGNO (addr.base)], vnum);
7129           }
7130         else
7131           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
7132                        INTVAL (addr.offset));
7133         return true;
7134
7135       case ADDRESS_REG_REG:
7136         if (addr.shift == 0)
7137           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
7138                        reg_names [REGNO (addr.offset)]);
7139         else
7140           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
7141                        reg_names [REGNO (addr.offset)], addr.shift);
7142         return true;
7143
7144       case ADDRESS_REG_UXTW:
7145         if (addr.shift == 0)
7146           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
7147                        REGNO (addr.offset) - R0_REGNUM);
7148         else
7149           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
7150                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
7151         return true;
7152
7153       case ADDRESS_REG_SXTW:
7154         if (addr.shift == 0)
7155           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
7156                        REGNO (addr.offset) - R0_REGNUM);
7157         else
7158           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
7159                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
7160         return true;
7161
7162       case ADDRESS_REG_WB:
7163         /* Writeback is only supported for fixed-width modes.  */
7164         size = GET_MODE_SIZE (mode).to_constant ();
7165         switch (GET_CODE (x))
7166           {
7167           case PRE_INC:
7168             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
7169             return true;
7170           case POST_INC:
7171             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
7172             return true;
7173           case PRE_DEC:
7174             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
7175             return true;
7176           case POST_DEC:
7177             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
7178             return true;
7179           case PRE_MODIFY:
7180             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
7181                          INTVAL (addr.offset));
7182             return true;
7183           case POST_MODIFY:
7184             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
7185                          INTVAL (addr.offset));
7186             return true;
7187           default:
7188             break;
7189           }
7190         break;
7191
7192       case ADDRESS_LO_SUM:
7193         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
7194         output_addr_const (f, addr.offset);
7195         asm_fprintf (f, "]");
7196         return true;
7197
7198       case ADDRESS_SYMBOLIC:
7199         output_addr_const (f, x);
7200         return true;
7201       }
7202
7203   return false;
7204 }
7205
7206 /* Print address 'x' of a memory access with mode 'mode'.  */
7207 static void
7208 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
7209 {
7210   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
7211     output_addr_const (f, x);
7212 }
7213
7214 bool
7215 aarch64_label_mentioned_p (rtx x)
7216 {
7217   const char *fmt;
7218   int i;
7219
7220   if (GET_CODE (x) == LABEL_REF)
7221     return true;
7222
7223   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
7224      referencing instruction, but they are constant offsets, not
7225      symbols.  */
7226   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
7227     return false;
7228
7229   fmt = GET_RTX_FORMAT (GET_CODE (x));
7230   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
7231     {
7232       if (fmt[i] == 'E')
7233         {
7234           int j;
7235
7236           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
7237             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
7238               return 1;
7239         }
7240       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
7241         return 1;
7242     }
7243
7244   return 0;
7245 }
7246
7247 /* Implement REGNO_REG_CLASS.  */
7248
7249 enum reg_class
7250 aarch64_regno_regclass (unsigned regno)
7251 {
7252   if (GP_REGNUM_P (regno))
7253     return GENERAL_REGS;
7254
7255   if (regno == SP_REGNUM)
7256     return STACK_REG;
7257
7258   if (regno == FRAME_POINTER_REGNUM
7259       || regno == ARG_POINTER_REGNUM)
7260     return POINTER_REGS;
7261
7262   if (FP_REGNUM_P (regno))
7263     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
7264
7265   if (PR_REGNUM_P (regno))
7266     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
7267
7268   return NO_REGS;
7269 }
7270
7271 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
7272    If OFFSET is out of range, return an offset of an anchor point
7273    that is in range.  Return 0 otherwise.  */
7274
7275 static HOST_WIDE_INT
7276 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
7277                        machine_mode mode)
7278 {
7279   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
7280   if (size > 16)
7281     return (offset + 0x400) & ~0x7f0;
7282
7283   /* For offsets that aren't a multiple of the access size, the limit is
7284      -256...255.  */
7285   if (offset & (size - 1))
7286     {
7287       /* BLKmode typically uses LDP of X-registers.  */
7288       if (mode == BLKmode)
7289         return (offset + 512) & ~0x3ff;
7290       return (offset + 0x100) & ~0x1ff;
7291     }
7292
7293   /* Small negative offsets are supported.  */
7294   if (IN_RANGE (offset, -256, 0))
7295     return 0;
7296
7297   if (mode == TImode || mode == TFmode)
7298     return (offset + 0x100) & ~0x1ff;
7299
7300   /* Use 12-bit offset by access size.  */
7301   return offset & (~0xfff * size);
7302 }
7303
7304 static rtx
7305 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
7306 {
7307   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
7308      where mask is selected by alignment and size of the offset.
7309      We try to pick as large a range for the offset as possible to
7310      maximize the chance of a CSE.  However, for aligned addresses
7311      we limit the range to 4k so that structures with different sized
7312      elements are likely to use the same base.  We need to be careful
7313      not to split a CONST for some forms of address expression, otherwise
7314      it will generate sub-optimal code.  */
7315
7316   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
7317     {
7318       rtx base = XEXP (x, 0);
7319       rtx offset_rtx = XEXP (x, 1);
7320       HOST_WIDE_INT offset = INTVAL (offset_rtx);
7321
7322       if (GET_CODE (base) == PLUS)
7323         {
7324           rtx op0 = XEXP (base, 0);
7325           rtx op1 = XEXP (base, 1);
7326
7327           /* Force any scaling into a temp for CSE.  */
7328           op0 = force_reg (Pmode, op0);
7329           op1 = force_reg (Pmode, op1);
7330
7331           /* Let the pointer register be in op0.  */
7332           if (REG_POINTER (op1))
7333             std::swap (op0, op1);
7334
7335           /* If the pointer is virtual or frame related, then we know that
7336              virtual register instantiation or register elimination is going
7337              to apply a second constant.  We want the two constants folded
7338              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
7339           if (virt_or_elim_regno_p (REGNO (op0)))
7340             {
7341               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
7342                                    NULL_RTX, true, OPTAB_DIRECT);
7343               return gen_rtx_PLUS (Pmode, base, op1);
7344             }
7345
7346           /* Otherwise, in order to encourage CSE (and thence loop strength
7347              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
7348           base = expand_binop (Pmode, add_optab, op0, op1,
7349                                NULL_RTX, true, OPTAB_DIRECT);
7350           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
7351         }
7352
7353       HOST_WIDE_INT size;
7354       if (GET_MODE_SIZE (mode).is_constant (&size))
7355         {
7356           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
7357                                                              mode);
7358           if (base_offset != 0)
7359             {
7360               base = plus_constant (Pmode, base, base_offset);
7361               base = force_operand (base, NULL_RTX);
7362               return plus_constant (Pmode, base, offset - base_offset);
7363             }
7364         }
7365     }
7366
7367   return x;
7368 }
7369
7370 static reg_class_t
7371 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
7372                           reg_class_t rclass,
7373                           machine_mode mode,
7374                           secondary_reload_info *sri)
7375 {
7376   /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
7377      directly by the *aarch64_sve_mov<mode>_be move pattern.  See the
7378      comment at the head of aarch64-sve.md for more details about the
7379      big-endian handling.  */
7380   if (BYTES_BIG_ENDIAN
7381       && reg_class_subset_p (rclass, FP_REGS)
7382       && !((REG_P (x) && HARD_REGISTER_P (x))
7383            || aarch64_simd_valid_immediate (x, NULL))
7384       && aarch64_sve_data_mode_p (mode))
7385     {
7386       sri->icode = CODE_FOR_aarch64_sve_reload_be;
7387       return NO_REGS;
7388     }
7389
7390   /* If we have to disable direct literal pool loads and stores because the
7391      function is too big, then we need a scratch register.  */
7392   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
7393       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
7394           || targetm.vector_mode_supported_p (GET_MODE (x)))
7395       && !aarch64_pcrelative_literal_loads)
7396     {
7397       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
7398       return NO_REGS;
7399     }
7400
7401   /* Without the TARGET_SIMD instructions we cannot move a Q register
7402      to a Q register directly.  We need a scratch.  */
7403   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
7404       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
7405       && reg_class_subset_p (rclass, FP_REGS))
7406     {
7407       sri->icode = code_for_aarch64_reload_mov (mode);
7408       return NO_REGS;
7409     }
7410
7411   /* A TFmode or TImode memory access should be handled via an FP_REGS
7412      because AArch64 has richer addressing modes for LDR/STR instructions
7413      than LDP/STP instructions.  */
7414   if (TARGET_FLOAT && rclass == GENERAL_REGS
7415       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
7416     return FP_REGS;
7417
7418   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
7419       return GENERAL_REGS;
7420
7421   return NO_REGS;
7422 }
7423
7424 static bool
7425 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
7426 {
7427   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
7428
7429   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
7430      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
7431   if (frame_pointer_needed)
7432     return to == HARD_FRAME_POINTER_REGNUM;
7433   return true;
7434 }
7435
7436 poly_int64
7437 aarch64_initial_elimination_offset (unsigned from, unsigned to)
7438 {
7439   aarch64_layout_frame ();
7440
7441   if (to == HARD_FRAME_POINTER_REGNUM)
7442     {
7443       if (from == ARG_POINTER_REGNUM)
7444         return cfun->machine->frame.hard_fp_offset;
7445
7446       if (from == FRAME_POINTER_REGNUM)
7447         return cfun->machine->frame.hard_fp_offset
7448                - cfun->machine->frame.locals_offset;
7449     }
7450
7451   if (to == STACK_POINTER_REGNUM)
7452     {
7453       if (from == FRAME_POINTER_REGNUM)
7454           return cfun->machine->frame.frame_size
7455                  - cfun->machine->frame.locals_offset;
7456     }
7457
7458   return cfun->machine->frame.frame_size;
7459 }
7460
7461 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
7462    previous frame.  */
7463
7464 rtx
7465 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
7466 {
7467   if (count != 0)
7468     return const0_rtx;
7469   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
7470 }
7471
7472
7473 static void
7474 aarch64_asm_trampoline_template (FILE *f)
7475 {
7476   if (TARGET_ILP32)
7477     {
7478       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
7479       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
7480     }
7481   else
7482     {
7483       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
7484       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
7485     }
7486   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
7487   assemble_aligned_integer (4, const0_rtx);
7488   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7489   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7490 }
7491
7492 static void
7493 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
7494 {
7495   rtx fnaddr, mem, a_tramp;
7496   const int tramp_code_sz = 16;
7497
7498   /* Don't need to copy the trailing D-words, we fill those in below.  */
7499   emit_block_move (m_tramp, assemble_trampoline_template (),
7500                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
7501   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
7502   fnaddr = XEXP (DECL_RTL (fndecl), 0);
7503   if (GET_MODE (fnaddr) != ptr_mode)
7504     fnaddr = convert_memory_address (ptr_mode, fnaddr);
7505   emit_move_insn (mem, fnaddr);
7506
7507   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
7508   emit_move_insn (mem, chain_value);
7509
7510   /* XXX We should really define a "clear_cache" pattern and use
7511      gen_clear_cache().  */
7512   a_tramp = XEXP (m_tramp, 0);
7513   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
7514                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
7515                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
7516                      ptr_mode);
7517 }
7518
7519 static unsigned char
7520 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
7521 {
7522   /* ??? Logically we should only need to provide a value when
7523      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
7524      can hold MODE, but at the moment we need to handle all modes.
7525      Just ignore any runtime parts for registers that can't store them.  */
7526   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
7527   unsigned int nregs;
7528   switch (regclass)
7529     {
7530     case TAILCALL_ADDR_REGS:
7531     case POINTER_REGS:
7532     case GENERAL_REGS:
7533     case ALL_REGS:
7534     case POINTER_AND_FP_REGS:
7535     case FP_REGS:
7536     case FP_LO_REGS:
7537       if (aarch64_sve_data_mode_p (mode)
7538           && constant_multiple_p (GET_MODE_SIZE (mode),
7539                                   BYTES_PER_SVE_VECTOR, &nregs))
7540         return nregs;
7541       return (aarch64_vector_data_mode_p (mode)
7542               ? CEIL (lowest_size, UNITS_PER_VREG)
7543               : CEIL (lowest_size, UNITS_PER_WORD));
7544     case STACK_REG:
7545     case PR_REGS:
7546     case PR_LO_REGS:
7547     case PR_HI_REGS:
7548       return 1;
7549
7550     case NO_REGS:
7551       return 0;
7552
7553     default:
7554       break;
7555     }
7556   gcc_unreachable ();
7557 }
7558
7559 static reg_class_t
7560 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
7561 {
7562   if (regclass == POINTER_REGS)
7563     return GENERAL_REGS;
7564
7565   if (regclass == STACK_REG)
7566     {
7567       if (REG_P(x)
7568           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
7569           return regclass;
7570
7571       return NO_REGS;
7572     }
7573
7574   /* Register eliminiation can result in a request for
7575      SP+constant->FP_REGS.  We cannot support such operations which
7576      use SP as source and an FP_REG as destination, so reject out
7577      right now.  */
7578   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
7579     {
7580       rtx lhs = XEXP (x, 0);
7581
7582       /* Look through a possible SUBREG introduced by ILP32.  */
7583       if (GET_CODE (lhs) == SUBREG)
7584         lhs = SUBREG_REG (lhs);
7585
7586       gcc_assert (REG_P (lhs));
7587       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
7588                                       POINTER_REGS));
7589       return NO_REGS;
7590     }
7591
7592   return regclass;
7593 }
7594
7595 void
7596 aarch64_asm_output_labelref (FILE* f, const char *name)
7597 {
7598   asm_fprintf (f, "%U%s", name);
7599 }
7600
7601 static void
7602 aarch64_elf_asm_constructor (rtx symbol, int priority)
7603 {
7604   if (priority == DEFAULT_INIT_PRIORITY)
7605     default_ctor_section_asm_out_constructor (symbol, priority);
7606   else
7607     {
7608       section *s;
7609       /* While priority is known to be in range [0, 65535], so 18 bytes
7610          would be enough, the compiler might not know that.  To avoid
7611          -Wformat-truncation false positive, use a larger size.  */
7612       char buf[23];
7613       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
7614       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7615       switch_to_section (s);
7616       assemble_align (POINTER_SIZE);
7617       assemble_aligned_integer (POINTER_BYTES, symbol);
7618     }
7619 }
7620
7621 static void
7622 aarch64_elf_asm_destructor (rtx symbol, int priority)
7623 {
7624   if (priority == DEFAULT_INIT_PRIORITY)
7625     default_dtor_section_asm_out_destructor (symbol, priority);
7626   else
7627     {
7628       section *s;
7629       /* While priority is known to be in range [0, 65535], so 18 bytes
7630          would be enough, the compiler might not know that.  To avoid
7631          -Wformat-truncation false positive, use a larger size.  */
7632       char buf[23];
7633       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
7634       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7635       switch_to_section (s);
7636       assemble_align (POINTER_SIZE);
7637       assemble_aligned_integer (POINTER_BYTES, symbol);
7638     }
7639 }
7640
7641 const char*
7642 aarch64_output_casesi (rtx *operands)
7643 {
7644   char buf[100];
7645   char label[100];
7646   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
7647   int index;
7648   static const char *const patterns[4][2] =
7649   {
7650     {
7651       "ldrb\t%w3, [%0,%w1,uxtw]",
7652       "add\t%3, %4, %w3, sxtb #2"
7653     },
7654     {
7655       "ldrh\t%w3, [%0,%w1,uxtw #1]",
7656       "add\t%3, %4, %w3, sxth #2"
7657     },
7658     {
7659       "ldr\t%w3, [%0,%w1,uxtw #2]",
7660       "add\t%3, %4, %w3, sxtw #2"
7661     },
7662     /* We assume that DImode is only generated when not optimizing and
7663        that we don't really need 64-bit address offsets.  That would
7664        imply an object file with 8GB of code in a single function!  */
7665     {
7666       "ldr\t%w3, [%0,%w1,uxtw #2]",
7667       "add\t%3, %4, %w3, sxtw #2"
7668     }
7669   };
7670
7671   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
7672
7673   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
7674   index = exact_log2 (GET_MODE_SIZE (mode));
7675
7676   gcc_assert (index >= 0 && index <= 3);
7677
7678   /* Need to implement table size reduction, by chaning the code below.  */
7679   output_asm_insn (patterns[index][0], operands);
7680   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
7681   snprintf (buf, sizeof (buf),
7682             "adr\t%%4, %s", targetm.strip_name_encoding (label));
7683   output_asm_insn (buf, operands);
7684   output_asm_insn (patterns[index][1], operands);
7685   output_asm_insn ("br\t%3", operands);
7686   assemble_label (asm_out_file, label);
7687   return "";
7688 }
7689
7690
7691 /* Return size in bits of an arithmetic operand which is shifted/scaled and
7692    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
7693    operator.  */
7694
7695 int
7696 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
7697 {
7698   if (shift >= 0 && shift <= 3)
7699     {
7700       int size;
7701       for (size = 8; size <= 32; size *= 2)
7702         {
7703           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
7704           if (mask == bits << shift)
7705             return size;
7706         }
7707     }
7708   return 0;
7709 }
7710
7711 /* Constant pools are per function only when PC relative
7712    literal loads are true or we are in the large memory
7713    model.  */
7714
7715 static inline bool
7716 aarch64_can_use_per_function_literal_pools_p (void)
7717 {
7718   return (aarch64_pcrelative_literal_loads
7719           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
7720 }
7721
7722 static bool
7723 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
7724 {
7725   /* We can't use blocks for constants when we're using a per-function
7726      constant pool.  */
7727   return !aarch64_can_use_per_function_literal_pools_p ();
7728 }
7729
7730 /* Select appropriate section for constants depending
7731    on where we place literal pools.  */
7732
7733 static section *
7734 aarch64_select_rtx_section (machine_mode mode,
7735                             rtx x,
7736                             unsigned HOST_WIDE_INT align)
7737 {
7738   if (aarch64_can_use_per_function_literal_pools_p ())
7739     return function_section (current_function_decl);
7740
7741   return default_elf_select_rtx_section (mode, x, align);
7742 }
7743
7744 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
7745 void
7746 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
7747                                   HOST_WIDE_INT offset)
7748 {
7749   /* When using per-function literal pools, we must ensure that any code
7750      section is aligned to the minimal instruction length, lest we get
7751      errors from the assembler re "unaligned instructions".  */
7752   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
7753     ASM_OUTPUT_ALIGN (f, 2);
7754 }
7755
7756 /* Costs.  */
7757
7758 /* Helper function for rtx cost calculation.  Strip a shift expression
7759    from X.  Returns the inner operand if successful, or the original
7760    expression on failure.  */
7761 static rtx
7762 aarch64_strip_shift (rtx x)
7763 {
7764   rtx op = x;
7765
7766   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
7767      we can convert both to ROR during final output.  */
7768   if ((GET_CODE (op) == ASHIFT
7769        || GET_CODE (op) == ASHIFTRT
7770        || GET_CODE (op) == LSHIFTRT
7771        || GET_CODE (op) == ROTATERT
7772        || GET_CODE (op) == ROTATE)
7773       && CONST_INT_P (XEXP (op, 1)))
7774     return XEXP (op, 0);
7775
7776   if (GET_CODE (op) == MULT
7777       && CONST_INT_P (XEXP (op, 1))
7778       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
7779     return XEXP (op, 0);
7780
7781   return x;
7782 }
7783
7784 /* Helper function for rtx cost calculation.  Strip an extend
7785    expression from X.  Returns the inner operand if successful, or the
7786    original expression on failure.  We deal with a number of possible
7787    canonicalization variations here. If STRIP_SHIFT is true, then
7788    we can strip off a shift also.  */
7789 static rtx
7790 aarch64_strip_extend (rtx x, bool strip_shift)
7791 {
7792   scalar_int_mode mode;
7793   rtx op = x;
7794
7795   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
7796     return op;
7797
7798   /* Zero and sign extraction of a widened value.  */
7799   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
7800       && XEXP (op, 2) == const0_rtx
7801       && GET_CODE (XEXP (op, 0)) == MULT
7802       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
7803                                          XEXP (op, 1)))
7804     return XEXP (XEXP (op, 0), 0);
7805
7806   /* It can also be represented (for zero-extend) as an AND with an
7807      immediate.  */
7808   if (GET_CODE (op) == AND
7809       && GET_CODE (XEXP (op, 0)) == MULT
7810       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
7811       && CONST_INT_P (XEXP (op, 1))
7812       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
7813                            INTVAL (XEXP (op, 1))) != 0)
7814     return XEXP (XEXP (op, 0), 0);
7815
7816   /* Now handle extended register, as this may also have an optional
7817      left shift by 1..4.  */
7818   if (strip_shift
7819       && GET_CODE (op) == ASHIFT
7820       && CONST_INT_P (XEXP (op, 1))
7821       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
7822     op = XEXP (op, 0);
7823
7824   if (GET_CODE (op) == ZERO_EXTEND
7825       || GET_CODE (op) == SIGN_EXTEND)
7826     op = XEXP (op, 0);
7827
7828   if (op != x)
7829     return op;
7830
7831   return x;
7832 }
7833
7834 /* Return true iff CODE is a shift supported in combination
7835    with arithmetic instructions.  */
7836
7837 static bool
7838 aarch64_shift_p (enum rtx_code code)
7839 {
7840   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
7841 }
7842
7843
7844 /* Return true iff X is a cheap shift without a sign extend. */
7845
7846 static bool
7847 aarch64_cheap_mult_shift_p (rtx x)
7848 {
7849   rtx op0, op1;
7850
7851   op0 = XEXP (x, 0);
7852   op1 = XEXP (x, 1);
7853
7854   if (!(aarch64_tune_params.extra_tuning_flags
7855                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
7856     return false;
7857
7858   if (GET_CODE (op0) == SIGN_EXTEND)
7859     return false;
7860
7861   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
7862       && UINTVAL (op1) <= 4)
7863     return true;
7864
7865   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
7866     return false;
7867
7868   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
7869
7870   if (l2 > 0 && l2 <= 4)
7871     return true;
7872
7873   return false;
7874 }
7875
7876 /* Helper function for rtx cost calculation.  Calculate the cost of
7877    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
7878    Return the calculated cost of the expression, recursing manually in to
7879    operands where needed.  */
7880
7881 static int
7882 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
7883 {
7884   rtx op0, op1;
7885   const struct cpu_cost_table *extra_cost
7886     = aarch64_tune_params.insn_extra_cost;
7887   int cost = 0;
7888   bool compound_p = (outer == PLUS || outer == MINUS);
7889   machine_mode mode = GET_MODE (x);
7890
7891   gcc_checking_assert (code == MULT);
7892
7893   op0 = XEXP (x, 0);
7894   op1 = XEXP (x, 1);
7895
7896   if (VECTOR_MODE_P (mode))
7897     mode = GET_MODE_INNER (mode);
7898
7899   /* Integer multiply/fma.  */
7900   if (GET_MODE_CLASS (mode) == MODE_INT)
7901     {
7902       /* The multiply will be canonicalized as a shift, cost it as such.  */
7903       if (aarch64_shift_p (GET_CODE (x))
7904           || (CONST_INT_P (op1)
7905               && exact_log2 (INTVAL (op1)) > 0))
7906         {
7907           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
7908                            || GET_CODE (op0) == SIGN_EXTEND;
7909           if (speed)
7910             {
7911               if (compound_p)
7912                 {
7913                   /* If the shift is considered cheap,
7914                      then don't add any cost. */
7915                   if (aarch64_cheap_mult_shift_p (x))
7916                     ;
7917                   else if (REG_P (op1))
7918                     /* ARITH + shift-by-register.  */
7919                     cost += extra_cost->alu.arith_shift_reg;
7920                   else if (is_extend)
7921                     /* ARITH + extended register.  We don't have a cost field
7922                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
7923                     cost += extra_cost->alu.extend_arith;
7924                   else
7925                     /* ARITH + shift-by-immediate.  */
7926                     cost += extra_cost->alu.arith_shift;
7927                 }
7928               else
7929                 /* LSL (immediate).  */
7930                 cost += extra_cost->alu.shift;
7931
7932             }
7933           /* Strip extends as we will have costed them in the case above.  */
7934           if (is_extend)
7935             op0 = aarch64_strip_extend (op0, true);
7936
7937           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
7938
7939           return cost;
7940         }
7941
7942       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
7943          compound and let the below cases handle it.  After all, MNEG is a
7944          special-case alias of MSUB.  */
7945       if (GET_CODE (op0) == NEG)
7946         {
7947           op0 = XEXP (op0, 0);
7948           compound_p = true;
7949         }
7950
7951       /* Integer multiplies or FMAs have zero/sign extending variants.  */
7952       if ((GET_CODE (op0) == ZERO_EXTEND
7953            && GET_CODE (op1) == ZERO_EXTEND)
7954           || (GET_CODE (op0) == SIGN_EXTEND
7955               && GET_CODE (op1) == SIGN_EXTEND))
7956         {
7957           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
7958           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
7959
7960           if (speed)
7961             {
7962               if (compound_p)
7963                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
7964                 cost += extra_cost->mult[0].extend_add;
7965               else
7966                 /* MUL/SMULL/UMULL.  */
7967                 cost += extra_cost->mult[0].extend;
7968             }
7969
7970           return cost;
7971         }
7972
7973       /* This is either an integer multiply or a MADD.  In both cases
7974          we want to recurse and cost the operands.  */
7975       cost += rtx_cost (op0, mode, MULT, 0, speed);
7976       cost += rtx_cost (op1, mode, MULT, 1, speed);
7977
7978       if (speed)
7979         {
7980           if (compound_p)
7981             /* MADD/MSUB.  */
7982             cost += extra_cost->mult[mode == DImode].add;
7983           else
7984             /* MUL.  */
7985             cost += extra_cost->mult[mode == DImode].simple;
7986         }
7987
7988       return cost;
7989     }
7990   else
7991     {
7992       if (speed)
7993         {
7994           /* Floating-point FMA/FMUL can also support negations of the
7995              operands, unless the rounding mode is upward or downward in
7996              which case FNMUL is different than FMUL with operand negation.  */
7997           bool neg0 = GET_CODE (op0) == NEG;
7998           bool neg1 = GET_CODE (op1) == NEG;
7999           if (compound_p || !flag_rounding_math || (neg0 && neg1))
8000             {
8001               if (neg0)
8002                 op0 = XEXP (op0, 0);
8003               if (neg1)
8004                 op1 = XEXP (op1, 0);
8005             }
8006
8007           if (compound_p)
8008             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
8009             cost += extra_cost->fp[mode == DFmode].fma;
8010           else
8011             /* FMUL/FNMUL.  */
8012             cost += extra_cost->fp[mode == DFmode].mult;
8013         }
8014
8015       cost += rtx_cost (op0, mode, MULT, 0, speed);
8016       cost += rtx_cost (op1, mode, MULT, 1, speed);
8017       return cost;
8018     }
8019 }
8020
8021 static int
8022 aarch64_address_cost (rtx x,
8023                       machine_mode mode,
8024                       addr_space_t as ATTRIBUTE_UNUSED,
8025                       bool speed)
8026 {
8027   enum rtx_code c = GET_CODE (x);
8028   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
8029   struct aarch64_address_info info;
8030   int cost = 0;
8031   info.shift = 0;
8032
8033   if (!aarch64_classify_address (&info, x, mode, false))
8034     {
8035       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
8036         {
8037           /* This is a CONST or SYMBOL ref which will be split
8038              in a different way depending on the code model in use.
8039              Cost it through the generic infrastructure.  */
8040           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
8041           /* Divide through by the cost of one instruction to
8042              bring it to the same units as the address costs.  */
8043           cost_symbol_ref /= COSTS_N_INSNS (1);
8044           /* The cost is then the cost of preparing the address,
8045              followed by an immediate (possibly 0) offset.  */
8046           return cost_symbol_ref + addr_cost->imm_offset;
8047         }
8048       else
8049         {
8050           /* This is most likely a jump table from a case
8051              statement.  */
8052           return addr_cost->register_offset;
8053         }
8054     }
8055
8056   switch (info.type)
8057     {
8058       case ADDRESS_LO_SUM:
8059       case ADDRESS_SYMBOLIC:
8060       case ADDRESS_REG_IMM:
8061         cost += addr_cost->imm_offset;
8062         break;
8063
8064       case ADDRESS_REG_WB:
8065         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
8066           cost += addr_cost->pre_modify;
8067         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
8068           cost += addr_cost->post_modify;
8069         else
8070           gcc_unreachable ();
8071
8072         break;
8073
8074       case ADDRESS_REG_REG:
8075         cost += addr_cost->register_offset;
8076         break;
8077
8078       case ADDRESS_REG_SXTW:
8079         cost += addr_cost->register_sextend;
8080         break;
8081
8082       case ADDRESS_REG_UXTW:
8083         cost += addr_cost->register_zextend;
8084         break;
8085
8086       default:
8087         gcc_unreachable ();
8088     }
8089
8090
8091   if (info.shift > 0)
8092     {
8093       /* For the sake of calculating the cost of the shifted register
8094          component, we can treat same sized modes in the same way.  */
8095       if (known_eq (GET_MODE_BITSIZE (mode), 16))
8096         cost += addr_cost->addr_scale_costs.hi;
8097       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
8098         cost += addr_cost->addr_scale_costs.si;
8099       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
8100         cost += addr_cost->addr_scale_costs.di;
8101       else
8102         /* We can't tell, or this is a 128-bit vector.  */
8103         cost += addr_cost->addr_scale_costs.ti;
8104     }
8105
8106   return cost;
8107 }
8108
8109 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
8110    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
8111    to be taken.  */
8112
8113 int
8114 aarch64_branch_cost (bool speed_p, bool predictable_p)
8115 {
8116   /* When optimizing for speed, use the cost of unpredictable branches.  */
8117   const struct cpu_branch_cost *branch_costs =
8118     aarch64_tune_params.branch_costs;
8119
8120   if (!speed_p || predictable_p)
8121     return branch_costs->predictable;
8122   else
8123     return branch_costs->unpredictable;
8124 }
8125
8126 /* Return true if the RTX X in mode MODE is a zero or sign extract
8127    usable in an ADD or SUB (extended register) instruction.  */
8128 static bool
8129 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
8130 {
8131   /* Catch add with a sign extract.
8132      This is add_<optab><mode>_multp2.  */
8133   if (GET_CODE (x) == SIGN_EXTRACT
8134       || GET_CODE (x) == ZERO_EXTRACT)
8135     {
8136       rtx op0 = XEXP (x, 0);
8137       rtx op1 = XEXP (x, 1);
8138       rtx op2 = XEXP (x, 2);
8139
8140       if (GET_CODE (op0) == MULT
8141           && CONST_INT_P (op1)
8142           && op2 == const0_rtx
8143           && CONST_INT_P (XEXP (op0, 1))
8144           && aarch64_is_extend_from_extract (mode,
8145                                              XEXP (op0, 1),
8146                                              op1))
8147         {
8148           return true;
8149         }
8150     }
8151   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
8152      No shift.  */
8153   else if (GET_CODE (x) == SIGN_EXTEND
8154            || GET_CODE (x) == ZERO_EXTEND)
8155     return REG_P (XEXP (x, 0));
8156
8157   return false;
8158 }
8159
8160 static bool
8161 aarch64_frint_unspec_p (unsigned int u)
8162 {
8163   switch (u)
8164     {
8165       case UNSPEC_FRINTZ:
8166       case UNSPEC_FRINTP:
8167       case UNSPEC_FRINTM:
8168       case UNSPEC_FRINTA:
8169       case UNSPEC_FRINTN:
8170       case UNSPEC_FRINTX:
8171       case UNSPEC_FRINTI:
8172         return true;
8173
8174       default:
8175         return false;
8176     }
8177 }
8178
8179 /* Return true iff X is an rtx that will match an extr instruction
8180    i.e. as described in the *extr<mode>5_insn family of patterns.
8181    OP0 and OP1 will be set to the operands of the shifts involved
8182    on success and will be NULL_RTX otherwise.  */
8183
8184 static bool
8185 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
8186 {
8187   rtx op0, op1;
8188   scalar_int_mode mode;
8189   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
8190     return false;
8191
8192   *res_op0 = NULL_RTX;
8193   *res_op1 = NULL_RTX;
8194
8195   if (GET_CODE (x) != IOR)
8196     return false;
8197
8198   op0 = XEXP (x, 0);
8199   op1 = XEXP (x, 1);
8200
8201   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
8202       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
8203     {
8204      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
8205       if (GET_CODE (op1) == ASHIFT)
8206         std::swap (op0, op1);
8207
8208       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
8209         return false;
8210
8211       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
8212       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
8213
8214       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
8215           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
8216         {
8217           *res_op0 = XEXP (op0, 0);
8218           *res_op1 = XEXP (op1, 0);
8219           return true;
8220         }
8221     }
8222
8223   return false;
8224 }
8225
8226 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
8227    storing it in *COST.  Result is true if the total cost of the operation
8228    has now been calculated.  */
8229 static bool
8230 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
8231 {
8232   rtx inner;
8233   rtx comparator;
8234   enum rtx_code cmpcode;
8235
8236   if (COMPARISON_P (op0))
8237     {
8238       inner = XEXP (op0, 0);
8239       comparator = XEXP (op0, 1);
8240       cmpcode = GET_CODE (op0);
8241     }
8242   else
8243     {
8244       inner = op0;
8245       comparator = const0_rtx;
8246       cmpcode = NE;
8247     }
8248
8249   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
8250     {
8251       /* Conditional branch.  */
8252       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8253         return true;
8254       else
8255         {
8256           if (cmpcode == NE || cmpcode == EQ)
8257             {
8258               if (comparator == const0_rtx)
8259                 {
8260                   /* TBZ/TBNZ/CBZ/CBNZ.  */
8261                   if (GET_CODE (inner) == ZERO_EXTRACT)
8262                     /* TBZ/TBNZ.  */
8263                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
8264                                        ZERO_EXTRACT, 0, speed);
8265                   else
8266                     /* CBZ/CBNZ.  */
8267                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
8268
8269                 return true;
8270               }
8271             }
8272           else if (cmpcode == LT || cmpcode == GE)
8273             {
8274               /* TBZ/TBNZ.  */
8275               if (comparator == const0_rtx)
8276                 return true;
8277             }
8278         }
8279     }
8280   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8281     {
8282       /* CCMP.  */
8283       if (GET_CODE (op1) == COMPARE)
8284         {
8285           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
8286           if (XEXP (op1, 1) == const0_rtx)
8287             *cost += 1;
8288           if (speed)
8289             {
8290               machine_mode mode = GET_MODE (XEXP (op1, 0));
8291               const struct cpu_cost_table *extra_cost
8292                 = aarch64_tune_params.insn_extra_cost;
8293
8294               if (GET_MODE_CLASS (mode) == MODE_INT)
8295                 *cost += extra_cost->alu.arith;
8296               else
8297                 *cost += extra_cost->fp[mode == DFmode].compare;
8298             }
8299           return true;
8300         }
8301
8302       /* It's a conditional operation based on the status flags,
8303          so it must be some flavor of CSEL.  */
8304
8305       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
8306       if (GET_CODE (op1) == NEG
8307           || GET_CODE (op1) == NOT
8308           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
8309         op1 = XEXP (op1, 0);
8310       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
8311         {
8312           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
8313           op1 = XEXP (op1, 0);
8314           op2 = XEXP (op2, 0);
8315         }
8316
8317       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
8318       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
8319       return true;
8320     }
8321
8322   /* We don't know what this is, cost all operands.  */
8323   return false;
8324 }
8325
8326 /* Check whether X is a bitfield operation of the form shift + extend that
8327    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
8328    operand to which the bitfield operation is applied.  Otherwise return
8329    NULL_RTX.  */
8330
8331 static rtx
8332 aarch64_extend_bitfield_pattern_p (rtx x)
8333 {
8334   rtx_code outer_code = GET_CODE (x);
8335   machine_mode outer_mode = GET_MODE (x);
8336
8337   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
8338       && outer_mode != SImode && outer_mode != DImode)
8339     return NULL_RTX;
8340
8341   rtx inner = XEXP (x, 0);
8342   rtx_code inner_code = GET_CODE (inner);
8343   machine_mode inner_mode = GET_MODE (inner);
8344   rtx op = NULL_RTX;
8345
8346   switch (inner_code)
8347     {
8348       case ASHIFT:
8349         if (CONST_INT_P (XEXP (inner, 1))
8350             && (inner_mode == QImode || inner_mode == HImode))
8351           op = XEXP (inner, 0);
8352         break;
8353       case LSHIFTRT:
8354         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
8355             && (inner_mode == QImode || inner_mode == HImode))
8356           op = XEXP (inner, 0);
8357         break;
8358       case ASHIFTRT:
8359         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
8360             && (inner_mode == QImode || inner_mode == HImode))
8361           op = XEXP (inner, 0);
8362         break;
8363       default:
8364         break;
8365     }
8366
8367   return op;
8368 }
8369
8370 /* Return true if the mask and a shift amount from an RTX of the form
8371    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
8372    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
8373
8374 bool
8375 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
8376                                     rtx shft_amnt)
8377 {
8378   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
8379          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
8380          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
8381          && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
8382 }
8383
8384 /* Calculate the cost of calculating X, storing it in *COST.  Result
8385    is true if the total cost of the operation has now been calculated.  */
8386 static bool
8387 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
8388                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
8389 {
8390   rtx op0, op1, op2;
8391   const struct cpu_cost_table *extra_cost
8392     = aarch64_tune_params.insn_extra_cost;
8393   int code = GET_CODE (x);
8394   scalar_int_mode int_mode;
8395
8396   /* By default, assume that everything has equivalent cost to the
8397      cheapest instruction.  Any additional costs are applied as a delta
8398      above this default.  */
8399   *cost = COSTS_N_INSNS (1);
8400
8401   switch (code)
8402     {
8403     case SET:
8404       /* The cost depends entirely on the operands to SET.  */
8405       *cost = 0;
8406       op0 = SET_DEST (x);
8407       op1 = SET_SRC (x);
8408
8409       switch (GET_CODE (op0))
8410         {
8411         case MEM:
8412           if (speed)
8413             {
8414               rtx address = XEXP (op0, 0);
8415               if (VECTOR_MODE_P (mode))
8416                 *cost += extra_cost->ldst.storev;
8417               else if (GET_MODE_CLASS (mode) == MODE_INT)
8418                 *cost += extra_cost->ldst.store;
8419               else if (mode == SFmode)
8420                 *cost += extra_cost->ldst.storef;
8421               else if (mode == DFmode)
8422                 *cost += extra_cost->ldst.stored;
8423
8424               *cost +=
8425                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8426                                                      0, speed));
8427             }
8428
8429           *cost += rtx_cost (op1, mode, SET, 1, speed);
8430           return true;
8431
8432         case SUBREG:
8433           if (! REG_P (SUBREG_REG (op0)))
8434             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
8435
8436           /* Fall through.  */
8437         case REG:
8438           /* The cost is one per vector-register copied.  */
8439           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
8440             {
8441               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
8442               *cost = COSTS_N_INSNS (nregs);
8443             }
8444           /* const0_rtx is in general free, but we will use an
8445              instruction to set a register to 0.  */
8446           else if (REG_P (op1) || op1 == const0_rtx)
8447             {
8448               /* The cost is 1 per register copied.  */
8449               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
8450               *cost = COSTS_N_INSNS (nregs);
8451             }
8452           else
8453             /* Cost is just the cost of the RHS of the set.  */
8454             *cost += rtx_cost (op1, mode, SET, 1, speed);
8455           return true;
8456
8457         case ZERO_EXTRACT:
8458         case SIGN_EXTRACT:
8459           /* Bit-field insertion.  Strip any redundant widening of
8460              the RHS to meet the width of the target.  */
8461           if (GET_CODE (op1) == SUBREG)
8462             op1 = SUBREG_REG (op1);
8463           if ((GET_CODE (op1) == ZERO_EXTEND
8464                || GET_CODE (op1) == SIGN_EXTEND)
8465               && CONST_INT_P (XEXP (op0, 1))
8466               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
8467               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
8468             op1 = XEXP (op1, 0);
8469
8470           if (CONST_INT_P (op1))
8471             {
8472               /* MOV immediate is assumed to always be cheap.  */
8473               *cost = COSTS_N_INSNS (1);
8474             }
8475           else
8476             {
8477               /* BFM.  */
8478               if (speed)
8479                 *cost += extra_cost->alu.bfi;
8480               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
8481             }
8482
8483           return true;
8484
8485         default:
8486           /* We can't make sense of this, assume default cost.  */
8487           *cost = COSTS_N_INSNS (1);
8488           return false;
8489         }
8490       return false;
8491
8492     case CONST_INT:
8493       /* If an instruction can incorporate a constant within the
8494          instruction, the instruction's expression avoids calling
8495          rtx_cost() on the constant.  If rtx_cost() is called on a
8496          constant, then it is usually because the constant must be
8497          moved into a register by one or more instructions.
8498
8499          The exception is constant 0, which can be expressed
8500          as XZR/WZR and is therefore free.  The exception to this is
8501          if we have (set (reg) (const0_rtx)) in which case we must cost
8502          the move.  However, we can catch that when we cost the SET, so
8503          we don't need to consider that here.  */
8504       if (x == const0_rtx)
8505         *cost = 0;
8506       else
8507         {
8508           /* To an approximation, building any other constant is
8509              proportionally expensive to the number of instructions
8510              required to build that constant.  This is true whether we
8511              are compiling for SPEED or otherwise.  */
8512           if (!is_a <scalar_int_mode> (mode, &int_mode))
8513             int_mode = word_mode;
8514           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
8515                                  (NULL_RTX, x, false, int_mode));
8516         }
8517       return true;
8518
8519     case CONST_DOUBLE:
8520
8521       /* First determine number of instructions to do the move
8522           as an integer constant.  */
8523       if (!aarch64_float_const_representable_p (x)
8524            && !aarch64_can_const_movi_rtx_p (x, mode)
8525            && aarch64_float_const_rtx_p (x))
8526         {
8527           unsigned HOST_WIDE_INT ival;
8528           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
8529           gcc_assert (succeed);
8530
8531           scalar_int_mode imode = (mode == HFmode
8532                                    ? SImode
8533                                    : int_mode_for_mode (mode).require ());
8534           int ncost = aarch64_internal_mov_immediate
8535                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
8536           *cost += COSTS_N_INSNS (ncost);
8537           return true;
8538         }
8539
8540       if (speed)
8541         {
8542           /* mov[df,sf]_aarch64.  */
8543           if (aarch64_float_const_representable_p (x))
8544             /* FMOV (scalar immediate).  */
8545             *cost += extra_cost->fp[mode == DFmode].fpconst;
8546           else if (!aarch64_float_const_zero_rtx_p (x))
8547             {
8548               /* This will be a load from memory.  */
8549               if (mode == DFmode)
8550                 *cost += extra_cost->ldst.loadd;
8551               else
8552                 *cost += extra_cost->ldst.loadf;
8553             }
8554           else
8555             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
8556                or MOV v0.s[0], wzr - neither of which are modeled by the
8557                cost tables.  Just use the default cost.  */
8558             {
8559             }
8560         }
8561
8562       return true;
8563
8564     case MEM:
8565       if (speed)
8566         {
8567           /* For loads we want the base cost of a load, plus an
8568              approximation for the additional cost of the addressing
8569              mode.  */
8570           rtx address = XEXP (x, 0);
8571           if (VECTOR_MODE_P (mode))
8572             *cost += extra_cost->ldst.loadv;
8573           else if (GET_MODE_CLASS (mode) == MODE_INT)
8574             *cost += extra_cost->ldst.load;
8575           else if (mode == SFmode)
8576             *cost += extra_cost->ldst.loadf;
8577           else if (mode == DFmode)
8578             *cost += extra_cost->ldst.loadd;
8579
8580           *cost +=
8581                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8582                                                      0, speed));
8583         }
8584
8585       return true;
8586
8587     case NEG:
8588       op0 = XEXP (x, 0);
8589
8590       if (VECTOR_MODE_P (mode))
8591         {
8592           if (speed)
8593             {
8594               /* FNEG.  */
8595               *cost += extra_cost->vect.alu;
8596             }
8597           return false;
8598         }
8599
8600       if (GET_MODE_CLASS (mode) == MODE_INT)
8601         {
8602           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8603               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8604             {
8605               /* CSETM.  */
8606               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
8607               return true;
8608             }
8609
8610           /* Cost this as SUB wzr, X.  */
8611           op0 = CONST0_RTX (mode);
8612           op1 = XEXP (x, 0);
8613           goto cost_minus;
8614         }
8615
8616       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8617         {
8618           /* Support (neg(fma...)) as a single instruction only if
8619              sign of zeros is unimportant.  This matches the decision
8620              making in aarch64.md.  */
8621           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
8622             {
8623               /* FNMADD.  */
8624               *cost = rtx_cost (op0, mode, NEG, 0, speed);
8625               return true;
8626             }
8627           if (GET_CODE (op0) == MULT)
8628             {
8629               /* FNMUL.  */
8630               *cost = rtx_cost (op0, mode, NEG, 0, speed);
8631               return true;
8632             }
8633           if (speed)
8634             /* FNEG.  */
8635             *cost += extra_cost->fp[mode == DFmode].neg;
8636           return false;
8637         }
8638
8639       return false;
8640
8641     case CLRSB:
8642     case CLZ:
8643       if (speed)
8644         {
8645           if (VECTOR_MODE_P (mode))
8646             *cost += extra_cost->vect.alu;
8647           else
8648             *cost += extra_cost->alu.clz;
8649         }
8650
8651       return false;
8652
8653     case COMPARE:
8654       op0 = XEXP (x, 0);
8655       op1 = XEXP (x, 1);
8656
8657       if (op1 == const0_rtx
8658           && GET_CODE (op0) == AND)
8659         {
8660           x = op0;
8661           mode = GET_MODE (op0);
8662           goto cost_logic;
8663         }
8664
8665       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
8666         {
8667           /* TODO: A write to the CC flags possibly costs extra, this
8668              needs encoding in the cost tables.  */
8669
8670           mode = GET_MODE (op0);
8671           /* ANDS.  */
8672           if (GET_CODE (op0) == AND)
8673             {
8674               x = op0;
8675               goto cost_logic;
8676             }
8677
8678           if (GET_CODE (op0) == PLUS)
8679             {
8680               /* ADDS (and CMN alias).  */
8681               x = op0;
8682               goto cost_plus;
8683             }
8684
8685           if (GET_CODE (op0) == MINUS)
8686             {
8687               /* SUBS.  */
8688               x = op0;
8689               goto cost_minus;
8690             }
8691
8692           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
8693               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
8694               && CONST_INT_P (XEXP (op0, 2)))
8695             {
8696               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
8697                  Handle it here directly rather than going to cost_logic
8698                  since we know the immediate generated for the TST is valid
8699                  so we can avoid creating an intermediate rtx for it only
8700                  for costing purposes.  */
8701               if (speed)
8702                 *cost += extra_cost->alu.logical;
8703
8704               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
8705                                  ZERO_EXTRACT, 0, speed);
8706               return true;
8707             }
8708
8709           if (GET_CODE (op1) == NEG)
8710             {
8711               /* CMN.  */
8712               if (speed)
8713                 *cost += extra_cost->alu.arith;
8714
8715               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
8716               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
8717               return true;
8718             }
8719
8720           /* CMP.
8721
8722              Compare can freely swap the order of operands, and
8723              canonicalization puts the more complex operation first.
8724              But the integer MINUS logic expects the shift/extend
8725              operation in op1.  */
8726           if (! (REG_P (op0)
8727                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
8728           {
8729             op0 = XEXP (x, 1);
8730             op1 = XEXP (x, 0);
8731           }
8732           goto cost_minus;
8733         }
8734
8735       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
8736         {
8737           /* FCMP.  */
8738           if (speed)
8739             *cost += extra_cost->fp[mode == DFmode].compare;
8740
8741           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
8742             {
8743               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
8744               /* FCMP supports constant 0.0 for no extra cost. */
8745               return true;
8746             }
8747           return false;
8748         }
8749
8750       if (VECTOR_MODE_P (mode))
8751         {
8752           /* Vector compare.  */
8753           if (speed)
8754             *cost += extra_cost->vect.alu;
8755
8756           if (aarch64_float_const_zero_rtx_p (op1))
8757             {
8758               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
8759                  cost.  */
8760               return true;
8761             }
8762           return false;
8763         }
8764       return false;
8765
8766     case MINUS:
8767       {
8768         op0 = XEXP (x, 0);
8769         op1 = XEXP (x, 1);
8770
8771 cost_minus:
8772         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
8773
8774         /* Detect valid immediates.  */
8775         if ((GET_MODE_CLASS (mode) == MODE_INT
8776              || (GET_MODE_CLASS (mode) == MODE_CC
8777                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
8778             && CONST_INT_P (op1)
8779             && aarch64_uimm12_shift (INTVAL (op1)))
8780           {
8781             if (speed)
8782               /* SUB(S) (immediate).  */
8783               *cost += extra_cost->alu.arith;
8784             return true;
8785           }
8786
8787         /* Look for SUB (extended register).  */
8788         if (is_a <scalar_int_mode> (mode, &int_mode)
8789             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
8790           {
8791             if (speed)
8792               *cost += extra_cost->alu.extend_arith;
8793
8794             op1 = aarch64_strip_extend (op1, true);
8795             *cost += rtx_cost (op1, VOIDmode,
8796                                (enum rtx_code) GET_CODE (op1), 0, speed);
8797             return true;
8798           }
8799
8800         rtx new_op1 = aarch64_strip_extend (op1, false);
8801
8802         /* Cost this as an FMA-alike operation.  */
8803         if ((GET_CODE (new_op1) == MULT
8804              || aarch64_shift_p (GET_CODE (new_op1)))
8805             && code != COMPARE)
8806           {
8807             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
8808                                             (enum rtx_code) code,
8809                                             speed);
8810             return true;
8811           }
8812
8813         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
8814
8815         if (speed)
8816           {
8817             if (VECTOR_MODE_P (mode))
8818               {
8819                 /* Vector SUB.  */
8820                 *cost += extra_cost->vect.alu;
8821               }
8822             else if (GET_MODE_CLASS (mode) == MODE_INT)
8823               {
8824                 /* SUB(S).  */
8825                 *cost += extra_cost->alu.arith;
8826               }
8827             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8828               {
8829                 /* FSUB.  */
8830                 *cost += extra_cost->fp[mode == DFmode].addsub;
8831               }
8832           }
8833         return true;
8834       }
8835
8836     case PLUS:
8837       {
8838         rtx new_op0;
8839
8840         op0 = XEXP (x, 0);
8841         op1 = XEXP (x, 1);
8842
8843 cost_plus:
8844         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8845             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8846           {
8847             /* CSINC.  */
8848             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
8849             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8850             return true;
8851           }
8852
8853         if (GET_MODE_CLASS (mode) == MODE_INT
8854             && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
8855                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
8856           {
8857             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
8858
8859             if (speed)
8860               /* ADD (immediate).  */
8861               *cost += extra_cost->alu.arith;
8862             return true;
8863           }
8864
8865         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8866
8867         /* Look for ADD (extended register).  */
8868         if (is_a <scalar_int_mode> (mode, &int_mode)
8869             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
8870           {
8871             if (speed)
8872               *cost += extra_cost->alu.extend_arith;
8873
8874             op0 = aarch64_strip_extend (op0, true);
8875             *cost += rtx_cost (op0, VOIDmode,
8876                                (enum rtx_code) GET_CODE (op0), 0, speed);
8877             return true;
8878           }
8879
8880         /* Strip any extend, leave shifts behind as we will
8881            cost them through mult_cost.  */
8882         new_op0 = aarch64_strip_extend (op0, false);
8883
8884         if (GET_CODE (new_op0) == MULT
8885             || aarch64_shift_p (GET_CODE (new_op0)))
8886           {
8887             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
8888                                             speed);
8889             return true;
8890           }
8891
8892         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
8893
8894         if (speed)
8895           {
8896             if (VECTOR_MODE_P (mode))
8897               {
8898                 /* Vector ADD.  */
8899                 *cost += extra_cost->vect.alu;
8900               }
8901             else if (GET_MODE_CLASS (mode) == MODE_INT)
8902               {
8903                 /* ADD.  */
8904                 *cost += extra_cost->alu.arith;
8905               }
8906             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8907               {
8908                 /* FADD.  */
8909                 *cost += extra_cost->fp[mode == DFmode].addsub;
8910               }
8911           }
8912         return true;
8913       }
8914
8915     case BSWAP:
8916       *cost = COSTS_N_INSNS (1);
8917
8918       if (speed)
8919         {
8920           if (VECTOR_MODE_P (mode))
8921             *cost += extra_cost->vect.alu;
8922           else
8923             *cost += extra_cost->alu.rev;
8924         }
8925       return false;
8926
8927     case IOR:
8928       if (aarch_rev16_p (x))
8929         {
8930           *cost = COSTS_N_INSNS (1);
8931
8932           if (speed)
8933             {
8934               if (VECTOR_MODE_P (mode))
8935                 *cost += extra_cost->vect.alu;
8936               else
8937                 *cost += extra_cost->alu.rev;
8938             }
8939           return true;
8940         }
8941
8942       if (aarch64_extr_rtx_p (x, &op0, &op1))
8943         {
8944           *cost += rtx_cost (op0, mode, IOR, 0, speed);
8945           *cost += rtx_cost (op1, mode, IOR, 1, speed);
8946           if (speed)
8947             *cost += extra_cost->alu.shift;
8948
8949           return true;
8950         }
8951     /* Fall through.  */
8952     case XOR:
8953     case AND:
8954     cost_logic:
8955       op0 = XEXP (x, 0);
8956       op1 = XEXP (x, 1);
8957
8958       if (VECTOR_MODE_P (mode))
8959         {
8960           if (speed)
8961             *cost += extra_cost->vect.alu;
8962           return true;
8963         }
8964
8965       if (code == AND
8966           && GET_CODE (op0) == MULT
8967           && CONST_INT_P (XEXP (op0, 1))
8968           && CONST_INT_P (op1)
8969           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
8970                                INTVAL (op1)) != 0)
8971         {
8972           /* This is a UBFM/SBFM.  */
8973           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
8974           if (speed)
8975             *cost += extra_cost->alu.bfx;
8976           return true;
8977         }
8978
8979       if (is_int_mode (mode, &int_mode))
8980         {
8981           if (CONST_INT_P (op1))
8982             {
8983               /* We have a mask + shift version of a UBFIZ
8984                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
8985               if (GET_CODE (op0) == ASHIFT
8986                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
8987                                                          XEXP (op0, 1)))
8988                 {
8989                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
8990                                      (enum rtx_code) code, 0, speed);
8991                   if (speed)
8992                     *cost += extra_cost->alu.bfx;
8993
8994                   return true;
8995                 }
8996               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
8997                 {
8998                 /* We possibly get the immediate for free, this is not
8999                    modelled.  */
9000                   *cost += rtx_cost (op0, int_mode,
9001                                      (enum rtx_code) code, 0, speed);
9002                   if (speed)
9003                     *cost += extra_cost->alu.logical;
9004
9005                   return true;
9006                 }
9007             }
9008           else
9009             {
9010               rtx new_op0 = op0;
9011
9012               /* Handle ORN, EON, or BIC.  */
9013               if (GET_CODE (op0) == NOT)
9014                 op0 = XEXP (op0, 0);
9015
9016               new_op0 = aarch64_strip_shift (op0);
9017
9018               /* If we had a shift on op0 then this is a logical-shift-
9019                  by-register/immediate operation.  Otherwise, this is just
9020                  a logical operation.  */
9021               if (speed)
9022                 {
9023                   if (new_op0 != op0)
9024                     {
9025                       /* Shift by immediate.  */
9026                       if (CONST_INT_P (XEXP (op0, 1)))
9027                         *cost += extra_cost->alu.log_shift;
9028                       else
9029                         *cost += extra_cost->alu.log_shift_reg;
9030                     }
9031                   else
9032                     *cost += extra_cost->alu.logical;
9033                 }
9034
9035               /* In both cases we want to cost both operands.  */
9036               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
9037                                  0, speed);
9038               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
9039                                  1, speed);
9040
9041               return true;
9042             }
9043         }
9044       return false;
9045
9046     case NOT:
9047       x = XEXP (x, 0);
9048       op0 = aarch64_strip_shift (x);
9049
9050       if (VECTOR_MODE_P (mode))
9051         {
9052           /* Vector NOT.  */
9053           *cost += extra_cost->vect.alu;
9054           return false;
9055         }
9056
9057       /* MVN-shifted-reg.  */
9058       if (op0 != x)
9059         {
9060           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9061
9062           if (speed)
9063             *cost += extra_cost->alu.log_shift;
9064
9065           return true;
9066         }
9067       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
9068          Handle the second form here taking care that 'a' in the above can
9069          be a shift.  */
9070       else if (GET_CODE (op0) == XOR)
9071         {
9072           rtx newop0 = XEXP (op0, 0);
9073           rtx newop1 = XEXP (op0, 1);
9074           rtx op0_stripped = aarch64_strip_shift (newop0);
9075
9076           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
9077           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
9078
9079           if (speed)
9080             {
9081               if (op0_stripped != newop0)
9082                 *cost += extra_cost->alu.log_shift;
9083               else
9084                 *cost += extra_cost->alu.logical;
9085             }
9086
9087           return true;
9088         }
9089       /* MVN.  */
9090       if (speed)
9091         *cost += extra_cost->alu.logical;
9092
9093       return false;
9094
9095     case ZERO_EXTEND:
9096
9097       op0 = XEXP (x, 0);
9098       /* If a value is written in SI mode, then zero extended to DI
9099          mode, the operation will in general be free as a write to
9100          a 'w' register implicitly zeroes the upper bits of an 'x'
9101          register.  However, if this is
9102
9103            (set (reg) (zero_extend (reg)))
9104
9105          we must cost the explicit register move.  */
9106       if (mode == DImode
9107           && GET_MODE (op0) == SImode
9108           && outer == SET)
9109         {
9110           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
9111
9112         /* If OP_COST is non-zero, then the cost of the zero extend
9113            is effectively the cost of the inner operation.  Otherwise
9114            we have a MOV instruction and we take the cost from the MOV
9115            itself.  This is true independently of whether we are
9116            optimizing for space or time.  */
9117           if (op_cost)
9118             *cost = op_cost;
9119
9120           return true;
9121         }
9122       else if (MEM_P (op0))
9123         {
9124           /* All loads can zero extend to any size for free.  */
9125           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
9126           return true;
9127         }
9128
9129       op0 = aarch64_extend_bitfield_pattern_p (x);
9130       if (op0)
9131         {
9132           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
9133           if (speed)
9134             *cost += extra_cost->alu.bfx;
9135           return true;
9136         }
9137
9138       if (speed)
9139         {
9140           if (VECTOR_MODE_P (mode))
9141             {
9142               /* UMOV.  */
9143               *cost += extra_cost->vect.alu;
9144             }
9145           else
9146             {
9147               /* We generate an AND instead of UXTB/UXTH.  */
9148               *cost += extra_cost->alu.logical;
9149             }
9150         }
9151       return false;
9152
9153     case SIGN_EXTEND:
9154       if (MEM_P (XEXP (x, 0)))
9155         {
9156           /* LDRSH.  */
9157           if (speed)
9158             {
9159               rtx address = XEXP (XEXP (x, 0), 0);
9160               *cost += extra_cost->ldst.load_sign_extend;
9161
9162               *cost +=
9163                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9164                                                      0, speed));
9165             }
9166           return true;
9167         }
9168
9169       op0 = aarch64_extend_bitfield_pattern_p (x);
9170       if (op0)
9171         {
9172           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
9173           if (speed)
9174             *cost += extra_cost->alu.bfx;
9175           return true;
9176         }
9177
9178       if (speed)
9179         {
9180           if (VECTOR_MODE_P (mode))
9181             *cost += extra_cost->vect.alu;
9182           else
9183             *cost += extra_cost->alu.extend;
9184         }
9185       return false;
9186
9187     case ASHIFT:
9188       op0 = XEXP (x, 0);
9189       op1 = XEXP (x, 1);
9190
9191       if (CONST_INT_P (op1))
9192         {
9193           if (speed)
9194             {
9195               if (VECTOR_MODE_P (mode))
9196                 {
9197                   /* Vector shift (immediate).  */
9198                   *cost += extra_cost->vect.alu;
9199                 }
9200               else
9201                 {
9202                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
9203                      aliases.  */
9204                   *cost += extra_cost->alu.shift;
9205                 }
9206             }
9207
9208           /* We can incorporate zero/sign extend for free.  */
9209           if (GET_CODE (op0) == ZERO_EXTEND
9210               || GET_CODE (op0) == SIGN_EXTEND)
9211             op0 = XEXP (op0, 0);
9212
9213           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
9214           return true;
9215         }
9216       else
9217         {
9218           if (VECTOR_MODE_P (mode))
9219             {
9220               if (speed)
9221                 /* Vector shift (register).  */
9222                 *cost += extra_cost->vect.alu;
9223             }
9224           else
9225             {
9226               if (speed)
9227                 /* LSLV.  */
9228                 *cost += extra_cost->alu.shift_reg;
9229
9230               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9231                   && CONST_INT_P (XEXP (op1, 1))
9232                   && known_eq (INTVAL (XEXP (op1, 1)),
9233                                GET_MODE_BITSIZE (mode) - 1))
9234                 {
9235                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9236                   /* We already demanded XEXP (op1, 0) to be REG_P, so
9237                      don't recurse into it.  */
9238                   return true;
9239                 }
9240             }
9241           return false;  /* All arguments need to be in registers.  */
9242         }
9243
9244     case ROTATE:
9245     case ROTATERT:
9246     case LSHIFTRT:
9247     case ASHIFTRT:
9248       op0 = XEXP (x, 0);
9249       op1 = XEXP (x, 1);
9250
9251       if (CONST_INT_P (op1))
9252         {
9253           /* ASR (immediate) and friends.  */
9254           if (speed)
9255             {
9256               if (VECTOR_MODE_P (mode))
9257                 *cost += extra_cost->vect.alu;
9258               else
9259                 *cost += extra_cost->alu.shift;
9260             }
9261
9262           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9263           return true;
9264         }
9265       else
9266         {
9267           if (VECTOR_MODE_P (mode))
9268             {
9269               if (speed)
9270                 /* Vector shift (register).  */
9271                 *cost += extra_cost->vect.alu;
9272             }
9273           else
9274             {
9275               if (speed)
9276                 /* ASR (register) and friends.  */
9277                 *cost += extra_cost->alu.shift_reg;
9278
9279               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9280                   && CONST_INT_P (XEXP (op1, 1))
9281                   && known_eq (INTVAL (XEXP (op1, 1)),
9282                                GET_MODE_BITSIZE (mode) - 1))
9283                 {
9284                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9285                   /* We already demanded XEXP (op1, 0) to be REG_P, so
9286                      don't recurse into it.  */
9287                   return true;
9288                 }
9289             }
9290           return false;  /* All arguments need to be in registers.  */
9291         }
9292
9293     case SYMBOL_REF:
9294
9295       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
9296           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
9297         {
9298           /* LDR.  */
9299           if (speed)
9300             *cost += extra_cost->ldst.load;
9301         }
9302       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
9303                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
9304         {
9305           /* ADRP, followed by ADD.  */
9306           *cost += COSTS_N_INSNS (1);
9307           if (speed)
9308             *cost += 2 * extra_cost->alu.arith;
9309         }
9310       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
9311                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
9312         {
9313           /* ADR.  */
9314           if (speed)
9315             *cost += extra_cost->alu.arith;
9316         }
9317
9318       if (flag_pic)
9319         {
9320           /* One extra load instruction, after accessing the GOT.  */
9321           *cost += COSTS_N_INSNS (1);
9322           if (speed)
9323             *cost += extra_cost->ldst.load;
9324         }
9325       return true;
9326
9327     case HIGH:
9328     case LO_SUM:
9329       /* ADRP/ADD (immediate).  */
9330       if (speed)
9331         *cost += extra_cost->alu.arith;
9332       return true;
9333
9334     case ZERO_EXTRACT:
9335     case SIGN_EXTRACT:
9336       /* UBFX/SBFX.  */
9337       if (speed)
9338         {
9339           if (VECTOR_MODE_P (mode))
9340             *cost += extra_cost->vect.alu;
9341           else
9342             *cost += extra_cost->alu.bfx;
9343         }
9344
9345       /* We can trust that the immediates used will be correct (there
9346          are no by-register forms), so we need only cost op0.  */
9347       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
9348       return true;
9349
9350     case MULT:
9351       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
9352       /* aarch64_rtx_mult_cost always handles recursion to its
9353          operands.  */
9354       return true;
9355
9356     case MOD:
9357     /* We can expand signed mod by power of 2 using a NEGS, two parallel
9358        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
9359        an unconditional negate.  This case should only ever be reached through
9360        the set_smod_pow2_cheap check in expmed.c.  */
9361       if (CONST_INT_P (XEXP (x, 1))
9362           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
9363           && (mode == SImode || mode == DImode))
9364         {
9365           /* We expand to 4 instructions.  Reset the baseline.  */
9366           *cost = COSTS_N_INSNS (4);
9367
9368           if (speed)
9369             *cost += 2 * extra_cost->alu.logical
9370                      + 2 * extra_cost->alu.arith;
9371
9372           return true;
9373         }
9374
9375     /* Fall-through.  */
9376     case UMOD:
9377       if (speed)
9378         {
9379           /* Slighly prefer UMOD over SMOD.  */
9380           if (VECTOR_MODE_P (mode))
9381             *cost += extra_cost->vect.alu;
9382           else if (GET_MODE_CLASS (mode) == MODE_INT)
9383             *cost += (extra_cost->mult[mode == DImode].add
9384                       + extra_cost->mult[mode == DImode].idiv
9385                       + (code == MOD ? 1 : 0));
9386         }
9387       return false;  /* All arguments need to be in registers.  */
9388
9389     case DIV:
9390     case UDIV:
9391     case SQRT:
9392       if (speed)
9393         {
9394           if (VECTOR_MODE_P (mode))
9395             *cost += extra_cost->vect.alu;
9396           else if (GET_MODE_CLASS (mode) == MODE_INT)
9397             /* There is no integer SQRT, so only DIV and UDIV can get
9398                here.  */
9399             *cost += (extra_cost->mult[mode == DImode].idiv
9400                      /* Slighly prefer UDIV over SDIV.  */
9401                      + (code == DIV ? 1 : 0));
9402           else
9403             *cost += extra_cost->fp[mode == DFmode].div;
9404         }
9405       return false;  /* All arguments need to be in registers.  */
9406
9407     case IF_THEN_ELSE:
9408       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
9409                                          XEXP (x, 2), cost, speed);
9410
9411     case EQ:
9412     case NE:
9413     case GT:
9414     case GTU:
9415     case LT:
9416     case LTU:
9417     case GE:
9418     case GEU:
9419     case LE:
9420     case LEU:
9421
9422       return false; /* All arguments must be in registers.  */
9423
9424     case FMA:
9425       op0 = XEXP (x, 0);
9426       op1 = XEXP (x, 1);
9427       op2 = XEXP (x, 2);
9428
9429       if (speed)
9430         {
9431           if (VECTOR_MODE_P (mode))
9432             *cost += extra_cost->vect.alu;
9433           else
9434             *cost += extra_cost->fp[mode == DFmode].fma;
9435         }
9436
9437       /* FMSUB, FNMADD, and FNMSUB are free.  */
9438       if (GET_CODE (op0) == NEG)
9439         op0 = XEXP (op0, 0);
9440
9441       if (GET_CODE (op2) == NEG)
9442         op2 = XEXP (op2, 0);
9443
9444       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
9445          and the by-element operand as operand 0.  */
9446       if (GET_CODE (op1) == NEG)
9447         op1 = XEXP (op1, 0);
9448
9449       /* Catch vector-by-element operations.  The by-element operand can
9450          either be (vec_duplicate (vec_select (x))) or just
9451          (vec_select (x)), depending on whether we are multiplying by
9452          a vector or a scalar.
9453
9454          Canonicalization is not very good in these cases, FMA4 will put the
9455          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
9456       if (GET_CODE (op0) == VEC_DUPLICATE)
9457         op0 = XEXP (op0, 0);
9458       else if (GET_CODE (op1) == VEC_DUPLICATE)
9459         op1 = XEXP (op1, 0);
9460
9461       if (GET_CODE (op0) == VEC_SELECT)
9462         op0 = XEXP (op0, 0);
9463       else if (GET_CODE (op1) == VEC_SELECT)
9464         op1 = XEXP (op1, 0);
9465
9466       /* If the remaining parameters are not registers,
9467          get the cost to put them into registers.  */
9468       *cost += rtx_cost (op0, mode, FMA, 0, speed);
9469       *cost += rtx_cost (op1, mode, FMA, 1, speed);
9470       *cost += rtx_cost (op2, mode, FMA, 2, speed);
9471       return true;
9472
9473     case FLOAT:
9474     case UNSIGNED_FLOAT:
9475       if (speed)
9476         *cost += extra_cost->fp[mode == DFmode].fromint;
9477       return false;
9478
9479     case FLOAT_EXTEND:
9480       if (speed)
9481         {
9482           if (VECTOR_MODE_P (mode))
9483             {
9484               /*Vector truncate.  */
9485               *cost += extra_cost->vect.alu;
9486             }
9487           else
9488             *cost += extra_cost->fp[mode == DFmode].widen;
9489         }
9490       return false;
9491
9492     case FLOAT_TRUNCATE:
9493       if (speed)
9494         {
9495           if (VECTOR_MODE_P (mode))
9496             {
9497               /*Vector conversion.  */
9498               *cost += extra_cost->vect.alu;
9499             }
9500           else
9501             *cost += extra_cost->fp[mode == DFmode].narrow;
9502         }
9503       return false;
9504
9505     case FIX:
9506     case UNSIGNED_FIX:
9507       x = XEXP (x, 0);
9508       /* Strip the rounding part.  They will all be implemented
9509          by the fcvt* family of instructions anyway.  */
9510       if (GET_CODE (x) == UNSPEC)
9511         {
9512           unsigned int uns_code = XINT (x, 1);
9513
9514           if (uns_code == UNSPEC_FRINTA
9515               || uns_code == UNSPEC_FRINTM
9516               || uns_code == UNSPEC_FRINTN
9517               || uns_code == UNSPEC_FRINTP
9518               || uns_code == UNSPEC_FRINTZ)
9519             x = XVECEXP (x, 0, 0);
9520         }
9521
9522       if (speed)
9523         {
9524           if (VECTOR_MODE_P (mode))
9525             *cost += extra_cost->vect.alu;
9526           else
9527             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
9528         }
9529
9530       /* We can combine fmul by a power of 2 followed by a fcvt into a single
9531          fixed-point fcvt.  */
9532       if (GET_CODE (x) == MULT
9533           && ((VECTOR_MODE_P (mode)
9534                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
9535               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
9536         {
9537           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
9538                              0, speed);
9539           return true;
9540         }
9541
9542       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
9543       return true;
9544
9545     case ABS:
9546       if (VECTOR_MODE_P (mode))
9547         {
9548           /* ABS (vector).  */
9549           if (speed)
9550             *cost += extra_cost->vect.alu;
9551         }
9552       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9553         {
9554           op0 = XEXP (x, 0);
9555
9556           /* FABD, which is analogous to FADD.  */
9557           if (GET_CODE (op0) == MINUS)
9558             {
9559               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
9560               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
9561               if (speed)
9562                 *cost += extra_cost->fp[mode == DFmode].addsub;
9563
9564               return true;
9565             }
9566           /* Simple FABS is analogous to FNEG.  */
9567           if (speed)
9568             *cost += extra_cost->fp[mode == DFmode].neg;
9569         }
9570       else
9571         {
9572           /* Integer ABS will either be split to
9573              two arithmetic instructions, or will be an ABS
9574              (scalar), which we don't model.  */
9575           *cost = COSTS_N_INSNS (2);
9576           if (speed)
9577             *cost += 2 * extra_cost->alu.arith;
9578         }
9579       return false;
9580
9581     case SMAX:
9582     case SMIN:
9583       if (speed)
9584         {
9585           if (VECTOR_MODE_P (mode))
9586             *cost += extra_cost->vect.alu;
9587           else
9588             {
9589               /* FMAXNM/FMINNM/FMAX/FMIN.
9590                  TODO: This may not be accurate for all implementations, but
9591                  we do not model this in the cost tables.  */
9592               *cost += extra_cost->fp[mode == DFmode].addsub;
9593             }
9594         }
9595       return false;
9596
9597     case UNSPEC:
9598       /* The floating point round to integer frint* instructions.  */
9599       if (aarch64_frint_unspec_p (XINT (x, 1)))
9600         {
9601           if (speed)
9602             *cost += extra_cost->fp[mode == DFmode].roundint;
9603
9604           return false;
9605         }
9606
9607       if (XINT (x, 1) == UNSPEC_RBIT)
9608         {
9609           if (speed)
9610             *cost += extra_cost->alu.rev;
9611
9612           return false;
9613         }
9614       break;
9615
9616     case TRUNCATE:
9617
9618       /* Decompose <su>muldi3_highpart.  */
9619       if (/* (truncate:DI  */
9620           mode == DImode
9621           /*   (lshiftrt:TI  */
9622           && GET_MODE (XEXP (x, 0)) == TImode
9623           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
9624           /*      (mult:TI  */
9625           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
9626           /*        (ANY_EXTEND:TI (reg:DI))
9627                     (ANY_EXTEND:TI (reg:DI)))  */
9628           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
9629                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
9630               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
9631                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
9632           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
9633           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
9634           /*     (const_int 64)  */
9635           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9636           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
9637         {
9638           /* UMULH/SMULH.  */
9639           if (speed)
9640             *cost += extra_cost->mult[mode == DImode].extend;
9641           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
9642                              mode, MULT, 0, speed);
9643           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
9644                              mode, MULT, 1, speed);
9645           return true;
9646         }
9647
9648       /* Fall through.  */
9649     default:
9650       break;
9651     }
9652
9653   if (dump_file
9654       && flag_aarch64_verbose_cost)
9655     fprintf (dump_file,
9656       "\nFailed to cost RTX.  Assuming default cost.\n");
9657
9658   return true;
9659 }
9660
9661 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
9662    calculated for X.  This cost is stored in *COST.  Returns true
9663    if the total cost of X was calculated.  */
9664 static bool
9665 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
9666                    int param, int *cost, bool speed)
9667 {
9668   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
9669
9670   if (dump_file
9671       && flag_aarch64_verbose_cost)
9672     {
9673       print_rtl_single (dump_file, x);
9674       fprintf (dump_file, "\n%s cost: %d (%s)\n",
9675                speed ? "Hot" : "Cold",
9676                *cost, result ? "final" : "partial");
9677     }
9678
9679   return result;
9680 }
9681
9682 static int
9683 aarch64_register_move_cost (machine_mode mode,
9684                             reg_class_t from_i, reg_class_t to_i)
9685 {
9686   enum reg_class from = (enum reg_class) from_i;
9687   enum reg_class to = (enum reg_class) to_i;
9688   const struct cpu_regmove_cost *regmove_cost
9689     = aarch64_tune_params.regmove_cost;
9690
9691   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
9692   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
9693     to = GENERAL_REGS;
9694
9695   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
9696     from = GENERAL_REGS;
9697
9698   /* Moving between GPR and stack cost is the same as GP2GP.  */
9699   if ((from == GENERAL_REGS && to == STACK_REG)
9700       || (to == GENERAL_REGS && from == STACK_REG))
9701     return regmove_cost->GP2GP;
9702
9703   /* To/From the stack register, we move via the gprs.  */
9704   if (to == STACK_REG || from == STACK_REG)
9705     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
9706             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
9707
9708   if (known_eq (GET_MODE_SIZE (mode), 16))
9709     {
9710       /* 128-bit operations on general registers require 2 instructions.  */
9711       if (from == GENERAL_REGS && to == GENERAL_REGS)
9712         return regmove_cost->GP2GP * 2;
9713       else if (from == GENERAL_REGS)
9714         return regmove_cost->GP2FP * 2;
9715       else if (to == GENERAL_REGS)
9716         return regmove_cost->FP2GP * 2;
9717
9718       /* When AdvSIMD instructions are disabled it is not possible to move
9719          a 128-bit value directly between Q registers.  This is handled in
9720          secondary reload.  A general register is used as a scratch to move
9721          the upper DI value and the lower DI value is moved directly,
9722          hence the cost is the sum of three moves. */
9723       if (! TARGET_SIMD)
9724         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
9725
9726       return regmove_cost->FP2FP;
9727     }
9728
9729   if (from == GENERAL_REGS && to == GENERAL_REGS)
9730     return regmove_cost->GP2GP;
9731   else if (from == GENERAL_REGS)
9732     return regmove_cost->GP2FP;
9733   else if (to == GENERAL_REGS)
9734     return regmove_cost->FP2GP;
9735
9736   return regmove_cost->FP2FP;
9737 }
9738
9739 static int
9740 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
9741                           reg_class_t rclass ATTRIBUTE_UNUSED,
9742                           bool in ATTRIBUTE_UNUSED)
9743 {
9744   return aarch64_tune_params.memmov_cost;
9745 }
9746
9747 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
9748    to optimize 1.0/sqrt.  */
9749
9750 static bool
9751 use_rsqrt_p (machine_mode mode)
9752 {
9753   return (!flag_trapping_math
9754           && flag_unsafe_math_optimizations
9755           && ((aarch64_tune_params.approx_modes->recip_sqrt
9756                & AARCH64_APPROX_MODE (mode))
9757               || flag_mrecip_low_precision_sqrt));
9758 }
9759
9760 /* Function to decide when to use the approximate reciprocal square root
9761    builtin.  */
9762
9763 static tree
9764 aarch64_builtin_reciprocal (tree fndecl)
9765 {
9766   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
9767
9768   if (!use_rsqrt_p (mode))
9769     return NULL_TREE;
9770   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
9771 }
9772
9773 /* Emit instruction sequence to compute either the approximate square root
9774    or its approximate reciprocal, depending on the flag RECP, and return
9775    whether the sequence was emitted or not.  */
9776
9777 bool
9778 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
9779 {
9780   machine_mode mode = GET_MODE (dst);
9781
9782   if (GET_MODE_INNER (mode) == HFmode)
9783     {
9784       gcc_assert (!recp);
9785       return false;
9786     }
9787
9788   if (!recp)
9789     {
9790       if (!(flag_mlow_precision_sqrt
9791             || (aarch64_tune_params.approx_modes->sqrt
9792                 & AARCH64_APPROX_MODE (mode))))
9793         return false;
9794
9795       if (flag_finite_math_only
9796           || flag_trapping_math
9797           || !flag_unsafe_math_optimizations
9798           || optimize_function_for_size_p (cfun))
9799         return false;
9800     }
9801   else
9802     /* Caller assumes we cannot fail.  */
9803     gcc_assert (use_rsqrt_p (mode));
9804
9805   machine_mode mmsk = mode_for_int_vector (mode).require ();
9806   rtx xmsk = gen_reg_rtx (mmsk);
9807   if (!recp)
9808     /* When calculating the approximate square root, compare the
9809        argument with 0.0 and create a mask.  */
9810     emit_insn (gen_rtx_SET (xmsk,
9811                             gen_rtx_NEG (mmsk,
9812                                          gen_rtx_EQ (mmsk, src,
9813                                                      CONST0_RTX (mode)))));
9814
9815   /* Estimate the approximate reciprocal square root.  */
9816   rtx xdst = gen_reg_rtx (mode);
9817   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
9818
9819   /* Iterate over the series twice for SF and thrice for DF.  */
9820   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9821
9822   /* Optionally iterate over the series once less for faster performance
9823      while sacrificing the accuracy.  */
9824   if ((recp && flag_mrecip_low_precision_sqrt)
9825       || (!recp && flag_mlow_precision_sqrt))
9826     iterations--;
9827
9828   /* Iterate over the series to calculate the approximate reciprocal square
9829      root.  */
9830   rtx x1 = gen_reg_rtx (mode);
9831   while (iterations--)
9832     {
9833       rtx x2 = gen_reg_rtx (mode);
9834       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
9835
9836       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
9837
9838       if (iterations > 0)
9839         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
9840     }
9841
9842   if (!recp)
9843     {
9844       /* Qualify the approximate reciprocal square root when the argument is
9845          0.0 by squashing the intermediary result to 0.0.  */
9846       rtx xtmp = gen_reg_rtx (mmsk);
9847       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
9848                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
9849       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
9850
9851       /* Calculate the approximate square root.  */
9852       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
9853     }
9854
9855   /* Finalize the approximation.  */
9856   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
9857
9858   return true;
9859 }
9860
9861 /* Emit the instruction sequence to compute the approximation for the division
9862    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
9863
9864 bool
9865 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
9866 {
9867   machine_mode mode = GET_MODE (quo);
9868
9869   if (GET_MODE_INNER (mode) == HFmode)
9870     return false;
9871
9872   bool use_approx_division_p = (flag_mlow_precision_div
9873                                 || (aarch64_tune_params.approx_modes->division
9874                                     & AARCH64_APPROX_MODE (mode)));
9875
9876   if (!flag_finite_math_only
9877       || flag_trapping_math
9878       || !flag_unsafe_math_optimizations
9879       || optimize_function_for_size_p (cfun)
9880       || !use_approx_division_p)
9881     return false;
9882
9883   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
9884     return false;
9885
9886   /* Estimate the approximate reciprocal.  */
9887   rtx xrcp = gen_reg_rtx (mode);
9888   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
9889
9890   /* Iterate over the series twice for SF and thrice for DF.  */
9891   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9892
9893   /* Optionally iterate over the series once less for faster performance,
9894      while sacrificing the accuracy.  */
9895   if (flag_mlow_precision_div)
9896     iterations--;
9897
9898   /* Iterate over the series to calculate the approximate reciprocal.  */
9899   rtx xtmp = gen_reg_rtx (mode);
9900   while (iterations--)
9901     {
9902       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
9903
9904       if (iterations > 0)
9905         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
9906     }
9907
9908   if (num != CONST1_RTX (mode))
9909     {
9910       /* As the approximate reciprocal of DEN is already calculated, only
9911          calculate the approximate division when NUM is not 1.0.  */
9912       rtx xnum = force_reg (mode, num);
9913       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
9914     }
9915
9916   /* Finalize the approximation.  */
9917   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
9918   return true;
9919 }
9920
9921 /* Return the number of instructions that can be issued per cycle.  */
9922 static int
9923 aarch64_sched_issue_rate (void)
9924 {
9925   return aarch64_tune_params.issue_rate;
9926 }
9927
9928 static int
9929 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
9930 {
9931   int issue_rate = aarch64_sched_issue_rate ();
9932
9933   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
9934 }
9935
9936
9937 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
9938    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
9939    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
9940
9941 static int
9942 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
9943                                                     int ready_index)
9944 {
9945   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
9946 }
9947
9948
9949 /* Vectorizer cost model target hooks.  */
9950
9951 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
9952 static int
9953 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
9954                                     tree vectype,
9955                                     int misalign ATTRIBUTE_UNUSED)
9956 {
9957   unsigned elements;
9958   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
9959   bool fp = false;
9960
9961   if (vectype != NULL)
9962     fp = FLOAT_TYPE_P (vectype);
9963
9964   switch (type_of_cost)
9965     {
9966       case scalar_stmt:
9967         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
9968
9969       case scalar_load:
9970         return costs->scalar_load_cost;
9971
9972       case scalar_store:
9973         return costs->scalar_store_cost;
9974
9975       case vector_stmt:
9976         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
9977
9978       case vector_load:
9979         return costs->vec_align_load_cost;
9980
9981       case vector_store:
9982         return costs->vec_store_cost;
9983
9984       case vec_to_scalar:
9985         return costs->vec_to_scalar_cost;
9986
9987       case scalar_to_vec:
9988         return costs->scalar_to_vec_cost;
9989
9990       case unaligned_load:
9991       case vector_gather_load:
9992         return costs->vec_unalign_load_cost;
9993
9994       case unaligned_store:
9995       case vector_scatter_store:
9996         return costs->vec_unalign_store_cost;
9997
9998       case cond_branch_taken:
9999         return costs->cond_taken_branch_cost;
10000
10001       case cond_branch_not_taken:
10002         return costs->cond_not_taken_branch_cost;
10003
10004       case vec_perm:
10005         return costs->vec_permute_cost;
10006
10007       case vec_promote_demote:
10008         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10009
10010       case vec_construct:
10011         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
10012         return elements / 2 + 1;
10013
10014       default:
10015         gcc_unreachable ();
10016     }
10017 }
10018
10019 /* Implement targetm.vectorize.add_stmt_cost.  */
10020 static unsigned
10021 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
10022                        struct _stmt_vec_info *stmt_info, int misalign,
10023                        enum vect_cost_model_location where)
10024 {
10025   unsigned *cost = (unsigned *) data;
10026   unsigned retval = 0;
10027
10028   if (flag_vect_cost_model)
10029     {
10030       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
10031       int stmt_cost =
10032             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
10033
10034       /* Statements in an inner loop relative to the loop being
10035          vectorized are weighted more heavily.  The value here is
10036          arbitrary and could potentially be improved with analysis.  */
10037       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
10038         count *= 50; /*  FIXME  */
10039
10040       retval = (unsigned) (count * stmt_cost);
10041       cost[where] += retval;
10042     }
10043
10044   return retval;
10045 }
10046
10047 static void initialize_aarch64_code_model (struct gcc_options *);
10048
10049 /* Parse the TO_PARSE string and put the architecture struct that it
10050    selects into RES and the architectural features into ISA_FLAGS.
10051    Return an aarch64_parse_opt_result describing the parse result.
10052    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
10053
10054 static enum aarch64_parse_opt_result
10055 aarch64_parse_arch (const char *to_parse, const struct processor **res,
10056                     unsigned long *isa_flags)
10057 {
10058   char *ext;
10059   const struct processor *arch;
10060   char *str = (char *) alloca (strlen (to_parse) + 1);
10061   size_t len;
10062
10063   strcpy (str, to_parse);
10064
10065   ext = strchr (str, '+');
10066
10067   if (ext != NULL)
10068     len = ext - str;
10069   else
10070     len = strlen (str);
10071
10072   if (len == 0)
10073     return AARCH64_PARSE_MISSING_ARG;
10074
10075
10076   /* Loop through the list of supported ARCHes to find a match.  */
10077   for (arch = all_architectures; arch->name != NULL; arch++)
10078     {
10079       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
10080         {
10081           unsigned long isa_temp = arch->flags;
10082
10083           if (ext != NULL)
10084             {
10085               /* TO_PARSE string contains at least one extension.  */
10086               enum aarch64_parse_opt_result ext_res
10087                 = aarch64_parse_extension (ext, &isa_temp);
10088
10089               if (ext_res != AARCH64_PARSE_OK)
10090                 return ext_res;
10091             }
10092           /* Extension parsing was successful.  Confirm the result
10093              arch and ISA flags.  */
10094           *res = arch;
10095           *isa_flags = isa_temp;
10096           return AARCH64_PARSE_OK;
10097         }
10098     }
10099
10100   /* ARCH name not found in list.  */
10101   return AARCH64_PARSE_INVALID_ARG;
10102 }
10103
10104 /* Parse the TO_PARSE string and put the result tuning in RES and the
10105    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
10106    describing the parse result.  If there is an error parsing, RES and
10107    ISA_FLAGS are left unchanged.  */
10108
10109 static enum aarch64_parse_opt_result
10110 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
10111                    unsigned long *isa_flags)
10112 {
10113   char *ext;
10114   const struct processor *cpu;
10115   char *str = (char *) alloca (strlen (to_parse) + 1);
10116   size_t len;
10117
10118   strcpy (str, to_parse);
10119
10120   ext = strchr (str, '+');
10121
10122   if (ext != NULL)
10123     len = ext - str;
10124   else
10125     len = strlen (str);
10126
10127   if (len == 0)
10128     return AARCH64_PARSE_MISSING_ARG;
10129
10130
10131   /* Loop through the list of supported CPUs to find a match.  */
10132   for (cpu = all_cores; cpu->name != NULL; cpu++)
10133     {
10134       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
10135         {
10136           unsigned long isa_temp = cpu->flags;
10137
10138
10139           if (ext != NULL)
10140             {
10141               /* TO_PARSE string contains at least one extension.  */
10142               enum aarch64_parse_opt_result ext_res
10143                 = aarch64_parse_extension (ext, &isa_temp);
10144
10145               if (ext_res != AARCH64_PARSE_OK)
10146                 return ext_res;
10147             }
10148           /* Extension parsing was successfull.  Confirm the result
10149              cpu and ISA flags.  */
10150           *res = cpu;
10151           *isa_flags = isa_temp;
10152           return AARCH64_PARSE_OK;
10153         }
10154     }
10155
10156   /* CPU name not found in list.  */
10157   return AARCH64_PARSE_INVALID_ARG;
10158 }
10159
10160 /* Parse the TO_PARSE string and put the cpu it selects into RES.
10161    Return an aarch64_parse_opt_result describing the parse result.
10162    If the parsing fails the RES does not change.  */
10163
10164 static enum aarch64_parse_opt_result
10165 aarch64_parse_tune (const char *to_parse, const struct processor **res)
10166 {
10167   const struct processor *cpu;
10168   char *str = (char *) alloca (strlen (to_parse) + 1);
10169
10170   strcpy (str, to_parse);
10171
10172   /* Loop through the list of supported CPUs to find a match.  */
10173   for (cpu = all_cores; cpu->name != NULL; cpu++)
10174     {
10175       if (strcmp (cpu->name, str) == 0)
10176         {
10177           *res = cpu;
10178           return AARCH64_PARSE_OK;
10179         }
10180     }
10181
10182   /* CPU name not found in list.  */
10183   return AARCH64_PARSE_INVALID_ARG;
10184 }
10185
10186 /* Parse TOKEN, which has length LENGTH to see if it is an option
10187    described in FLAG.  If it is, return the index bit for that fusion type.
10188    If not, error (printing OPTION_NAME) and return zero.  */
10189
10190 static unsigned int
10191 aarch64_parse_one_option_token (const char *token,
10192                                 size_t length,
10193                                 const struct aarch64_flag_desc *flag,
10194                                 const char *option_name)
10195 {
10196   for (; flag->name != NULL; flag++)
10197     {
10198       if (length == strlen (flag->name)
10199           && !strncmp (flag->name, token, length))
10200         return flag->flag;
10201     }
10202
10203   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
10204   return 0;
10205 }
10206
10207 /* Parse OPTION which is a comma-separated list of flags to enable.
10208    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
10209    default state we inherit from the CPU tuning structures.  OPTION_NAME
10210    gives the top-level option we are parsing in the -moverride string,
10211    for use in error messages.  */
10212
10213 static unsigned int
10214 aarch64_parse_boolean_options (const char *option,
10215                                const struct aarch64_flag_desc *flags,
10216                                unsigned int initial_state,
10217                                const char *option_name)
10218 {
10219   const char separator = '.';
10220   const char* specs = option;
10221   const char* ntoken = option;
10222   unsigned int found_flags = initial_state;
10223
10224   while ((ntoken = strchr (specs, separator)))
10225     {
10226       size_t token_length = ntoken - specs;
10227       unsigned token_ops = aarch64_parse_one_option_token (specs,
10228                                                            token_length,
10229                                                            flags,
10230                                                            option_name);
10231       /* If we find "none" (or, for simplicity's sake, an error) anywhere
10232          in the token stream, reset the supported operations.  So:
10233
10234            adrp+add.cmp+branch.none.adrp+add
10235
10236            would have the result of turning on only adrp+add fusion.  */
10237       if (!token_ops)
10238         found_flags = 0;
10239
10240       found_flags |= token_ops;
10241       specs = ++ntoken;
10242     }
10243
10244   /* We ended with a comma, print something.  */
10245   if (!(*specs))
10246     {
10247       error ("%s string ill-formed\n", option_name);
10248       return 0;
10249     }
10250
10251   /* We still have one more token to parse.  */
10252   size_t token_length = strlen (specs);
10253   unsigned token_ops = aarch64_parse_one_option_token (specs,
10254                                                        token_length,
10255                                                        flags,
10256                                                        option_name);
10257    if (!token_ops)
10258      found_flags = 0;
10259
10260   found_flags |= token_ops;
10261   return found_flags;
10262 }
10263
10264 /* Support for overriding instruction fusion.  */
10265
10266 static void
10267 aarch64_parse_fuse_string (const char *fuse_string,
10268                             struct tune_params *tune)
10269 {
10270   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
10271                                                      aarch64_fusible_pairs,
10272                                                      tune->fusible_ops,
10273                                                      "fuse=");
10274 }
10275
10276 /* Support for overriding other tuning flags.  */
10277
10278 static void
10279 aarch64_parse_tune_string (const char *tune_string,
10280                             struct tune_params *tune)
10281 {
10282   tune->extra_tuning_flags
10283     = aarch64_parse_boolean_options (tune_string,
10284                                      aarch64_tuning_flags,
10285                                      tune->extra_tuning_flags,
10286                                      "tune=");
10287 }
10288
10289 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
10290    we understand.  If it is, extract the option string and handoff to
10291    the appropriate function.  */
10292
10293 void
10294 aarch64_parse_one_override_token (const char* token,
10295                                   size_t length,
10296                                   struct tune_params *tune)
10297 {
10298   const struct aarch64_tuning_override_function *fn
10299     = aarch64_tuning_override_functions;
10300
10301   const char *option_part = strchr (token, '=');
10302   if (!option_part)
10303     {
10304       error ("tuning string missing in option (%s)", token);
10305       return;
10306     }
10307
10308   /* Get the length of the option name.  */
10309   length = option_part - token;
10310   /* Skip the '=' to get to the option string.  */
10311   option_part++;
10312
10313   for (; fn->name != NULL; fn++)
10314     {
10315       if (!strncmp (fn->name, token, length))
10316         {
10317           fn->parse_override (option_part, tune);
10318           return;
10319         }
10320     }
10321
10322   error ("unknown tuning option (%s)",token);
10323   return;
10324 }
10325
10326 /* A checking mechanism for the implementation of the tls size.  */
10327
10328 static void
10329 initialize_aarch64_tls_size (struct gcc_options *opts)
10330 {
10331   if (aarch64_tls_size == 0)
10332     aarch64_tls_size = 24;
10333
10334   switch (opts->x_aarch64_cmodel_var)
10335     {
10336     case AARCH64_CMODEL_TINY:
10337       /* Both the default and maximum TLS size allowed under tiny is 1M which
10338          needs two instructions to address, so we clamp the size to 24.  */
10339       if (aarch64_tls_size > 24)
10340         aarch64_tls_size = 24;
10341       break;
10342     case AARCH64_CMODEL_SMALL:
10343       /* The maximum TLS size allowed under small is 4G.  */
10344       if (aarch64_tls_size > 32)
10345         aarch64_tls_size = 32;
10346       break;
10347     case AARCH64_CMODEL_LARGE:
10348       /* The maximum TLS size allowed under large is 16E.
10349          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
10350       if (aarch64_tls_size > 48)
10351         aarch64_tls_size = 48;
10352       break;
10353     default:
10354       gcc_unreachable ();
10355     }
10356
10357   return;
10358 }
10359
10360 /* Parse STRING looking for options in the format:
10361      string     :: option:string
10362      option     :: name=substring
10363      name       :: {a-z}
10364      substring  :: defined by option.  */
10365
10366 static void
10367 aarch64_parse_override_string (const char* input_string,
10368                                struct tune_params* tune)
10369 {
10370   const char separator = ':';
10371   size_t string_length = strlen (input_string) + 1;
10372   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
10373   char *string = string_root;
10374   strncpy (string, input_string, string_length);
10375   string[string_length - 1] = '\0';
10376
10377   char* ntoken = string;
10378
10379   while ((ntoken = strchr (string, separator)))
10380     {
10381       size_t token_length = ntoken - string;
10382       /* Make this substring look like a string.  */
10383       *ntoken = '\0';
10384       aarch64_parse_one_override_token (string, token_length, tune);
10385       string = ++ntoken;
10386     }
10387
10388   /* One last option to parse.  */
10389   aarch64_parse_one_override_token (string, strlen (string), tune);
10390   free (string_root);
10391 }
10392
10393
10394 static void
10395 aarch64_override_options_after_change_1 (struct gcc_options *opts)
10396 {
10397   /* PR 70044: We have to be careful about being called multiple times for the
10398      same function.  This means all changes should be repeatable.  */
10399
10400   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
10401      Disable the frame pointer flag so the mid-end will not use a frame
10402      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
10403      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
10404      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
10405   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
10406   if (opts->x_flag_omit_frame_pointer == 0)
10407     opts->x_flag_omit_frame_pointer = 2;
10408
10409   /* If not optimizing for size, set the default
10410      alignment to what the target wants.  */
10411   if (!opts->x_optimize_size)
10412     {
10413       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
10414         opts->x_str_align_loops = aarch64_tune_params.loop_align;
10415       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
10416         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
10417       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
10418         opts->x_str_align_functions = aarch64_tune_params.function_align;
10419     }
10420
10421   /* We default to no pc-relative literal loads.  */
10422
10423   aarch64_pcrelative_literal_loads = false;
10424
10425   /* If -mpc-relative-literal-loads is set on the command line, this
10426      implies that the user asked for PC relative literal loads.  */
10427   if (opts->x_pcrelative_literal_loads == 1)
10428     aarch64_pcrelative_literal_loads = true;
10429
10430   /* In the tiny memory model it makes no sense to disallow PC relative
10431      literal pool loads.  */
10432   if (aarch64_cmodel == AARCH64_CMODEL_TINY
10433       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10434     aarch64_pcrelative_literal_loads = true;
10435
10436   /* When enabling the lower precision Newton series for the square root, also
10437      enable it for the reciprocal square root, since the latter is an
10438      intermediary step for the former.  */
10439   if (flag_mlow_precision_sqrt)
10440     flag_mrecip_low_precision_sqrt = true;
10441 }
10442
10443 /* 'Unpack' up the internal tuning structs and update the options
10444     in OPTS.  The caller must have set up selected_tune and selected_arch
10445     as all the other target-specific codegen decisions are
10446     derived from them.  */
10447
10448 void
10449 aarch64_override_options_internal (struct gcc_options *opts)
10450 {
10451   aarch64_tune_flags = selected_tune->flags;
10452   aarch64_tune = selected_tune->sched_core;
10453   /* Make a copy of the tuning parameters attached to the core, which
10454      we may later overwrite.  */
10455   aarch64_tune_params = *(selected_tune->tune);
10456   aarch64_architecture_version = selected_arch->architecture_version;
10457
10458   if (opts->x_aarch64_override_tune_string)
10459     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
10460                                   &aarch64_tune_params);
10461
10462   /* This target defaults to strict volatile bitfields.  */
10463   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
10464     opts->x_flag_strict_volatile_bitfields = 1;
10465
10466   initialize_aarch64_code_model (opts);
10467   initialize_aarch64_tls_size (opts);
10468
10469   int queue_depth = 0;
10470   switch (aarch64_tune_params.autoprefetcher_model)
10471     {
10472       case tune_params::AUTOPREFETCHER_OFF:
10473         queue_depth = -1;
10474         break;
10475       case tune_params::AUTOPREFETCHER_WEAK:
10476         queue_depth = 0;
10477         break;
10478       case tune_params::AUTOPREFETCHER_STRONG:
10479         queue_depth = max_insn_queue_index + 1;
10480         break;
10481       default:
10482         gcc_unreachable ();
10483     }
10484
10485   /* We don't mind passing in global_options_set here as we don't use
10486      the *options_set structs anyway.  */
10487   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
10488                          queue_depth,
10489                          opts->x_param_values,
10490                          global_options_set.x_param_values);
10491
10492   /* Set up parameters to be used in prefetching algorithm.  Do not
10493      override the defaults unless we are tuning for a core we have
10494      researched values for.  */
10495   if (aarch64_tune_params.prefetch->num_slots > 0)
10496     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
10497                            aarch64_tune_params.prefetch->num_slots,
10498                            opts->x_param_values,
10499                            global_options_set.x_param_values);
10500   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
10501     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
10502                            aarch64_tune_params.prefetch->l1_cache_size,
10503                            opts->x_param_values,
10504                            global_options_set.x_param_values);
10505   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
10506     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
10507                            aarch64_tune_params.prefetch->l1_cache_line_size,
10508                            opts->x_param_values,
10509                            global_options_set.x_param_values);
10510   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
10511     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
10512                            aarch64_tune_params.prefetch->l2_cache_size,
10513                            opts->x_param_values,
10514                            global_options_set.x_param_values);
10515   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
10516     maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
10517                            0,
10518                            opts->x_param_values,
10519                            global_options_set.x_param_values);
10520   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
10521     maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
10522                            aarch64_tune_params.prefetch->minimum_stride,
10523                            opts->x_param_values,
10524                            global_options_set.x_param_values);
10525
10526   /* Use the alternative scheduling-pressure algorithm by default.  */
10527   maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
10528                          opts->x_param_values,
10529                          global_options_set.x_param_values);
10530
10531   /* Enable sw prefetching at specified optimization level for
10532      CPUS that have prefetch.  Lower optimization level threshold by 1
10533      when profiling is enabled.  */
10534   if (opts->x_flag_prefetch_loop_arrays < 0
10535       && !opts->x_optimize_size
10536       && aarch64_tune_params.prefetch->default_opt_level >= 0
10537       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
10538     opts->x_flag_prefetch_loop_arrays = 1;
10539
10540   aarch64_override_options_after_change_1 (opts);
10541 }
10542
10543 /* Print a hint with a suggestion for a core or architecture name that
10544    most closely resembles what the user passed in STR.  ARCH is true if
10545    the user is asking for an architecture name.  ARCH is false if the user
10546    is asking for a core name.  */
10547
10548 static void
10549 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
10550 {
10551   auto_vec<const char *> candidates;
10552   const struct processor *entry = arch ? all_architectures : all_cores;
10553   for (; entry->name != NULL; entry++)
10554     candidates.safe_push (entry->name);
10555
10556 #ifdef HAVE_LOCAL_CPU_DETECT
10557   /* Add also "native" as possible value.  */
10558   if (arch)
10559     candidates.safe_push ("native");
10560 #endif
10561
10562   char *s;
10563   const char *hint = candidates_list_and_hint (str, s, candidates);
10564   if (hint)
10565     inform (input_location, "valid arguments are: %s;"
10566                              " did you mean %qs?", s, hint);
10567   else
10568     inform (input_location, "valid arguments are: %s", s);
10569
10570   XDELETEVEC (s);
10571 }
10572
10573 /* Print a hint with a suggestion for a core name that most closely resembles
10574    what the user passed in STR.  */
10575
10576 inline static void
10577 aarch64_print_hint_for_core (const char *str)
10578 {
10579   aarch64_print_hint_for_core_or_arch (str, false);
10580 }
10581
10582 /* Print a hint with a suggestion for an architecture name that most closely
10583    resembles what the user passed in STR.  */
10584
10585 inline static void
10586 aarch64_print_hint_for_arch (const char *str)
10587 {
10588   aarch64_print_hint_for_core_or_arch (str, true);
10589 }
10590
10591 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
10592    specified in STR and throw errors if appropriate.  Put the results if
10593    they are valid in RES and ISA_FLAGS.  Return whether the option is
10594    valid.  */
10595
10596 static bool
10597 aarch64_validate_mcpu (const char *str, const struct processor **res,
10598                        unsigned long *isa_flags)
10599 {
10600   enum aarch64_parse_opt_result parse_res
10601     = aarch64_parse_cpu (str, res, isa_flags);
10602
10603   if (parse_res == AARCH64_PARSE_OK)
10604     return true;
10605
10606   switch (parse_res)
10607     {
10608       case AARCH64_PARSE_MISSING_ARG:
10609         error ("missing cpu name in %<-mcpu=%s%>", str);
10610         break;
10611       case AARCH64_PARSE_INVALID_ARG:
10612         error ("unknown value %qs for -mcpu", str);
10613         aarch64_print_hint_for_core (str);
10614         break;
10615       case AARCH64_PARSE_INVALID_FEATURE:
10616         error ("invalid feature modifier in %<-mcpu=%s%>", str);
10617         break;
10618       default:
10619         gcc_unreachable ();
10620     }
10621
10622   return false;
10623 }
10624
10625 /* Validate a command-line -march option.  Parse the arch and extensions
10626    (if any) specified in STR and throw errors if appropriate.  Put the
10627    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
10628    option is valid.  */
10629
10630 static bool
10631 aarch64_validate_march (const char *str, const struct processor **res,
10632                          unsigned long *isa_flags)
10633 {
10634   enum aarch64_parse_opt_result parse_res
10635     = aarch64_parse_arch (str, res, isa_flags);
10636
10637   if (parse_res == AARCH64_PARSE_OK)
10638     return true;
10639
10640   switch (parse_res)
10641     {
10642       case AARCH64_PARSE_MISSING_ARG:
10643         error ("missing arch name in %<-march=%s%>", str);
10644         break;
10645       case AARCH64_PARSE_INVALID_ARG:
10646         error ("unknown value %qs for -march", str);
10647         aarch64_print_hint_for_arch (str);
10648         break;
10649       case AARCH64_PARSE_INVALID_FEATURE:
10650         error ("invalid feature modifier in %<-march=%s%>", str);
10651         break;
10652       default:
10653         gcc_unreachable ();
10654     }
10655
10656   return false;
10657 }
10658
10659 /* Validate a command-line -mtune option.  Parse the cpu
10660    specified in STR and throw errors if appropriate.  Put the
10661    result, if it is valid, in RES.  Return whether the option is
10662    valid.  */
10663
10664 static bool
10665 aarch64_validate_mtune (const char *str, const struct processor **res)
10666 {
10667   enum aarch64_parse_opt_result parse_res
10668     = aarch64_parse_tune (str, res);
10669
10670   if (parse_res == AARCH64_PARSE_OK)
10671     return true;
10672
10673   switch (parse_res)
10674     {
10675       case AARCH64_PARSE_MISSING_ARG:
10676         error ("missing cpu name in %<-mtune=%s%>", str);
10677         break;
10678       case AARCH64_PARSE_INVALID_ARG:
10679         error ("unknown value %qs for -mtune", str);
10680         aarch64_print_hint_for_core (str);
10681         break;
10682       default:
10683         gcc_unreachable ();
10684     }
10685   return false;
10686 }
10687
10688 /* Return the CPU corresponding to the enum CPU.
10689    If it doesn't specify a cpu, return the default.  */
10690
10691 static const struct processor *
10692 aarch64_get_tune_cpu (enum aarch64_processor cpu)
10693 {
10694   if (cpu != aarch64_none)
10695     return &all_cores[cpu];
10696
10697   /* The & 0x3f is to extract the bottom 6 bits that encode the
10698      default cpu as selected by the --with-cpu GCC configure option
10699      in config.gcc.
10700      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
10701      flags mechanism should be reworked to make it more sane.  */
10702   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10703 }
10704
10705 /* Return the architecture corresponding to the enum ARCH.
10706    If it doesn't specify a valid architecture, return the default.  */
10707
10708 static const struct processor *
10709 aarch64_get_arch (enum aarch64_arch arch)
10710 {
10711   if (arch != aarch64_no_arch)
10712     return &all_architectures[arch];
10713
10714   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10715
10716   return &all_architectures[cpu->arch];
10717 }
10718
10719 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
10720
10721 static poly_uint16
10722 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
10723 {
10724   /* For now generate vector-length agnostic code for -msve-vector-bits=128.
10725      This ensures we can clearly distinguish SVE and Advanced SIMD modes when
10726      deciding which .md file patterns to use and when deciding whether
10727      something is a legitimate address or constant.  */
10728   if (value == SVE_SCALABLE || value == SVE_128)
10729     return poly_uint16 (2, 2);
10730   else
10731     return (int) value / 64;
10732 }
10733
10734 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
10735    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
10736    tuning structs.  In particular it must set selected_tune and
10737    aarch64_isa_flags that define the available ISA features and tuning
10738    decisions.  It must also set selected_arch as this will be used to
10739    output the .arch asm tags for each function.  */
10740
10741 static void
10742 aarch64_override_options (void)
10743 {
10744   unsigned long cpu_isa = 0;
10745   unsigned long arch_isa = 0;
10746   aarch64_isa_flags = 0;
10747
10748   bool valid_cpu = true;
10749   bool valid_tune = true;
10750   bool valid_arch = true;
10751
10752   selected_cpu = NULL;
10753   selected_arch = NULL;
10754   selected_tune = NULL;
10755
10756   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
10757      If either of -march or -mtune is given, they override their
10758      respective component of -mcpu.  */
10759   if (aarch64_cpu_string)
10760     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
10761                                         &cpu_isa);
10762
10763   if (aarch64_arch_string)
10764     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
10765                                           &arch_isa);
10766
10767   if (aarch64_tune_string)
10768     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
10769
10770   /* If the user did not specify a processor, choose the default
10771      one for them.  This will be the CPU set during configuration using
10772      --with-cpu, otherwise it is "generic".  */
10773   if (!selected_cpu)
10774     {
10775       if (selected_arch)
10776         {
10777           selected_cpu = &all_cores[selected_arch->ident];
10778           aarch64_isa_flags = arch_isa;
10779           explicit_arch = selected_arch->arch;
10780         }
10781       else
10782         {
10783           /* Get default configure-time CPU.  */
10784           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
10785           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
10786         }
10787
10788       if (selected_tune)
10789         explicit_tune_core = selected_tune->ident;
10790     }
10791   /* If both -mcpu and -march are specified check that they are architecturally
10792      compatible, warn if they're not and prefer the -march ISA flags.  */
10793   else if (selected_arch)
10794     {
10795       if (selected_arch->arch != selected_cpu->arch)
10796         {
10797           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
10798                        all_architectures[selected_cpu->arch].name,
10799                        selected_arch->name);
10800         }
10801       aarch64_isa_flags = arch_isa;
10802       explicit_arch = selected_arch->arch;
10803       explicit_tune_core = selected_tune ? selected_tune->ident
10804                                           : selected_cpu->ident;
10805     }
10806   else
10807     {
10808       /* -mcpu but no -march.  */
10809       aarch64_isa_flags = cpu_isa;
10810       explicit_tune_core = selected_tune ? selected_tune->ident
10811                                           : selected_cpu->ident;
10812       gcc_assert (selected_cpu);
10813       selected_arch = &all_architectures[selected_cpu->arch];
10814       explicit_arch = selected_arch->arch;
10815     }
10816
10817   /* Set the arch as well as we will need it when outputing
10818      the .arch directive in assembly.  */
10819   if (!selected_arch)
10820     {
10821       gcc_assert (selected_cpu);
10822       selected_arch = &all_architectures[selected_cpu->arch];
10823     }
10824
10825   if (!selected_tune)
10826     selected_tune = selected_cpu;
10827
10828 #ifndef HAVE_AS_MABI_OPTION
10829   /* The compiler may have been configured with 2.23.* binutils, which does
10830      not have support for ILP32.  */
10831   if (TARGET_ILP32)
10832     error ("assembler does not support -mabi=ilp32");
10833 #endif
10834
10835   /* Convert -msve-vector-bits to a VG count.  */
10836   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
10837
10838   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
10839     sorry ("return address signing is only supported for -mabi=lp64");
10840
10841   /* Make sure we properly set up the explicit options.  */
10842   if ((aarch64_cpu_string && valid_cpu)
10843        || (aarch64_tune_string && valid_tune))
10844     gcc_assert (explicit_tune_core != aarch64_none);
10845
10846   if ((aarch64_cpu_string && valid_cpu)
10847        || (aarch64_arch_string && valid_arch))
10848     gcc_assert (explicit_arch != aarch64_no_arch);
10849
10850   aarch64_override_options_internal (&global_options);
10851
10852   /* Save these options as the default ones in case we push and pop them later
10853      while processing functions with potential target attributes.  */
10854   target_option_default_node = target_option_current_node
10855       = build_target_option_node (&global_options);
10856 }
10857
10858 /* Implement targetm.override_options_after_change.  */
10859
10860 static void
10861 aarch64_override_options_after_change (void)
10862 {
10863   aarch64_override_options_after_change_1 (&global_options);
10864 }
10865
10866 static struct machine_function *
10867 aarch64_init_machine_status (void)
10868 {
10869   struct machine_function *machine;
10870   machine = ggc_cleared_alloc<machine_function> ();
10871   return machine;
10872 }
10873
10874 void
10875 aarch64_init_expanders (void)
10876 {
10877   init_machine_status = aarch64_init_machine_status;
10878 }
10879
10880 /* A checking mechanism for the implementation of the various code models.  */
10881 static void
10882 initialize_aarch64_code_model (struct gcc_options *opts)
10883 {
10884    if (opts->x_flag_pic)
10885      {
10886        switch (opts->x_aarch64_cmodel_var)
10887          {
10888          case AARCH64_CMODEL_TINY:
10889            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
10890            break;
10891          case AARCH64_CMODEL_SMALL:
10892 #ifdef HAVE_AS_SMALL_PIC_RELOCS
10893            aarch64_cmodel = (flag_pic == 2
10894                              ? AARCH64_CMODEL_SMALL_PIC
10895                              : AARCH64_CMODEL_SMALL_SPIC);
10896 #else
10897            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
10898 #endif
10899            break;
10900          case AARCH64_CMODEL_LARGE:
10901            sorry ("code model %qs with -f%s", "large",
10902                   opts->x_flag_pic > 1 ? "PIC" : "pic");
10903            break;
10904          default:
10905            gcc_unreachable ();
10906          }
10907      }
10908    else
10909      aarch64_cmodel = opts->x_aarch64_cmodel_var;
10910 }
10911
10912 /* Implement TARGET_OPTION_SAVE.  */
10913
10914 static void
10915 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
10916 {
10917   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
10918 }
10919
10920 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
10921    using the information saved in PTR.  */
10922
10923 static void
10924 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
10925 {
10926   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
10927   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
10928   opts->x_explicit_arch = ptr->x_explicit_arch;
10929   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
10930   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
10931
10932   aarch64_override_options_internal (opts);
10933 }
10934
10935 /* Implement TARGET_OPTION_PRINT.  */
10936
10937 static void
10938 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
10939 {
10940   const struct processor *cpu
10941     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
10942   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
10943   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
10944   std::string extension
10945     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
10946
10947   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
10948   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
10949            arch->name, extension.c_str ());
10950 }
10951
10952 static GTY(()) tree aarch64_previous_fndecl;
10953
10954 void
10955 aarch64_reset_previous_fndecl (void)
10956 {
10957   aarch64_previous_fndecl = NULL;
10958 }
10959
10960 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
10961    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
10962    make sure optab availability predicates are recomputed when necessary.  */
10963
10964 void
10965 aarch64_save_restore_target_globals (tree new_tree)
10966 {
10967   if (TREE_TARGET_GLOBALS (new_tree))
10968     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
10969   else if (new_tree == target_option_default_node)
10970     restore_target_globals (&default_target_globals);
10971   else
10972     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
10973 }
10974
10975 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
10976    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
10977    of the function, if such exists.  This function may be called multiple
10978    times on a single function so use aarch64_previous_fndecl to avoid
10979    setting up identical state.  */
10980
10981 static void
10982 aarch64_set_current_function (tree fndecl)
10983 {
10984   if (!fndecl || fndecl == aarch64_previous_fndecl)
10985     return;
10986
10987   tree old_tree = (aarch64_previous_fndecl
10988                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
10989                    : NULL_TREE);
10990
10991   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
10992
10993   /* If current function has no attributes but the previous one did,
10994      use the default node.  */
10995   if (!new_tree && old_tree)
10996     new_tree = target_option_default_node;
10997
10998   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
10999      the default have been handled by aarch64_save_restore_target_globals from
11000      aarch64_pragma_target_parse.  */
11001   if (old_tree == new_tree)
11002     return;
11003
11004   aarch64_previous_fndecl = fndecl;
11005
11006   /* First set the target options.  */
11007   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
11008
11009   aarch64_save_restore_target_globals (new_tree);
11010 }
11011
11012 /* Enum describing the various ways we can handle attributes.
11013    In many cases we can reuse the generic option handling machinery.  */
11014
11015 enum aarch64_attr_opt_type
11016 {
11017   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
11018   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
11019   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
11020   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
11021 };
11022
11023 /* All the information needed to handle a target attribute.
11024    NAME is the name of the attribute.
11025    ATTR_TYPE specifies the type of behavior of the attribute as described
11026    in the definition of enum aarch64_attr_opt_type.
11027    ALLOW_NEG is true if the attribute supports a "no-" form.
11028    HANDLER is the function that takes the attribute string as an argument
11029    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
11030    OPT_NUM is the enum specifying the option that the attribute modifies.
11031    This is needed for attributes that mirror the behavior of a command-line
11032    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
11033    aarch64_attr_enum.  */
11034
11035 struct aarch64_attribute_info
11036 {
11037   const char *name;
11038   enum aarch64_attr_opt_type attr_type;
11039   bool allow_neg;
11040   bool (*handler) (const char *);
11041   enum opt_code opt_num;
11042 };
11043
11044 /* Handle the ARCH_STR argument to the arch= target attribute.  */
11045
11046 static bool
11047 aarch64_handle_attr_arch (const char *str)
11048 {
11049   const struct processor *tmp_arch = NULL;
11050   enum aarch64_parse_opt_result parse_res
11051     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
11052
11053   if (parse_res == AARCH64_PARSE_OK)
11054     {
11055       gcc_assert (tmp_arch);
11056       selected_arch = tmp_arch;
11057       explicit_arch = selected_arch->arch;
11058       return true;
11059     }
11060
11061   switch (parse_res)
11062     {
11063       case AARCH64_PARSE_MISSING_ARG:
11064         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
11065         break;
11066       case AARCH64_PARSE_INVALID_ARG:
11067         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
11068         aarch64_print_hint_for_arch (str);
11069         break;
11070       case AARCH64_PARSE_INVALID_FEATURE:
11071         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11072         break;
11073       default:
11074         gcc_unreachable ();
11075     }
11076
11077   return false;
11078 }
11079
11080 /* Handle the argument CPU_STR to the cpu= target attribute.  */
11081
11082 static bool
11083 aarch64_handle_attr_cpu (const char *str)
11084 {
11085   const struct processor *tmp_cpu = NULL;
11086   enum aarch64_parse_opt_result parse_res
11087     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
11088
11089   if (parse_res == AARCH64_PARSE_OK)
11090     {
11091       gcc_assert (tmp_cpu);
11092       selected_tune = tmp_cpu;
11093       explicit_tune_core = selected_tune->ident;
11094
11095       selected_arch = &all_architectures[tmp_cpu->arch];
11096       explicit_arch = selected_arch->arch;
11097       return true;
11098     }
11099
11100   switch (parse_res)
11101     {
11102       case AARCH64_PARSE_MISSING_ARG:
11103         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
11104         break;
11105       case AARCH64_PARSE_INVALID_ARG:
11106         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
11107         aarch64_print_hint_for_core (str);
11108         break;
11109       case AARCH64_PARSE_INVALID_FEATURE:
11110         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11111         break;
11112       default:
11113         gcc_unreachable ();
11114     }
11115
11116   return false;
11117 }
11118
11119 /* Handle the argument STR to the tune= target attribute.  */
11120
11121 static bool
11122 aarch64_handle_attr_tune (const char *str)
11123 {
11124   const struct processor *tmp_tune = NULL;
11125   enum aarch64_parse_opt_result parse_res
11126     = aarch64_parse_tune (str, &tmp_tune);
11127
11128   if (parse_res == AARCH64_PARSE_OK)
11129     {
11130       gcc_assert (tmp_tune);
11131       selected_tune = tmp_tune;
11132       explicit_tune_core = selected_tune->ident;
11133       return true;
11134     }
11135
11136   switch (parse_res)
11137     {
11138       case AARCH64_PARSE_INVALID_ARG:
11139         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
11140         aarch64_print_hint_for_core (str);
11141         break;
11142       default:
11143         gcc_unreachable ();
11144     }
11145
11146   return false;
11147 }
11148
11149 /* Parse an architecture extensions target attribute string specified in STR.
11150    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
11151    if successful.  Update aarch64_isa_flags to reflect the ISA features
11152    modified.  */
11153
11154 static bool
11155 aarch64_handle_attr_isa_flags (char *str)
11156 {
11157   enum aarch64_parse_opt_result parse_res;
11158   unsigned long isa_flags = aarch64_isa_flags;
11159
11160   /* We allow "+nothing" in the beginning to clear out all architectural
11161      features if the user wants to handpick specific features.  */
11162   if (strncmp ("+nothing", str, 8) == 0)
11163     {
11164       isa_flags = 0;
11165       str += 8;
11166     }
11167
11168   parse_res = aarch64_parse_extension (str, &isa_flags);
11169
11170   if (parse_res == AARCH64_PARSE_OK)
11171     {
11172       aarch64_isa_flags = isa_flags;
11173       return true;
11174     }
11175
11176   switch (parse_res)
11177     {
11178       case AARCH64_PARSE_MISSING_ARG:
11179         error ("missing value in %<target()%> pragma or attribute");
11180         break;
11181
11182       case AARCH64_PARSE_INVALID_FEATURE:
11183         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11184         break;
11185
11186       default:
11187         gcc_unreachable ();
11188     }
11189
11190  return false;
11191 }
11192
11193 /* The target attributes that we support.  On top of these we also support just
11194    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
11195    handled explicitly in aarch64_process_one_target_attr.  */
11196
11197 static const struct aarch64_attribute_info aarch64_attributes[] =
11198 {
11199   { "general-regs-only", aarch64_attr_mask, false, NULL,
11200      OPT_mgeneral_regs_only },
11201   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
11202      OPT_mfix_cortex_a53_835769 },
11203   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
11204      OPT_mfix_cortex_a53_843419 },
11205   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
11206   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
11207   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
11208      OPT_momit_leaf_frame_pointer },
11209   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
11210   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
11211      OPT_march_ },
11212   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
11213   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
11214      OPT_mtune_ },
11215   { "sign-return-address", aarch64_attr_enum, false, NULL,
11216      OPT_msign_return_address_ },
11217   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
11218 };
11219
11220 /* Parse ARG_STR which contains the definition of one target attribute.
11221    Show appropriate errors if any or return true if the attribute is valid.  */
11222
11223 static bool
11224 aarch64_process_one_target_attr (char *arg_str)
11225 {
11226   bool invert = false;
11227
11228   size_t len = strlen (arg_str);
11229
11230   if (len == 0)
11231     {
11232       error ("malformed %<target()%> pragma or attribute");
11233       return false;
11234     }
11235
11236   char *str_to_check = (char *) alloca (len + 1);
11237   strcpy (str_to_check, arg_str);
11238
11239   /* Skip leading whitespace.  */
11240   while (*str_to_check == ' ' || *str_to_check == '\t')
11241     str_to_check++;
11242
11243   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
11244      It is easier to detect and handle it explicitly here rather than going
11245      through the machinery for the rest of the target attributes in this
11246      function.  */
11247   if (*str_to_check == '+')
11248     return aarch64_handle_attr_isa_flags (str_to_check);
11249
11250   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
11251     {
11252       invert = true;
11253       str_to_check += 3;
11254     }
11255   char *arg = strchr (str_to_check, '=');
11256
11257   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
11258      and point ARG to "foo".  */
11259   if (arg)
11260     {
11261       *arg = '\0';
11262       arg++;
11263     }
11264   const struct aarch64_attribute_info *p_attr;
11265   bool found = false;
11266   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
11267     {
11268       /* If the names don't match up, or the user has given an argument
11269          to an attribute that doesn't accept one, or didn't give an argument
11270          to an attribute that expects one, fail to match.  */
11271       if (strcmp (str_to_check, p_attr->name) != 0)
11272         continue;
11273
11274       found = true;
11275       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
11276                               || p_attr->attr_type == aarch64_attr_enum;
11277
11278       if (attr_need_arg_p ^ (arg != NULL))
11279         {
11280           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
11281           return false;
11282         }
11283
11284       /* If the name matches but the attribute does not allow "no-" versions
11285          then we can't match.  */
11286       if (invert && !p_attr->allow_neg)
11287         {
11288           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
11289           return false;
11290         }
11291
11292       switch (p_attr->attr_type)
11293         {
11294         /* Has a custom handler registered.
11295            For example, cpu=, arch=, tune=.  */
11296           case aarch64_attr_custom:
11297             gcc_assert (p_attr->handler);
11298             if (!p_attr->handler (arg))
11299               return false;
11300             break;
11301
11302           /* Either set or unset a boolean option.  */
11303           case aarch64_attr_bool:
11304             {
11305               struct cl_decoded_option decoded;
11306
11307               generate_option (p_attr->opt_num, NULL, !invert,
11308                                CL_TARGET, &decoded);
11309               aarch64_handle_option (&global_options, &global_options_set,
11310                                       &decoded, input_location);
11311               break;
11312             }
11313           /* Set or unset a bit in the target_flags.  aarch64_handle_option
11314              should know what mask to apply given the option number.  */
11315           case aarch64_attr_mask:
11316             {
11317               struct cl_decoded_option decoded;
11318               /* We only need to specify the option number.
11319                  aarch64_handle_option will know which mask to apply.  */
11320               decoded.opt_index = p_attr->opt_num;
11321               decoded.value = !invert;
11322               aarch64_handle_option (&global_options, &global_options_set,
11323                                       &decoded, input_location);
11324               break;
11325             }
11326           /* Use the option setting machinery to set an option to an enum.  */
11327           case aarch64_attr_enum:
11328             {
11329               gcc_assert (arg);
11330               bool valid;
11331               int value;
11332               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
11333                                               &value, CL_TARGET);
11334               if (valid)
11335                 {
11336                   set_option (&global_options, NULL, p_attr->opt_num, value,
11337                               NULL, DK_UNSPECIFIED, input_location,
11338                               global_dc);
11339                 }
11340               else
11341                 {
11342                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
11343                 }
11344               break;
11345             }
11346           default:
11347             gcc_unreachable ();
11348         }
11349     }
11350
11351   /* If we reached here we either have found an attribute and validated
11352      it or didn't match any.  If we matched an attribute but its arguments
11353      were malformed we will have returned false already.  */
11354   return found;
11355 }
11356
11357 /* Count how many times the character C appears in
11358    NULL-terminated string STR.  */
11359
11360 static unsigned int
11361 num_occurences_in_str (char c, char *str)
11362 {
11363   unsigned int res = 0;
11364   while (*str != '\0')
11365     {
11366       if (*str == c)
11367         res++;
11368
11369       str++;
11370     }
11371
11372   return res;
11373 }
11374
11375 /* Parse the tree in ARGS that contains the target attribute information
11376    and update the global target options space.  */
11377
11378 bool
11379 aarch64_process_target_attr (tree args)
11380 {
11381   if (TREE_CODE (args) == TREE_LIST)
11382     {
11383       do
11384         {
11385           tree head = TREE_VALUE (args);
11386           if (head)
11387             {
11388               if (!aarch64_process_target_attr (head))
11389                 return false;
11390             }
11391           args = TREE_CHAIN (args);
11392         } while (args);
11393
11394       return true;
11395     }
11396
11397   if (TREE_CODE (args) != STRING_CST)
11398     {
11399       error ("attribute %<target%> argument not a string");
11400       return false;
11401     }
11402
11403   size_t len = strlen (TREE_STRING_POINTER (args));
11404   char *str_to_check = (char *) alloca (len + 1);
11405   strcpy (str_to_check, TREE_STRING_POINTER (args));
11406
11407   if (len == 0)
11408     {
11409       error ("malformed %<target()%> pragma or attribute");
11410       return false;
11411     }
11412
11413   /* Used to catch empty spaces between commas i.e.
11414      attribute ((target ("attr1,,attr2"))).  */
11415   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
11416
11417   /* Handle multiple target attributes separated by ','.  */
11418   char *token = strtok (str_to_check, ",");
11419
11420   unsigned int num_attrs = 0;
11421   while (token)
11422     {
11423       num_attrs++;
11424       if (!aarch64_process_one_target_attr (token))
11425         {
11426           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
11427           return false;
11428         }
11429
11430       token = strtok (NULL, ",");
11431     }
11432
11433   if (num_attrs != num_commas + 1)
11434     {
11435       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
11436       return false;
11437     }
11438
11439   return true;
11440 }
11441
11442 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
11443    process attribute ((target ("..."))).  */
11444
11445 static bool
11446 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
11447 {
11448   struct cl_target_option cur_target;
11449   bool ret;
11450   tree old_optimize;
11451   tree new_target, new_optimize;
11452   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11453
11454   /* If what we're processing is the current pragma string then the
11455      target option node is already stored in target_option_current_node
11456      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
11457      having to re-parse the string.  This is especially useful to keep
11458      arm_neon.h compile times down since that header contains a lot
11459      of intrinsics enclosed in pragmas.  */
11460   if (!existing_target && args == current_target_pragma)
11461     {
11462       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
11463       return true;
11464     }
11465   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11466
11467   old_optimize = build_optimization_node (&global_options);
11468   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11469
11470   /* If the function changed the optimization levels as well as setting
11471      target options, start with the optimizations specified.  */
11472   if (func_optimize && func_optimize != old_optimize)
11473     cl_optimization_restore (&global_options,
11474                              TREE_OPTIMIZATION (func_optimize));
11475
11476   /* Save the current target options to restore at the end.  */
11477   cl_target_option_save (&cur_target, &global_options);
11478
11479   /* If fndecl already has some target attributes applied to it, unpack
11480      them so that we add this attribute on top of them, rather than
11481      overwriting them.  */
11482   if (existing_target)
11483     {
11484       struct cl_target_option *existing_options
11485         = TREE_TARGET_OPTION (existing_target);
11486
11487       if (existing_options)
11488         cl_target_option_restore (&global_options, existing_options);
11489     }
11490   else
11491     cl_target_option_restore (&global_options,
11492                         TREE_TARGET_OPTION (target_option_current_node));
11493
11494   ret = aarch64_process_target_attr (args);
11495
11496   /* Set up any additional state.  */
11497   if (ret)
11498     {
11499       aarch64_override_options_internal (&global_options);
11500       /* Initialize SIMD builtins if we haven't already.
11501          Set current_target_pragma to NULL for the duration so that
11502          the builtin initialization code doesn't try to tag the functions
11503          being built with the attributes specified by any current pragma, thus
11504          going into an infinite recursion.  */
11505       if (TARGET_SIMD)
11506         {
11507           tree saved_current_target_pragma = current_target_pragma;
11508           current_target_pragma = NULL;
11509           aarch64_init_simd_builtins ();
11510           current_target_pragma = saved_current_target_pragma;
11511         }
11512       new_target = build_target_option_node (&global_options);
11513     }
11514   else
11515     new_target = NULL;
11516
11517   new_optimize = build_optimization_node (&global_options);
11518
11519   if (fndecl && ret)
11520     {
11521       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
11522
11523       if (old_optimize != new_optimize)
11524         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
11525     }
11526
11527   cl_target_option_restore (&global_options, &cur_target);
11528
11529   if (old_optimize != new_optimize)
11530     cl_optimization_restore (&global_options,
11531                              TREE_OPTIMIZATION (old_optimize));
11532   return ret;
11533 }
11534
11535 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
11536    tri-bool options (yes, no, don't care) and the default value is
11537    DEF, determine whether to reject inlining.  */
11538
11539 static bool
11540 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
11541                                      int dont_care, int def)
11542 {
11543   /* If the callee doesn't care, always allow inlining.  */
11544   if (callee == dont_care)
11545     return true;
11546
11547   /* If the caller doesn't care, always allow inlining.  */
11548   if (caller == dont_care)
11549     return true;
11550
11551   /* Otherwise, allow inlining if either the callee and caller values
11552      agree, or if the callee is using the default value.  */
11553   return (callee == caller || callee == def);
11554 }
11555
11556 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
11557    to inline CALLEE into CALLER based on target-specific info.
11558    Make sure that the caller and callee have compatible architectural
11559    features.  Then go through the other possible target attributes
11560    and see if they can block inlining.  Try not to reject always_inline
11561    callees unless they are incompatible architecturally.  */
11562
11563 static bool
11564 aarch64_can_inline_p (tree caller, tree callee)
11565 {
11566   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
11567   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
11568
11569   struct cl_target_option *caller_opts
11570         = TREE_TARGET_OPTION (caller_tree ? caller_tree
11571                                            : target_option_default_node);
11572
11573   struct cl_target_option *callee_opts
11574         = TREE_TARGET_OPTION (callee_tree ? callee_tree
11575                                            : target_option_default_node);
11576
11577   /* Callee's ISA flags should be a subset of the caller's.  */
11578   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
11579        != callee_opts->x_aarch64_isa_flags)
11580     return false;
11581
11582   /* Allow non-strict aligned functions inlining into strict
11583      aligned ones.  */
11584   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
11585        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
11586       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
11587            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
11588     return false;
11589
11590   bool always_inline = lookup_attribute ("always_inline",
11591                                           DECL_ATTRIBUTES (callee));
11592
11593   /* If the architectural features match up and the callee is always_inline
11594      then the other attributes don't matter.  */
11595   if (always_inline)
11596     return true;
11597
11598   if (caller_opts->x_aarch64_cmodel_var
11599       != callee_opts->x_aarch64_cmodel_var)
11600     return false;
11601
11602   if (caller_opts->x_aarch64_tls_dialect
11603       != callee_opts->x_aarch64_tls_dialect)
11604     return false;
11605
11606   /* Honour explicit requests to workaround errata.  */
11607   if (!aarch64_tribools_ok_for_inlining_p (
11608           caller_opts->x_aarch64_fix_a53_err835769,
11609           callee_opts->x_aarch64_fix_a53_err835769,
11610           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
11611     return false;
11612
11613   if (!aarch64_tribools_ok_for_inlining_p (
11614           caller_opts->x_aarch64_fix_a53_err843419,
11615           callee_opts->x_aarch64_fix_a53_err843419,
11616           2, TARGET_FIX_ERR_A53_843419))
11617     return false;
11618
11619   /* If the user explicitly specified -momit-leaf-frame-pointer for the
11620      caller and calle and they don't match up, reject inlining.  */
11621   if (!aarch64_tribools_ok_for_inlining_p (
11622           caller_opts->x_flag_omit_leaf_frame_pointer,
11623           callee_opts->x_flag_omit_leaf_frame_pointer,
11624           2, 1))
11625     return false;
11626
11627   /* If the callee has specific tuning overrides, respect them.  */
11628   if (callee_opts->x_aarch64_override_tune_string != NULL
11629       && caller_opts->x_aarch64_override_tune_string == NULL)
11630     return false;
11631
11632   /* If the user specified tuning override strings for the
11633      caller and callee and they don't match up, reject inlining.
11634      We just do a string compare here, we don't analyze the meaning
11635      of the string, as it would be too costly for little gain.  */
11636   if (callee_opts->x_aarch64_override_tune_string
11637       && caller_opts->x_aarch64_override_tune_string
11638       && (strcmp (callee_opts->x_aarch64_override_tune_string,
11639                   caller_opts->x_aarch64_override_tune_string) != 0))
11640     return false;
11641
11642   return true;
11643 }
11644
11645 /* Return true if SYMBOL_REF X binds locally.  */
11646
11647 static bool
11648 aarch64_symbol_binds_local_p (const_rtx x)
11649 {
11650   return (SYMBOL_REF_DECL (x)
11651           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
11652           : SYMBOL_REF_LOCAL_P (x));
11653 }
11654
11655 /* Return true if SYMBOL_REF X is thread local */
11656 static bool
11657 aarch64_tls_symbol_p (rtx x)
11658 {
11659   if (! TARGET_HAVE_TLS)
11660     return false;
11661
11662   if (GET_CODE (x) != SYMBOL_REF)
11663     return false;
11664
11665   return SYMBOL_REF_TLS_MODEL (x) != 0;
11666 }
11667
11668 /* Classify a TLS symbol into one of the TLS kinds.  */
11669 enum aarch64_symbol_type
11670 aarch64_classify_tls_symbol (rtx x)
11671 {
11672   enum tls_model tls_kind = tls_symbolic_operand_type (x);
11673
11674   switch (tls_kind)
11675     {
11676     case TLS_MODEL_GLOBAL_DYNAMIC:
11677     case TLS_MODEL_LOCAL_DYNAMIC:
11678       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
11679
11680     case TLS_MODEL_INITIAL_EXEC:
11681       switch (aarch64_cmodel)
11682         {
11683         case AARCH64_CMODEL_TINY:
11684         case AARCH64_CMODEL_TINY_PIC:
11685           return SYMBOL_TINY_TLSIE;
11686         default:
11687           return SYMBOL_SMALL_TLSIE;
11688         }
11689
11690     case TLS_MODEL_LOCAL_EXEC:
11691       if (aarch64_tls_size == 12)
11692         return SYMBOL_TLSLE12;
11693       else if (aarch64_tls_size == 24)
11694         return SYMBOL_TLSLE24;
11695       else if (aarch64_tls_size == 32)
11696         return SYMBOL_TLSLE32;
11697       else if (aarch64_tls_size == 48)
11698         return SYMBOL_TLSLE48;
11699       else
11700         gcc_unreachable ();
11701
11702     case TLS_MODEL_EMULATED:
11703     case TLS_MODEL_NONE:
11704       return SYMBOL_FORCE_TO_MEM;
11705
11706     default:
11707       gcc_unreachable ();
11708     }
11709 }
11710
11711 /* Return the correct method for accessing X + OFFSET, where X is either
11712    a SYMBOL_REF or LABEL_REF.  */
11713
11714 enum aarch64_symbol_type
11715 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
11716 {
11717   if (GET_CODE (x) == LABEL_REF)
11718     {
11719       switch (aarch64_cmodel)
11720         {
11721         case AARCH64_CMODEL_LARGE:
11722           return SYMBOL_FORCE_TO_MEM;
11723
11724         case AARCH64_CMODEL_TINY_PIC:
11725         case AARCH64_CMODEL_TINY:
11726           return SYMBOL_TINY_ABSOLUTE;
11727
11728         case AARCH64_CMODEL_SMALL_SPIC:
11729         case AARCH64_CMODEL_SMALL_PIC:
11730         case AARCH64_CMODEL_SMALL:
11731           return SYMBOL_SMALL_ABSOLUTE;
11732
11733         default:
11734           gcc_unreachable ();
11735         }
11736     }
11737
11738   if (GET_CODE (x) == SYMBOL_REF)
11739     {
11740       if (aarch64_tls_symbol_p (x))
11741         return aarch64_classify_tls_symbol (x);
11742
11743       switch (aarch64_cmodel)
11744         {
11745         case AARCH64_CMODEL_TINY:
11746           /* When we retrieve symbol + offset address, we have to make sure
11747              the offset does not cause overflow of the final address.  But
11748              we have no way of knowing the address of symbol at compile time
11749              so we can't accurately say if the distance between the PC and
11750              symbol + offset is outside the addressible range of +/-1M in the
11751              TINY code model.  So we rely on images not being greater than
11752              1M and cap the offset at 1M and anything beyond 1M will have to
11753              be loaded using an alternative mechanism.  Furthermore if the
11754              symbol is a weak reference to something that isn't known to
11755              resolve to a symbol in this module, then force to memory.  */
11756           if ((SYMBOL_REF_WEAK (x)
11757                && !aarch64_symbol_binds_local_p (x))
11758               || !IN_RANGE (offset, -1048575, 1048575))
11759             return SYMBOL_FORCE_TO_MEM;
11760           return SYMBOL_TINY_ABSOLUTE;
11761
11762         case AARCH64_CMODEL_SMALL:
11763           /* Same reasoning as the tiny code model, but the offset cap here is
11764              4G.  */
11765           if ((SYMBOL_REF_WEAK (x)
11766                && !aarch64_symbol_binds_local_p (x))
11767               || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
11768                             HOST_WIDE_INT_C (4294967264)))
11769             return SYMBOL_FORCE_TO_MEM;
11770           return SYMBOL_SMALL_ABSOLUTE;
11771
11772         case AARCH64_CMODEL_TINY_PIC:
11773           if (!aarch64_symbol_binds_local_p (x))
11774             return SYMBOL_TINY_GOT;
11775           return SYMBOL_TINY_ABSOLUTE;
11776
11777         case AARCH64_CMODEL_SMALL_SPIC:
11778         case AARCH64_CMODEL_SMALL_PIC:
11779           if (!aarch64_symbol_binds_local_p (x))
11780             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
11781                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
11782           return SYMBOL_SMALL_ABSOLUTE;
11783
11784         case AARCH64_CMODEL_LARGE:
11785           /* This is alright even in PIC code as the constant
11786              pool reference is always PC relative and within
11787              the same translation unit.  */
11788           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
11789             return SYMBOL_SMALL_ABSOLUTE;
11790           else
11791             return SYMBOL_FORCE_TO_MEM;
11792
11793         default:
11794           gcc_unreachable ();
11795         }
11796     }
11797
11798   /* By default push everything into the constant pool.  */
11799   return SYMBOL_FORCE_TO_MEM;
11800 }
11801
11802 bool
11803 aarch64_constant_address_p (rtx x)
11804 {
11805   return (CONSTANT_P (x) && memory_address_p (DImode, x));
11806 }
11807
11808 bool
11809 aarch64_legitimate_pic_operand_p (rtx x)
11810 {
11811   if (GET_CODE (x) == SYMBOL_REF
11812       || (GET_CODE (x) == CONST
11813           && GET_CODE (XEXP (x, 0)) == PLUS
11814           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
11815      return false;
11816
11817   return true;
11818 }
11819
11820 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
11821    that should be rematerialized rather than spilled.  */
11822
11823 static bool
11824 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
11825 {
11826   /* Support CSE and rematerialization of common constants.  */
11827   if (CONST_INT_P (x)
11828       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
11829       || GET_CODE (x) == CONST_VECTOR)
11830     return true;
11831
11832   /* Do not allow vector struct mode constants for Advanced SIMD.
11833      We could support 0 and -1 easily, but they need support in
11834      aarch64-simd.md.  */
11835   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11836   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
11837     return false;
11838
11839   /* Only accept variable-length vector constants if they can be
11840      handled directly.
11841
11842      ??? It would be possible to handle rematerialization of other
11843      constants via secondary reloads.  */
11844   if (vec_flags & VEC_ANY_SVE)
11845     return aarch64_simd_valid_immediate (x, NULL);
11846
11847   if (GET_CODE (x) == HIGH)
11848     x = XEXP (x, 0);
11849
11850   /* Accept polynomial constants that can be calculated by using the
11851      destination of a move as the sole temporary.  Constants that
11852      require a second temporary cannot be rematerialized (they can't be
11853      forced to memory and also aren't legitimate constants).  */
11854   poly_int64 offset;
11855   if (poly_int_rtx_p (x, &offset))
11856     return aarch64_offset_temporaries (false, offset) <= 1;
11857
11858   /* If an offset is being added to something else, we need to allow the
11859      base to be moved into the destination register, meaning that there
11860      are no free temporaries for the offset.  */
11861   x = strip_offset (x, &offset);
11862   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
11863     return false;
11864
11865   /* Do not allow const (plus (anchor_symbol, const_int)).  */
11866   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
11867     return false;
11868
11869   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
11870      so spilling them is better than rematerialization.  */
11871   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
11872     return true;
11873
11874   /* Label references are always constant.  */
11875   if (GET_CODE (x) == LABEL_REF)
11876     return true;
11877
11878   return false;
11879 }
11880
11881 rtx
11882 aarch64_load_tp (rtx target)
11883 {
11884   if (!target
11885       || GET_MODE (target) != Pmode
11886       || !register_operand (target, Pmode))
11887     target = gen_reg_rtx (Pmode);
11888
11889   /* Can return in any reg.  */
11890   emit_insn (gen_aarch64_load_tp_hard (target));
11891   return target;
11892 }
11893
11894 /* On AAPCS systems, this is the "struct __va_list".  */
11895 static GTY(()) tree va_list_type;
11896
11897 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
11898    Return the type to use as __builtin_va_list.
11899
11900    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
11901
11902    struct __va_list
11903    {
11904      void *__stack;
11905      void *__gr_top;
11906      void *__vr_top;
11907      int   __gr_offs;
11908      int   __vr_offs;
11909    };  */
11910
11911 static tree
11912 aarch64_build_builtin_va_list (void)
11913 {
11914   tree va_list_name;
11915   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
11916
11917   /* Create the type.  */
11918   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
11919   /* Give it the required name.  */
11920   va_list_name = build_decl (BUILTINS_LOCATION,
11921                              TYPE_DECL,
11922                              get_identifier ("__va_list"),
11923                              va_list_type);
11924   DECL_ARTIFICIAL (va_list_name) = 1;
11925   TYPE_NAME (va_list_type) = va_list_name;
11926   TYPE_STUB_DECL (va_list_type) = va_list_name;
11927
11928   /* Create the fields.  */
11929   f_stack = build_decl (BUILTINS_LOCATION,
11930                         FIELD_DECL, get_identifier ("__stack"),
11931                         ptr_type_node);
11932   f_grtop = build_decl (BUILTINS_LOCATION,
11933                         FIELD_DECL, get_identifier ("__gr_top"),
11934                         ptr_type_node);
11935   f_vrtop = build_decl (BUILTINS_LOCATION,
11936                         FIELD_DECL, get_identifier ("__vr_top"),
11937                         ptr_type_node);
11938   f_groff = build_decl (BUILTINS_LOCATION,
11939                         FIELD_DECL, get_identifier ("__gr_offs"),
11940                         integer_type_node);
11941   f_vroff = build_decl (BUILTINS_LOCATION,
11942                         FIELD_DECL, get_identifier ("__vr_offs"),
11943                         integer_type_node);
11944
11945   /* Tell tree-stdarg pass about our internal offset fields.
11946      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
11947      purpose to identify whether the code is updating va_list internal
11948      offset fields through irregular way.  */
11949   va_list_gpr_counter_field = f_groff;
11950   va_list_fpr_counter_field = f_vroff;
11951
11952   DECL_ARTIFICIAL (f_stack) = 1;
11953   DECL_ARTIFICIAL (f_grtop) = 1;
11954   DECL_ARTIFICIAL (f_vrtop) = 1;
11955   DECL_ARTIFICIAL (f_groff) = 1;
11956   DECL_ARTIFICIAL (f_vroff) = 1;
11957
11958   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
11959   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
11960   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
11961   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
11962   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
11963
11964   TYPE_FIELDS (va_list_type) = f_stack;
11965   DECL_CHAIN (f_stack) = f_grtop;
11966   DECL_CHAIN (f_grtop) = f_vrtop;
11967   DECL_CHAIN (f_vrtop) = f_groff;
11968   DECL_CHAIN (f_groff) = f_vroff;
11969
11970   /* Compute its layout.  */
11971   layout_type (va_list_type);
11972
11973   return va_list_type;
11974 }
11975
11976 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
11977 static void
11978 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
11979 {
11980   const CUMULATIVE_ARGS *cum;
11981   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
11982   tree stack, grtop, vrtop, groff, vroff;
11983   tree t;
11984   int gr_save_area_size = cfun->va_list_gpr_size;
11985   int vr_save_area_size = cfun->va_list_fpr_size;
11986   int vr_offset;
11987
11988   cum = &crtl->args.info;
11989   if (cfun->va_list_gpr_size)
11990     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
11991                              cfun->va_list_gpr_size);
11992   if (cfun->va_list_fpr_size)
11993     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
11994                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
11995
11996   if (!TARGET_FLOAT)
11997     {
11998       gcc_assert (cum->aapcs_nvrn == 0);
11999       vr_save_area_size = 0;
12000     }
12001
12002   f_stack = TYPE_FIELDS (va_list_type_node);
12003   f_grtop = DECL_CHAIN (f_stack);
12004   f_vrtop = DECL_CHAIN (f_grtop);
12005   f_groff = DECL_CHAIN (f_vrtop);
12006   f_vroff = DECL_CHAIN (f_groff);
12007
12008   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
12009                   NULL_TREE);
12010   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
12011                   NULL_TREE);
12012   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
12013                   NULL_TREE);
12014   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
12015                   NULL_TREE);
12016   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
12017                   NULL_TREE);
12018
12019   /* Emit code to initialize STACK, which points to the next varargs stack
12020      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
12021      by named arguments.  STACK is 8-byte aligned.  */
12022   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
12023   if (cum->aapcs_stack_size > 0)
12024     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
12025   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
12026   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12027
12028   /* Emit code to initialize GRTOP, the top of the GR save area.
12029      virtual_incoming_args_rtx should have been 16 byte aligned.  */
12030   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
12031   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
12032   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12033
12034   /* Emit code to initialize VRTOP, the top of the VR save area.
12035      This address is gr_save_area_bytes below GRTOP, rounded
12036      down to the next 16-byte boundary.  */
12037   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
12038   vr_offset = ROUND_UP (gr_save_area_size,
12039                         STACK_BOUNDARY / BITS_PER_UNIT);
12040
12041   if (vr_offset)
12042     t = fold_build_pointer_plus_hwi (t, -vr_offset);
12043   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
12044   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12045
12046   /* Emit code to initialize GROFF, the offset from GRTOP of the
12047      next GPR argument.  */
12048   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
12049               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
12050   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12051
12052   /* Likewise emit code to initialize VROFF, the offset from FTOP
12053      of the next VR argument.  */
12054   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
12055               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
12056   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12057 }
12058
12059 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
12060
12061 static tree
12062 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
12063                               gimple_seq *post_p ATTRIBUTE_UNUSED)
12064 {
12065   tree addr;
12066   bool indirect_p;
12067   bool is_ha;           /* is HFA or HVA.  */
12068   bool dw_align;        /* double-word align.  */
12069   machine_mode ag_mode = VOIDmode;
12070   int nregs;
12071   machine_mode mode;
12072
12073   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12074   tree stack, f_top, f_off, off, arg, roundup, on_stack;
12075   HOST_WIDE_INT size, rsize, adjust, align;
12076   tree t, u, cond1, cond2;
12077
12078   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
12079   if (indirect_p)
12080     type = build_pointer_type (type);
12081
12082   mode = TYPE_MODE (type);
12083
12084   f_stack = TYPE_FIELDS (va_list_type_node);
12085   f_grtop = DECL_CHAIN (f_stack);
12086   f_vrtop = DECL_CHAIN (f_grtop);
12087   f_groff = DECL_CHAIN (f_vrtop);
12088   f_vroff = DECL_CHAIN (f_groff);
12089
12090   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
12091                   f_stack, NULL_TREE);
12092   size = int_size_in_bytes (type);
12093   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
12094
12095   dw_align = false;
12096   adjust = 0;
12097   if (aarch64_vfp_is_call_or_return_candidate (mode,
12098                                                type,
12099                                                &ag_mode,
12100                                                &nregs,
12101                                                &is_ha))
12102     {
12103       /* No frontends can create types with variable-sized modes, so we
12104          shouldn't be asked to pass or return them.  */
12105       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
12106
12107       /* TYPE passed in fp/simd registers.  */
12108       if (!TARGET_FLOAT)
12109         aarch64_err_no_fpadvsimd (mode);
12110
12111       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
12112                       unshare_expr (valist), f_vrtop, NULL_TREE);
12113       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
12114                       unshare_expr (valist), f_vroff, NULL_TREE);
12115
12116       rsize = nregs * UNITS_PER_VREG;
12117
12118       if (is_ha)
12119         {
12120           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
12121             adjust = UNITS_PER_VREG - ag_size;
12122         }
12123       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12124                && size < UNITS_PER_VREG)
12125         {
12126           adjust = UNITS_PER_VREG - size;
12127         }
12128     }
12129   else
12130     {
12131       /* TYPE passed in general registers.  */
12132       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
12133                       unshare_expr (valist), f_grtop, NULL_TREE);
12134       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
12135                       unshare_expr (valist), f_groff, NULL_TREE);
12136       rsize = ROUND_UP (size, UNITS_PER_WORD);
12137       nregs = rsize / UNITS_PER_WORD;
12138
12139       if (align > 8)
12140         dw_align = true;
12141
12142       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12143           && size < UNITS_PER_WORD)
12144         {
12145           adjust = UNITS_PER_WORD  - size;
12146         }
12147     }
12148
12149   /* Get a local temporary for the field value.  */
12150   off = get_initialized_tmp_var (f_off, pre_p, NULL);
12151
12152   /* Emit code to branch if off >= 0.  */
12153   t = build2 (GE_EXPR, boolean_type_node, off,
12154               build_int_cst (TREE_TYPE (off), 0));
12155   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
12156
12157   if (dw_align)
12158     {
12159       /* Emit: offs = (offs + 15) & -16.  */
12160       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12161                   build_int_cst (TREE_TYPE (off), 15));
12162       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
12163                   build_int_cst (TREE_TYPE (off), -16));
12164       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
12165     }
12166   else
12167     roundup = NULL;
12168
12169   /* Update ap.__[g|v]r_offs  */
12170   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12171               build_int_cst (TREE_TYPE (off), rsize));
12172   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
12173
12174   /* String up.  */
12175   if (roundup)
12176     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12177
12178   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
12179   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
12180               build_int_cst (TREE_TYPE (f_off), 0));
12181   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
12182
12183   /* String up: make sure the assignment happens before the use.  */
12184   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
12185   COND_EXPR_ELSE (cond1) = t;
12186
12187   /* Prepare the trees handling the argument that is passed on the stack;
12188      the top level node will store in ON_STACK.  */
12189   arg = get_initialized_tmp_var (stack, pre_p, NULL);
12190   if (align > 8)
12191     {
12192       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
12193       t = fold_build_pointer_plus_hwi (arg, 15);
12194       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12195                   build_int_cst (TREE_TYPE (t), -16));
12196       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
12197     }
12198   else
12199     roundup = NULL;
12200   /* Advance ap.__stack  */
12201   t = fold_build_pointer_plus_hwi (arg, size + 7);
12202   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12203               build_int_cst (TREE_TYPE (t), -8));
12204   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
12205   /* String up roundup and advance.  */
12206   if (roundup)
12207     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12208   /* String up with arg */
12209   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
12210   /* Big-endianness related address adjustment.  */
12211   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12212       && size < UNITS_PER_WORD)
12213   {
12214     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
12215                 size_int (UNITS_PER_WORD - size));
12216     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
12217   }
12218
12219   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
12220   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
12221
12222   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
12223   t = off;
12224   if (adjust)
12225     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
12226                 build_int_cst (TREE_TYPE (off), adjust));
12227
12228   t = fold_convert (sizetype, t);
12229   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
12230
12231   if (is_ha)
12232     {
12233       /* type ha; // treat as "struct {ftype field[n];}"
12234          ... [computing offs]
12235          for (i = 0; i <nregs; ++i, offs += 16)
12236            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
12237          return ha;  */
12238       int i;
12239       tree tmp_ha, field_t, field_ptr_t;
12240
12241       /* Declare a local variable.  */
12242       tmp_ha = create_tmp_var_raw (type, "ha");
12243       gimple_add_tmp_var (tmp_ha);
12244
12245       /* Establish the base type.  */
12246       switch (ag_mode)
12247         {
12248         case E_SFmode:
12249           field_t = float_type_node;
12250           field_ptr_t = float_ptr_type_node;
12251           break;
12252         case E_DFmode:
12253           field_t = double_type_node;
12254           field_ptr_t = double_ptr_type_node;
12255           break;
12256         case E_TFmode:
12257           field_t = long_double_type_node;
12258           field_ptr_t = long_double_ptr_type_node;
12259           break;
12260         case E_HFmode:
12261           field_t = aarch64_fp16_type_node;
12262           field_ptr_t = aarch64_fp16_ptr_type_node;
12263           break;
12264         case E_V2SImode:
12265         case E_V4SImode:
12266             {
12267               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
12268               field_t = build_vector_type_for_mode (innertype, ag_mode);
12269               field_ptr_t = build_pointer_type (field_t);
12270             }
12271           break;
12272         default:
12273           gcc_assert (0);
12274         }
12275
12276       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
12277       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
12278       addr = t;
12279       t = fold_convert (field_ptr_t, addr);
12280       t = build2 (MODIFY_EXPR, field_t,
12281                   build1 (INDIRECT_REF, field_t, tmp_ha),
12282                   build1 (INDIRECT_REF, field_t, t));
12283
12284       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
12285       for (i = 1; i < nregs; ++i)
12286         {
12287           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
12288           u = fold_convert (field_ptr_t, addr);
12289           u = build2 (MODIFY_EXPR, field_t,
12290                       build2 (MEM_REF, field_t, tmp_ha,
12291                               build_int_cst (field_ptr_t,
12292                                              (i *
12293                                               int_size_in_bytes (field_t)))),
12294                       build1 (INDIRECT_REF, field_t, u));
12295           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
12296         }
12297
12298       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
12299       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
12300     }
12301
12302   COND_EXPR_ELSE (cond2) = t;
12303   addr = fold_convert (build_pointer_type (type), cond1);
12304   addr = build_va_arg_indirect_ref (addr);
12305
12306   if (indirect_p)
12307     addr = build_va_arg_indirect_ref (addr);
12308
12309   return addr;
12310 }
12311
12312 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
12313
12314 static void
12315 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
12316                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
12317                                 int no_rtl)
12318 {
12319   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
12320   CUMULATIVE_ARGS local_cum;
12321   int gr_saved = cfun->va_list_gpr_size;
12322   int vr_saved = cfun->va_list_fpr_size;
12323
12324   /* The caller has advanced CUM up to, but not beyond, the last named
12325      argument.  Advance a local copy of CUM past the last "real" named
12326      argument, to find out how many registers are left over.  */
12327   local_cum = *cum;
12328   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
12329
12330   /* Found out how many registers we need to save.
12331      Honor tree-stdvar analysis results.  */
12332   if (cfun->va_list_gpr_size)
12333     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
12334                     cfun->va_list_gpr_size / UNITS_PER_WORD);
12335   if (cfun->va_list_fpr_size)
12336     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
12337                     cfun->va_list_fpr_size / UNITS_PER_VREG);
12338
12339   if (!TARGET_FLOAT)
12340     {
12341       gcc_assert (local_cum.aapcs_nvrn == 0);
12342       vr_saved = 0;
12343     }
12344
12345   if (!no_rtl)
12346     {
12347       if (gr_saved > 0)
12348         {
12349           rtx ptr, mem;
12350
12351           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
12352           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
12353                                - gr_saved * UNITS_PER_WORD);
12354           mem = gen_frame_mem (BLKmode, ptr);
12355           set_mem_alias_set (mem, get_varargs_alias_set ());
12356
12357           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
12358                                mem, gr_saved);
12359         }
12360       if (vr_saved > 0)
12361         {
12362           /* We can't use move_block_from_reg, because it will use
12363              the wrong mode, storing D regs only.  */
12364           machine_mode mode = TImode;
12365           int off, i, vr_start;
12366
12367           /* Set OFF to the offset from virtual_incoming_args_rtx of
12368              the first vector register.  The VR save area lies below
12369              the GR one, and is aligned to 16 bytes.  */
12370           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
12371                            STACK_BOUNDARY / BITS_PER_UNIT);
12372           off -= vr_saved * UNITS_PER_VREG;
12373
12374           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
12375           for (i = 0; i < vr_saved; ++i)
12376             {
12377               rtx ptr, mem;
12378
12379               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
12380               mem = gen_frame_mem (mode, ptr);
12381               set_mem_alias_set (mem, get_varargs_alias_set ());
12382               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
12383               off += UNITS_PER_VREG;
12384             }
12385         }
12386     }
12387
12388   /* We don't save the size into *PRETEND_SIZE because we want to avoid
12389      any complication of having crtl->args.pretend_args_size changed.  */
12390   cfun->machine->frame.saved_varargs_size
12391     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
12392                  STACK_BOUNDARY / BITS_PER_UNIT)
12393        + vr_saved * UNITS_PER_VREG);
12394 }
12395
12396 static void
12397 aarch64_conditional_register_usage (void)
12398 {
12399   int i;
12400   if (!TARGET_FLOAT)
12401     {
12402       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
12403         {
12404           fixed_regs[i] = 1;
12405           call_used_regs[i] = 1;
12406         }
12407     }
12408   if (!TARGET_SVE)
12409     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
12410       {
12411         fixed_regs[i] = 1;
12412         call_used_regs[i] = 1;
12413       }
12414
12415   /* When tracking speculation, we need a couple of call-clobbered registers
12416      to track the speculation state.  It would be nice to just use
12417      IP0 and IP1, but currently there are numerous places that just
12418      assume these registers are free for other uses (eg pointer
12419      authentication).  */
12420   if (aarch64_track_speculation)
12421     {
12422       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
12423       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
12424       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
12425       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
12426     }
12427 }
12428
12429 /* Walk down the type tree of TYPE counting consecutive base elements.
12430    If *MODEP is VOIDmode, then set it to the first valid floating point
12431    type.  If a non-floating point type is found, or if a floating point
12432    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
12433    otherwise return the count in the sub-tree.  */
12434 static int
12435 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
12436 {
12437   machine_mode mode;
12438   HOST_WIDE_INT size;
12439
12440   switch (TREE_CODE (type))
12441     {
12442     case REAL_TYPE:
12443       mode = TYPE_MODE (type);
12444       if (mode != DFmode && mode != SFmode
12445           && mode != TFmode && mode != HFmode)
12446         return -1;
12447
12448       if (*modep == VOIDmode)
12449         *modep = mode;
12450
12451       if (*modep == mode)
12452         return 1;
12453
12454       break;
12455
12456     case COMPLEX_TYPE:
12457       mode = TYPE_MODE (TREE_TYPE (type));
12458       if (mode != DFmode && mode != SFmode
12459           && mode != TFmode && mode != HFmode)
12460         return -1;
12461
12462       if (*modep == VOIDmode)
12463         *modep = mode;
12464
12465       if (*modep == mode)
12466         return 2;
12467
12468       break;
12469
12470     case VECTOR_TYPE:
12471       /* Use V2SImode and V4SImode as representatives of all 64-bit
12472          and 128-bit vector types.  */
12473       size = int_size_in_bytes (type);
12474       switch (size)
12475         {
12476         case 8:
12477           mode = V2SImode;
12478           break;
12479         case 16:
12480           mode = V4SImode;
12481           break;
12482         default:
12483           return -1;
12484         }
12485
12486       if (*modep == VOIDmode)
12487         *modep = mode;
12488
12489       /* Vector modes are considered to be opaque: two vectors are
12490          equivalent for the purposes of being homogeneous aggregates
12491          if they are the same size.  */
12492       if (*modep == mode)
12493         return 1;
12494
12495       break;
12496
12497     case ARRAY_TYPE:
12498       {
12499         int count;
12500         tree index = TYPE_DOMAIN (type);
12501
12502         /* Can't handle incomplete types nor sizes that are not
12503            fixed.  */
12504         if (!COMPLETE_TYPE_P (type)
12505             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12506           return -1;
12507
12508         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
12509         if (count == -1
12510             || !index
12511             || !TYPE_MAX_VALUE (index)
12512             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
12513             || !TYPE_MIN_VALUE (index)
12514             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
12515             || count < 0)
12516           return -1;
12517
12518         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
12519                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
12520
12521         /* There must be no padding.  */
12522         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12523                       count * GET_MODE_BITSIZE (*modep)))
12524           return -1;
12525
12526         return count;
12527       }
12528
12529     case RECORD_TYPE:
12530       {
12531         int count = 0;
12532         int sub_count;
12533         tree field;
12534
12535         /* Can't handle incomplete types nor sizes that are not
12536            fixed.  */
12537         if (!COMPLETE_TYPE_P (type)
12538             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12539           return -1;
12540
12541         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12542           {
12543             if (TREE_CODE (field) != FIELD_DECL)
12544               continue;
12545
12546             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12547             if (sub_count < 0)
12548               return -1;
12549             count += sub_count;
12550           }
12551
12552         /* There must be no padding.  */
12553         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12554                       count * GET_MODE_BITSIZE (*modep)))
12555           return -1;
12556
12557         return count;
12558       }
12559
12560     case UNION_TYPE:
12561     case QUAL_UNION_TYPE:
12562       {
12563         /* These aren't very interesting except in a degenerate case.  */
12564         int count = 0;
12565         int sub_count;
12566         tree field;
12567
12568         /* Can't handle incomplete types nor sizes that are not
12569            fixed.  */
12570         if (!COMPLETE_TYPE_P (type)
12571             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12572           return -1;
12573
12574         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12575           {
12576             if (TREE_CODE (field) != FIELD_DECL)
12577               continue;
12578
12579             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12580             if (sub_count < 0)
12581               return -1;
12582             count = count > sub_count ? count : sub_count;
12583           }
12584
12585         /* There must be no padding.  */
12586         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12587                       count * GET_MODE_BITSIZE (*modep)))
12588           return -1;
12589
12590         return count;
12591       }
12592
12593     default:
12594       break;
12595     }
12596
12597   return -1;
12598 }
12599
12600 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
12601    type as described in AAPCS64 \S 4.1.2.
12602
12603    See the comment above aarch64_composite_type_p for the notes on MODE.  */
12604
12605 static bool
12606 aarch64_short_vector_p (const_tree type,
12607                         machine_mode mode)
12608 {
12609   poly_int64 size = -1;
12610
12611   if (type && TREE_CODE (type) == VECTOR_TYPE)
12612     size = int_size_in_bytes (type);
12613   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
12614             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
12615     size = GET_MODE_SIZE (mode);
12616
12617   return known_eq (size, 8) || known_eq (size, 16);
12618 }
12619
12620 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
12621    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
12622    array types.  The C99 floating-point complex types are also considered
12623    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
12624    types, which are GCC extensions and out of the scope of AAPCS64, are
12625    treated as composite types here as well.
12626
12627    Note that MODE itself is not sufficient in determining whether a type
12628    is such a composite type or not.  This is because
12629    stor-layout.c:compute_record_mode may have already changed the MODE
12630    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
12631    structure with only one field may have its MODE set to the mode of the
12632    field.  Also an integer mode whose size matches the size of the
12633    RECORD_TYPE type may be used to substitute the original mode
12634    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
12635    solely relied on.  */
12636
12637 static bool
12638 aarch64_composite_type_p (const_tree type,
12639                           machine_mode mode)
12640 {
12641   if (aarch64_short_vector_p (type, mode))
12642     return false;
12643
12644   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
12645     return true;
12646
12647   if (mode == BLKmode
12648       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
12649       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
12650     return true;
12651
12652   return false;
12653 }
12654
12655 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
12656    shall be passed or returned in simd/fp register(s) (providing these
12657    parameter passing registers are available).
12658
12659    Upon successful return, *COUNT returns the number of needed registers,
12660    *BASE_MODE returns the mode of the individual register and when IS_HAF
12661    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
12662    floating-point aggregate or a homogeneous short-vector aggregate.  */
12663
12664 static bool
12665 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
12666                                          const_tree type,
12667                                          machine_mode *base_mode,
12668                                          int *count,
12669                                          bool *is_ha)
12670 {
12671   machine_mode new_mode = VOIDmode;
12672   bool composite_p = aarch64_composite_type_p (type, mode);
12673
12674   if (is_ha != NULL) *is_ha = false;
12675
12676   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
12677       || aarch64_short_vector_p (type, mode))
12678     {
12679       *count = 1;
12680       new_mode = mode;
12681     }
12682   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
12683     {
12684       if (is_ha != NULL) *is_ha = true;
12685       *count = 2;
12686       new_mode = GET_MODE_INNER (mode);
12687     }
12688   else if (type && composite_p)
12689     {
12690       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
12691
12692       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
12693         {
12694           if (is_ha != NULL) *is_ha = true;
12695           *count = ag_count;
12696         }
12697       else
12698         return false;
12699     }
12700   else
12701     return false;
12702
12703   *base_mode = new_mode;
12704   return true;
12705 }
12706
12707 /* Implement TARGET_STRUCT_VALUE_RTX.  */
12708
12709 static rtx
12710 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
12711                           int incoming ATTRIBUTE_UNUSED)
12712 {
12713   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
12714 }
12715
12716 /* Implements target hook vector_mode_supported_p.  */
12717 static bool
12718 aarch64_vector_mode_supported_p (machine_mode mode)
12719 {
12720   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12721   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
12722 }
12723
12724 /* Return appropriate SIMD container
12725    for MODE within a vector of WIDTH bits.  */
12726 static machine_mode
12727 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
12728 {
12729   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
12730     switch (mode)
12731       {
12732       case E_DFmode:
12733         return VNx2DFmode;
12734       case E_SFmode:
12735         return VNx4SFmode;
12736       case E_HFmode:
12737         return VNx8HFmode;
12738       case E_DImode:
12739         return VNx2DImode;
12740       case E_SImode:
12741         return VNx4SImode;
12742       case E_HImode:
12743         return VNx8HImode;
12744       case E_QImode:
12745         return VNx16QImode;
12746       default:
12747         return word_mode;
12748       }
12749
12750   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
12751   if (TARGET_SIMD)
12752     {
12753       if (known_eq (width, 128))
12754         switch (mode)
12755           {
12756           case E_DFmode:
12757             return V2DFmode;
12758           case E_SFmode:
12759             return V4SFmode;
12760           case E_HFmode:
12761             return V8HFmode;
12762           case E_SImode:
12763             return V4SImode;
12764           case E_HImode:
12765             return V8HImode;
12766           case E_QImode:
12767             return V16QImode;
12768           case E_DImode:
12769             return V2DImode;
12770           default:
12771             break;
12772           }
12773       else
12774         switch (mode)
12775           {
12776           case E_SFmode:
12777             return V2SFmode;
12778           case E_HFmode:
12779             return V4HFmode;
12780           case E_SImode:
12781             return V2SImode;
12782           case E_HImode:
12783             return V4HImode;
12784           case E_QImode:
12785             return V8QImode;
12786           default:
12787             break;
12788           }
12789     }
12790   return word_mode;
12791 }
12792
12793 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
12794 static machine_mode
12795 aarch64_preferred_simd_mode (scalar_mode mode)
12796 {
12797   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
12798   return aarch64_simd_container_mode (mode, bits);
12799 }
12800
12801 /* Return a list of possible vector sizes for the vectorizer
12802    to iterate over.  */
12803 static void
12804 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
12805 {
12806   if (TARGET_SVE)
12807     sizes->safe_push (BYTES_PER_SVE_VECTOR);
12808   sizes->safe_push (16);
12809   sizes->safe_push (8);
12810 }
12811
12812 /* Implement TARGET_MANGLE_TYPE.  */
12813
12814 static const char *
12815 aarch64_mangle_type (const_tree type)
12816 {
12817   /* The AArch64 ABI documents say that "__va_list" has to be
12818      managled as if it is in the "std" namespace.  */
12819   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
12820     return "St9__va_list";
12821
12822   /* Half-precision float.  */
12823   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
12824     return "Dh";
12825
12826   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
12827      builtin types.  */
12828   if (TYPE_NAME (type) != NULL)
12829     return aarch64_mangle_builtin_type (type);
12830
12831   /* Use the default mangling.  */
12832   return NULL;
12833 }
12834
12835 /* Find the first rtx_insn before insn that will generate an assembly
12836    instruction.  */
12837
12838 static rtx_insn *
12839 aarch64_prev_real_insn (rtx_insn *insn)
12840 {
12841   if (!insn)
12842     return NULL;
12843
12844   do
12845     {
12846       insn = prev_real_insn (insn);
12847     }
12848   while (insn && recog_memoized (insn) < 0);
12849
12850   return insn;
12851 }
12852
12853 static bool
12854 is_madd_op (enum attr_type t1)
12855 {
12856   unsigned int i;
12857   /* A number of these may be AArch32 only.  */
12858   enum attr_type mlatypes[] = {
12859     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
12860     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
12861     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
12862   };
12863
12864   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
12865     {
12866       if (t1 == mlatypes[i])
12867         return true;
12868     }
12869
12870   return false;
12871 }
12872
12873 /* Check if there is a register dependency between a load and the insn
12874    for which we hold recog_data.  */
12875
12876 static bool
12877 dep_between_memop_and_curr (rtx memop)
12878 {
12879   rtx load_reg;
12880   int opno;
12881
12882   gcc_assert (GET_CODE (memop) == SET);
12883
12884   if (!REG_P (SET_DEST (memop)))
12885     return false;
12886
12887   load_reg = SET_DEST (memop);
12888   for (opno = 1; opno < recog_data.n_operands; opno++)
12889     {
12890       rtx operand = recog_data.operand[opno];
12891       if (REG_P (operand)
12892           && reg_overlap_mentioned_p (load_reg, operand))
12893         return true;
12894
12895     }
12896   return false;
12897 }
12898
12899
12900 /* When working around the Cortex-A53 erratum 835769,
12901    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
12902    instruction and has a preceding memory instruction such that a NOP
12903    should be inserted between them.  */
12904
12905 bool
12906 aarch64_madd_needs_nop (rtx_insn* insn)
12907 {
12908   enum attr_type attr_type;
12909   rtx_insn *prev;
12910   rtx body;
12911
12912   if (!TARGET_FIX_ERR_A53_835769)
12913     return false;
12914
12915   if (!INSN_P (insn) || recog_memoized (insn) < 0)
12916     return false;
12917
12918   attr_type = get_attr_type (insn);
12919   if (!is_madd_op (attr_type))
12920     return false;
12921
12922   prev = aarch64_prev_real_insn (insn);
12923   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
12924      Restore recog state to INSN to avoid state corruption.  */
12925   extract_constrain_insn_cached (insn);
12926
12927   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
12928     return false;
12929
12930   body = single_set (prev);
12931
12932   /* If the previous insn is a memory op and there is no dependency between
12933      it and the DImode madd, emit a NOP between them.  If body is NULL then we
12934      have a complex memory operation, probably a load/store pair.
12935      Be conservative for now and emit a NOP.  */
12936   if (GET_MODE (recog_data.operand[0]) == DImode
12937       && (!body || !dep_between_memop_and_curr (body)))
12938     return true;
12939
12940   return false;
12941
12942 }
12943
12944
12945 /* Implement FINAL_PRESCAN_INSN.  */
12946
12947 void
12948 aarch64_final_prescan_insn (rtx_insn *insn)
12949 {
12950   if (aarch64_madd_needs_nop (insn))
12951     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
12952 }
12953
12954
12955 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
12956    instruction.  */
12957
12958 bool
12959 aarch64_sve_index_immediate_p (rtx base_or_step)
12960 {
12961   return (CONST_INT_P (base_or_step)
12962           && IN_RANGE (INTVAL (base_or_step), -16, 15));
12963 }
12964
12965 /* Return true if X is a valid immediate for the SVE ADD and SUB
12966    instructions.  Negate X first if NEGATE_P is true.  */
12967
12968 bool
12969 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
12970 {
12971   rtx elt;
12972
12973   if (!const_vec_duplicate_p (x, &elt)
12974       || !CONST_INT_P (elt))
12975     return false;
12976
12977   HOST_WIDE_INT val = INTVAL (elt);
12978   if (negate_p)
12979     val = -val;
12980   val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
12981
12982   if (val & 0xff)
12983     return IN_RANGE (val, 0, 0xff);
12984   return IN_RANGE (val, 0, 0xff00);
12985 }
12986
12987 /* Return true if X is a valid immediate operand for an SVE logical
12988    instruction such as AND.  */
12989
12990 bool
12991 aarch64_sve_bitmask_immediate_p (rtx x)
12992 {
12993   rtx elt;
12994
12995   return (const_vec_duplicate_p (x, &elt)
12996           && CONST_INT_P (elt)
12997           && aarch64_bitmask_imm (INTVAL (elt),
12998                                   GET_MODE_INNER (GET_MODE (x))));
12999 }
13000
13001 /* Return true if X is a valid immediate for the SVE DUP and CPY
13002    instructions.  */
13003
13004 bool
13005 aarch64_sve_dup_immediate_p (rtx x)
13006 {
13007   rtx elt;
13008
13009   if (!const_vec_duplicate_p (x, &elt)
13010       || !CONST_INT_P (elt))
13011     return false;
13012
13013   HOST_WIDE_INT val = INTVAL (elt);
13014   if (val & 0xff)
13015     return IN_RANGE (val, -0x80, 0x7f);
13016   return IN_RANGE (val, -0x8000, 0x7f00);
13017 }
13018
13019 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
13020    SIGNED_P says whether the operand is signed rather than unsigned.  */
13021
13022 bool
13023 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
13024 {
13025   rtx elt;
13026
13027   return (const_vec_duplicate_p (x, &elt)
13028           && CONST_INT_P (elt)
13029           && (signed_p
13030               ? IN_RANGE (INTVAL (elt), -16, 15)
13031               : IN_RANGE (INTVAL (elt), 0, 127)));
13032 }
13033
13034 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
13035    instruction.  Negate X first if NEGATE_P is true.  */
13036
13037 bool
13038 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
13039 {
13040   rtx elt;
13041   REAL_VALUE_TYPE r;
13042
13043   if (!const_vec_duplicate_p (x, &elt)
13044       || GET_CODE (elt) != CONST_DOUBLE)
13045     return false;
13046
13047   r = *CONST_DOUBLE_REAL_VALUE (elt);
13048
13049   if (negate_p)
13050     r = real_value_negate (&r);
13051
13052   if (real_equal (&r, &dconst1))
13053     return true;
13054   if (real_equal (&r, &dconsthalf))
13055     return true;
13056   return false;
13057 }
13058
13059 /* Return true if X is a valid immediate operand for an SVE FMUL
13060    instruction.  */
13061
13062 bool
13063 aarch64_sve_float_mul_immediate_p (rtx x)
13064 {
13065   rtx elt;
13066
13067   /* GCC will never generate a multiply with an immediate of 2, so there is no
13068      point testing for it (even though it is a valid constant).  */
13069   return (const_vec_duplicate_p (x, &elt)
13070           && GET_CODE (elt) == CONST_DOUBLE
13071           && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
13072 }
13073
13074 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
13075    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
13076    is nonnull, use it to describe valid immediates.  */
13077 static bool
13078 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
13079                                     simd_immediate_info *info,
13080                                     enum simd_immediate_check which,
13081                                     simd_immediate_info::insn_type insn)
13082 {
13083   /* Try a 4-byte immediate with LSL.  */
13084   for (unsigned int shift = 0; shift < 32; shift += 8)
13085     if ((val32 & (0xff << shift)) == val32)
13086       {
13087         if (info)
13088           *info = simd_immediate_info (SImode, val32 >> shift, insn,
13089                                        simd_immediate_info::LSL, shift);
13090         return true;
13091       }
13092
13093   /* Try a 2-byte immediate with LSL.  */
13094   unsigned int imm16 = val32 & 0xffff;
13095   if (imm16 == (val32 >> 16))
13096     for (unsigned int shift = 0; shift < 16; shift += 8)
13097       if ((imm16 & (0xff << shift)) == imm16)
13098         {
13099           if (info)
13100             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
13101                                          simd_immediate_info::LSL, shift);
13102           return true;
13103         }
13104
13105   /* Try a 4-byte immediate with MSL, except for cases that MVN
13106      can handle.  */
13107   if (which == AARCH64_CHECK_MOV)
13108     for (unsigned int shift = 8; shift < 24; shift += 8)
13109       {
13110         unsigned int low = (1 << shift) - 1;
13111         if (((val32 & (0xff << shift)) | low) == val32)
13112           {
13113             if (info)
13114               *info = simd_immediate_info (SImode, val32 >> shift, insn,
13115                                            simd_immediate_info::MSL, shift);
13116             return true;
13117           }
13118       }
13119
13120   return false;
13121 }
13122
13123 /* Return true if replicating VAL64 is a valid immediate for the
13124    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
13125    use it to describe valid immediates.  */
13126 static bool
13127 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
13128                                  simd_immediate_info *info,
13129                                  enum simd_immediate_check which)
13130 {
13131   unsigned int val32 = val64 & 0xffffffff;
13132   unsigned int val16 = val64 & 0xffff;
13133   unsigned int val8 = val64 & 0xff;
13134
13135   if (val32 == (val64 >> 32))
13136     {
13137       if ((which & AARCH64_CHECK_ORR) != 0
13138           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
13139                                                  simd_immediate_info::MOV))
13140         return true;
13141
13142       if ((which & AARCH64_CHECK_BIC) != 0
13143           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
13144                                                  simd_immediate_info::MVN))
13145         return true;
13146
13147       /* Try using a replicated byte.  */
13148       if (which == AARCH64_CHECK_MOV
13149           && val16 == (val32 >> 16)
13150           && val8 == (val16 >> 8))
13151         {
13152           if (info)
13153             *info = simd_immediate_info (QImode, val8);
13154           return true;
13155         }
13156     }
13157
13158   /* Try using a bit-to-bytemask.  */
13159   if (which == AARCH64_CHECK_MOV)
13160     {
13161       unsigned int i;
13162       for (i = 0; i < 64; i += 8)
13163         {
13164           unsigned char byte = (val64 >> i) & 0xff;
13165           if (byte != 0 && byte != 0xff)
13166             break;
13167         }
13168       if (i == 64)
13169         {
13170           if (info)
13171             *info = simd_immediate_info (DImode, val64);
13172           return true;
13173         }
13174     }
13175   return false;
13176 }
13177
13178 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
13179    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
13180
13181 static bool
13182 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
13183                              simd_immediate_info *info)
13184 {
13185   scalar_int_mode mode = DImode;
13186   unsigned int val32 = val64 & 0xffffffff;
13187   if (val32 == (val64 >> 32))
13188     {
13189       mode = SImode;
13190       unsigned int val16 = val32 & 0xffff;
13191       if (val16 == (val32 >> 16))
13192         {
13193           mode = HImode;
13194           unsigned int val8 = val16 & 0xff;
13195           if (val8 == (val16 >> 8))
13196             mode = QImode;
13197         }
13198     }
13199   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
13200   if (IN_RANGE (val, -0x80, 0x7f))
13201     {
13202       /* DUP with no shift.  */
13203       if (info)
13204         *info = simd_immediate_info (mode, val);
13205       return true;
13206     }
13207   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
13208     {
13209       /* DUP with LSL #8.  */
13210       if (info)
13211         *info = simd_immediate_info (mode, val);
13212       return true;
13213     }
13214   if (aarch64_bitmask_imm (val64, mode))
13215     {
13216       /* DUPM.  */
13217       if (info)
13218         *info = simd_immediate_info (mode, val);
13219       return true;
13220     }
13221   return false;
13222 }
13223
13224 /* Return true if OP is a valid SIMD immediate for the operation
13225    described by WHICH.  If INFO is nonnull, use it to describe valid
13226    immediates.  */
13227 bool
13228 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
13229                               enum simd_immediate_check which)
13230 {
13231   machine_mode mode = GET_MODE (op);
13232   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13233   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13234     return false;
13235
13236   scalar_mode elt_mode = GET_MODE_INNER (mode);
13237   rtx base, step;
13238   unsigned int n_elts;
13239   if (GET_CODE (op) == CONST_VECTOR
13240       && CONST_VECTOR_DUPLICATE_P (op))
13241     n_elts = CONST_VECTOR_NPATTERNS (op);
13242   else if ((vec_flags & VEC_SVE_DATA)
13243            && const_vec_series_p (op, &base, &step))
13244     {
13245       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
13246       if (!aarch64_sve_index_immediate_p (base)
13247           || !aarch64_sve_index_immediate_p (step))
13248         return false;
13249
13250       if (info)
13251         *info = simd_immediate_info (elt_mode, base, step);
13252       return true;
13253     }
13254   else if (GET_CODE (op) == CONST_VECTOR
13255            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
13256     /* N_ELTS set above.  */;
13257   else
13258     return false;
13259
13260   /* Handle PFALSE and PTRUE.  */
13261   if (vec_flags & VEC_SVE_PRED)
13262     return (op == CONST0_RTX (mode)
13263             || op == CONSTM1_RTX (mode));
13264
13265   scalar_float_mode elt_float_mode;
13266   if (n_elts == 1
13267       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
13268     {
13269       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
13270       if (aarch64_float_const_zero_rtx_p (elt)
13271           || aarch64_float_const_representable_p (elt))
13272         {
13273           if (info)
13274             *info = simd_immediate_info (elt_float_mode, elt);
13275           return true;
13276         }
13277     }
13278
13279   unsigned int elt_size = GET_MODE_SIZE (elt_mode);
13280   if (elt_size > 8)
13281     return false;
13282
13283   scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
13284
13285   /* Expand the vector constant out into a byte vector, with the least
13286      significant byte of the register first.  */
13287   auto_vec<unsigned char, 16> bytes;
13288   bytes.reserve (n_elts * elt_size);
13289   for (unsigned int i = 0; i < n_elts; i++)
13290     {
13291       /* The vector is provided in gcc endian-neutral fashion.
13292          For aarch64_be Advanced SIMD, it must be laid out in the vector
13293          register in reverse order.  */
13294       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
13295       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
13296
13297       if (elt_mode != elt_int_mode)
13298         elt = gen_lowpart (elt_int_mode, elt);
13299
13300       if (!CONST_INT_P (elt))
13301         return false;
13302
13303       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
13304       for (unsigned int byte = 0; byte < elt_size; byte++)
13305         {
13306           bytes.quick_push (elt_val & 0xff);
13307           elt_val >>= BITS_PER_UNIT;
13308         }
13309     }
13310
13311   /* The immediate must repeat every eight bytes.  */
13312   unsigned int nbytes = bytes.length ();
13313   for (unsigned i = 8; i < nbytes; ++i)
13314     if (bytes[i] != bytes[i - 8])
13315       return false;
13316
13317   /* Get the repeating 8-byte value as an integer.  No endian correction
13318      is needed here because bytes is already in lsb-first order.  */
13319   unsigned HOST_WIDE_INT val64 = 0;
13320   for (unsigned int i = 0; i < 8; i++)
13321     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
13322               << (i * BITS_PER_UNIT));
13323
13324   if (vec_flags & VEC_SVE_DATA)
13325     return aarch64_sve_valid_immediate (val64, info);
13326   else
13327     return aarch64_advsimd_valid_immediate (val64, info, which);
13328 }
13329
13330 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
13331    has a step in the range of INDEX.  Return the index expression if so,
13332    otherwise return null.  */
13333 rtx
13334 aarch64_check_zero_based_sve_index_immediate (rtx x)
13335 {
13336   rtx base, step;
13337   if (const_vec_series_p (x, &base, &step)
13338       && base == const0_rtx
13339       && aarch64_sve_index_immediate_p (step))
13340     return step;
13341   return NULL_RTX;
13342 }
13343
13344 /* Check of immediate shift constants are within range.  */
13345 bool
13346 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
13347 {
13348   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
13349   if (left)
13350     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
13351   else
13352     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
13353 }
13354
13355 /* Return the bitmask CONST_INT to select the bits required by a zero extract
13356    operation of width WIDTH at bit position POS.  */
13357
13358 rtx
13359 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
13360 {
13361   gcc_assert (CONST_INT_P (width));
13362   gcc_assert (CONST_INT_P (pos));
13363
13364   unsigned HOST_WIDE_INT mask
13365     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
13366   return GEN_INT (mask << UINTVAL (pos));
13367 }
13368
13369 bool
13370 aarch64_mov_operand_p (rtx x, machine_mode mode)
13371 {
13372   if (GET_CODE (x) == HIGH
13373       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
13374     return true;
13375
13376   if (CONST_INT_P (x))
13377     return true;
13378
13379   if (VECTOR_MODE_P (GET_MODE (x)))
13380     return aarch64_simd_valid_immediate (x, NULL);
13381
13382   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
13383     return true;
13384
13385   if (aarch64_sve_cnt_immediate_p (x))
13386     return true;
13387
13388   return aarch64_classify_symbolic_expression (x)
13389     == SYMBOL_TINY_ABSOLUTE;
13390 }
13391
13392 /* Return a const_int vector of VAL.  */
13393 rtx
13394 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
13395 {
13396   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
13397   return gen_const_vec_duplicate (mode, c);
13398 }
13399
13400 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
13401
13402 bool
13403 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
13404 {
13405   machine_mode vmode;
13406
13407   vmode = aarch64_simd_container_mode (mode, 64);
13408   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
13409   return aarch64_simd_valid_immediate (op_v, NULL);
13410 }
13411
13412 /* Construct and return a PARALLEL RTX vector with elements numbering the
13413    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
13414    the vector - from the perspective of the architecture.  This does not
13415    line up with GCC's perspective on lane numbers, so we end up with
13416    different masks depending on our target endian-ness.  The diagram
13417    below may help.  We must draw the distinction when building masks
13418    which select one half of the vector.  An instruction selecting
13419    architectural low-lanes for a big-endian target, must be described using
13420    a mask selecting GCC high-lanes.
13421
13422                  Big-Endian             Little-Endian
13423
13424 GCC             0   1   2   3           3   2   1   0
13425               | x | x | x | x |       | x | x | x | x |
13426 Architecture    3   2   1   0           3   2   1   0
13427
13428 Low Mask:         { 2, 3 }                { 0, 1 }
13429 High Mask:        { 0, 1 }                { 2, 3 }
13430
13431    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
13432
13433 rtx
13434 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
13435 {
13436   rtvec v = rtvec_alloc (nunits / 2);
13437   int high_base = nunits / 2;
13438   int low_base = 0;
13439   int base;
13440   rtx t1;
13441   int i;
13442
13443   if (BYTES_BIG_ENDIAN)
13444     base = high ? low_base : high_base;
13445   else
13446     base = high ? high_base : low_base;
13447
13448   for (i = 0; i < nunits / 2; i++)
13449     RTVEC_ELT (v, i) = GEN_INT (base + i);
13450
13451   t1 = gen_rtx_PARALLEL (mode, v);
13452   return t1;
13453 }
13454
13455 /* Check OP for validity as a PARALLEL RTX vector with elements
13456    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
13457    from the perspective of the architecture.  See the diagram above
13458    aarch64_simd_vect_par_cnst_half for more details.  */
13459
13460 bool
13461 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
13462                                        bool high)
13463 {
13464   int nelts;
13465   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
13466     return false;
13467
13468   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
13469   HOST_WIDE_INT count_op = XVECLEN (op, 0);
13470   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
13471   int i = 0;
13472
13473   if (count_op != count_ideal)
13474     return false;
13475
13476   for (i = 0; i < count_ideal; i++)
13477     {
13478       rtx elt_op = XVECEXP (op, 0, i);
13479       rtx elt_ideal = XVECEXP (ideal, 0, i);
13480
13481       if (!CONST_INT_P (elt_op)
13482           || INTVAL (elt_ideal) != INTVAL (elt_op))
13483         return false;
13484     }
13485   return true;
13486 }
13487
13488 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
13489    HIGH (exclusive).  */
13490 void
13491 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
13492                           const_tree exp)
13493 {
13494   HOST_WIDE_INT lane;
13495   gcc_assert (CONST_INT_P (operand));
13496   lane = INTVAL (operand);
13497
13498   if (lane < low || lane >= high)
13499   {
13500     if (exp)
13501       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
13502     else
13503       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
13504   }
13505 }
13506
13507 /* Peform endian correction on lane number N, which indexes a vector
13508    of mode MODE, and return the result as an SImode rtx.  */
13509
13510 rtx
13511 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
13512 {
13513   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
13514 }
13515
13516 /* Return TRUE if OP is a valid vector addressing mode.  */
13517
13518 bool
13519 aarch64_simd_mem_operand_p (rtx op)
13520 {
13521   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
13522                         || REG_P (XEXP (op, 0)));
13523 }
13524
13525 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
13526
13527 bool
13528 aarch64_sve_ld1r_operand_p (rtx op)
13529 {
13530   struct aarch64_address_info addr;
13531   scalar_mode mode;
13532
13533   return (MEM_P (op)
13534           && is_a <scalar_mode> (GET_MODE (op), &mode)
13535           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
13536           && addr.type == ADDRESS_REG_IMM
13537           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
13538 }
13539
13540 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
13541    The conditions for STR are the same.  */
13542 bool
13543 aarch64_sve_ldr_operand_p (rtx op)
13544 {
13545   struct aarch64_address_info addr;
13546
13547   return (MEM_P (op)
13548           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
13549                                        false, ADDR_QUERY_ANY)
13550           && addr.type == ADDRESS_REG_IMM);
13551 }
13552
13553 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
13554    We need to be able to access the individual pieces, so the range
13555    is different from LD[234] and ST[234].  */
13556 bool
13557 aarch64_sve_struct_memory_operand_p (rtx op)
13558 {
13559   if (!MEM_P (op))
13560     return false;
13561
13562   machine_mode mode = GET_MODE (op);
13563   struct aarch64_address_info addr;
13564   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
13565                                  ADDR_QUERY_ANY)
13566       || addr.type != ADDRESS_REG_IMM)
13567     return false;
13568
13569   poly_int64 first = addr.const_offset;
13570   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
13571   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
13572           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
13573 }
13574
13575 /* Emit a register copy from operand to operand, taking care not to
13576    early-clobber source registers in the process.
13577
13578    COUNT is the number of components into which the copy needs to be
13579    decomposed.  */
13580 void
13581 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
13582                                 unsigned int count)
13583 {
13584   unsigned int i;
13585   int rdest = REGNO (operands[0]);
13586   int rsrc = REGNO (operands[1]);
13587
13588   if (!reg_overlap_mentioned_p (operands[0], operands[1])
13589       || rdest < rsrc)
13590     for (i = 0; i < count; i++)
13591       emit_move_insn (gen_rtx_REG (mode, rdest + i),
13592                       gen_rtx_REG (mode, rsrc + i));
13593   else
13594     for (i = 0; i < count; i++)
13595       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
13596                       gen_rtx_REG (mode, rsrc + count - i - 1));
13597 }
13598
13599 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
13600    one of VSTRUCT modes: OI, CI, or XI.  */
13601 int
13602 aarch64_simd_attr_length_rglist (machine_mode mode)
13603 {
13604   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
13605   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
13606 }
13607
13608 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
13609    alignment of a vector to 128 bits.  SVE predicates have an alignment of
13610    16 bits.  */
13611 static HOST_WIDE_INT
13612 aarch64_simd_vector_alignment (const_tree type)
13613 {
13614   if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13615     /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
13616        be set for non-predicate vectors of booleans.  Modes are the most
13617        direct way we have of identifying real SVE predicate types.  */
13618     return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
13619   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
13620   return MIN (align, 128);
13621 }
13622
13623 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
13624 static HOST_WIDE_INT
13625 aarch64_vectorize_preferred_vector_alignment (const_tree type)
13626 {
13627   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
13628     {
13629       /* If the length of the vector is fixed, try to align to that length,
13630          otherwise don't try to align at all.  */
13631       HOST_WIDE_INT result;
13632       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
13633         result = TYPE_ALIGN (TREE_TYPE (type));
13634       return result;
13635     }
13636   return TYPE_ALIGN (type);
13637 }
13638
13639 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
13640 static bool
13641 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
13642 {
13643   if (is_packed)
13644     return false;
13645
13646   /* For fixed-length vectors, check that the vectorizer will aim for
13647      full-vector alignment.  This isn't true for generic GCC vectors
13648      that are wider than the ABI maximum of 128 bits.  */
13649   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
13650       && (wi::to_widest (TYPE_SIZE (type))
13651           != aarch64_vectorize_preferred_vector_alignment (type)))
13652     return false;
13653
13654   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
13655   return true;
13656 }
13657
13658 /* Return true if the vector misalignment factor is supported by the
13659    target.  */
13660 static bool
13661 aarch64_builtin_support_vector_misalignment (machine_mode mode,
13662                                              const_tree type, int misalignment,
13663                                              bool is_packed)
13664 {
13665   if (TARGET_SIMD && STRICT_ALIGNMENT)
13666     {
13667       /* Return if movmisalign pattern is not supported for this mode.  */
13668       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
13669         return false;
13670
13671       /* Misalignment factor is unknown at compile time.  */
13672       if (misalignment == -1)
13673         return false;
13674     }
13675   return default_builtin_support_vector_misalignment (mode, type, misalignment,
13676                                                       is_packed);
13677 }
13678
13679 /* If VALS is a vector constant that can be loaded into a register
13680    using DUP, generate instructions to do so and return an RTX to
13681    assign to the register.  Otherwise return NULL_RTX.  */
13682 static rtx
13683 aarch64_simd_dup_constant (rtx vals)
13684 {
13685   machine_mode mode = GET_MODE (vals);
13686   machine_mode inner_mode = GET_MODE_INNER (mode);
13687   rtx x;
13688
13689   if (!const_vec_duplicate_p (vals, &x))
13690     return NULL_RTX;
13691
13692   /* We can load this constant by using DUP and a constant in a
13693      single ARM register.  This will be cheaper than a vector
13694      load.  */
13695   x = copy_to_mode_reg (inner_mode, x);
13696   return gen_vec_duplicate (mode, x);
13697 }
13698
13699
13700 /* Generate code to load VALS, which is a PARALLEL containing only
13701    constants (for vec_init) or CONST_VECTOR, efficiently into a
13702    register.  Returns an RTX to copy into the register, or NULL_RTX
13703    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
13704 static rtx
13705 aarch64_simd_make_constant (rtx vals)
13706 {
13707   machine_mode mode = GET_MODE (vals);
13708   rtx const_dup;
13709   rtx const_vec = NULL_RTX;
13710   int n_const = 0;
13711   int i;
13712
13713   if (GET_CODE (vals) == CONST_VECTOR)
13714     const_vec = vals;
13715   else if (GET_CODE (vals) == PARALLEL)
13716     {
13717       /* A CONST_VECTOR must contain only CONST_INTs and
13718          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
13719          Only store valid constants in a CONST_VECTOR.  */
13720       int n_elts = XVECLEN (vals, 0);
13721       for (i = 0; i < n_elts; ++i)
13722         {
13723           rtx x = XVECEXP (vals, 0, i);
13724           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13725             n_const++;
13726         }
13727       if (n_const == n_elts)
13728         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
13729     }
13730   else
13731     gcc_unreachable ();
13732
13733   if (const_vec != NULL_RTX
13734       && aarch64_simd_valid_immediate (const_vec, NULL))
13735     /* Load using MOVI/MVNI.  */
13736     return const_vec;
13737   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
13738     /* Loaded using DUP.  */
13739     return const_dup;
13740   else if (const_vec != NULL_RTX)
13741     /* Load from constant pool. We can not take advantage of single-cycle
13742        LD1 because we need a PC-relative addressing mode.  */
13743     return const_vec;
13744   else
13745     /* A PARALLEL containing something not valid inside CONST_VECTOR.
13746        We can not construct an initializer.  */
13747     return NULL_RTX;
13748 }
13749
13750 /* Expand a vector initialisation sequence, such that TARGET is
13751    initialised to contain VALS.  */
13752
13753 void
13754 aarch64_expand_vector_init (rtx target, rtx vals)
13755 {
13756   machine_mode mode = GET_MODE (target);
13757   scalar_mode inner_mode = GET_MODE_INNER (mode);
13758   /* The number of vector elements.  */
13759   int n_elts = XVECLEN (vals, 0);
13760   /* The number of vector elements which are not constant.  */
13761   int n_var = 0;
13762   rtx any_const = NULL_RTX;
13763   /* The first element of vals.  */
13764   rtx v0 = XVECEXP (vals, 0, 0);
13765   bool all_same = true;
13766
13767   /* Count the number of variable elements to initialise.  */
13768   for (int i = 0; i < n_elts; ++i)
13769     {
13770       rtx x = XVECEXP (vals, 0, i);
13771       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
13772         ++n_var;
13773       else
13774         any_const = x;
13775
13776       all_same &= rtx_equal_p (x, v0);
13777     }
13778
13779   /* No variable elements, hand off to aarch64_simd_make_constant which knows
13780      how best to handle this.  */
13781   if (n_var == 0)
13782     {
13783       rtx constant = aarch64_simd_make_constant (vals);
13784       if (constant != NULL_RTX)
13785         {
13786           emit_move_insn (target, constant);
13787           return;
13788         }
13789     }
13790
13791   /* Splat a single non-constant element if we can.  */
13792   if (all_same)
13793     {
13794       rtx x = copy_to_mode_reg (inner_mode, v0);
13795       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13796       return;
13797     }
13798
13799   enum insn_code icode = optab_handler (vec_set_optab, mode);
13800   gcc_assert (icode != CODE_FOR_nothing);
13801
13802   /* If there are only variable elements, try to optimize
13803      the insertion using dup for the most common element
13804      followed by insertions.  */
13805
13806   /* The algorithm will fill matches[*][0] with the earliest matching element,
13807      and matches[X][1] with the count of duplicate elements (if X is the
13808      earliest element which has duplicates).  */
13809
13810   if (n_var == n_elts && n_elts <= 16)
13811     {
13812       int matches[16][2] = {0};
13813       for (int i = 0; i < n_elts; i++)
13814         {
13815           for (int j = 0; j <= i; j++)
13816             {
13817               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
13818                 {
13819                   matches[i][0] = j;
13820                   matches[j][1]++;
13821                   break;
13822                 }
13823             }
13824         }
13825       int maxelement = 0;
13826       int maxv = 0;
13827       for (int i = 0; i < n_elts; i++)
13828         if (matches[i][1] > maxv)
13829           {
13830             maxelement = i;
13831             maxv = matches[i][1];
13832           }
13833
13834       /* Create a duplicate of the most common element, unless all elements
13835          are equally useless to us, in which case just immediately set the
13836          vector register using the first element.  */
13837
13838       if (maxv == 1)
13839         {
13840           /* For vectors of two 64-bit elements, we can do even better.  */
13841           if (n_elts == 2
13842               && (inner_mode == E_DImode
13843                   || inner_mode == E_DFmode))
13844
13845             {
13846               rtx x0 = XVECEXP (vals, 0, 0);
13847               rtx x1 = XVECEXP (vals, 0, 1);
13848               /* Combine can pick up this case, but handling it directly
13849                  here leaves clearer RTL.
13850
13851                  This is load_pair_lanes<mode>, and also gives us a clean-up
13852                  for store_pair_lanes<mode>.  */
13853               if (memory_operand (x0, inner_mode)
13854                   && memory_operand (x1, inner_mode)
13855                   && !STRICT_ALIGNMENT
13856                   && rtx_equal_p (XEXP (x1, 0),
13857                                   plus_constant (Pmode,
13858                                                  XEXP (x0, 0),
13859                                                  GET_MODE_SIZE (inner_mode))))
13860                 {
13861                   rtx t;
13862                   if (inner_mode == DFmode)
13863                     t = gen_load_pair_lanesdf (target, x0, x1);
13864                   else
13865                     t = gen_load_pair_lanesdi (target, x0, x1);
13866                   emit_insn (t);
13867                   return;
13868                 }
13869             }
13870           /* The subreg-move sequence below will move into lane zero of the
13871              vector register.  For big-endian we want that position to hold
13872              the last element of VALS.  */
13873           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
13874           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
13875           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
13876         }
13877       else
13878         {
13879           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
13880           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13881         }
13882
13883       /* Insert the rest.  */
13884       for (int i = 0; i < n_elts; i++)
13885         {
13886           rtx x = XVECEXP (vals, 0, i);
13887           if (matches[i][0] == maxelement)
13888             continue;
13889           x = copy_to_mode_reg (inner_mode, x);
13890           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
13891         }
13892       return;
13893     }
13894
13895   /* Initialise a vector which is part-variable.  We want to first try
13896      to build those lanes which are constant in the most efficient way we
13897      can.  */
13898   if (n_var != n_elts)
13899     {
13900       rtx copy = copy_rtx (vals);
13901
13902       /* Load constant part of vector.  We really don't care what goes into the
13903          parts we will overwrite, but we're more likely to be able to load the
13904          constant efficiently if it has fewer, larger, repeating parts
13905          (see aarch64_simd_valid_immediate).  */
13906       for (int i = 0; i < n_elts; i++)
13907         {
13908           rtx x = XVECEXP (vals, 0, i);
13909           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13910             continue;
13911           rtx subst = any_const;
13912           for (int bit = n_elts / 2; bit > 0; bit /= 2)
13913             {
13914               /* Look in the copied vector, as more elements are const.  */
13915               rtx test = XVECEXP (copy, 0, i ^ bit);
13916               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
13917                 {
13918                   subst = test;
13919                   break;
13920                 }
13921             }
13922           XVECEXP (copy, 0, i) = subst;
13923         }
13924       aarch64_expand_vector_init (target, copy);
13925     }
13926
13927   /* Insert the variable lanes directly.  */
13928   for (int i = 0; i < n_elts; i++)
13929     {
13930       rtx x = XVECEXP (vals, 0, i);
13931       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13932         continue;
13933       x = copy_to_mode_reg (inner_mode, x);
13934       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
13935     }
13936 }
13937
13938 static unsigned HOST_WIDE_INT
13939 aarch64_shift_truncation_mask (machine_mode mode)
13940 {
13941   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
13942     return 0;
13943   return GET_MODE_UNIT_BITSIZE (mode) - 1;
13944 }
13945
13946 /* Select a format to encode pointers in exception handling data.  */
13947 int
13948 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
13949 {
13950    int type;
13951    switch (aarch64_cmodel)
13952      {
13953      case AARCH64_CMODEL_TINY:
13954      case AARCH64_CMODEL_TINY_PIC:
13955      case AARCH64_CMODEL_SMALL:
13956      case AARCH64_CMODEL_SMALL_PIC:
13957      case AARCH64_CMODEL_SMALL_SPIC:
13958        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
13959           for everything.  */
13960        type = DW_EH_PE_sdata4;
13961        break;
13962      default:
13963        /* No assumptions here.  8-byte relocs required.  */
13964        type = DW_EH_PE_sdata8;
13965        break;
13966      }
13967    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
13968 }
13969
13970 /* The last .arch and .tune assembly strings that we printed.  */
13971 static std::string aarch64_last_printed_arch_string;
13972 static std::string aarch64_last_printed_tune_string;
13973
13974 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
13975    by the function fndecl.  */
13976
13977 void
13978 aarch64_declare_function_name (FILE *stream, const char* name,
13979                                 tree fndecl)
13980 {
13981   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13982
13983   struct cl_target_option *targ_options;
13984   if (target_parts)
13985     targ_options = TREE_TARGET_OPTION (target_parts);
13986   else
13987     targ_options = TREE_TARGET_OPTION (target_option_current_node);
13988   gcc_assert (targ_options);
13989
13990   const struct processor *this_arch
13991     = aarch64_get_arch (targ_options->x_explicit_arch);
13992
13993   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
13994   std::string extension
13995     = aarch64_get_extension_string_for_isa_flags (isa_flags,
13996                                                   this_arch->flags);
13997   /* Only update the assembler .arch string if it is distinct from the last
13998      such string we printed.  */
13999   std::string to_print = this_arch->name + extension;
14000   if (to_print != aarch64_last_printed_arch_string)
14001     {
14002       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
14003       aarch64_last_printed_arch_string = to_print;
14004     }
14005
14006   /* Print the cpu name we're tuning for in the comments, might be
14007      useful to readers of the generated asm.  Do it only when it changes
14008      from function to function and verbose assembly is requested.  */
14009   const struct processor *this_tune
14010     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
14011
14012   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
14013     {
14014       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
14015                    this_tune->name);
14016       aarch64_last_printed_tune_string = this_tune->name;
14017     }
14018
14019   /* Don't forget the type directive for ELF.  */
14020   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
14021   ASM_OUTPUT_LABEL (stream, name);
14022 }
14023
14024 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
14025
14026 static void
14027 aarch64_start_file (void)
14028 {
14029   struct cl_target_option *default_options
14030     = TREE_TARGET_OPTION (target_option_default_node);
14031
14032   const struct processor *default_arch
14033     = aarch64_get_arch (default_options->x_explicit_arch);
14034   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
14035   std::string extension
14036     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
14037                                                   default_arch->flags);
14038
14039    aarch64_last_printed_arch_string = default_arch->name + extension;
14040    aarch64_last_printed_tune_string = "";
14041    asm_fprintf (asm_out_file, "\t.arch %s\n",
14042                 aarch64_last_printed_arch_string.c_str ());
14043
14044    default_file_start ();
14045 }
14046
14047 /* Emit load exclusive.  */
14048
14049 static void
14050 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
14051                              rtx mem, rtx model_rtx)
14052 {
14053   emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
14054 }
14055
14056 /* Emit store exclusive.  */
14057
14058 static void
14059 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
14060                               rtx rval, rtx mem, rtx model_rtx)
14061 {
14062   emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
14063 }
14064
14065 /* Mark the previous jump instruction as unlikely.  */
14066
14067 static void
14068 aarch64_emit_unlikely_jump (rtx insn)
14069 {
14070   rtx_insn *jump = emit_jump_insn (insn);
14071   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
14072 }
14073
14074 /* Expand a compare and swap pattern.  */
14075
14076 void
14077 aarch64_expand_compare_and_swap (rtx operands[])
14078 {
14079   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
14080   machine_mode mode, cmp_mode;
14081
14082   bval = operands[0];
14083   rval = operands[1];
14084   mem = operands[2];
14085   oldval = operands[3];
14086   newval = operands[4];
14087   is_weak = operands[5];
14088   mod_s = operands[6];
14089   mod_f = operands[7];
14090   mode = GET_MODE (mem);
14091   cmp_mode = mode;
14092
14093   /* Normally the succ memory model must be stronger than fail, but in the
14094      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
14095      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
14096
14097   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
14098       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
14099     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
14100
14101   switch (mode)
14102     {
14103     case E_QImode:
14104     case E_HImode:
14105       /* For short modes, we're going to perform the comparison in SImode,
14106          so do the zero-extension now.  */
14107       cmp_mode = SImode;
14108       rval = gen_reg_rtx (SImode);
14109       oldval = convert_modes (SImode, mode, oldval, true);
14110       /* Fall through.  */
14111
14112     case E_SImode:
14113     case E_DImode:
14114       /* Force the value into a register if needed.  */
14115       if (!aarch64_plus_operand (oldval, mode))
14116         oldval = force_reg (cmp_mode, oldval);
14117       break;
14118
14119     default:
14120       gcc_unreachable ();
14121     }
14122
14123   if (TARGET_LSE)
14124     emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem, oldval,
14125                                                  newval, is_weak, mod_s,
14126                                                  mod_f));
14127   else
14128     emit_insn (gen_aarch64_compare_and_swap (mode, rval, mem, oldval, newval,
14129                                              is_weak, mod_s, mod_f));
14130
14131
14132   if (mode == QImode || mode == HImode)
14133     emit_move_insn (operands[1], gen_lowpart (mode, rval));
14134
14135   x = gen_rtx_REG (CCmode, CC_REGNUM);
14136   x = gen_rtx_EQ (SImode, x, const0_rtx);
14137   emit_insn (gen_rtx_SET (bval, x));
14138 }
14139
14140 /* Test whether the target supports using a atomic load-operate instruction.
14141    CODE is the operation and AFTER is TRUE if the data in memory after the
14142    operation should be returned and FALSE if the data before the operation
14143    should be returned.  Returns FALSE if the operation isn't supported by the
14144    architecture.  */
14145
14146 bool
14147 aarch64_atomic_ldop_supported_p (enum rtx_code code)
14148 {
14149   if (!TARGET_LSE)
14150     return false;
14151
14152   switch (code)
14153     {
14154     case SET:
14155     case AND:
14156     case IOR:
14157     case XOR:
14158     case MINUS:
14159     case PLUS:
14160       return true;
14161     default:
14162       return false;
14163     }
14164 }
14165
14166 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
14167    sequence implementing an atomic operation.  */
14168
14169 static void
14170 aarch64_emit_post_barrier (enum memmodel model)
14171 {
14172   const enum memmodel base_model = memmodel_base (model);
14173
14174   if (is_mm_sync (model)
14175       && (base_model == MEMMODEL_ACQUIRE
14176           || base_model == MEMMODEL_ACQ_REL
14177           || base_model == MEMMODEL_SEQ_CST))
14178     {
14179       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
14180     }
14181 }
14182
14183 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
14184    for the data in memory.  EXPECTED is the value expected to be in memory.
14185    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
14186    is the memory ordering to use.  */
14187
14188 void
14189 aarch64_gen_atomic_cas (rtx rval, rtx mem,
14190                         rtx expected, rtx desired,
14191                         rtx model)
14192 {
14193   machine_mode mode;
14194
14195   mode = GET_MODE (mem);
14196
14197   /* Move the expected value into the CAS destination register.  */
14198   emit_insn (gen_rtx_SET (rval, expected));
14199
14200   /* Emit the CAS.  */
14201   emit_insn (gen_aarch64_atomic_cas (mode, rval, mem, desired, model));
14202
14203   /* Compare the expected value with the value loaded by the CAS, to establish
14204      whether the swap was made.  */
14205   aarch64_gen_compare_reg (EQ, rval, expected);
14206 }
14207
14208 /* Split a compare and swap pattern.  */
14209
14210 void
14211 aarch64_split_compare_and_swap (rtx operands[])
14212 {
14213   rtx rval, mem, oldval, newval, scratch;
14214   machine_mode mode;
14215   bool is_weak;
14216   rtx_code_label *label1, *label2;
14217   rtx x, cond;
14218   enum memmodel model;
14219   rtx model_rtx;
14220
14221   rval = operands[0];
14222   mem = operands[1];
14223   oldval = operands[2];
14224   newval = operands[3];
14225   is_weak = (operands[4] != const0_rtx);
14226   model_rtx = operands[5];
14227   scratch = operands[7];
14228   mode = GET_MODE (mem);
14229   model = memmodel_from_int (INTVAL (model_rtx));
14230
14231   /* When OLDVAL is zero and we want the strong version we can emit a tighter
14232     loop:
14233     .label1:
14234         LD[A]XR rval, [mem]
14235         CBNZ    rval, .label2
14236         ST[L]XR scratch, newval, [mem]
14237         CBNZ    scratch, .label1
14238     .label2:
14239         CMP     rval, 0.  */
14240   bool strong_zero_p = !is_weak && oldval == const0_rtx;
14241
14242   label1 = NULL;
14243   if (!is_weak)
14244     {
14245       label1 = gen_label_rtx ();
14246       emit_label (label1);
14247     }
14248   label2 = gen_label_rtx ();
14249
14250   /* The initial load can be relaxed for a __sync operation since a final
14251      barrier will be emitted to stop code hoisting.  */
14252   if (is_mm_sync (model))
14253     aarch64_emit_load_exclusive (mode, rval, mem,
14254                                  GEN_INT (MEMMODEL_RELAXED));
14255   else
14256     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
14257
14258   if (strong_zero_p)
14259     {
14260       if (aarch64_track_speculation)
14261         {
14262           /* Emit an explicit compare instruction, so that we can correctly
14263              track the condition codes.  */
14264           rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
14265           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
14266         }
14267       else
14268         x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
14269
14270       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14271                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14272       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14273     }
14274   else
14275     {
14276       cond = aarch64_gen_compare_reg (NE, rval, oldval);
14277       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14278       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14279                                  gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14280       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14281     }
14282
14283   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
14284
14285   if (!is_weak)
14286     {
14287       if (aarch64_track_speculation)
14288         {
14289           /* Emit an explicit compare instruction, so that we can correctly
14290              track the condition codes.  */
14291           rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
14292           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
14293         }
14294       else
14295         x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
14296
14297       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14298                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
14299       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14300     }
14301   else
14302     {
14303       cond = gen_rtx_REG (CCmode, CC_REGNUM);
14304       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
14305       emit_insn (gen_rtx_SET (cond, x));
14306     }
14307
14308   emit_label (label2);
14309   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
14310      to set the condition flags.  If this is not used it will be removed by
14311      later passes.  */
14312   if (strong_zero_p)
14313     {
14314       cond = gen_rtx_REG (CCmode, CC_REGNUM);
14315       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
14316       emit_insn (gen_rtx_SET (cond, x));
14317     }
14318   /* Emit any final barrier needed for a __sync operation.  */
14319   if (is_mm_sync (model))
14320     aarch64_emit_post_barrier (model);
14321 }
14322
14323 /* Emit a BIC instruction.  */
14324
14325 static void
14326 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
14327 {
14328   rtx shift_rtx = GEN_INT (shift);
14329   rtx (*gen) (rtx, rtx, rtx, rtx);
14330
14331   switch (mode)
14332     {
14333     case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
14334     case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
14335     default:
14336       gcc_unreachable ();
14337     }
14338
14339   emit_insn (gen (dst, s2, shift_rtx, s1));
14340 }
14341
14342 /* Emit an atomic swap.  */
14343
14344 static void
14345 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
14346                           rtx mem, rtx model)
14347 {
14348   emit_insn (gen_aarch64_atomic_swp (mode, dst, mem, value, model));
14349 }
14350
14351 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
14352    location to store the data read from memory.  OUT_RESULT is the location to
14353    store the result of the operation.  MEM is the memory location to read and
14354    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
14355    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
14356    be NULL.  */
14357
14358 void
14359 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
14360                          rtx mem, rtx value, rtx model_rtx)
14361 {
14362   machine_mode mode = GET_MODE (mem);
14363   machine_mode wmode = (mode == DImode ? DImode : SImode);
14364   const bool short_mode = (mode < SImode);
14365   int ldop_code;
14366   rtx src;
14367   rtx x;
14368
14369   if (out_data)
14370     out_data = gen_lowpart (mode, out_data);
14371
14372   if (out_result)
14373     out_result = gen_lowpart (mode, out_result);
14374
14375   /* Make sure the value is in a register, putting it into a destination
14376      register if it needs to be manipulated.  */
14377   if (!register_operand (value, mode)
14378       || code == AND || code == MINUS)
14379     {
14380       src = out_result ? out_result : out_data;
14381       emit_move_insn (src, gen_lowpart (mode, value));
14382     }
14383   else
14384     src = value;
14385   gcc_assert (register_operand (src, mode));
14386
14387   /* Preprocess the data for the operation as necessary.  If the operation is
14388      a SET then emit a swap instruction and finish.  */
14389   switch (code)
14390     {
14391     case SET:
14392       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
14393       return;
14394
14395     case MINUS:
14396       /* Negate the value and treat it as a PLUS.  */
14397       {
14398         rtx neg_src;
14399
14400         /* Resize the value if necessary.  */
14401         if (short_mode)
14402           src = gen_lowpart (wmode, src);
14403
14404         neg_src = gen_rtx_NEG (wmode, src);
14405         emit_insn (gen_rtx_SET (src, neg_src));
14406
14407         if (short_mode)
14408           src = gen_lowpart (mode, src);
14409       }
14410       /* Fall-through.  */
14411     case PLUS:
14412       ldop_code = UNSPECV_ATOMIC_LDOP_PLUS;
14413       break;
14414
14415     case IOR:
14416       ldop_code = UNSPECV_ATOMIC_LDOP_OR;
14417       break;
14418
14419     case XOR:
14420       ldop_code = UNSPECV_ATOMIC_LDOP_XOR;
14421       break;
14422
14423     case AND:
14424       {
14425         rtx not_src;
14426
14427         /* Resize the value if necessary.  */
14428         if (short_mode)
14429           src = gen_lowpart (wmode, src);
14430
14431         not_src = gen_rtx_NOT (wmode, src);
14432         emit_insn (gen_rtx_SET (src, not_src));
14433
14434         if (short_mode)
14435           src = gen_lowpart (mode, src);
14436       }
14437       ldop_code = UNSPECV_ATOMIC_LDOP_BIC;
14438       break;
14439
14440     default:
14441       /* The operation can't be done with atomic instructions.  */
14442       gcc_unreachable ();
14443     }
14444
14445   emit_insn (gen_aarch64_atomic_load (ldop_code, mode,
14446                                       out_data, mem, src, model_rtx));
14447
14448   /* If necessary, calculate the data in memory after the update by redoing the
14449      operation from values in registers.  */
14450   if (!out_result)
14451     return;
14452
14453   if (short_mode)
14454     {
14455       src = gen_lowpart (wmode, src);
14456       out_data = gen_lowpart (wmode, out_data);
14457       out_result = gen_lowpart (wmode, out_result);
14458     }
14459
14460   x = NULL_RTX;
14461
14462   switch (code)
14463     {
14464     case MINUS:
14465     case PLUS:
14466       x = gen_rtx_PLUS (wmode, out_data, src);
14467       break;
14468     case IOR:
14469       x = gen_rtx_IOR (wmode, out_data, src);
14470       break;
14471     case XOR:
14472       x = gen_rtx_XOR (wmode, out_data, src);
14473       break;
14474     case AND:
14475       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
14476       return;
14477     default:
14478       gcc_unreachable ();
14479     }
14480
14481   emit_set_insn (out_result, x);
14482
14483   return;
14484 }
14485
14486 /* Split an atomic operation.  */
14487
14488 void
14489 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
14490                          rtx value, rtx model_rtx, rtx cond)
14491 {
14492   machine_mode mode = GET_MODE (mem);
14493   machine_mode wmode = (mode == DImode ? DImode : SImode);
14494   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
14495   const bool is_sync = is_mm_sync (model);
14496   rtx_code_label *label;
14497   rtx x;
14498
14499   /* Split the atomic operation into a sequence.  */
14500   label = gen_label_rtx ();
14501   emit_label (label);
14502
14503   if (new_out)
14504     new_out = gen_lowpart (wmode, new_out);
14505   if (old_out)
14506     old_out = gen_lowpart (wmode, old_out);
14507   else
14508     old_out = new_out;
14509   value = simplify_gen_subreg (wmode, value, mode, 0);
14510
14511   /* The initial load can be relaxed for a __sync operation since a final
14512      barrier will be emitted to stop code hoisting.  */
14513  if (is_sync)
14514     aarch64_emit_load_exclusive (mode, old_out, mem,
14515                                  GEN_INT (MEMMODEL_RELAXED));
14516   else
14517     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
14518
14519   switch (code)
14520     {
14521     case SET:
14522       new_out = value;
14523       break;
14524
14525     case NOT:
14526       x = gen_rtx_AND (wmode, old_out, value);
14527       emit_insn (gen_rtx_SET (new_out, x));
14528       x = gen_rtx_NOT (wmode, new_out);
14529       emit_insn (gen_rtx_SET (new_out, x));
14530       break;
14531
14532     case MINUS:
14533       if (CONST_INT_P (value))
14534         {
14535           value = GEN_INT (-INTVAL (value));
14536           code = PLUS;
14537         }
14538       /* Fall through.  */
14539
14540     default:
14541       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
14542       emit_insn (gen_rtx_SET (new_out, x));
14543       break;
14544     }
14545
14546   aarch64_emit_store_exclusive (mode, cond, mem,
14547                                 gen_lowpart (mode, new_out), model_rtx);
14548
14549   if (aarch64_track_speculation)
14550     {
14551       /* Emit an explicit compare instruction, so that we can correctly
14552          track the condition codes.  */
14553       rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
14554       x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
14555     }
14556   else
14557     x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14558
14559   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14560                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
14561   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14562
14563   /* Emit any final barrier needed for a __sync operation.  */
14564   if (is_sync)
14565     aarch64_emit_post_barrier (model);
14566 }
14567
14568 static void
14569 aarch64_init_libfuncs (void)
14570 {
14571    /* Half-precision float operations.  The compiler handles all operations
14572      with NULL libfuncs by converting to SFmode.  */
14573
14574   /* Conversions.  */
14575   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
14576   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
14577
14578   /* Arithmetic.  */
14579   set_optab_libfunc (add_optab, HFmode, NULL);
14580   set_optab_libfunc (sdiv_optab, HFmode, NULL);
14581   set_optab_libfunc (smul_optab, HFmode, NULL);
14582   set_optab_libfunc (neg_optab, HFmode, NULL);
14583   set_optab_libfunc (sub_optab, HFmode, NULL);
14584
14585   /* Comparisons.  */
14586   set_optab_libfunc (eq_optab, HFmode, NULL);
14587   set_optab_libfunc (ne_optab, HFmode, NULL);
14588   set_optab_libfunc (lt_optab, HFmode, NULL);
14589   set_optab_libfunc (le_optab, HFmode, NULL);
14590   set_optab_libfunc (ge_optab, HFmode, NULL);
14591   set_optab_libfunc (gt_optab, HFmode, NULL);
14592   set_optab_libfunc (unord_optab, HFmode, NULL);
14593 }
14594
14595 /* Target hook for c_mode_for_suffix.  */
14596 static machine_mode
14597 aarch64_c_mode_for_suffix (char suffix)
14598 {
14599   if (suffix == 'q')
14600     return TFmode;
14601
14602   return VOIDmode;
14603 }
14604
14605 /* We can only represent floating point constants which will fit in
14606    "quarter-precision" values.  These values are characterised by
14607    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
14608    by:
14609
14610    (-1)^s * (n/16) * 2^r
14611
14612    Where:
14613      's' is the sign bit.
14614      'n' is an integer in the range 16 <= n <= 31.
14615      'r' is an integer in the range -3 <= r <= 4.  */
14616
14617 /* Return true iff X can be represented by a quarter-precision
14618    floating point immediate operand X.  Note, we cannot represent 0.0.  */
14619 bool
14620 aarch64_float_const_representable_p (rtx x)
14621 {
14622   /* This represents our current view of how many bits
14623      make up the mantissa.  */
14624   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
14625   int exponent;
14626   unsigned HOST_WIDE_INT mantissa, mask;
14627   REAL_VALUE_TYPE r, m;
14628   bool fail;
14629
14630   if (!CONST_DOUBLE_P (x))
14631     return false;
14632
14633   if (GET_MODE (x) == VOIDmode
14634       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
14635     return false;
14636
14637   r = *CONST_DOUBLE_REAL_VALUE (x);
14638
14639   /* We cannot represent infinities, NaNs or +/-zero.  We won't
14640      know if we have +zero until we analyse the mantissa, but we
14641      can reject the other invalid values.  */
14642   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
14643       || REAL_VALUE_MINUS_ZERO (r))
14644     return false;
14645
14646   /* Extract exponent.  */
14647   r = real_value_abs (&r);
14648   exponent = REAL_EXP (&r);
14649
14650   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
14651      highest (sign) bit, with a fixed binary point at bit point_pos.
14652      m1 holds the low part of the mantissa, m2 the high part.
14653      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
14654      bits for the mantissa, this can fail (low bits will be lost).  */
14655   real_ldexp (&m, &r, point_pos - exponent);
14656   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
14657
14658   /* If the low part of the mantissa has bits set we cannot represent
14659      the value.  */
14660   if (w.ulow () != 0)
14661     return false;
14662   /* We have rejected the lower HOST_WIDE_INT, so update our
14663      understanding of how many bits lie in the mantissa and
14664      look only at the high HOST_WIDE_INT.  */
14665   mantissa = w.elt (1);
14666   point_pos -= HOST_BITS_PER_WIDE_INT;
14667
14668   /* We can only represent values with a mantissa of the form 1.xxxx.  */
14669   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
14670   if ((mantissa & mask) != 0)
14671     return false;
14672
14673   /* Having filtered unrepresentable values, we may now remove all
14674      but the highest 5 bits.  */
14675   mantissa >>= point_pos - 5;
14676
14677   /* We cannot represent the value 0.0, so reject it.  This is handled
14678      elsewhere.  */
14679   if (mantissa == 0)
14680     return false;
14681
14682   /* Then, as bit 4 is always set, we can mask it off, leaving
14683      the mantissa in the range [0, 15].  */
14684   mantissa &= ~(1 << 4);
14685   gcc_assert (mantissa <= 15);
14686
14687   /* GCC internally does not use IEEE754-like encoding (where normalized
14688      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
14689      Our mantissa values are shifted 4 places to the left relative to
14690      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
14691      by 5 places to correct for GCC's representation.  */
14692   exponent = 5 - exponent;
14693
14694   return (exponent >= 0 && exponent <= 7);
14695 }
14696
14697 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
14698    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
14699    output MOVI/MVNI, ORR or BIC immediate.  */
14700 char*
14701 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
14702                                    enum simd_immediate_check which)
14703 {
14704   bool is_valid;
14705   static char templ[40];
14706   const char *mnemonic;
14707   const char *shift_op;
14708   unsigned int lane_count = 0;
14709   char element_char;
14710
14711   struct simd_immediate_info info;
14712
14713   /* This will return true to show const_vector is legal for use as either
14714      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
14715      It will also update INFO to show how the immediate should be generated.
14716      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
14717   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
14718   gcc_assert (is_valid);
14719
14720   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14721   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
14722
14723   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
14724     {
14725       gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
14726       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
14727          move immediate path.  */
14728       if (aarch64_float_const_zero_rtx_p (info.value))
14729         info.value = GEN_INT (0);
14730       else
14731         {
14732           const unsigned int buf_size = 20;
14733           char float_buf[buf_size] = {'\0'};
14734           real_to_decimal_for_mode (float_buf,
14735                                     CONST_DOUBLE_REAL_VALUE (info.value),
14736                                     buf_size, buf_size, 1, info.elt_mode);
14737
14738           if (lane_count == 1)
14739             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
14740           else
14741             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
14742                       lane_count, element_char, float_buf);
14743           return templ;
14744         }
14745     }
14746
14747   gcc_assert (CONST_INT_P (info.value));
14748
14749   if (which == AARCH64_CHECK_MOV)
14750     {
14751       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
14752       shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
14753       if (lane_count == 1)
14754         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
14755                   mnemonic, UINTVAL (info.value));
14756       else if (info.shift)
14757         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
14758                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
14759                   element_char, UINTVAL (info.value), shift_op, info.shift);
14760       else
14761         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
14762                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
14763                   element_char, UINTVAL (info.value));
14764     }
14765   else
14766     {
14767       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
14768       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
14769       if (info.shift)
14770         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
14771                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
14772                   element_char, UINTVAL (info.value), "lsl", info.shift);
14773       else
14774         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
14775                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
14776                   element_char, UINTVAL (info.value));
14777     }
14778   return templ;
14779 }
14780
14781 char*
14782 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
14783 {
14784
14785   /* If a floating point number was passed and we desire to use it in an
14786      integer mode do the conversion to integer.  */
14787   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
14788     {
14789       unsigned HOST_WIDE_INT ival;
14790       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
14791           gcc_unreachable ();
14792       immediate = gen_int_mode (ival, mode);
14793     }
14794
14795   machine_mode vmode;
14796   /* use a 64 bit mode for everything except for DI/DF mode, where we use
14797      a 128 bit vector mode.  */
14798   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
14799
14800   vmode = aarch64_simd_container_mode (mode, width);
14801   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
14802   return aarch64_output_simd_mov_immediate (v_op, width);
14803 }
14804
14805 /* Return the output string to use for moving immediate CONST_VECTOR
14806    into an SVE register.  */
14807
14808 char *
14809 aarch64_output_sve_mov_immediate (rtx const_vector)
14810 {
14811   static char templ[40];
14812   struct simd_immediate_info info;
14813   char element_char;
14814
14815   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
14816   gcc_assert (is_valid);
14817
14818   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14819
14820   if (info.step)
14821     {
14822       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
14823                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
14824                 element_char, INTVAL (info.value), INTVAL (info.step));
14825       return templ;
14826     }
14827
14828   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
14829     {
14830       if (aarch64_float_const_zero_rtx_p (info.value))
14831         info.value = GEN_INT (0);
14832       else
14833         {
14834           const int buf_size = 20;
14835           char float_buf[buf_size] = {};
14836           real_to_decimal_for_mode (float_buf,
14837                                     CONST_DOUBLE_REAL_VALUE (info.value),
14838                                     buf_size, buf_size, 1, info.elt_mode);
14839
14840           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
14841                     element_char, float_buf);
14842           return templ;
14843         }
14844     }
14845
14846   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
14847             element_char, INTVAL (info.value));
14848   return templ;
14849 }
14850
14851 /* Return the asm format for a PTRUE instruction whose destination has
14852    mode MODE.  SUFFIX is the element size suffix.  */
14853
14854 char *
14855 aarch64_output_ptrue (machine_mode mode, char suffix)
14856 {
14857   unsigned int nunits;
14858   static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
14859   if (GET_MODE_NUNITS (mode).is_constant (&nunits))
14860     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
14861   else
14862     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
14863   return buf;
14864 }
14865
14866 /* Split operands into moves from op[1] + op[2] into op[0].  */
14867
14868 void
14869 aarch64_split_combinev16qi (rtx operands[3])
14870 {
14871   unsigned int dest = REGNO (operands[0]);
14872   unsigned int src1 = REGNO (operands[1]);
14873   unsigned int src2 = REGNO (operands[2]);
14874   machine_mode halfmode = GET_MODE (operands[1]);
14875   unsigned int halfregs = REG_NREGS (operands[1]);
14876   rtx destlo, desthi;
14877
14878   gcc_assert (halfmode == V16QImode);
14879
14880   if (src1 == dest && src2 == dest + halfregs)
14881     {
14882       /* No-op move.  Can't split to nothing; emit something.  */
14883       emit_note (NOTE_INSN_DELETED);
14884       return;
14885     }
14886
14887   /* Preserve register attributes for variable tracking.  */
14888   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
14889   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
14890                                GET_MODE_SIZE (halfmode));
14891
14892   /* Special case of reversed high/low parts.  */
14893   if (reg_overlap_mentioned_p (operands[2], destlo)
14894       && reg_overlap_mentioned_p (operands[1], desthi))
14895     {
14896       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
14897       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
14898       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
14899     }
14900   else if (!reg_overlap_mentioned_p (operands[2], destlo))
14901     {
14902       /* Try to avoid unnecessary moves if part of the result
14903          is in the right place already.  */
14904       if (src1 != dest)
14905         emit_move_insn (destlo, operands[1]);
14906       if (src2 != dest + halfregs)
14907         emit_move_insn (desthi, operands[2]);
14908     }
14909   else
14910     {
14911       if (src2 != dest + halfregs)
14912         emit_move_insn (desthi, operands[2]);
14913       if (src1 != dest)
14914         emit_move_insn (destlo, operands[1]);
14915     }
14916 }
14917
14918 /* vec_perm support.  */
14919
14920 struct expand_vec_perm_d
14921 {
14922   rtx target, op0, op1;
14923   vec_perm_indices perm;
14924   machine_mode vmode;
14925   unsigned int vec_flags;
14926   bool one_vector_p;
14927   bool testing_p;
14928 };
14929
14930 /* Generate a variable permutation.  */
14931
14932 static void
14933 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
14934 {
14935   machine_mode vmode = GET_MODE (target);
14936   bool one_vector_p = rtx_equal_p (op0, op1);
14937
14938   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
14939   gcc_checking_assert (GET_MODE (op0) == vmode);
14940   gcc_checking_assert (GET_MODE (op1) == vmode);
14941   gcc_checking_assert (GET_MODE (sel) == vmode);
14942   gcc_checking_assert (TARGET_SIMD);
14943
14944   if (one_vector_p)
14945     {
14946       if (vmode == V8QImode)
14947         {
14948           /* Expand the argument to a V16QI mode by duplicating it.  */
14949           rtx pair = gen_reg_rtx (V16QImode);
14950           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
14951           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
14952         }
14953       else
14954         {
14955           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
14956         }
14957     }
14958   else
14959     {
14960       rtx pair;
14961
14962       if (vmode == V8QImode)
14963         {
14964           pair = gen_reg_rtx (V16QImode);
14965           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
14966           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
14967         }
14968       else
14969         {
14970           pair = gen_reg_rtx (OImode);
14971           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
14972           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
14973         }
14974     }
14975 }
14976
14977 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
14978    NELT is the number of elements in the vector.  */
14979
14980 void
14981 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
14982                          unsigned int nelt)
14983 {
14984   machine_mode vmode = GET_MODE (target);
14985   bool one_vector_p = rtx_equal_p (op0, op1);
14986   rtx mask;
14987
14988   /* The TBL instruction does not use a modulo index, so we must take care
14989      of that ourselves.  */
14990   mask = aarch64_simd_gen_const_vector_dup (vmode,
14991       one_vector_p ? nelt - 1 : 2 * nelt - 1);
14992   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
14993
14994   /* For big-endian, we also need to reverse the index within the vector
14995      (but not which vector).  */
14996   if (BYTES_BIG_ENDIAN)
14997     {
14998       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
14999       if (!one_vector_p)
15000         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
15001       sel = expand_simple_binop (vmode, XOR, sel, mask,
15002                                  NULL, 0, OPTAB_LIB_WIDEN);
15003     }
15004   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
15005 }
15006
15007 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
15008
15009 static void
15010 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
15011 {
15012   emit_insn (gen_rtx_SET (target,
15013                           gen_rtx_UNSPEC (GET_MODE (target),
15014                                           gen_rtvec (2, op0, op1), code)));
15015 }
15016
15017 /* Expand an SVE vec_perm with the given operands.  */
15018
15019 void
15020 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
15021 {
15022   machine_mode data_mode = GET_MODE (target);
15023   machine_mode sel_mode = GET_MODE (sel);
15024   /* Enforced by the pattern condition.  */
15025   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
15026
15027   /* Note: vec_perm indices are supposed to wrap when they go beyond the
15028      size of the two value vectors, i.e. the upper bits of the indices
15029      are effectively ignored.  SVE TBL instead produces 0 for any
15030      out-of-range indices, so we need to modulo all the vec_perm indices
15031      to ensure they are all in range.  */
15032   rtx sel_reg = force_reg (sel_mode, sel);
15033
15034   /* Check if the sel only references the first values vector.  */
15035   if (GET_CODE (sel) == CONST_VECTOR
15036       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
15037     {
15038       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
15039       return;
15040     }
15041
15042   /* Check if the two values vectors are the same.  */
15043   if (rtx_equal_p (op0, op1))
15044     {
15045       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
15046       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15047                                          NULL, 0, OPTAB_DIRECT);
15048       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
15049       return;
15050     }
15051
15052   /* Run TBL on for each value vector and combine the results.  */
15053
15054   rtx res0 = gen_reg_rtx (data_mode);
15055   rtx res1 = gen_reg_rtx (data_mode);
15056   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
15057   if (GET_CODE (sel) != CONST_VECTOR
15058       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
15059     {
15060       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
15061                                                        2 * nunits - 1);
15062       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15063                                      NULL, 0, OPTAB_DIRECT);
15064     }
15065   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
15066   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
15067                                      NULL, 0, OPTAB_DIRECT);
15068   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
15069   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
15070     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
15071   else
15072     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
15073 }
15074
15075 /* Recognize patterns suitable for the TRN instructions.  */
15076 static bool
15077 aarch64_evpc_trn (struct expand_vec_perm_d *d)
15078 {
15079   HOST_WIDE_INT odd;
15080   poly_uint64 nelt = d->perm.length ();
15081   rtx out, in0, in1, x;
15082   machine_mode vmode = d->vmode;
15083
15084   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15085     return false;
15086
15087   /* Note that these are little-endian tests.
15088      We correct for big-endian later.  */
15089   if (!d->perm[0].is_constant (&odd)
15090       || (odd != 0 && odd != 1)
15091       || !d->perm.series_p (0, 2, odd, 2)
15092       || !d->perm.series_p (1, 2, nelt + odd, 2))
15093     return false;
15094
15095   /* Success!  */
15096   if (d->testing_p)
15097     return true;
15098
15099   in0 = d->op0;
15100   in1 = d->op1;
15101   /* We don't need a big-endian lane correction for SVE; see the comment
15102      at the head of aarch64-sve.md for details.  */
15103   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15104     {
15105       x = in0, in0 = in1, in1 = x;
15106       odd = !odd;
15107     }
15108   out = d->target;
15109
15110   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15111                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
15112   return true;
15113 }
15114
15115 /* Recognize patterns suitable for the UZP instructions.  */
15116 static bool
15117 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
15118 {
15119   HOST_WIDE_INT odd;
15120   rtx out, in0, in1, x;
15121   machine_mode vmode = d->vmode;
15122
15123   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15124     return false;
15125
15126   /* Note that these are little-endian tests.
15127      We correct for big-endian later.  */
15128   if (!d->perm[0].is_constant (&odd)
15129       || (odd != 0 && odd != 1)
15130       || !d->perm.series_p (0, 1, odd, 2))
15131     return false;
15132
15133   /* Success!  */
15134   if (d->testing_p)
15135     return true;
15136
15137   in0 = d->op0;
15138   in1 = d->op1;
15139   /* We don't need a big-endian lane correction for SVE; see the comment
15140      at the head of aarch64-sve.md for details.  */
15141   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15142     {
15143       x = in0, in0 = in1, in1 = x;
15144       odd = !odd;
15145     }
15146   out = d->target;
15147
15148   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15149                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
15150   return true;
15151 }
15152
15153 /* Recognize patterns suitable for the ZIP instructions.  */
15154 static bool
15155 aarch64_evpc_zip (struct expand_vec_perm_d *d)
15156 {
15157   unsigned int high;
15158   poly_uint64 nelt = d->perm.length ();
15159   rtx out, in0, in1, x;
15160   machine_mode vmode = d->vmode;
15161
15162   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15163     return false;
15164
15165   /* Note that these are little-endian tests.
15166      We correct for big-endian later.  */
15167   poly_uint64 first = d->perm[0];
15168   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
15169       || !d->perm.series_p (0, 2, first, 1)
15170       || !d->perm.series_p (1, 2, first + nelt, 1))
15171     return false;
15172   high = maybe_ne (first, 0U);
15173
15174   /* Success!  */
15175   if (d->testing_p)
15176     return true;
15177
15178   in0 = d->op0;
15179   in1 = d->op1;
15180   /* We don't need a big-endian lane correction for SVE; see the comment
15181      at the head of aarch64-sve.md for details.  */
15182   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15183     {
15184       x = in0, in0 = in1, in1 = x;
15185       high = !high;
15186     }
15187   out = d->target;
15188
15189   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15190                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
15191   return true;
15192 }
15193
15194 /* Recognize patterns for the EXT insn.  */
15195
15196 static bool
15197 aarch64_evpc_ext (struct expand_vec_perm_d *d)
15198 {
15199   HOST_WIDE_INT location;
15200   rtx offset;
15201
15202   /* The first element always refers to the first vector.
15203      Check if the extracted indices are increasing by one.  */
15204   if (d->vec_flags == VEC_SVE_PRED
15205       || !d->perm[0].is_constant (&location)
15206       || !d->perm.series_p (0, 1, location, 1))
15207     return false;
15208
15209   /* Success! */
15210   if (d->testing_p)
15211     return true;
15212
15213   /* The case where (location == 0) is a no-op for both big- and little-endian,
15214      and is removed by the mid-end at optimization levels -O1 and higher.
15215
15216      We don't need a big-endian lane correction for SVE; see the comment
15217      at the head of aarch64-sve.md for details.  */
15218   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
15219     {
15220       /* After setup, we want the high elements of the first vector (stored
15221          at the LSB end of the register), and the low elements of the second
15222          vector (stored at the MSB end of the register). So swap.  */
15223       std::swap (d->op0, d->op1);
15224       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
15225          to_constant () is safe since this is restricted to Advanced SIMD
15226          vectors.  */
15227       location = d->perm.length ().to_constant () - location;
15228     }
15229
15230   offset = GEN_INT (location);
15231   emit_set_insn (d->target,
15232                  gen_rtx_UNSPEC (d->vmode,
15233                                  gen_rtvec (3, d->op0, d->op1, offset),
15234                                  UNSPEC_EXT));
15235   return true;
15236 }
15237
15238 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
15239    within each 64-bit, 32-bit or 16-bit granule.  */
15240
15241 static bool
15242 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
15243 {
15244   HOST_WIDE_INT diff;
15245   unsigned int i, size, unspec;
15246   machine_mode pred_mode;
15247
15248   if (d->vec_flags == VEC_SVE_PRED
15249       || !d->one_vector_p
15250       || !d->perm[0].is_constant (&diff))
15251     return false;
15252
15253   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
15254   if (size == 8)
15255     {
15256       unspec = UNSPEC_REV64;
15257       pred_mode = VNx2BImode;
15258     }
15259   else if (size == 4)
15260     {
15261       unspec = UNSPEC_REV32;
15262       pred_mode = VNx4BImode;
15263     }
15264   else if (size == 2)
15265     {
15266       unspec = UNSPEC_REV16;
15267       pred_mode = VNx8BImode;
15268     }
15269   else
15270     return false;
15271
15272   unsigned int step = diff + 1;
15273   for (i = 0; i < step; ++i)
15274     if (!d->perm.series_p (i, step, diff - i, step))
15275       return false;
15276
15277   /* Success! */
15278   if (d->testing_p)
15279     return true;
15280
15281   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
15282   if (d->vec_flags == VEC_SVE_DATA)
15283     {
15284       rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15285       src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
15286                             UNSPEC_MERGE_PTRUE);
15287     }
15288   emit_set_insn (d->target, src);
15289   return true;
15290 }
15291
15292 /* Recognize patterns for the REV insn, which reverses elements within
15293    a full vector.  */
15294
15295 static bool
15296 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
15297 {
15298   poly_uint64 nelt = d->perm.length ();
15299
15300   if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
15301     return false;
15302
15303   if (!d->perm.series_p (0, 1, nelt - 1, -1))
15304     return false;
15305
15306   /* Success! */
15307   if (d->testing_p)
15308     return true;
15309
15310   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
15311   emit_set_insn (d->target, src);
15312   return true;
15313 }
15314
15315 static bool
15316 aarch64_evpc_dup (struct expand_vec_perm_d *d)
15317 {
15318   rtx out = d->target;
15319   rtx in0;
15320   HOST_WIDE_INT elt;
15321   machine_mode vmode = d->vmode;
15322   rtx lane;
15323
15324   if (d->vec_flags == VEC_SVE_PRED
15325       || d->perm.encoding ().encoded_nelts () != 1
15326       || !d->perm[0].is_constant (&elt))
15327     return false;
15328
15329   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
15330     return false;
15331
15332   /* Success! */
15333   if (d->testing_p)
15334     return true;
15335
15336   /* The generic preparation in aarch64_expand_vec_perm_const_1
15337      swaps the operand order and the permute indices if it finds
15338      d->perm[0] to be in the second operand.  Thus, we can always
15339      use d->op0 and need not do any extra arithmetic to get the
15340      correct lane number.  */
15341   in0 = d->op0;
15342   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
15343
15344   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
15345   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
15346   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
15347   return true;
15348 }
15349
15350 static bool
15351 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
15352 {
15353   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
15354   machine_mode vmode = d->vmode;
15355
15356   /* Make sure that the indices are constant.  */
15357   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
15358   for (unsigned int i = 0; i < encoded_nelts; ++i)
15359     if (!d->perm[i].is_constant ())
15360       return false;
15361
15362   if (d->testing_p)
15363     return true;
15364
15365   /* Generic code will try constant permutation twice.  Once with the
15366      original mode and again with the elements lowered to QImode.
15367      So wait and don't do the selector expansion ourselves.  */
15368   if (vmode != V8QImode && vmode != V16QImode)
15369     return false;
15370
15371   /* to_constant is safe since this routine is specific to Advanced SIMD
15372      vectors.  */
15373   unsigned int nelt = d->perm.length ().to_constant ();
15374   for (unsigned int i = 0; i < nelt; ++i)
15375     /* If big-endian and two vectors we end up with a weird mixed-endian
15376        mode on NEON.  Reverse the index within each word but not the word
15377        itself.  to_constant is safe because we checked is_constant above.  */
15378     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
15379                         ? d->perm[i].to_constant () ^ (nelt - 1)
15380                         : d->perm[i].to_constant ());
15381
15382   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
15383   sel = force_reg (vmode, sel);
15384
15385   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
15386   return true;
15387 }
15388
15389 /* Try to implement D using an SVE TBL instruction.  */
15390
15391 static bool
15392 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
15393 {
15394   unsigned HOST_WIDE_INT nelt;
15395
15396   /* Permuting two variable-length vectors could overflow the
15397      index range.  */
15398   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
15399     return false;
15400
15401   if (d->testing_p)
15402     return true;
15403
15404   machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
15405   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
15406   aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
15407   return true;
15408 }
15409
15410 static bool
15411 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
15412 {
15413   /* The pattern matching functions above are written to look for a small
15414      number to begin the sequence (0, 1, N/2).  If we begin with an index
15415      from the second operand, we can swap the operands.  */
15416   poly_int64 nelt = d->perm.length ();
15417   if (known_ge (d->perm[0], nelt))
15418     {
15419       d->perm.rotate_inputs (1);
15420       std::swap (d->op0, d->op1);
15421     }
15422
15423   if ((d->vec_flags == VEC_ADVSIMD
15424        || d->vec_flags == VEC_SVE_DATA
15425        || d->vec_flags == VEC_SVE_PRED)
15426       && known_gt (nelt, 1))
15427     {
15428       if (aarch64_evpc_rev_local (d))
15429         return true;
15430       else if (aarch64_evpc_rev_global (d))
15431         return true;
15432       else if (aarch64_evpc_ext (d))
15433         return true;
15434       else if (aarch64_evpc_dup (d))
15435         return true;
15436       else if (aarch64_evpc_zip (d))
15437         return true;
15438       else if (aarch64_evpc_uzp (d))
15439         return true;
15440       else if (aarch64_evpc_trn (d))
15441         return true;
15442       if (d->vec_flags == VEC_SVE_DATA)
15443         return aarch64_evpc_sve_tbl (d);
15444       else if (d->vec_flags == VEC_SVE_DATA)
15445         return aarch64_evpc_tbl (d);
15446     }
15447   return false;
15448 }
15449
15450 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
15451
15452 static bool
15453 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
15454                                   rtx op1, const vec_perm_indices &sel)
15455 {
15456   struct expand_vec_perm_d d;
15457
15458   /* Check whether the mask can be applied to a single vector.  */
15459   if (op0 && rtx_equal_p (op0, op1))
15460     d.one_vector_p = true;
15461   else if (sel.all_from_input_p (0))
15462     {
15463       d.one_vector_p = true;
15464       op1 = op0;
15465     }
15466   else if (sel.all_from_input_p (1))
15467     {
15468       d.one_vector_p = true;
15469       op0 = op1;
15470     }
15471   else
15472     d.one_vector_p = false;
15473
15474   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
15475                      sel.nelts_per_input ());
15476   d.vmode = vmode;
15477   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
15478   d.target = target;
15479   d.op0 = op0;
15480   d.op1 = op1;
15481   d.testing_p = !target;
15482
15483   if (!d.testing_p)
15484     return aarch64_expand_vec_perm_const_1 (&d);
15485
15486   rtx_insn *last = get_last_insn ();
15487   bool ret = aarch64_expand_vec_perm_const_1 (&d);
15488   gcc_assert (last == get_last_insn ());
15489
15490   return ret;
15491 }
15492
15493 /* Generate a byte permute mask for a register of mode MODE,
15494    which has NUNITS units.  */
15495
15496 rtx
15497 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
15498 {
15499   /* We have to reverse each vector because we dont have
15500      a permuted load that can reverse-load according to ABI rules.  */
15501   rtx mask;
15502   rtvec v = rtvec_alloc (16);
15503   unsigned int i, j;
15504   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
15505
15506   gcc_assert (BYTES_BIG_ENDIAN);
15507   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
15508
15509   for (i = 0; i < nunits; i++)
15510     for (j = 0; j < usize; j++)
15511       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
15512   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
15513   return force_reg (V16QImode, mask);
15514 }
15515
15516 /* Return true if X is a valid second operand for the SVE instruction
15517    that implements integer comparison OP_CODE.  */
15518
15519 static bool
15520 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
15521 {
15522   if (register_operand (x, VOIDmode))
15523     return true;
15524
15525   switch (op_code)
15526     {
15527     case LTU:
15528     case LEU:
15529     case GEU:
15530     case GTU:
15531       return aarch64_sve_cmp_immediate_p (x, false);
15532     case LT:
15533     case LE:
15534     case GE:
15535     case GT:
15536     case NE:
15537     case EQ:
15538       return aarch64_sve_cmp_immediate_p (x, true);
15539     default:
15540       gcc_unreachable ();
15541     }
15542 }
15543
15544 /* Use predicated SVE instructions to implement the equivalent of:
15545
15546      (set TARGET OP)
15547
15548    given that PTRUE is an all-true predicate of the appropriate mode.  */
15549
15550 static void
15551 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
15552 {
15553   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
15554                                gen_rtvec (2, ptrue, op),
15555                                UNSPEC_MERGE_PTRUE);
15556   rtx_insn *insn = emit_set_insn (target, unspec);
15557   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
15558 }
15559
15560 /* Likewise, but also clobber the condition codes.  */
15561
15562 static void
15563 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
15564 {
15565   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
15566                                gen_rtvec (2, ptrue, op),
15567                                UNSPEC_MERGE_PTRUE);
15568   rtx_insn *insn = emit_insn (gen_set_clobber_cc (target, unspec));
15569   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
15570 }
15571
15572 /* Return the UNSPEC_COND_* code for comparison CODE.  */
15573
15574 static unsigned int
15575 aarch64_unspec_cond_code (rtx_code code)
15576 {
15577   switch (code)
15578     {
15579     case NE:
15580       return UNSPEC_COND_NE;
15581     case EQ:
15582       return UNSPEC_COND_EQ;
15583     case LT:
15584       return UNSPEC_COND_LT;
15585     case GT:
15586       return UNSPEC_COND_GT;
15587     case LE:
15588       return UNSPEC_COND_LE;
15589     case GE:
15590       return UNSPEC_COND_GE;
15591     default:
15592       gcc_unreachable ();
15593     }
15594 }
15595
15596 /* Emit:
15597
15598       (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
15599
15600    where <X> is the operation associated with comparison CODE.  This form
15601    of instruction is used when (and (CODE OP0 OP1) PRED) would have different
15602    semantics, such as when PRED might not be all-true and when comparing
15603    inactive lanes could have side effects.  */
15604
15605 static void
15606 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
15607                                   rtx pred, rtx op0, rtx op1)
15608 {
15609   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
15610                                gen_rtvec (3, pred, op0, op1),
15611                                aarch64_unspec_cond_code (code));
15612   emit_set_insn (target, unspec);
15613 }
15614
15615 /* Expand an SVE integer comparison using the SVE equivalent of:
15616
15617      (set TARGET (CODE OP0 OP1)).  */
15618
15619 void
15620 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
15621 {
15622   machine_mode pred_mode = GET_MODE (target);
15623   machine_mode data_mode = GET_MODE (op0);
15624
15625   if (!aarch64_sve_cmp_operand_p (code, op1))
15626     op1 = force_reg (data_mode, op1);
15627
15628   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15629   rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15630   aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
15631 }
15632
15633 /* Emit the SVE equivalent of:
15634
15635       (set TMP1 (CODE1 OP0 OP1))
15636       (set TMP2 (CODE2 OP0 OP1))
15637       (set TARGET (ior:PRED_MODE TMP1 TMP2))
15638
15639    PTRUE is an all-true predicate with the same mode as TARGET.  */
15640
15641 static void
15642 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
15643                            rtx ptrue, rtx op0, rtx op1)
15644 {
15645   machine_mode pred_mode = GET_MODE (ptrue);
15646   rtx tmp1 = gen_reg_rtx (pred_mode);
15647   aarch64_emit_sve_ptrue_op (tmp1, ptrue,
15648                              gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
15649   rtx tmp2 = gen_reg_rtx (pred_mode);
15650   aarch64_emit_sve_ptrue_op (tmp2, ptrue,
15651                              gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
15652   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
15653 }
15654
15655 /* Emit the SVE equivalent of:
15656
15657       (set TMP (CODE OP0 OP1))
15658       (set TARGET (not TMP))
15659
15660    PTRUE is an all-true predicate with the same mode as TARGET.  */
15661
15662 static void
15663 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
15664                                 rtx op0, rtx op1)
15665 {
15666   machine_mode pred_mode = GET_MODE (ptrue);
15667   rtx tmp = gen_reg_rtx (pred_mode);
15668   aarch64_emit_sve_ptrue_op (tmp, ptrue,
15669                              gen_rtx_fmt_ee (code, pred_mode, op0, op1));
15670   aarch64_emit_unop (target, one_cmpl_optab, tmp);
15671 }
15672
15673 /* Expand an SVE floating-point comparison using the SVE equivalent of:
15674
15675      (set TARGET (CODE OP0 OP1))
15676
15677    If CAN_INVERT_P is true, the caller can also handle inverted results;
15678    return true if the result is in fact inverted.  */
15679
15680 bool
15681 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
15682                                   rtx op0, rtx op1, bool can_invert_p)
15683 {
15684   machine_mode pred_mode = GET_MODE (target);
15685   machine_mode data_mode = GET_MODE (op0);
15686
15687   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15688   switch (code)
15689     {
15690     case UNORDERED:
15691       /* UNORDERED has no immediate form.  */
15692       op1 = force_reg (data_mode, op1);
15693       /* fall through */
15694     case LT:
15695     case LE:
15696     case GT:
15697     case GE:
15698     case EQ:
15699     case NE:
15700       {
15701         /* There is native support for the comparison.  */
15702         rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15703         aarch64_emit_sve_ptrue_op (target, ptrue, cond);
15704         return false;
15705       }
15706
15707     case LTGT:
15708       /* This is a trapping operation (LT or GT).  */
15709       aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
15710       return false;
15711
15712     case UNEQ:
15713       if (!flag_trapping_math)
15714         {
15715           /* This would trap for signaling NaNs.  */
15716           op1 = force_reg (data_mode, op1);
15717           aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
15718           return false;
15719         }
15720       /* fall through */
15721     case UNLT:
15722     case UNLE:
15723     case UNGT:
15724     case UNGE:
15725       if (flag_trapping_math)
15726         {
15727           /* Work out which elements are ordered.  */
15728           rtx ordered = gen_reg_rtx (pred_mode);
15729           op1 = force_reg (data_mode, op1);
15730           aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
15731
15732           /* Test the opposite condition for the ordered elements,
15733              then invert the result.  */
15734           if (code == UNEQ)
15735             code = NE;
15736           else
15737             code = reverse_condition_maybe_unordered (code);
15738           if (can_invert_p)
15739             {
15740               aarch64_emit_sve_predicated_cond (target, code,
15741                                                 ordered, op0, op1);
15742               return true;
15743             }
15744           rtx tmp = gen_reg_rtx (pred_mode);
15745           aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
15746           aarch64_emit_unop (target, one_cmpl_optab, tmp);
15747           return false;
15748         }
15749       break;
15750
15751     case ORDERED:
15752       /* ORDERED has no immediate form.  */
15753       op1 = force_reg (data_mode, op1);
15754       break;
15755
15756     default:
15757       gcc_unreachable ();
15758     }
15759
15760   /* There is native support for the inverse comparison.  */
15761   code = reverse_condition_maybe_unordered (code);
15762   if (can_invert_p)
15763     {
15764       rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15765       aarch64_emit_sve_ptrue_op (target, ptrue, cond);
15766       return true;
15767     }
15768   aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
15769   return false;
15770 }
15771
15772 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
15773    of the data being selected and CMP_MODE is the mode of the values being
15774    compared.  */
15775
15776 void
15777 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
15778                           rtx *ops)
15779 {
15780   machine_mode pred_mode
15781     = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
15782                              GET_MODE_SIZE (cmp_mode)).require ();
15783   rtx pred = gen_reg_rtx (pred_mode);
15784   if (FLOAT_MODE_P (cmp_mode))
15785     {
15786       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
15787                                             ops[4], ops[5], true))
15788         std::swap (ops[1], ops[2]);
15789     }
15790   else
15791     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
15792
15793   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
15794   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
15795 }
15796
15797 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
15798    true.  However due to issues with register allocation it is preferable
15799    to avoid tieing integer scalar and FP scalar modes.  Executing integer
15800    operations in general registers is better than treating them as scalar
15801    vector operations.  This reduces latency and avoids redundant int<->FP
15802    moves.  So tie modes if they are either the same class, or vector modes
15803    with other vector modes, vector structs or any scalar mode.  */
15804
15805 static bool
15806 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
15807 {
15808   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
15809     return true;
15810
15811   /* We specifically want to allow elements of "structure" modes to
15812      be tieable to the structure.  This more general condition allows
15813      other rarer situations too.  The reason we don't extend this to
15814      predicate modes is that there are no predicate structure modes
15815      nor any specific instructions for extracting part of a predicate
15816      register.  */
15817   if (aarch64_vector_data_mode_p (mode1)
15818       && aarch64_vector_data_mode_p (mode2))
15819     return true;
15820
15821   /* Also allow any scalar modes with vectors.  */
15822   if (aarch64_vector_mode_supported_p (mode1)
15823       || aarch64_vector_mode_supported_p (mode2))
15824     return true;
15825
15826   return false;
15827 }
15828
15829 /* Return a new RTX holding the result of moving POINTER forward by
15830    AMOUNT bytes.  */
15831
15832 static rtx
15833 aarch64_move_pointer (rtx pointer, poly_int64 amount)
15834 {
15835   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
15836
15837   return adjust_automodify_address (pointer, GET_MODE (pointer),
15838                                     next, amount);
15839 }
15840
15841 /* Return a new RTX holding the result of moving POINTER forward by the
15842    size of the mode it points to.  */
15843
15844 static rtx
15845 aarch64_progress_pointer (rtx pointer)
15846 {
15847   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
15848 }
15849
15850 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
15851    MODE bytes.  */
15852
15853 static void
15854 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
15855                                               machine_mode mode)
15856 {
15857   rtx reg = gen_reg_rtx (mode);
15858
15859   /* "Cast" the pointers to the correct mode.  */
15860   *src = adjust_address (*src, mode, 0);
15861   *dst = adjust_address (*dst, mode, 0);
15862   /* Emit the memcpy.  */
15863   emit_move_insn (reg, *src);
15864   emit_move_insn (*dst, reg);
15865   /* Move the pointers forward.  */
15866   *src = aarch64_progress_pointer (*src);
15867   *dst = aarch64_progress_pointer (*dst);
15868 }
15869
15870 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
15871    we succeed, otherwise return false.  */
15872
15873 bool
15874 aarch64_expand_movmem (rtx *operands)
15875 {
15876   int n, mode_bits;
15877   rtx dst = operands[0];
15878   rtx src = operands[1];
15879   rtx base;
15880   machine_mode cur_mode = BLKmode, next_mode;
15881   bool speed_p = !optimize_function_for_size_p (cfun);
15882
15883   /* When optimizing for size, give a better estimate of the length of a
15884      memcpy call, but use the default otherwise.  Moves larger than 8 bytes
15885      will always require an even number of instructions to do now.  And each
15886      operation requires both a load+store, so devide the max number by 2.  */
15887   int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
15888
15889   /* We can't do anything smart if the amount to copy is not constant.  */
15890   if (!CONST_INT_P (operands[2]))
15891     return false;
15892
15893   n = INTVAL (operands[2]);
15894
15895   /* Try to keep the number of instructions low.  For all cases we will do at
15896      most two moves for the residual amount, since we'll always overlap the
15897      remainder.  */
15898   if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
15899     return false;
15900
15901   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
15902   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
15903
15904   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
15905   src = adjust_automodify_address (src, VOIDmode, base, 0);
15906
15907   /* Convert n to bits to make the rest of the code simpler.  */
15908   n = n * BITS_PER_UNIT;
15909
15910   while (n > 0)
15911     {
15912       /* Find the largest mode in which to do the copy in without over reading
15913          or writing.  */
15914       opt_scalar_int_mode mode_iter;
15915       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
15916         if (GET_MODE_BITSIZE (mode_iter.require ()) <= n)
15917           cur_mode = mode_iter.require ();
15918
15919       gcc_assert (cur_mode != BLKmode);
15920
15921       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
15922       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
15923
15924       n -= mode_bits;
15925
15926       /* Do certain trailing copies as overlapping if it's going to be
15927          cheaper.  i.e. less instructions to do so.  For instance doing a 15
15928          byte copy it's more efficient to do two overlapping 8 byte copies than
15929          8 + 6 + 1.  */
15930       next_mode = smallest_mode_for_size (n, MODE_INT);
15931       int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
15932       if (n > 0 && n_bits > n && n_bits <= 8 * BITS_PER_UNIT)
15933         {
15934           src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
15935           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
15936           n = n_bits;
15937         }
15938     }
15939
15940   return true;
15941 }
15942
15943 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
15944    SImode stores.  Handle the case when the constant has identical
15945    bottom and top halves.  This is beneficial when the two stores can be
15946    merged into an STP and we avoid synthesising potentially expensive
15947    immediates twice.  Return true if such a split is possible.  */
15948
15949 bool
15950 aarch64_split_dimode_const_store (rtx dst, rtx src)
15951 {
15952   rtx lo = gen_lowpart (SImode, src);
15953   rtx hi = gen_highpart_mode (SImode, DImode, src);
15954
15955   bool size_p = optimize_function_for_size_p (cfun);
15956
15957   if (!rtx_equal_p (lo, hi))
15958     return false;
15959
15960   unsigned int orig_cost
15961     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
15962   unsigned int lo_cost
15963     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
15964
15965   /* We want to transform:
15966      MOV        x1, 49370
15967      MOVK       x1, 0x140, lsl 16
15968      MOVK       x1, 0xc0da, lsl 32
15969      MOVK       x1, 0x140, lsl 48
15970      STR        x1, [x0]
15971    into:
15972      MOV        w1, 49370
15973      MOVK       w1, 0x140, lsl 16
15974      STP        w1, w1, [x0]
15975    So we want to perform this only when we save two instructions
15976    or more.  When optimizing for size, however, accept any code size
15977    savings we can.  */
15978   if (size_p && orig_cost <= lo_cost)
15979     return false;
15980
15981   if (!size_p
15982       && (orig_cost <= lo_cost + 1))
15983     return false;
15984
15985   rtx mem_lo = adjust_address (dst, SImode, 0);
15986   if (!aarch64_mem_pair_operand (mem_lo, SImode))
15987     return false;
15988
15989   rtx tmp_reg = gen_reg_rtx (SImode);
15990   aarch64_expand_mov_immediate (tmp_reg, lo);
15991   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
15992   /* Don't emit an explicit store pair as this may not be always profitable.
15993      Let the sched-fusion logic decide whether to merge them.  */
15994   emit_move_insn (mem_lo, tmp_reg);
15995   emit_move_insn (mem_hi, tmp_reg);
15996
15997   return true;
15998 }
15999
16000 /* Generate RTL for a conditional branch with rtx comparison CODE in
16001    mode CC_MODE.  The destination of the unlikely conditional branch
16002    is LABEL_REF.  */
16003
16004 void
16005 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
16006                               rtx label_ref)
16007 {
16008   rtx x;
16009   x = gen_rtx_fmt_ee (code, VOIDmode,
16010                       gen_rtx_REG (cc_mode, CC_REGNUM),
16011                       const0_rtx);
16012
16013   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16014                             gen_rtx_LABEL_REF (VOIDmode, label_ref),
16015                             pc_rtx);
16016   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16017 }
16018
16019 /* Generate DImode scratch registers for 128-bit (TImode) addition.
16020
16021    OP1 represents the TImode destination operand 1
16022    OP2 represents the TImode destination operand 2
16023    LOW_DEST represents the low half (DImode) of TImode operand 0
16024    LOW_IN1 represents the low half (DImode) of TImode operand 1
16025    LOW_IN2 represents the low half (DImode) of TImode operand 2
16026    HIGH_DEST represents the high half (DImode) of TImode operand 0
16027    HIGH_IN1 represents the high half (DImode) of TImode operand 1
16028    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
16029
16030 void
16031 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
16032                             rtx *low_in1, rtx *low_in2,
16033                             rtx *high_dest, rtx *high_in1,
16034                             rtx *high_in2)
16035 {
16036   *low_dest = gen_reg_rtx (DImode);
16037   *low_in1 = gen_lowpart (DImode, op1);
16038   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
16039                                   subreg_lowpart_offset (DImode, TImode));
16040   *high_dest = gen_reg_rtx (DImode);
16041   *high_in1 = gen_highpart (DImode, op1);
16042   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
16043                                    subreg_highpart_offset (DImode, TImode));
16044 }
16045
16046 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
16047
16048    This function differs from 'arch64_addti_scratch_regs' in that
16049    OP1 can be an immediate constant (zero). We must call
16050    subreg_highpart_offset with DImode and TImode arguments, otherwise
16051    VOIDmode will be used for the const_int which generates an internal
16052    error from subreg_size_highpart_offset which does not expect a size of zero.
16053
16054    OP1 represents the TImode destination operand 1
16055    OP2 represents the TImode destination operand 2
16056    LOW_DEST represents the low half (DImode) of TImode operand 0
16057    LOW_IN1 represents the low half (DImode) of TImode operand 1
16058    LOW_IN2 represents the low half (DImode) of TImode operand 2
16059    HIGH_DEST represents the high half (DImode) of TImode operand 0
16060    HIGH_IN1 represents the high half (DImode) of TImode operand 1
16061    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
16062
16063
16064 void
16065 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
16066                              rtx *low_in1, rtx *low_in2,
16067                              rtx *high_dest, rtx *high_in1,
16068                              rtx *high_in2)
16069 {
16070   *low_dest = gen_reg_rtx (DImode);
16071   *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
16072                                   subreg_lowpart_offset (DImode, TImode));
16073
16074   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
16075                                   subreg_lowpart_offset (DImode, TImode));
16076   *high_dest = gen_reg_rtx (DImode);
16077
16078   *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
16079                                    subreg_highpart_offset (DImode, TImode));
16080   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
16081                                    subreg_highpart_offset (DImode, TImode));
16082 }
16083
16084 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
16085
16086    OP0 represents the TImode destination operand 0
16087    LOW_DEST represents the low half (DImode) of TImode operand 0
16088    LOW_IN1 represents the low half (DImode) of TImode operand 1
16089    LOW_IN2 represents the low half (DImode) of TImode operand 2
16090    HIGH_DEST represents the high half (DImode) of TImode operand 0
16091    HIGH_IN1 represents the high half (DImode) of TImode operand 1
16092    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
16093
16094 void
16095 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
16096                        rtx low_in2, rtx high_dest, rtx high_in1,
16097                        rtx high_in2)
16098 {
16099   if (low_in2 == const0_rtx)
16100     {
16101       low_dest = low_in1;
16102       emit_insn (gen_subdi3_compare1 (high_dest, high_in1,
16103                                       force_reg (DImode, high_in2)));
16104     }
16105   else
16106     {
16107       if (CONST_INT_P (low_in2))
16108         {
16109           low_in2 = force_reg (DImode, GEN_INT (-UINTVAL (low_in2)));
16110           high_in2 = force_reg (DImode, high_in2);
16111           emit_insn (gen_adddi3_compareC (low_dest, low_in1, low_in2));
16112         }
16113       else
16114         emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
16115       emit_insn (gen_subdi3_carryinCV (high_dest,
16116                                        force_reg (DImode, high_in1),
16117                                        high_in2));
16118     }
16119
16120   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
16121   emit_move_insn (gen_highpart (DImode, op0), high_dest);
16122
16123 }
16124
16125 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
16126
16127 static unsigned HOST_WIDE_INT
16128 aarch64_asan_shadow_offset (void)
16129 {
16130   return (HOST_WIDE_INT_1 << 36);
16131 }
16132
16133 static rtx
16134 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
16135                         int code, tree treeop0, tree treeop1)
16136 {
16137   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16138   rtx op0, op1;
16139   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16140   insn_code icode;
16141   struct expand_operand ops[4];
16142
16143   start_sequence ();
16144   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16145
16146   op_mode = GET_MODE (op0);
16147   if (op_mode == VOIDmode)
16148     op_mode = GET_MODE (op1);
16149
16150   switch (op_mode)
16151     {
16152     case E_QImode:
16153     case E_HImode:
16154     case E_SImode:
16155       cmp_mode = SImode;
16156       icode = CODE_FOR_cmpsi;
16157       break;
16158
16159     case E_DImode:
16160       cmp_mode = DImode;
16161       icode = CODE_FOR_cmpdi;
16162       break;
16163
16164     case E_SFmode:
16165       cmp_mode = SFmode;
16166       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16167       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
16168       break;
16169
16170     case E_DFmode:
16171       cmp_mode = DFmode;
16172       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16173       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
16174       break;
16175
16176     default:
16177       end_sequence ();
16178       return NULL_RTX;
16179     }
16180
16181   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
16182   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
16183   if (!op0 || !op1)
16184     {
16185       end_sequence ();
16186       return NULL_RTX;
16187     }
16188   *prep_seq = get_insns ();
16189   end_sequence ();
16190
16191   create_fixed_operand (&ops[0], op0);
16192   create_fixed_operand (&ops[1], op1);
16193
16194   start_sequence ();
16195   if (!maybe_expand_insn (icode, 2, ops))
16196     {
16197       end_sequence ();
16198       return NULL_RTX;
16199     }
16200   *gen_seq = get_insns ();
16201   end_sequence ();
16202
16203   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
16204                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
16205 }
16206
16207 static rtx
16208 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
16209                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
16210 {
16211   rtx op0, op1, target;
16212   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16213   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16214   insn_code icode;
16215   struct expand_operand ops[6];
16216   int aarch64_cond;
16217
16218   push_to_sequence (*prep_seq);
16219   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16220
16221   op_mode = GET_MODE (op0);
16222   if (op_mode == VOIDmode)
16223     op_mode = GET_MODE (op1);
16224
16225   switch (op_mode)
16226     {
16227     case E_QImode:
16228     case E_HImode:
16229     case E_SImode:
16230       cmp_mode = SImode;
16231       icode = CODE_FOR_ccmpsi;
16232       break;
16233
16234     case E_DImode:
16235       cmp_mode = DImode;
16236       icode = CODE_FOR_ccmpdi;
16237       break;
16238
16239     case E_SFmode:
16240       cmp_mode = SFmode;
16241       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16242       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
16243       break;
16244
16245     case E_DFmode:
16246       cmp_mode = DFmode;
16247       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16248       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
16249       break;
16250
16251     default:
16252       end_sequence ();
16253       return NULL_RTX;
16254     }
16255
16256   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
16257   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
16258   if (!op0 || !op1)
16259     {
16260       end_sequence ();
16261       return NULL_RTX;
16262     }
16263   *prep_seq = get_insns ();
16264   end_sequence ();
16265
16266   target = gen_rtx_REG (cc_mode, CC_REGNUM);
16267   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
16268
16269   if (bit_code != AND)
16270     {
16271       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
16272                                                 GET_MODE (XEXP (prev, 0))),
16273                              VOIDmode, XEXP (prev, 0), const0_rtx);
16274       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
16275     }
16276
16277   create_fixed_operand (&ops[0], XEXP (prev, 0));
16278   create_fixed_operand (&ops[1], target);
16279   create_fixed_operand (&ops[2], op0);
16280   create_fixed_operand (&ops[3], op1);
16281   create_fixed_operand (&ops[4], prev);
16282   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
16283
16284   push_to_sequence (*gen_seq);
16285   if (!maybe_expand_insn (icode, 6, ops))
16286     {
16287       end_sequence ();
16288       return NULL_RTX;
16289     }
16290
16291   *gen_seq = get_insns ();
16292   end_sequence ();
16293
16294   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
16295 }
16296
16297 #undef TARGET_GEN_CCMP_FIRST
16298 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
16299
16300 #undef TARGET_GEN_CCMP_NEXT
16301 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
16302
16303 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
16304    instruction fusion of some sort.  */
16305
16306 static bool
16307 aarch64_macro_fusion_p (void)
16308 {
16309   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
16310 }
16311
16312
16313 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
16314    should be kept together during scheduling.  */
16315
16316 static bool
16317 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
16318 {
16319   rtx set_dest;
16320   rtx prev_set = single_set (prev);
16321   rtx curr_set = single_set (curr);
16322   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
16323   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
16324
16325   if (!aarch64_macro_fusion_p ())
16326     return false;
16327
16328   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
16329     {
16330       /* We are trying to match:
16331          prev (mov)  == (set (reg r0) (const_int imm16))
16332          curr (movk) == (set (zero_extract (reg r0)
16333                                            (const_int 16)
16334                                            (const_int 16))
16335                              (const_int imm16_1))  */
16336
16337       set_dest = SET_DEST (curr_set);
16338
16339       if (GET_CODE (set_dest) == ZERO_EXTRACT
16340           && CONST_INT_P (SET_SRC (curr_set))
16341           && CONST_INT_P (SET_SRC (prev_set))
16342           && CONST_INT_P (XEXP (set_dest, 2))
16343           && INTVAL (XEXP (set_dest, 2)) == 16
16344           && REG_P (XEXP (set_dest, 0))
16345           && REG_P (SET_DEST (prev_set))
16346           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
16347         {
16348           return true;
16349         }
16350     }
16351
16352   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
16353     {
16354
16355       /*  We're trying to match:
16356           prev (adrp) == (set (reg r1)
16357                               (high (symbol_ref ("SYM"))))
16358           curr (add) == (set (reg r0)
16359                              (lo_sum (reg r1)
16360                                      (symbol_ref ("SYM"))))
16361           Note that r0 need not necessarily be the same as r1, especially
16362           during pre-regalloc scheduling.  */
16363
16364       if (satisfies_constraint_Ush (SET_SRC (prev_set))
16365           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16366         {
16367           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
16368               && REG_P (XEXP (SET_SRC (curr_set), 0))
16369               && REGNO (XEXP (SET_SRC (curr_set), 0))
16370                  == REGNO (SET_DEST (prev_set))
16371               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
16372                               XEXP (SET_SRC (curr_set), 1)))
16373             return true;
16374         }
16375     }
16376
16377   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
16378     {
16379
16380       /* We're trying to match:
16381          prev (movk) == (set (zero_extract (reg r0)
16382                                            (const_int 16)
16383                                            (const_int 32))
16384                              (const_int imm16_1))
16385          curr (movk) == (set (zero_extract (reg r0)
16386                                            (const_int 16)
16387                                            (const_int 48))
16388                              (const_int imm16_2))  */
16389
16390       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
16391           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
16392           && REG_P (XEXP (SET_DEST (prev_set), 0))
16393           && REG_P (XEXP (SET_DEST (curr_set), 0))
16394           && REGNO (XEXP (SET_DEST (prev_set), 0))
16395              == REGNO (XEXP (SET_DEST (curr_set), 0))
16396           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
16397           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
16398           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
16399           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
16400           && CONST_INT_P (SET_SRC (prev_set))
16401           && CONST_INT_P (SET_SRC (curr_set)))
16402         return true;
16403
16404     }
16405   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
16406     {
16407       /* We're trying to match:
16408           prev (adrp) == (set (reg r0)
16409                               (high (symbol_ref ("SYM"))))
16410           curr (ldr) == (set (reg r1)
16411                              (mem (lo_sum (reg r0)
16412                                              (symbol_ref ("SYM")))))
16413                  or
16414           curr (ldr) == (set (reg r1)
16415                              (zero_extend (mem
16416                                            (lo_sum (reg r0)
16417                                                    (symbol_ref ("SYM"))))))  */
16418       if (satisfies_constraint_Ush (SET_SRC (prev_set))
16419           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16420         {
16421           rtx curr_src = SET_SRC (curr_set);
16422
16423           if (GET_CODE (curr_src) == ZERO_EXTEND)
16424             curr_src = XEXP (curr_src, 0);
16425
16426           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
16427               && REG_P (XEXP (XEXP (curr_src, 0), 0))
16428               && REGNO (XEXP (XEXP (curr_src, 0), 0))
16429                  == REGNO (SET_DEST (prev_set))
16430               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
16431                               XEXP (SET_SRC (prev_set), 0)))
16432               return true;
16433         }
16434     }
16435
16436   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
16437        && aarch_crypto_can_dual_issue (prev, curr))
16438     return true;
16439
16440   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
16441       && any_condjump_p (curr))
16442     {
16443       enum attr_type prev_type = get_attr_type (prev);
16444
16445       unsigned int condreg1, condreg2;
16446       rtx cc_reg_1;
16447       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
16448       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
16449
16450       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
16451           && prev
16452           && modified_in_p (cc_reg_1, prev))
16453         {
16454           /* FIXME: this misses some which is considered simple arthematic
16455              instructions for ThunderX.  Simple shifts are missed here.  */
16456           if (prev_type == TYPE_ALUS_SREG
16457               || prev_type == TYPE_ALUS_IMM
16458               || prev_type == TYPE_LOGICS_REG
16459               || prev_type == TYPE_LOGICS_IMM)
16460             return true;
16461         }
16462     }
16463
16464   if (prev_set
16465       && curr_set
16466       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
16467       && any_condjump_p (curr))
16468     {
16469       /* We're trying to match:
16470           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
16471           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
16472                                                          (const_int 0))
16473                                                  (label_ref ("SYM"))
16474                                                  (pc))  */
16475       if (SET_DEST (curr_set) == (pc_rtx)
16476           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
16477           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
16478           && REG_P (SET_DEST (prev_set))
16479           && REGNO (SET_DEST (prev_set))
16480              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
16481         {
16482           /* Fuse ALU operations followed by conditional branch instruction.  */
16483           switch (get_attr_type (prev))
16484             {
16485             case TYPE_ALU_IMM:
16486             case TYPE_ALU_SREG:
16487             case TYPE_ADC_REG:
16488             case TYPE_ADC_IMM:
16489             case TYPE_ADCS_REG:
16490             case TYPE_ADCS_IMM:
16491             case TYPE_LOGIC_REG:
16492             case TYPE_LOGIC_IMM:
16493             case TYPE_CSEL:
16494             case TYPE_ADR:
16495             case TYPE_MOV_IMM:
16496             case TYPE_SHIFT_REG:
16497             case TYPE_SHIFT_IMM:
16498             case TYPE_BFM:
16499             case TYPE_RBIT:
16500             case TYPE_REV:
16501             case TYPE_EXTEND:
16502               return true;
16503
16504             default:;
16505             }
16506         }
16507     }
16508
16509   return false;
16510 }
16511
16512 /* Return true iff the instruction fusion described by OP is enabled.  */
16513
16514 bool
16515 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
16516 {
16517   return (aarch64_tune_params.fusible_ops & op) != 0;
16518 }
16519
16520 /* If MEM is in the form of [base+offset], extract the two parts
16521    of address and set to BASE and OFFSET, otherwise return false
16522    after clearing BASE and OFFSET.  */
16523
16524 bool
16525 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
16526 {
16527   rtx addr;
16528
16529   gcc_assert (MEM_P (mem));
16530
16531   addr = XEXP (mem, 0);
16532
16533   if (REG_P (addr))
16534     {
16535       *base = addr;
16536       *offset = const0_rtx;
16537       return true;
16538     }
16539
16540   if (GET_CODE (addr) == PLUS
16541       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
16542     {
16543       *base = XEXP (addr, 0);
16544       *offset = XEXP (addr, 1);
16545       return true;
16546     }
16547
16548   *base = NULL_RTX;
16549   *offset = NULL_RTX;
16550
16551   return false;
16552 }
16553
16554 /* Types for scheduling fusion.  */
16555 enum sched_fusion_type
16556 {
16557   SCHED_FUSION_NONE = 0,
16558   SCHED_FUSION_LD_SIGN_EXTEND,
16559   SCHED_FUSION_LD_ZERO_EXTEND,
16560   SCHED_FUSION_LD,
16561   SCHED_FUSION_ST,
16562   SCHED_FUSION_NUM
16563 };
16564
16565 /* If INSN is a load or store of address in the form of [base+offset],
16566    extract the two parts and set to BASE and OFFSET.  Return scheduling
16567    fusion type this INSN is.  */
16568
16569 static enum sched_fusion_type
16570 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
16571 {
16572   rtx x, dest, src;
16573   enum sched_fusion_type fusion = SCHED_FUSION_LD;
16574
16575   gcc_assert (INSN_P (insn));
16576   x = PATTERN (insn);
16577   if (GET_CODE (x) != SET)
16578     return SCHED_FUSION_NONE;
16579
16580   src = SET_SRC (x);
16581   dest = SET_DEST (x);
16582
16583   machine_mode dest_mode = GET_MODE (dest);
16584
16585   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
16586     return SCHED_FUSION_NONE;
16587
16588   if (GET_CODE (src) == SIGN_EXTEND)
16589     {
16590       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
16591       src = XEXP (src, 0);
16592       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16593         return SCHED_FUSION_NONE;
16594     }
16595   else if (GET_CODE (src) == ZERO_EXTEND)
16596     {
16597       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
16598       src = XEXP (src, 0);
16599       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16600         return SCHED_FUSION_NONE;
16601     }
16602
16603   if (GET_CODE (src) == MEM && REG_P (dest))
16604     extract_base_offset_in_addr (src, base, offset);
16605   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
16606     {
16607       fusion = SCHED_FUSION_ST;
16608       extract_base_offset_in_addr (dest, base, offset);
16609     }
16610   else
16611     return SCHED_FUSION_NONE;
16612
16613   if (*base == NULL_RTX || *offset == NULL_RTX)
16614     fusion = SCHED_FUSION_NONE;
16615
16616   return fusion;
16617 }
16618
16619 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
16620
16621    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
16622    and PRI are only calculated for these instructions.  For other instruction,
16623    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
16624    type instruction fusion can be added by returning different priorities.
16625
16626    It's important that irrelevant instructions get the largest FUSION_PRI.  */
16627
16628 static void
16629 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
16630                                int *fusion_pri, int *pri)
16631 {
16632   int tmp, off_val;
16633   rtx base, offset;
16634   enum sched_fusion_type fusion;
16635
16636   gcc_assert (INSN_P (insn));
16637
16638   tmp = max_pri - 1;
16639   fusion = fusion_load_store (insn, &base, &offset);
16640   if (fusion == SCHED_FUSION_NONE)
16641     {
16642       *pri = tmp;
16643       *fusion_pri = tmp;
16644       return;
16645     }
16646
16647   /* Set FUSION_PRI according to fusion type and base register.  */
16648   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
16649
16650   /* Calculate PRI.  */
16651   tmp /= 2;
16652
16653   /* INSN with smaller offset goes first.  */
16654   off_val = (int)(INTVAL (offset));
16655   if (off_val >= 0)
16656     tmp -= (off_val & 0xfffff);
16657   else
16658     tmp += ((- off_val) & 0xfffff);
16659
16660   *pri = tmp;
16661   return;
16662 }
16663
16664 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
16665    Adjust priority of sha1h instructions so they are scheduled before
16666    other SHA1 instructions.  */
16667
16668 static int
16669 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
16670 {
16671   rtx x = PATTERN (insn);
16672
16673   if (GET_CODE (x) == SET)
16674     {
16675       x = SET_SRC (x);
16676
16677       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
16678         return priority + 10;
16679     }
16680
16681   return priority;
16682 }
16683
16684 /* Given OPERANDS of consecutive load/store, check if we can merge
16685    them into ldp/stp.  LOAD is true if they are load instructions.
16686    MODE is the mode of memory operands.  */
16687
16688 bool
16689 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
16690                                 machine_mode mode)
16691 {
16692   HOST_WIDE_INT offval_1, offval_2, msize;
16693   enum reg_class rclass_1, rclass_2;
16694   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
16695
16696   if (load)
16697     {
16698       mem_1 = operands[1];
16699       mem_2 = operands[3];
16700       reg_1 = operands[0];
16701       reg_2 = operands[2];
16702       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
16703       if (REGNO (reg_1) == REGNO (reg_2))
16704         return false;
16705     }
16706   else
16707     {
16708       mem_1 = operands[0];
16709       mem_2 = operands[2];
16710       reg_1 = operands[1];
16711       reg_2 = operands[3];
16712     }
16713
16714   /* The mems cannot be volatile.  */
16715   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
16716     return false;
16717
16718   /* If we have SImode and slow unaligned ldp,
16719      check the alignment to be at least 8 byte. */
16720   if (mode == SImode
16721       && (aarch64_tune_params.extra_tuning_flags
16722           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16723       && !optimize_size
16724       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
16725     return false;
16726
16727   /* Check if the addresses are in the form of [base+offset].  */
16728   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16729   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16730     return false;
16731   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16732   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16733     return false;
16734
16735   /* Check if the bases are same.  */
16736   if (!rtx_equal_p (base_1, base_2))
16737     return false;
16738
16739   /* The operands must be of the same size.  */
16740   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
16741                          GET_MODE_SIZE (GET_MODE (mem_2))));
16742
16743   offval_1 = INTVAL (offset_1);
16744   offval_2 = INTVAL (offset_2);
16745   /* We should only be trying this for fixed-sized modes.  There is no
16746      SVE LDP/STP instruction.  */
16747   msize = GET_MODE_SIZE (mode).to_constant ();
16748   /* Check if the offsets are consecutive.  */
16749   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
16750     return false;
16751
16752   /* Check if the addresses are clobbered by load.  */
16753   if (load)
16754     {
16755       if (reg_mentioned_p (reg_1, mem_1))
16756         return false;
16757
16758       /* In increasing order, the last load can clobber the address.  */
16759       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
16760         return false;
16761     }
16762
16763   /* One of the memory accesses must be a mempair operand.
16764      If it is not the first one, they need to be swapped by the
16765      peephole.  */
16766   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
16767        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
16768     return false;
16769
16770   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
16771     rclass_1 = FP_REGS;
16772   else
16773     rclass_1 = GENERAL_REGS;
16774
16775   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
16776     rclass_2 = FP_REGS;
16777   else
16778     rclass_2 = GENERAL_REGS;
16779
16780   /* Check if the registers are of same class.  */
16781   if (rclass_1 != rclass_2)
16782     return false;
16783
16784   return true;
16785 }
16786
16787 /* Given OPERANDS of consecutive load/store that can be merged,
16788    swap them if they are not in ascending order.  */
16789 void
16790 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
16791 {
16792   rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
16793   HOST_WIDE_INT offval_1, offval_2;
16794
16795   if (load)
16796     {
16797       mem_1 = operands[1];
16798       mem_2 = operands[3];
16799     }
16800   else
16801     {
16802       mem_1 = operands[0];
16803       mem_2 = operands[2];
16804     }
16805
16806   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16807   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16808
16809   offval_1 = INTVAL (offset_1);
16810   offval_2 = INTVAL (offset_2);
16811
16812   if (offval_1 > offval_2)
16813     {
16814       /* Irrespective of whether this is a load or a store,
16815          we do the same swap.  */
16816       std::swap (operands[0], operands[2]);
16817       std::swap (operands[1], operands[3]);
16818     }
16819 }
16820
16821 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
16822    comparison between the two.  */
16823 int
16824 aarch64_host_wide_int_compare (const void *x, const void *y)
16825 {
16826   return wi::cmps (* ((const HOST_WIDE_INT *) x),
16827                    * ((const HOST_WIDE_INT *) y));
16828 }
16829
16830 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
16831    other pointing to a REG rtx containing an offset, compare the offsets
16832    of the two pairs.
16833
16834    Return:
16835
16836         1 iff offset (X) > offset (Y)
16837         0 iff offset (X) == offset (Y)
16838         -1 iff offset (X) < offset (Y)  */
16839 int
16840 aarch64_ldrstr_offset_compare (const void *x, const void *y)
16841 {
16842   const rtx * operands_1 = (const rtx *) x;
16843   const rtx * operands_2 = (const rtx *) y;
16844   rtx mem_1, mem_2, base, offset_1, offset_2;
16845
16846   if (MEM_P (operands_1[0]))
16847     mem_1 = operands_1[0];
16848   else
16849     mem_1 = operands_1[1];
16850
16851   if (MEM_P (operands_2[0]))
16852     mem_2 = operands_2[0];
16853   else
16854     mem_2 = operands_2[1];
16855
16856   /* Extract the offsets.  */
16857   extract_base_offset_in_addr (mem_1, &base, &offset_1);
16858   extract_base_offset_in_addr (mem_2, &base, &offset_2);
16859
16860   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
16861
16862   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
16863 }
16864
16865 /* Given OPERANDS of consecutive load/store, check if we can merge
16866    them into ldp/stp by adjusting the offset.  LOAD is true if they
16867    are load instructions.  MODE is the mode of memory operands.
16868
16869    Given below consecutive stores:
16870
16871      str  w1, [xb, 0x100]
16872      str  w1, [xb, 0x104]
16873      str  w1, [xb, 0x108]
16874      str  w1, [xb, 0x10c]
16875
16876    Though the offsets are out of the range supported by stp, we can
16877    still pair them after adjusting the offset, like:
16878
16879      add  scratch, xb, 0x100
16880      stp  w1, w1, [scratch]
16881      stp  w1, w1, [scratch, 0x8]
16882
16883    The peephole patterns detecting this opportunity should guarantee
16884    the scratch register is avaliable.  */
16885
16886 bool
16887 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
16888                                        scalar_mode mode)
16889 {
16890   const int num_insns = 4;
16891   enum reg_class rclass;
16892   HOST_WIDE_INT offvals[num_insns], msize;
16893   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
16894
16895   if (load)
16896     {
16897       for (int i = 0; i < num_insns; i++)
16898         {
16899           reg[i] = operands[2 * i];
16900           mem[i] = operands[2 * i + 1];
16901
16902           gcc_assert (REG_P (reg[i]));
16903         }
16904
16905       /* Do not attempt to merge the loads if the loads clobber each other.  */
16906       for (int i = 0; i < 8; i += 2)
16907         for (int j = i + 2; j < 8; j += 2)
16908           if (reg_overlap_mentioned_p (operands[i], operands[j]))
16909             return false;
16910     }
16911   else
16912     for (int i = 0; i < num_insns; i++)
16913       {
16914         mem[i] = operands[2 * i];
16915         reg[i] = operands[2 * i + 1];
16916       }
16917
16918   /* Skip if memory operand is by itself valid for ldp/stp.  */
16919   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
16920     return false;
16921
16922   for (int i = 0; i < num_insns; i++)
16923     {
16924       /* The mems cannot be volatile.  */
16925       if (MEM_VOLATILE_P (mem[i]))
16926         return false;
16927
16928       /* Check if the addresses are in the form of [base+offset].  */
16929       extract_base_offset_in_addr (mem[i], base + i, offset + i);
16930       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
16931         return false;
16932     }
16933
16934   /* Check if the registers are of same class.  */
16935   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
16936     ? FP_REGS : GENERAL_REGS;
16937
16938   for (int i = 1; i < num_insns; i++)
16939     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
16940       {
16941         if (rclass != FP_REGS)
16942           return false;
16943       }
16944     else
16945       {
16946         if (rclass != GENERAL_REGS)
16947           return false;
16948       }
16949
16950   /* Only the last register in the order in which they occur
16951      may be clobbered by the load.  */
16952   if (rclass == GENERAL_REGS && load)
16953     for (int i = 0; i < num_insns - 1; i++)
16954       if (reg_mentioned_p (reg[i], mem[i]))
16955         return false;
16956
16957   /* Check if the bases are same.  */
16958   for (int i = 0; i < num_insns - 1; i++)
16959     if (!rtx_equal_p (base[i], base[i + 1]))
16960       return false;
16961
16962   for (int i = 0; i < num_insns; i++)
16963     offvals[i] = INTVAL (offset[i]);
16964
16965   msize = GET_MODE_SIZE (mode);
16966
16967   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
16968   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
16969          aarch64_host_wide_int_compare);
16970
16971   if (!(offvals[1] == offvals[0] + msize
16972         && offvals[3] == offvals[2] + msize))
16973     return false;
16974
16975   /* Check that offsets are within range of each other.  The ldp/stp
16976      instructions have 7 bit immediate offsets, so use 0x80.  */
16977   if (offvals[2] - offvals[0] >= msize * 0x80)
16978     return false;
16979
16980   /* The offsets must be aligned with respect to each other.  */
16981   if (offvals[0] % msize != offvals[2] % msize)
16982     return false;
16983
16984   /* If we have SImode and slow unaligned ldp,
16985      check the alignment to be at least 8 byte. */
16986   if (mode == SImode
16987       && (aarch64_tune_params.extra_tuning_flags
16988           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16989       && !optimize_size
16990       && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
16991     return false;
16992
16993   return true;
16994 }
16995
16996 /* Given OPERANDS of consecutive load/store, this function pairs them
16997    into LDP/STP after adjusting the offset.  It depends on the fact
16998    that the operands can be sorted so the offsets are correct for STP.
16999    MODE is the mode of memory operands.  CODE is the rtl operator
17000    which should be applied to all memory operands, it's SIGN_EXTEND,
17001    ZERO_EXTEND or UNKNOWN.  */
17002
17003 bool
17004 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
17005                              scalar_mode mode, RTX_CODE code)
17006 {
17007   rtx base, offset_1, offset_3, t1, t2;
17008   rtx mem_1, mem_2, mem_3, mem_4;
17009   rtx temp_operands[8];
17010   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
17011                 stp_off_upper_limit, stp_off_lower_limit, msize;
17012
17013   /* We make changes on a copy as we may still bail out.  */
17014   for (int i = 0; i < 8; i ++)
17015     temp_operands[i] = operands[i];
17016
17017   /* Sort the operands.  */
17018   qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
17019
17020   if (load)
17021     {
17022       mem_1 = temp_operands[1];
17023       mem_2 = temp_operands[3];
17024       mem_3 = temp_operands[5];
17025       mem_4 = temp_operands[7];
17026     }
17027   else
17028     {
17029       mem_1 = temp_operands[0];
17030       mem_2 = temp_operands[2];
17031       mem_3 = temp_operands[4];
17032       mem_4 = temp_operands[6];
17033       gcc_assert (code == UNKNOWN);
17034     }
17035
17036   extract_base_offset_in_addr (mem_1, &base, &offset_1);
17037   extract_base_offset_in_addr (mem_3, &base, &offset_3);
17038   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
17039               && offset_3 != NULL_RTX);
17040
17041   /* Adjust offset so it can fit in LDP/STP instruction.  */
17042   msize = GET_MODE_SIZE (mode);
17043   stp_off_upper_limit = msize * (0x40 - 1);
17044   stp_off_lower_limit = - msize * 0x40;
17045
17046   off_val_1 = INTVAL (offset_1);
17047   off_val_3 = INTVAL (offset_3);
17048
17049   /* The base offset is optimally half way between the two STP/LDP offsets.  */
17050   if (msize <= 4)
17051     base_off = (off_val_1 + off_val_3) / 2;
17052   else
17053     /* However, due to issues with negative LDP/STP offset generation for
17054        larger modes, for DF, DI and vector modes. we must not use negative
17055        addresses smaller than 9 signed unadjusted bits can store.  This
17056        provides the most range in this case.  */
17057     base_off = off_val_1;
17058
17059   /* Adjust the base so that it is aligned with the addresses but still
17060      optimal.  */
17061   if (base_off % msize != off_val_1 % msize)
17062     /* Fix the offset, bearing in mind we want to make it bigger not
17063        smaller.  */
17064     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17065   else if (msize <= 4)
17066     /* The negative range of LDP/STP is one larger than the positive range.  */
17067     base_off += msize;
17068
17069   /* Check if base offset is too big or too small.  We can attempt to resolve
17070      this issue by setting it to the maximum value and seeing if the offsets
17071      still fit.  */
17072   if (base_off >= 0x1000)
17073     {
17074       base_off = 0x1000 - 1;
17075       /* We must still make sure that the base offset is aligned with respect
17076          to the address.  But it may may not be made any bigger.  */
17077       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17078     }
17079
17080   /* Likewise for the case where the base is too small.  */
17081   if (base_off <= -0x1000)
17082     {
17083       base_off = -0x1000 + 1;
17084       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17085     }
17086
17087   /* Offset of the first STP/LDP.  */
17088   new_off_1 = off_val_1 - base_off;
17089
17090   /* Offset of the second STP/LDP.  */
17091   new_off_3 = off_val_3 - base_off;
17092
17093   /* The offsets must be within the range of the LDP/STP instructions.  */
17094   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
17095       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
17096     return false;
17097
17098   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
17099                                                   new_off_1), true);
17100   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
17101                                                   new_off_1 + msize), true);
17102   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
17103                                                   new_off_3), true);
17104   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
17105                                                   new_off_3 + msize), true);
17106
17107   if (!aarch64_mem_pair_operand (mem_1, mode)
17108       || !aarch64_mem_pair_operand (mem_3, mode))
17109     return false;
17110
17111   if (code == ZERO_EXTEND)
17112     {
17113       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
17114       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
17115       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
17116       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
17117     }
17118   else if (code == SIGN_EXTEND)
17119     {
17120       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
17121       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
17122       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
17123       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
17124     }
17125
17126   if (load)
17127     {
17128       operands[0] = temp_operands[0];
17129       operands[1] = mem_1;
17130       operands[2] = temp_operands[2];
17131       operands[3] = mem_2;
17132       operands[4] = temp_operands[4];
17133       operands[5] = mem_3;
17134       operands[6] = temp_operands[6];
17135       operands[7] = mem_4;
17136     }
17137   else
17138     {
17139       operands[0] = mem_1;
17140       operands[1] = temp_operands[1];
17141       operands[2] = mem_2;
17142       operands[3] = temp_operands[3];
17143       operands[4] = mem_3;
17144       operands[5] = temp_operands[5];
17145       operands[6] = mem_4;
17146       operands[7] = temp_operands[7];
17147     }
17148
17149   /* Emit adjusting instruction.  */
17150   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
17151   /* Emit ldp/stp instructions.  */
17152   t1 = gen_rtx_SET (operands[0], operands[1]);
17153   t2 = gen_rtx_SET (operands[2], operands[3]);
17154   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17155   t1 = gen_rtx_SET (operands[4], operands[5]);
17156   t2 = gen_rtx_SET (operands[6], operands[7]);
17157   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17158   return true;
17159 }
17160
17161 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
17162    it isn't worth branching around empty masked ops (including masked
17163    stores).  */
17164
17165 static bool
17166 aarch64_empty_mask_is_expensive (unsigned)
17167 {
17168   return false;
17169 }
17170
17171 /* Return 1 if pseudo register should be created and used to hold
17172    GOT address for PIC code.  */
17173
17174 bool
17175 aarch64_use_pseudo_pic_reg (void)
17176 {
17177   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
17178 }
17179
17180 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
17181
17182 static int
17183 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
17184 {
17185   switch (XINT (x, 1))
17186     {
17187     case UNSPEC_GOTSMALLPIC:
17188     case UNSPEC_GOTSMALLPIC28K:
17189     case UNSPEC_GOTTINYPIC:
17190       return 0;
17191     default:
17192       break;
17193     }
17194
17195   return default_unspec_may_trap_p (x, flags);
17196 }
17197
17198
17199 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
17200    return the log2 of that value.  Otherwise return -1.  */
17201
17202 int
17203 aarch64_fpconst_pow_of_2 (rtx x)
17204 {
17205   const REAL_VALUE_TYPE *r;
17206
17207   if (!CONST_DOUBLE_P (x))
17208     return -1;
17209
17210   r = CONST_DOUBLE_REAL_VALUE (x);
17211
17212   if (REAL_VALUE_NEGATIVE (*r)
17213       || REAL_VALUE_ISNAN (*r)
17214       || REAL_VALUE_ISINF (*r)
17215       || !real_isinteger (r, DFmode))
17216     return -1;
17217
17218   return exact_log2 (real_to_integer (r));
17219 }
17220
17221 /* If X is a vector of equal CONST_DOUBLE values and that value is
17222    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
17223
17224 int
17225 aarch64_vec_fpconst_pow_of_2 (rtx x)
17226 {
17227   int nelts;
17228   if (GET_CODE (x) != CONST_VECTOR
17229       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
17230     return -1;
17231
17232   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
17233     return -1;
17234
17235   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
17236   if (firstval <= 0)
17237     return -1;
17238
17239   for (int i = 1; i < nelts; i++)
17240     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
17241       return -1;
17242
17243   return firstval;
17244 }
17245
17246 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
17247    to float.
17248
17249    __fp16 always promotes through this hook.
17250    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
17251    through the generic excess precision logic rather than here.  */
17252
17253 static tree
17254 aarch64_promoted_type (const_tree t)
17255 {
17256   if (SCALAR_FLOAT_TYPE_P (t)
17257       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
17258     return float_type_node;
17259
17260   return NULL_TREE;
17261 }
17262
17263 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
17264
17265 static bool
17266 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
17267                            optimization_type opt_type)
17268 {
17269   switch (op)
17270     {
17271     case rsqrt_optab:
17272       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
17273
17274     default:
17275       return true;
17276     }
17277 }
17278
17279 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
17280
17281 static unsigned int
17282 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
17283                                         int *offset)
17284 {
17285   /* Polynomial invariant 1 == (VG / 2) - 1.  */
17286   gcc_assert (i == 1);
17287   *factor = 2;
17288   *offset = 1;
17289   return AARCH64_DWARF_VG;
17290 }
17291
17292 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
17293    if MODE is HFmode, and punt to the generic implementation otherwise.  */
17294
17295 static bool
17296 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
17297 {
17298   return (mode == HFmode
17299           ? true
17300           : default_libgcc_floating_mode_supported_p (mode));
17301 }
17302
17303 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
17304    if MODE is HFmode, and punt to the generic implementation otherwise.  */
17305
17306 static bool
17307 aarch64_scalar_mode_supported_p (scalar_mode mode)
17308 {
17309   return (mode == HFmode
17310           ? true
17311           : default_scalar_mode_supported_p (mode));
17312 }
17313
17314 /* Set the value of FLT_EVAL_METHOD.
17315    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
17316
17317     0: evaluate all operations and constants, whose semantic type has at
17318        most the range and precision of type float, to the range and
17319        precision of float; evaluate all other operations and constants to
17320        the range and precision of the semantic type;
17321
17322     N, where _FloatN is a supported interchange floating type
17323        evaluate all operations and constants, whose semantic type has at
17324        most the range and precision of _FloatN type, to the range and
17325        precision of the _FloatN type; evaluate all other operations and
17326        constants to the range and precision of the semantic type;
17327
17328    If we have the ARMv8.2-A extensions then we support _Float16 in native
17329    precision, so we should set this to 16.  Otherwise, we support the type,
17330    but want to evaluate expressions in float precision, so set this to
17331    0.  */
17332
17333 static enum flt_eval_method
17334 aarch64_excess_precision (enum excess_precision_type type)
17335 {
17336   switch (type)
17337     {
17338       case EXCESS_PRECISION_TYPE_FAST:
17339       case EXCESS_PRECISION_TYPE_STANDARD:
17340         /* We can calculate either in 16-bit range and precision or
17341            32-bit range and precision.  Make that decision based on whether
17342            we have native support for the ARMv8.2-A 16-bit floating-point
17343            instructions or not.  */
17344         return (TARGET_FP_F16INST
17345                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
17346                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
17347       case EXCESS_PRECISION_TYPE_IMPLICIT:
17348         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
17349       default:
17350         gcc_unreachable ();
17351     }
17352   return FLT_EVAL_METHOD_UNPREDICTABLE;
17353 }
17354
17355 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
17356    scheduled for speculative execution.  Reject the long-running division
17357    and square-root instructions.  */
17358
17359 static bool
17360 aarch64_sched_can_speculate_insn (rtx_insn *insn)
17361 {
17362   switch (get_attr_type (insn))
17363     {
17364       case TYPE_SDIV:
17365       case TYPE_UDIV:
17366       case TYPE_FDIVS:
17367       case TYPE_FDIVD:
17368       case TYPE_FSQRTS:
17369       case TYPE_FSQRTD:
17370       case TYPE_NEON_FP_SQRT_S:
17371       case TYPE_NEON_FP_SQRT_D:
17372       case TYPE_NEON_FP_SQRT_S_Q:
17373       case TYPE_NEON_FP_SQRT_D_Q:
17374       case TYPE_NEON_FP_DIV_S:
17375       case TYPE_NEON_FP_DIV_D:
17376       case TYPE_NEON_FP_DIV_S_Q:
17377       case TYPE_NEON_FP_DIV_D_Q:
17378         return false;
17379       default:
17380         return true;
17381     }
17382 }
17383
17384 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
17385
17386 static int
17387 aarch64_compute_pressure_classes (reg_class *classes)
17388 {
17389   int i = 0;
17390   classes[i++] = GENERAL_REGS;
17391   classes[i++] = FP_REGS;
17392   /* PR_REGS isn't a useful pressure class because many predicate pseudo
17393      registers need to go in PR_LO_REGS at some point during their
17394      lifetime.  Splitting it into two halves has the effect of making
17395      all predicates count against PR_LO_REGS, so that we try whenever
17396      possible to restrict the number of live predicates to 8.  This
17397      greatly reduces the amount of spilling in certain loops.  */
17398   classes[i++] = PR_LO_REGS;
17399   classes[i++] = PR_HI_REGS;
17400   return i;
17401 }
17402
17403 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
17404
17405 static bool
17406 aarch64_can_change_mode_class (machine_mode from,
17407                                machine_mode to, reg_class_t)
17408 {
17409   if (BYTES_BIG_ENDIAN)
17410     {
17411       bool from_sve_p = aarch64_sve_data_mode_p (from);
17412       bool to_sve_p = aarch64_sve_data_mode_p (to);
17413
17414       /* Don't allow changes between SVE data modes and non-SVE modes.
17415          See the comment at the head of aarch64-sve.md for details.  */
17416       if (from_sve_p != to_sve_p)
17417         return false;
17418
17419       /* Don't allow changes in element size: lane 0 of the new vector
17420          would not then be lane 0 of the old vector.  See the comment
17421          above aarch64_maybe_expand_sve_subreg_move for a more detailed
17422          description.
17423
17424          In the worst case, this forces a register to be spilled in
17425          one mode and reloaded in the other, which handles the
17426          endianness correctly.  */
17427       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
17428         return false;
17429     }
17430   return true;
17431 }
17432
17433 /* Implement TARGET_EARLY_REMAT_MODES.  */
17434
17435 static void
17436 aarch64_select_early_remat_modes (sbitmap modes)
17437 {
17438   /* SVE values are not normally live across a call, so it should be
17439      worth doing early rematerialization even in VL-specific mode.  */
17440   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
17441     {
17442       machine_mode mode = (machine_mode) i;
17443       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
17444       if (vec_flags & VEC_ANY_SVE)
17445         bitmap_set_bit (modes, i);
17446     }
17447 }
17448
17449 /* Override the default target speculation_safe_value.  */
17450 static rtx
17451 aarch64_speculation_safe_value (machine_mode mode,
17452                                 rtx result, rtx val, rtx failval)
17453 {
17454   /* Maybe we should warn if falling back to hard barriers.  They are
17455      likely to be noticably more expensive than the alternative below.  */
17456   if (!aarch64_track_speculation)
17457     return default_speculation_safe_value (mode, result, val, failval);
17458
17459   if (!REG_P (val))
17460     val = copy_to_mode_reg (mode, val);
17461
17462   if (!aarch64_reg_or_zero (failval, mode))
17463     failval = copy_to_mode_reg (mode, failval);
17464
17465   switch (mode)
17466     {
17467     case E_QImode:
17468       emit_insn (gen_despeculate_copyqi (result, val, failval));
17469       break;
17470     case E_HImode:
17471       emit_insn (gen_despeculate_copyhi (result, val, failval));
17472       break;
17473     case E_SImode:
17474       emit_insn (gen_despeculate_copysi (result, val, failval));
17475       break;
17476     case E_DImode:
17477       emit_insn (gen_despeculate_copydi (result, val, failval));
17478       break;
17479     case E_TImode:
17480       emit_insn (gen_despeculate_copyti (result, val, failval));
17481       break;
17482     default:
17483       gcc_unreachable ();
17484     }
17485   return result;
17486 }
17487
17488 /* Target-specific selftests.  */
17489
17490 #if CHECKING_P
17491
17492 namespace selftest {
17493
17494 /* Selftest for the RTL loader.
17495    Verify that the RTL loader copes with a dump from
17496    print_rtx_function.  This is essentially just a test that class
17497    function_reader can handle a real dump, but it also verifies
17498    that lookup_reg_by_dump_name correctly handles hard regs.
17499    The presence of hard reg names in the dump means that the test is
17500    target-specific, hence it is in this file.  */
17501
17502 static void
17503 aarch64_test_loading_full_dump ()
17504 {
17505   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
17506
17507   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
17508
17509   rtx_insn *insn_1 = get_insn_by_uid (1);
17510   ASSERT_EQ (NOTE, GET_CODE (insn_1));
17511
17512   rtx_insn *insn_15 = get_insn_by_uid (15);
17513   ASSERT_EQ (INSN, GET_CODE (insn_15));
17514   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
17515
17516   /* Verify crtl->return_rtx.  */
17517   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
17518   ASSERT_EQ (0, REGNO (crtl->return_rtx));
17519   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
17520 }
17521
17522 /* Run all target-specific selftests.  */
17523
17524 static void
17525 aarch64_run_selftests (void)
17526 {
17527   aarch64_test_loading_full_dump ();
17528 }
17529
17530 } // namespace selftest
17531
17532 #endif /* #if CHECKING_P */
17533
17534 #undef TARGET_ADDRESS_COST
17535 #define TARGET_ADDRESS_COST aarch64_address_cost
17536
17537 /* This hook will determines whether unnamed bitfields affect the alignment
17538    of the containing structure.  The hook returns true if the structure
17539    should inherit the alignment requirements of an unnamed bitfield's
17540    type.  */
17541 #undef TARGET_ALIGN_ANON_BITFIELD
17542 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
17543
17544 #undef TARGET_ASM_ALIGNED_DI_OP
17545 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
17546
17547 #undef TARGET_ASM_ALIGNED_HI_OP
17548 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
17549
17550 #undef TARGET_ASM_ALIGNED_SI_OP
17551 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
17552
17553 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
17554 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
17555   hook_bool_const_tree_hwi_hwi_const_tree_true
17556
17557 #undef TARGET_ASM_FILE_START
17558 #define TARGET_ASM_FILE_START aarch64_start_file
17559
17560 #undef TARGET_ASM_OUTPUT_MI_THUNK
17561 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
17562
17563 #undef TARGET_ASM_SELECT_RTX_SECTION
17564 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
17565
17566 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
17567 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
17568
17569 #undef TARGET_BUILD_BUILTIN_VA_LIST
17570 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
17571
17572 #undef TARGET_CALLEE_COPIES
17573 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
17574
17575 #undef TARGET_CAN_ELIMINATE
17576 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
17577
17578 #undef TARGET_CAN_INLINE_P
17579 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
17580
17581 #undef TARGET_CANNOT_FORCE_CONST_MEM
17582 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
17583
17584 #undef TARGET_CASE_VALUES_THRESHOLD
17585 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
17586
17587 #undef TARGET_CONDITIONAL_REGISTER_USAGE
17588 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
17589
17590 /* Only the least significant bit is used for initialization guard
17591    variables.  */
17592 #undef TARGET_CXX_GUARD_MASK_BIT
17593 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
17594
17595 #undef TARGET_C_MODE_FOR_SUFFIX
17596 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
17597
17598 #ifdef TARGET_BIG_ENDIAN_DEFAULT
17599 #undef  TARGET_DEFAULT_TARGET_FLAGS
17600 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
17601 #endif
17602
17603 #undef TARGET_CLASS_MAX_NREGS
17604 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
17605
17606 #undef TARGET_BUILTIN_DECL
17607 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
17608
17609 #undef TARGET_BUILTIN_RECIPROCAL
17610 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
17611
17612 #undef TARGET_C_EXCESS_PRECISION
17613 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
17614
17615 #undef  TARGET_EXPAND_BUILTIN
17616 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
17617
17618 #undef TARGET_EXPAND_BUILTIN_VA_START
17619 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
17620
17621 #undef TARGET_FOLD_BUILTIN
17622 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
17623
17624 #undef TARGET_FUNCTION_ARG
17625 #define TARGET_FUNCTION_ARG aarch64_function_arg
17626
17627 #undef TARGET_FUNCTION_ARG_ADVANCE
17628 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
17629
17630 #undef TARGET_FUNCTION_ARG_BOUNDARY
17631 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
17632
17633 #undef TARGET_FUNCTION_ARG_PADDING
17634 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
17635
17636 #undef TARGET_GET_RAW_RESULT_MODE
17637 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
17638 #undef TARGET_GET_RAW_ARG_MODE
17639 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
17640
17641 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
17642 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
17643
17644 #undef TARGET_FUNCTION_VALUE
17645 #define TARGET_FUNCTION_VALUE aarch64_function_value
17646
17647 #undef TARGET_FUNCTION_VALUE_REGNO_P
17648 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
17649
17650 #undef TARGET_GIMPLE_FOLD_BUILTIN
17651 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
17652
17653 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
17654 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
17655
17656 #undef  TARGET_INIT_BUILTINS
17657 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
17658
17659 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
17660 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
17661   aarch64_ira_change_pseudo_allocno_class
17662
17663 #undef TARGET_LEGITIMATE_ADDRESS_P
17664 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
17665
17666 #undef TARGET_LEGITIMATE_CONSTANT_P
17667 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
17668
17669 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
17670 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
17671   aarch64_legitimize_address_displacement
17672
17673 #undef TARGET_LIBGCC_CMP_RETURN_MODE
17674 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
17675
17676 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
17677 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
17678 aarch64_libgcc_floating_mode_supported_p
17679
17680 #undef TARGET_MANGLE_TYPE
17681 #define TARGET_MANGLE_TYPE aarch64_mangle_type
17682
17683 #undef TARGET_MEMORY_MOVE_COST
17684 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
17685
17686 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
17687 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
17688
17689 #undef TARGET_MUST_PASS_IN_STACK
17690 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
17691
17692 /* This target hook should return true if accesses to volatile bitfields
17693    should use the narrowest mode possible.  It should return false if these
17694    accesses should use the bitfield container type.  */
17695 #undef TARGET_NARROW_VOLATILE_BITFIELD
17696 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
17697
17698 #undef  TARGET_OPTION_OVERRIDE
17699 #define TARGET_OPTION_OVERRIDE aarch64_override_options
17700
17701 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
17702 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
17703   aarch64_override_options_after_change
17704
17705 #undef TARGET_OPTION_SAVE
17706 #define TARGET_OPTION_SAVE aarch64_option_save
17707
17708 #undef TARGET_OPTION_RESTORE
17709 #define TARGET_OPTION_RESTORE aarch64_option_restore
17710
17711 #undef TARGET_OPTION_PRINT
17712 #define TARGET_OPTION_PRINT aarch64_option_print
17713
17714 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
17715 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
17716
17717 #undef TARGET_SET_CURRENT_FUNCTION
17718 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
17719
17720 #undef TARGET_PASS_BY_REFERENCE
17721 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
17722
17723 #undef TARGET_PREFERRED_RELOAD_CLASS
17724 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
17725
17726 #undef TARGET_SCHED_REASSOCIATION_WIDTH
17727 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
17728
17729 #undef TARGET_PROMOTED_TYPE
17730 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
17731
17732 #undef TARGET_SECONDARY_RELOAD
17733 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
17734
17735 #undef TARGET_SHIFT_TRUNCATION_MASK
17736 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
17737
17738 #undef TARGET_SETUP_INCOMING_VARARGS
17739 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
17740
17741 #undef TARGET_STRUCT_VALUE_RTX
17742 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
17743
17744 #undef TARGET_REGISTER_MOVE_COST
17745 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
17746
17747 #undef TARGET_RETURN_IN_MEMORY
17748 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
17749
17750 #undef TARGET_RETURN_IN_MSB
17751 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
17752
17753 #undef TARGET_RTX_COSTS
17754 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
17755
17756 #undef TARGET_SCALAR_MODE_SUPPORTED_P
17757 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
17758
17759 #undef TARGET_SCHED_ISSUE_RATE
17760 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
17761
17762 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
17763 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
17764   aarch64_sched_first_cycle_multipass_dfa_lookahead
17765
17766 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
17767 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
17768   aarch64_first_cycle_multipass_dfa_lookahead_guard
17769
17770 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
17771 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
17772   aarch64_get_separate_components
17773
17774 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
17775 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
17776   aarch64_components_for_bb
17777
17778 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
17779 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
17780   aarch64_disqualify_components
17781
17782 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
17783 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
17784   aarch64_emit_prologue_components
17785
17786 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
17787 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
17788   aarch64_emit_epilogue_components
17789
17790 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
17791 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
17792   aarch64_set_handled_components
17793
17794 #undef TARGET_TRAMPOLINE_INIT
17795 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
17796
17797 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
17798 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
17799
17800 #undef TARGET_VECTOR_MODE_SUPPORTED_P
17801 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
17802
17803 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
17804 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
17805   aarch64_builtin_support_vector_misalignment
17806
17807 #undef TARGET_ARRAY_MODE
17808 #define TARGET_ARRAY_MODE aarch64_array_mode
17809
17810 #undef TARGET_ARRAY_MODE_SUPPORTED_P
17811 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
17812
17813 #undef TARGET_VECTORIZE_ADD_STMT_COST
17814 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
17815
17816 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
17817 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
17818   aarch64_builtin_vectorization_cost
17819
17820 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
17821 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
17822
17823 #undef TARGET_VECTORIZE_BUILTINS
17824 #define TARGET_VECTORIZE_BUILTINS
17825
17826 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
17827 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
17828   aarch64_builtin_vectorized_function
17829
17830 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
17831 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
17832   aarch64_autovectorize_vector_sizes
17833
17834 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
17835 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
17836   aarch64_atomic_assign_expand_fenv
17837
17838 /* Section anchor support.  */
17839
17840 #undef TARGET_MIN_ANCHOR_OFFSET
17841 #define TARGET_MIN_ANCHOR_OFFSET -256
17842
17843 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
17844    byte offset; we can do much more for larger data types, but have no way
17845    to determine the size of the access.  We assume accesses are aligned.  */
17846 #undef TARGET_MAX_ANCHOR_OFFSET
17847 #define TARGET_MAX_ANCHOR_OFFSET 4095
17848
17849 #undef TARGET_VECTOR_ALIGNMENT
17850 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
17851
17852 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
17853 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
17854   aarch64_vectorize_preferred_vector_alignment
17855 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
17856 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
17857   aarch64_simd_vector_alignment_reachable
17858
17859 /* vec_perm support.  */
17860
17861 #undef TARGET_VECTORIZE_VEC_PERM_CONST
17862 #define TARGET_VECTORIZE_VEC_PERM_CONST \
17863   aarch64_vectorize_vec_perm_const
17864
17865 #undef TARGET_VECTORIZE_GET_MASK_MODE
17866 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
17867 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
17868 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
17869   aarch64_empty_mask_is_expensive
17870 #undef TARGET_PREFERRED_ELSE_VALUE
17871 #define TARGET_PREFERRED_ELSE_VALUE \
17872   aarch64_preferred_else_value
17873
17874 #undef TARGET_INIT_LIBFUNCS
17875 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
17876
17877 #undef TARGET_FIXED_CONDITION_CODE_REGS
17878 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
17879
17880 #undef TARGET_FLAGS_REGNUM
17881 #define TARGET_FLAGS_REGNUM CC_REGNUM
17882
17883 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
17884 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
17885
17886 #undef TARGET_ASAN_SHADOW_OFFSET
17887 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
17888
17889 #undef TARGET_LEGITIMIZE_ADDRESS
17890 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
17891
17892 #undef TARGET_SCHED_CAN_SPECULATE_INSN
17893 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
17894
17895 #undef TARGET_CAN_USE_DOLOOP_P
17896 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
17897
17898 #undef TARGET_SCHED_ADJUST_PRIORITY
17899 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
17900
17901 #undef TARGET_SCHED_MACRO_FUSION_P
17902 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
17903
17904 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
17905 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
17906
17907 #undef TARGET_SCHED_FUSION_PRIORITY
17908 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
17909
17910 #undef TARGET_UNSPEC_MAY_TRAP_P
17911 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
17912
17913 #undef TARGET_USE_PSEUDO_PIC_REG
17914 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
17915
17916 #undef TARGET_PRINT_OPERAND
17917 #define TARGET_PRINT_OPERAND aarch64_print_operand
17918
17919 #undef TARGET_PRINT_OPERAND_ADDRESS
17920 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
17921
17922 #undef TARGET_OPTAB_SUPPORTED_P
17923 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
17924
17925 #undef TARGET_OMIT_STRUCT_RETURN_REG
17926 #define TARGET_OMIT_STRUCT_RETURN_REG true
17927
17928 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
17929 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
17930   aarch64_dwarf_poly_indeterminate_value
17931
17932 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
17933 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
17934 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
17935
17936 #undef TARGET_HARD_REGNO_NREGS
17937 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
17938 #undef TARGET_HARD_REGNO_MODE_OK
17939 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
17940
17941 #undef TARGET_MODES_TIEABLE_P
17942 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
17943
17944 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
17945 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
17946   aarch64_hard_regno_call_part_clobbered
17947
17948 #undef TARGET_CONSTANT_ALIGNMENT
17949 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
17950
17951 #undef TARGET_COMPUTE_PRESSURE_CLASSES
17952 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
17953
17954 #undef TARGET_CAN_CHANGE_MODE_CLASS
17955 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
17956
17957 #undef TARGET_SELECT_EARLY_REMAT_MODES
17958 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
17959
17960 #undef TARGET_SPECULATION_SAFE_VALUE
17961 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
17962
17963 #if CHECKING_P
17964 #undef TARGET_RUN_TARGET_SELFTESTS
17965 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
17966 #endif /* #if CHECKING_P */
17967
17968 struct gcc_target targetm = TARGET_INITIALIZER;
17969
17970 #include "gt-aarch64.h"