gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2018 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #include "config.h"
  24 #define INCLUDE_STRING
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "memmodel.h"
  32 #include "gimple.h"
  33 #include "cfghooks.h"
  34 #include "cfgloop.h"
  35 #include "df.h"
  36 #include "tm_p.h"
  37 #include "stringpool.h"
  38 #include "attribs.h"
  39 #include "optabs.h"
  40 #include "regs.h"
  41 #include "emit-rtl.h"
  42 #include "recog.h"
  43 #include "diagnostic.h"
  44 #include "insn-attr.h"
  45 #include "alias.h"
  46 #include "fold-const.h"
  47 #include "stor-layout.h"
  48 #include "calls.h"
  49 #include "varasm.h"
  50 #include "output.h"
  51 #include "flags.h"
  52 #include "explow.h"
  53 #include "expr.h"
  54 #include "reload.h"
  55 #include "langhooks.h"
  56 #include "opts.h"
  57 #include "params.h"
  58 #include "gimplify.h"
  59 #include "dwarf2.h"
  60 #include "gimple-iterator.h"
  61 #include "tree-vectorizer.h"
  62 #include "aarch64-cost-tables.h"
  63 #include "dumpfile.h"
  64 #include "builtins.h"
  65 #include "rtl-iter.h"
  66 #include "tm-constrs.h"
  67 #include "sched-int.h"
  68 #include "target-globals.h"
  69 #include "common/common-target.h"
  70 #include "cfgrtl.h"
  71 #include "selftest.h"
  72 #include "selftest-rtl.h"
  73 #include "rtx-vector-builder.h"
  74
  75 /* This file should be included last.  */
  76 #include "target-def.h"
  77
  78 /* Defined for convenience.  */
  79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  80
  81 /* Classifies an address.
  82
  83    ADDRESS_REG_IMM
  84        A simple base register plus immediate offset.
  85
  86    ADDRESS_REG_WB
  87        A base register indexed by immediate offset with writeback.
  88
  89    ADDRESS_REG_REG
  90        A base register indexed by (optionally scaled) register.
  91
  92    ADDRESS_REG_UXTW
  93        A base register indexed by (optionally scaled) zero-extended register.
  94
  95    ADDRESS_REG_SXTW
  96        A base register indexed by (optionally scaled) sign-extended register.
  97
  98    ADDRESS_LO_SUM
  99        A LO_SUM rtx with a base register and "LO12" symbol relocation.
 100
 101    ADDRESS_SYMBOLIC:
 102        A constant symbolic address, in pc-relative literal pool.  */
 103
 104 enum aarch64_address_type {
 105   ADDRESS_REG_IMM,
 106   ADDRESS_REG_WB,
 107   ADDRESS_REG_REG,
 108   ADDRESS_REG_UXTW,
 109   ADDRESS_REG_SXTW,
 110   ADDRESS_LO_SUM,
 111   ADDRESS_SYMBOLIC
 112 };
 113
 114 struct aarch64_address_info {
 115   enum aarch64_address_type type;
 116   rtx base;
 117   rtx offset;
 118   poly_int64 const_offset;
 119   int shift;
 120   enum aarch64_symbol_type symbol_type;
 121 };
 122
 123 /* Information about a legitimate vector immediate operand.  */
 124 struct simd_immediate_info
 125 {
 126   enum insn_type { MOV, MVN };
 127   enum modifier_type { LSL, MSL };
 128
 129   simd_immediate_info () {}
 130   simd_immediate_info (scalar_float_mode, rtx);
 131   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
 132                        insn_type = MOV, modifier_type = LSL,
 133                        unsigned int = 0);
 134   simd_immediate_info (scalar_mode, rtx, rtx);
 135
 136   /* The mode of the elements.  */
 137   scalar_mode elt_mode;
 138
 139   /* The value of each element if all elements are the same, or the
 140      first value if the constant is a series.  */
 141   rtx value;
 142
 143   /* The value of the step if the constant is a series, null otherwise.  */
 144   rtx step;
 145
 146   /* The instruction to use to move the immediate into a vector.  */
 147   insn_type insn;
 148
 149   /* The kind of shift modifier to use, and the number of bits to shift.
 150      This is (LSL, 0) if no shift is needed.  */
 151   modifier_type modifier;
 152   unsigned int shift;
 153 };
 154
 155 /* Construct a floating-point immediate in which each element has mode
 156    ELT_MODE_IN and value VALUE_IN.  */
 157 inline simd_immediate_info
 158 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 159   : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
 160     modifier (LSL), shift (0)
 161 {}
 162
 163 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 164    and value VALUE_IN.  The other parameters are as for the structure
 165    fields.  */
 166 inline simd_immediate_info
 167 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 168                        unsigned HOST_WIDE_INT value_in,
 169                        insn_type insn_in, modifier_type modifier_in,
 170                        unsigned int shift_in)
 171   : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
 172     step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
 173 {}
 174
 175 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 176    and where element I is equal to VALUE_IN + I * STEP_IN.  */
 177 inline simd_immediate_info
 178 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
 179   : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
 180     modifier (LSL), shift (0)
 181 {}
 182
 183 /* The current code model.  */
 184 enum aarch64_code_model aarch64_cmodel;
 185
 186 /* The number of 64-bit elements in an SVE vector.  */
 187 poly_uint16 aarch64_sve_vg;
 188
 189 #ifdef HAVE_AS_TLS
 190 #undef TARGET_HAVE_TLS
 191 #define TARGET_HAVE_TLS 1
 192 #endif
 193
 194 static bool aarch64_composite_type_p (const_tree, machine_mode);
 195 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 196                                                      const_tree,
 197                                                      machine_mode *, int *,
 198                                                      bool *);
 199 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 200 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 201 static void aarch64_override_options_after_change (void);
 202 static bool aarch64_vector_mode_supported_p (machine_mode);
 203 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 204 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 205                                                          const_tree type,
 206                                                          int misalignment,
 207                                                          bool is_packed);
 208 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 209 static bool aarch64_print_ldpstp_address (FILE *, machine_mode, rtx);
 210
 211 /* Major revision number of the ARM Architecture implemented by the target.  */
 212 unsigned aarch64_architecture_version;
 213
 214 /* The processor for which instructions should be scheduled.  */
 215 enum aarch64_processor aarch64_tune = cortexa53;
 216
 217 /* Mask to specify which instruction scheduling options should be used.  */
 218 unsigned long aarch64_tune_flags = 0;
 219
 220 /* Global flag for PC relative loads.  */
 221 bool aarch64_pcrelative_literal_loads;
 222
 223 /* Support for command line parsing of boolean flags in the tuning
 224    structures.  */
 225 struct aarch64_flag_desc
 226 {
 227   const char* name;
 228   unsigned int flag;
 229 };
 230
 231 #define AARCH64_FUSION_PAIR(name, internal_name) \
 232   { name, AARCH64_FUSE_##internal_name },
 233 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 234 {
 235   { "none", AARCH64_FUSE_NOTHING },
 236 #include "aarch64-fusion-pairs.def"
 237   { "all", AARCH64_FUSE_ALL },
 238   { NULL, AARCH64_FUSE_NOTHING }
 239 };
 240
 241 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 242   { name, AARCH64_EXTRA_TUNE_##internal_name },
 243 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 244 {
 245   { "none", AARCH64_EXTRA_TUNE_NONE },
 246 #include "aarch64-tuning-flags.def"
 247   { "all", AARCH64_EXTRA_TUNE_ALL },
 248   { NULL, AARCH64_EXTRA_TUNE_NONE }
 249 };
 250
 251 /* Tuning parameters.  */
 252
 253 static const struct cpu_addrcost_table generic_addrcost_table =
 254 {
 255     {
 256       1, /* hi  */
 257       0, /* si  */
 258       0, /* di  */
 259       1, /* ti  */
 260     },
 261   0, /* pre_modify  */
 262   0, /* post_modify  */
 263   0, /* register_offset  */
 264   0, /* register_sextend  */
 265   0, /* register_zextend  */
 266   0 /* imm_offset  */
 267 };
 268
 269 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 270 {
 271     {
 272       0, /* hi  */
 273       0, /* si  */
 274       0, /* di  */
 275       2, /* ti  */
 276     },
 277   0, /* pre_modify  */
 278   0, /* post_modify  */
 279   1, /* register_offset  */
 280   1, /* register_sextend  */
 281   2, /* register_zextend  */
 282   0, /* imm_offset  */
 283 };
 284
 285 static const struct cpu_addrcost_table xgene1_addrcost_table =
 286 {
 287     {
 288       1, /* hi  */
 289       0, /* si  */
 290       0, /* di  */
 291       1, /* ti  */
 292     },
 293   1, /* pre_modify  */
 294   0, /* post_modify  */
 295   0, /* register_offset  */
 296   1, /* register_sextend  */
 297   1, /* register_zextend  */
 298   0, /* imm_offset  */
 299 };
 300
 301 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 302 {
 303     {
 304       1, /* hi  */
 305       1, /* si  */
 306       1, /* di  */
 307       2, /* ti  */
 308     },
 309   0, /* pre_modify  */
 310   0, /* post_modify  */
 311   2, /* register_offset  */
 312   3, /* register_sextend  */
 313   3, /* register_zextend  */
 314   0, /* imm_offset  */
 315 };
 316
 317 static const struct cpu_regmove_cost generic_regmove_cost =
 318 {
 319   1, /* GP2GP  */
 320   /* Avoid the use of slow int<->fp moves for spilling by setting
 321      their cost higher than memmov_cost.  */
 322   5, /* GP2FP  */
 323   5, /* FP2GP  */
 324   2 /* FP2FP  */
 325 };
 326
 327 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 328 {
 329   1, /* GP2GP  */
 330   /* Avoid the use of slow int<->fp moves for spilling by setting
 331      their cost higher than memmov_cost.  */
 332   5, /* GP2FP  */
 333   5, /* FP2GP  */
 334   2 /* FP2FP  */
 335 };
 336
 337 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 338 {
 339   1, /* GP2GP  */
 340   /* Avoid the use of slow int<->fp moves for spilling by setting
 341      their cost higher than memmov_cost.  */
 342   5, /* GP2FP  */
 343   5, /* FP2GP  */
 344   2 /* FP2FP  */
 345 };
 346
 347 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 348 {
 349   1, /* GP2GP  */
 350   /* Avoid the use of slow int<->fp moves for spilling by setting
 351      their cost higher than memmov_cost (actual, 4 and 9).  */
 352   9, /* GP2FP  */
 353   9, /* FP2GP  */
 354   1 /* FP2FP  */
 355 };
 356
 357 static const struct cpu_regmove_cost thunderx_regmove_cost =
 358 {
 359   2, /* GP2GP  */
 360   2, /* GP2FP  */
 361   6, /* FP2GP  */
 362   4 /* FP2FP  */
 363 };
 364
 365 static const struct cpu_regmove_cost xgene1_regmove_cost =
 366 {
 367   1, /* GP2GP  */
 368   /* Avoid the use of slow int<->fp moves for spilling by setting
 369      their cost higher than memmov_cost.  */
 370   8, /* GP2FP  */
 371   8, /* FP2GP  */
 372   2 /* FP2FP  */
 373 };
 374
 375 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 376 {
 377   2, /* GP2GP  */
 378   /* Avoid the use of int<->fp moves for spilling.  */
 379   6, /* GP2FP  */
 380   6, /* FP2GP  */
 381   4 /* FP2FP  */
 382 };
 383
 384 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 385 {
 386   1, /* GP2GP  */
 387   /* Avoid the use of int<->fp moves for spilling.  */
 388   8, /* GP2FP  */
 389   8, /* FP2GP  */
 390   4  /* FP2FP  */
 391 };
 392
 393 /* Generic costs for vector insn classes.  */
 394 static const struct cpu_vector_cost generic_vector_cost =
 395 {
 396   1, /* scalar_int_stmt_cost  */
 397   1, /* scalar_fp_stmt_cost  */
 398   1, /* scalar_load_cost  */
 399   1, /* scalar_store_cost  */
 400   1, /* vec_int_stmt_cost  */
 401   1, /* vec_fp_stmt_cost  */
 402   2, /* vec_permute_cost  */
 403   1, /* vec_to_scalar_cost  */
 404   1, /* scalar_to_vec_cost  */
 405   1, /* vec_align_load_cost  */
 406   1, /* vec_unalign_load_cost  */
 407   1, /* vec_unalign_store_cost  */
 408   1, /* vec_store_cost  */
 409   3, /* cond_taken_branch_cost  */
 410   1 /* cond_not_taken_branch_cost  */
 411 };
 412
 413 /* ThunderX costs for vector insn classes.  */
 414 static const struct cpu_vector_cost thunderx_vector_cost =
 415 {
 416   1, /* scalar_int_stmt_cost  */
 417   1, /* scalar_fp_stmt_cost  */
 418   3, /* scalar_load_cost  */
 419   1, /* scalar_store_cost  */
 420   4, /* vec_int_stmt_cost  */
 421   1, /* vec_fp_stmt_cost  */
 422   4, /* vec_permute_cost  */
 423   2, /* vec_to_scalar_cost  */
 424   2, /* scalar_to_vec_cost  */
 425   3, /* vec_align_load_cost  */
 426   5, /* vec_unalign_load_cost  */
 427   5, /* vec_unalign_store_cost  */
 428   1, /* vec_store_cost  */
 429   3, /* cond_taken_branch_cost  */
 430   3 /* cond_not_taken_branch_cost  */
 431 };
 432
 433 /* Generic costs for vector insn classes.  */
 434 static const struct cpu_vector_cost cortexa57_vector_cost =
 435 {
 436   1, /* scalar_int_stmt_cost  */
 437   1, /* scalar_fp_stmt_cost  */
 438   4, /* scalar_load_cost  */
 439   1, /* scalar_store_cost  */
 440   2, /* vec_int_stmt_cost  */
 441   2, /* vec_fp_stmt_cost  */
 442   3, /* vec_permute_cost  */
 443   8, /* vec_to_scalar_cost  */
 444   8, /* scalar_to_vec_cost  */
 445   4, /* vec_align_load_cost  */
 446   4, /* vec_unalign_load_cost  */
 447   1, /* vec_unalign_store_cost  */
 448   1, /* vec_store_cost  */
 449   1, /* cond_taken_branch_cost  */
 450   1 /* cond_not_taken_branch_cost  */
 451 };
 452
 453 static const struct cpu_vector_cost exynosm1_vector_cost =
 454 {
 455   1, /* scalar_int_stmt_cost  */
 456   1, /* scalar_fp_stmt_cost  */
 457   5, /* scalar_load_cost  */
 458   1, /* scalar_store_cost  */
 459   3, /* vec_int_stmt_cost  */
 460   3, /* vec_fp_stmt_cost  */
 461   3, /* vec_permute_cost  */
 462   3, /* vec_to_scalar_cost  */
 463   3, /* scalar_to_vec_cost  */
 464   5, /* vec_align_load_cost  */
 465   5, /* vec_unalign_load_cost  */
 466   1, /* vec_unalign_store_cost  */
 467   1, /* vec_store_cost  */
 468   1, /* cond_taken_branch_cost  */
 469   1 /* cond_not_taken_branch_cost  */
 470 };
 471
 472 /* Generic costs for vector insn classes.  */
 473 static const struct cpu_vector_cost xgene1_vector_cost =
 474 {
 475   1, /* scalar_int_stmt_cost  */
 476   1, /* scalar_fp_stmt_cost  */
 477   5, /* scalar_load_cost  */
 478   1, /* scalar_store_cost  */
 479   2, /* vec_int_stmt_cost  */
 480   2, /* vec_fp_stmt_cost  */
 481   2, /* vec_permute_cost  */
 482   4, /* vec_to_scalar_cost  */
 483   4, /* scalar_to_vec_cost  */
 484   10, /* vec_align_load_cost  */
 485   10, /* vec_unalign_load_cost  */
 486   2, /* vec_unalign_store_cost  */
 487   2, /* vec_store_cost  */
 488   2, /* cond_taken_branch_cost  */
 489   1 /* cond_not_taken_branch_cost  */
 490 };
 491
 492 /* Costs for vector insn classes for Vulcan.  */
 493 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 494 {
 495   1, /* scalar_int_stmt_cost  */
 496   6, /* scalar_fp_stmt_cost  */
 497   4, /* scalar_load_cost  */
 498   1, /* scalar_store_cost  */
 499   5, /* vec_int_stmt_cost  */
 500   6, /* vec_fp_stmt_cost  */
 501   3, /* vec_permute_cost  */
 502   6, /* vec_to_scalar_cost  */
 503   5, /* scalar_to_vec_cost  */
 504   8, /* vec_align_load_cost  */
 505   8, /* vec_unalign_load_cost  */
 506   4, /* vec_unalign_store_cost  */
 507   4, /* vec_store_cost  */
 508   2, /* cond_taken_branch_cost  */
 509   1  /* cond_not_taken_branch_cost  */
 510 };
 511
 512 /* Generic costs for branch instructions.  */
 513 static const struct cpu_branch_cost generic_branch_cost =
 514 {
 515   1,  /* Predictable.  */
 516   3   /* Unpredictable.  */
 517 };
 518
 519 /* Generic approximation modes.  */
 520 static const cpu_approx_modes generic_approx_modes =
 521 {
 522   AARCH64_APPROX_NONE,  /* division  */
 523   AARCH64_APPROX_NONE,  /* sqrt  */
 524   AARCH64_APPROX_NONE   /* recip_sqrt  */
 525 };
 526
 527 /* Approximation modes for Exynos M1.  */
 528 static const cpu_approx_modes exynosm1_approx_modes =
 529 {
 530   AARCH64_APPROX_NONE,  /* division  */
 531   AARCH64_APPROX_ALL,   /* sqrt  */
 532   AARCH64_APPROX_ALL    /* recip_sqrt  */
 533 };
 534
 535 /* Approximation modes for X-Gene 1.  */
 536 static const cpu_approx_modes xgene1_approx_modes =
 537 {
 538   AARCH64_APPROX_NONE,  /* division  */
 539   AARCH64_APPROX_NONE,  /* sqrt  */
 540   AARCH64_APPROX_ALL    /* recip_sqrt  */
 541 };
 542
 543 /* Generic prefetch settings (which disable prefetch).  */
 544 static const cpu_prefetch_tune generic_prefetch_tune =
 545 {
 546   0,                    /* num_slots  */
 547   -1,                   /* l1_cache_size  */
 548   -1,                   /* l1_cache_line_size  */
 549   -1,                   /* l2_cache_size  */
 550   -1                    /* default_opt_level  */
 551 };
 552
 553 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 554 {
 555   0,                    /* num_slots  */
 556   -1,                   /* l1_cache_size  */
 557   64,                   /* l1_cache_line_size  */
 558   -1,                   /* l2_cache_size  */
 559   -1                    /* default_opt_level  */
 560 };
 561
 562 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 563 {
 564   4,                    /* num_slots  */
 565   32,                   /* l1_cache_size  */
 566   64,                   /* l1_cache_line_size  */
 567   512,                  /* l2_cache_size  */
 568   -1                    /* default_opt_level  */
 569 };
 570
 571 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 572 {
 573   8,                    /* num_slots  */
 574   32,                   /* l1_cache_size  */
 575   128,                  /* l1_cache_line_size  */
 576   16*1024,              /* l2_cache_size  */
 577   3                     /* default_opt_level  */
 578 };
 579
 580 static const cpu_prefetch_tune thunderx_prefetch_tune =
 581 {
 582   8,                    /* num_slots  */
 583   32,                   /* l1_cache_size  */
 584   128,                  /* l1_cache_line_size  */
 585   -1,                   /* l2_cache_size  */
 586   -1                    /* default_opt_level  */
 587 };
 588
 589 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 590 {
 591   8,                    /* num_slots  */
 592   32,                   /* l1_cache_size  */
 593   64,                   /* l1_cache_line_size  */
 594   256,                  /* l2_cache_size  */
 595   -1                    /* default_opt_level  */
 596 };
 597
 598 static const struct tune_params generic_tunings =
 599 {
 600   &cortexa57_extra_costs,
 601   &generic_addrcost_table,
 602   &generic_regmove_cost,
 603   &generic_vector_cost,
 604   &generic_branch_cost,
 605   &generic_approx_modes,
 606   4, /* memmov_cost  */
 607   2, /* issue_rate  */
 608   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 609   8,    /* function_align.  */
 610   4,    /* jump_align.  */
 611   8,    /* loop_align.  */
 612   2,    /* int_reassoc_width.  */
 613   4,    /* fp_reassoc_width.  */
 614   1,    /* vec_reassoc_width.  */
 615   2,    /* min_div_recip_mul_sf.  */
 616   2,    /* min_div_recip_mul_df.  */
 617   0,    /* max_case_values.  */
 618   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 619   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 620   &generic_prefetch_tune
 621 };
 622
 623 static const struct tune_params cortexa35_tunings =
 624 {
 625   &cortexa53_extra_costs,
 626   &generic_addrcost_table,
 627   &cortexa53_regmove_cost,
 628   &generic_vector_cost,
 629   &generic_branch_cost,
 630   &generic_approx_modes,
 631   4, /* memmov_cost  */
 632   1, /* issue_rate  */
 633   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 634    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 635   16,   /* function_align.  */
 636   4,    /* jump_align.  */
 637   8,    /* loop_align.  */
 638   2,    /* int_reassoc_width.  */
 639   4,    /* fp_reassoc_width.  */
 640   1,    /* vec_reassoc_width.  */
 641   2,    /* min_div_recip_mul_sf.  */
 642   2,    /* min_div_recip_mul_df.  */
 643   0,    /* max_case_values.  */
 644   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 645   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 646   &generic_prefetch_tune
 647 };
 648
 649 static const struct tune_params cortexa53_tunings =
 650 {
 651   &cortexa53_extra_costs,
 652   &generic_addrcost_table,
 653   &cortexa53_regmove_cost,
 654   &generic_vector_cost,
 655   &generic_branch_cost,
 656   &generic_approx_modes,
 657   4, /* memmov_cost  */
 658   2, /* issue_rate  */
 659   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 660    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 661   16,   /* function_align.  */
 662   4,    /* jump_align.  */
 663   8,    /* loop_align.  */
 664   2,    /* int_reassoc_width.  */
 665   4,    /* fp_reassoc_width.  */
 666   1,    /* vec_reassoc_width.  */
 667   2,    /* min_div_recip_mul_sf.  */
 668   2,    /* min_div_recip_mul_df.  */
 669   0,    /* max_case_values.  */
 670   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 671   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 672   &generic_prefetch_tune
 673 };
 674
 675 static const struct tune_params cortexa57_tunings =
 676 {
 677   &cortexa57_extra_costs,
 678   &generic_addrcost_table,
 679   &cortexa57_regmove_cost,
 680   &cortexa57_vector_cost,
 681   &generic_branch_cost,
 682   &generic_approx_modes,
 683   4, /* memmov_cost  */
 684   3, /* issue_rate  */
 685   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 686    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 687   16,   /* function_align.  */
 688   4,    /* jump_align.  */
 689   8,    /* loop_align.  */
 690   2,    /* int_reassoc_width.  */
 691   4,    /* fp_reassoc_width.  */
 692   1,    /* vec_reassoc_width.  */
 693   2,    /* min_div_recip_mul_sf.  */
 694   2,    /* min_div_recip_mul_df.  */
 695   0,    /* max_case_values.  */
 696   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 697   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 698   &generic_prefetch_tune
 699 };
 700
 701 static const struct tune_params cortexa72_tunings =
 702 {
 703   &cortexa57_extra_costs,
 704   &generic_addrcost_table,
 705   &cortexa57_regmove_cost,
 706   &cortexa57_vector_cost,
 707   &generic_branch_cost,
 708   &generic_approx_modes,
 709   4, /* memmov_cost  */
 710   3, /* issue_rate  */
 711   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 712    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 713   16,   /* function_align.  */
 714   4,    /* jump_align.  */
 715   8,    /* loop_align.  */
 716   2,    /* int_reassoc_width.  */
 717   4,    /* fp_reassoc_width.  */
 718   1,    /* vec_reassoc_width.  */
 719   2,    /* min_div_recip_mul_sf.  */
 720   2,    /* min_div_recip_mul_df.  */
 721   0,    /* max_case_values.  */
 722   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 723   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 724   &generic_prefetch_tune
 725 };
 726
 727 static const struct tune_params cortexa73_tunings =
 728 {
 729   &cortexa57_extra_costs,
 730   &generic_addrcost_table,
 731   &cortexa57_regmove_cost,
 732   &cortexa57_vector_cost,
 733   &generic_branch_cost,
 734   &generic_approx_modes,
 735   4, /* memmov_cost.  */
 736   2, /* issue_rate.  */
 737   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 738    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 739   16,   /* function_align.  */
 740   4,    /* jump_align.  */
 741   8,    /* loop_align.  */
 742   2,    /* int_reassoc_width.  */
 743   4,    /* fp_reassoc_width.  */
 744   1,    /* vec_reassoc_width.  */
 745   2,    /* min_div_recip_mul_sf.  */
 746   2,    /* min_div_recip_mul_df.  */
 747   0,    /* max_case_values.  */
 748   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 749   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 750   &generic_prefetch_tune
 751 };
 752
 753
 754
 755 static const struct tune_params exynosm1_tunings =
 756 {
 757   &exynosm1_extra_costs,
 758   &exynosm1_addrcost_table,
 759   &exynosm1_regmove_cost,
 760   &exynosm1_vector_cost,
 761   &generic_branch_cost,
 762   &exynosm1_approx_modes,
 763   4,    /* memmov_cost  */
 764   3,    /* issue_rate  */
 765   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 766   4,    /* function_align.  */
 767   4,    /* jump_align.  */
 768   4,    /* loop_align.  */
 769   2,    /* int_reassoc_width.  */
 770   4,    /* fp_reassoc_width.  */
 771   1,    /* vec_reassoc_width.  */
 772   2,    /* min_div_recip_mul_sf.  */
 773   2,    /* min_div_recip_mul_df.  */
 774   48,   /* max_case_values.  */
 775   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 776   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 777   &exynosm1_prefetch_tune
 778 };
 779
 780 static const struct tune_params thunderxt88_tunings =
 781 {
 782   &thunderx_extra_costs,
 783   &generic_addrcost_table,
 784   &thunderx_regmove_cost,
 785   &thunderx_vector_cost,
 786   &generic_branch_cost,
 787   &generic_approx_modes,
 788   6, /* memmov_cost  */
 789   2, /* issue_rate  */
 790   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 791   8,    /* function_align.  */
 792   8,    /* jump_align.  */
 793   8,    /* loop_align.  */
 794   2,    /* int_reassoc_width.  */
 795   4,    /* fp_reassoc_width.  */
 796   1,    /* vec_reassoc_width.  */
 797   2,    /* min_div_recip_mul_sf.  */
 798   2,    /* min_div_recip_mul_df.  */
 799   0,    /* max_case_values.  */
 800   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 801   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 802   &thunderxt88_prefetch_tune
 803 };
 804
 805 static const struct tune_params thunderx_tunings =
 806 {
 807   &thunderx_extra_costs,
 808   &generic_addrcost_table,
 809   &thunderx_regmove_cost,
 810   &thunderx_vector_cost,
 811   &generic_branch_cost,
 812   &generic_approx_modes,
 813   6, /* memmov_cost  */
 814   2, /* issue_rate  */
 815   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 816   8,    /* function_align.  */
 817   8,    /* jump_align.  */
 818   8,    /* loop_align.  */
 819   2,    /* int_reassoc_width.  */
 820   4,    /* fp_reassoc_width.  */
 821   1,    /* vec_reassoc_width.  */
 822   2,    /* min_div_recip_mul_sf.  */
 823   2,    /* min_div_recip_mul_df.  */
 824   0,    /* max_case_values.  */
 825   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 826   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 827    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 828   &thunderx_prefetch_tune
 829 };
 830
 831 static const struct tune_params xgene1_tunings =
 832 {
 833   &xgene1_extra_costs,
 834   &xgene1_addrcost_table,
 835   &xgene1_regmove_cost,
 836   &xgene1_vector_cost,
 837   &generic_branch_cost,
 838   &xgene1_approx_modes,
 839   6, /* memmov_cost  */
 840   4, /* issue_rate  */
 841   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 842   16,   /* function_align.  */
 843   8,    /* jump_align.  */
 844   16,   /* loop_align.  */
 845   2,    /* int_reassoc_width.  */
 846   4,    /* fp_reassoc_width.  */
 847   1,    /* vec_reassoc_width.  */
 848   2,    /* min_div_recip_mul_sf.  */
 849   2,    /* min_div_recip_mul_df.  */
 850   0,    /* max_case_values.  */
 851   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 852   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 853   &generic_prefetch_tune
 854 };
 855
 856 static const struct tune_params qdf24xx_tunings =
 857 {
 858   &qdf24xx_extra_costs,
 859   &generic_addrcost_table,
 860   &qdf24xx_regmove_cost,
 861   &generic_vector_cost,
 862   &generic_branch_cost,
 863   &generic_approx_modes,
 864   4, /* memmov_cost  */
 865   4, /* issue_rate  */
 866   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 867    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 868   16,   /* function_align.  */
 869   8,    /* jump_align.  */
 870   16,   /* loop_align.  */
 871   2,    /* int_reassoc_width.  */
 872   4,    /* fp_reassoc_width.  */
 873   1,    /* vec_reassoc_width.  */
 874   2,    /* min_div_recip_mul_sf.  */
 875   2,    /* min_div_recip_mul_df.  */
 876   0,    /* max_case_values.  */
 877   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 878   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
 879   &qdf24xx_prefetch_tune
 880 };
 881
 882 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
 883    for now.  */
 884 static const struct tune_params saphira_tunings =
 885 {
 886   &generic_extra_costs,
 887   &generic_addrcost_table,
 888   &generic_regmove_cost,
 889   &generic_vector_cost,
 890   &generic_branch_cost,
 891   &generic_approx_modes,
 892   4, /* memmov_cost  */
 893   4, /* issue_rate  */
 894   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 895    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 896   16,   /* function_align.  */
 897   8,    /* jump_align.  */
 898   16,   /* loop_align.  */
 899   2,    /* int_reassoc_width.  */
 900   4,    /* fp_reassoc_width.  */
 901   1,    /* vec_reassoc_width.  */
 902   2,    /* min_div_recip_mul_sf.  */
 903   2,    /* min_div_recip_mul_df.  */
 904   0,    /* max_case_values.  */
 905   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 906   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
 907   &generic_prefetch_tune
 908 };
 909
 910 static const struct tune_params thunderx2t99_tunings =
 911 {
 912   &thunderx2t99_extra_costs,
 913   &thunderx2t99_addrcost_table,
 914   &thunderx2t99_regmove_cost,
 915   &thunderx2t99_vector_cost,
 916   &generic_branch_cost,
 917   &generic_approx_modes,
 918   4, /* memmov_cost.  */
 919   4, /* issue_rate.  */
 920   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
 921    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 922   16,   /* function_align.  */
 923   8,    /* jump_align.  */
 924   16,   /* loop_align.  */
 925   3,    /* int_reassoc_width.  */
 926   2,    /* fp_reassoc_width.  */
 927   2,    /* vec_reassoc_width.  */
 928   2,    /* min_div_recip_mul_sf.  */
 929   2,    /* min_div_recip_mul_df.  */
 930   0,    /* max_case_values.  */
 931   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 932   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 933   &thunderx2t99_prefetch_tune
 934 };
 935
 936 /* Support for fine-grained override of the tuning structures.  */
 937 struct aarch64_tuning_override_function
 938 {
 939   const char* name;
 940   void (*parse_override)(const char*, struct tune_params*);
 941 };
 942
 943 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 944 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 945
 946 static const struct aarch64_tuning_override_function
 947 aarch64_tuning_override_functions[] =
 948 {
 949   { "fuse", aarch64_parse_fuse_string },
 950   { "tune", aarch64_parse_tune_string },
 951   { NULL, NULL }
 952 };
 953
 954 /* A processor implementing AArch64.  */
 955 struct processor
 956 {
 957   const char *const name;
 958   enum aarch64_processor ident;
 959   enum aarch64_processor sched_core;
 960   enum aarch64_arch arch;
 961   unsigned architecture_version;
 962   const unsigned long flags;
 963   const struct tune_params *const tune;
 964 };
 965
 966 /* Architectures implementing AArch64.  */
 967 static const struct processor all_architectures[] =
 968 {
 969 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
 970   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
 971 #include "aarch64-arches.def"
 972   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 973 };
 974
 975 /* Processor cores implementing AArch64.  */
 976 static const struct processor all_cores[] =
 977 {
 978 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
 979   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
 980   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
 981   FLAGS, &COSTS##_tunings},
 982 #include "aarch64-cores.def"
 983   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
 984     AARCH64_FL_FOR_ARCH8, &generic_tunings},
 985   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 986 };
 987
 988
 989 /* Target specification.  These are populated by the -march, -mtune, -mcpu
 990    handling code or by target attributes.  */
 991 static const struct processor *selected_arch;
 992 static const struct processor *selected_cpu;
 993 static const struct processor *selected_tune;
 994
 995 /* The current tuning set.  */
 996 struct tune_params aarch64_tune_params = generic_tunings;
 997
 998 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 999
1000 /* An ISA extension in the co-processor and main instruction set space.  */
1001 struct aarch64_option_extension
1002 {
1003   const char *const name;
1004   const unsigned long flags_on;
1005   const unsigned long flags_off;
1006 };
1007
1008 typedef enum aarch64_cond_code
1009 {
1010   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1011   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1012   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1013 }
1014 aarch64_cc;
1015
1016 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1017
1018 /* The condition codes of the processor, and the inverse function.  */
1019 static const char * const aarch64_condition_codes[] =
1020 {
1021   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1022   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1023 };
1024
1025 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1026 const char *
1027 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1028                         const char * branch_format)
1029 {
1030     rtx_code_label * tmp_label = gen_label_rtx ();
1031     char label_buf[256];
1032     char buffer[128];
1033     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1034                                  CODE_LABEL_NUMBER (tmp_label));
1035     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1036     rtx dest_label = operands[pos_label];
1037     operands[pos_label] = tmp_label;
1038
1039     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1040     output_asm_insn (buffer, operands);
1041
1042     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1043     operands[pos_label] = dest_label;
1044     output_asm_insn (buffer, operands);
1045     return "";
1046 }
1047
1048 void
1049 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
1050 {
1051   const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
1052   if (TARGET_GENERAL_REGS_ONLY)
1053     error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
1054   else
1055     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
1056 }
1057
1058 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1059    The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
1060    the same cost even if ALL_REGS has a much larger cost.  ALL_REGS is also
1061    used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
1062    cost (in this case the best class is the lowest cost one).  Using ALL_REGS
1063    irrespectively of its cost results in bad allocations with many redundant
1064    int<->FP moves which are expensive on various cores.
1065    To avoid this we don't allow ALL_REGS as the allocno class, but force a
1066    decision between FP_REGS and GENERAL_REGS.  We use the allocno class if it
1067    isn't ALL_REGS.  Similarly, use the best class if it isn't ALL_REGS.
1068    Otherwise set the allocno class depending on the mode.
1069    The result of this is that it is no longer inefficient to have a higher
1070    memory move cost than the register move cost.
1071 */
1072
1073 static reg_class_t
1074 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1075                                          reg_class_t best_class)
1076 {
1077   machine_mode mode;
1078
1079   if (allocno_class != ALL_REGS)
1080     return allocno_class;
1081
1082   if (best_class != ALL_REGS)
1083     return best_class;
1084
1085   mode = PSEUDO_REGNO_MODE (regno);
1086   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1087 }
1088
1089 static unsigned int
1090 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1091 {
1092   if (GET_MODE_UNIT_SIZE (mode) == 4)
1093     return aarch64_tune_params.min_div_recip_mul_sf;
1094   return aarch64_tune_params.min_div_recip_mul_df;
1095 }
1096
1097 /* Return the reassociation width of treeop OPC with mode MODE.  */
1098 static int
1099 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1100 {
1101   if (VECTOR_MODE_P (mode))
1102     return aarch64_tune_params.vec_reassoc_width;
1103   if (INTEGRAL_MODE_P (mode))
1104     return aarch64_tune_params.int_reassoc_width;
1105   /* Avoid reassociating floating point addition so we emit more FMAs.  */
1106   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1107     return aarch64_tune_params.fp_reassoc_width;
1108   return 1;
1109 }
1110
1111 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1112 unsigned
1113 aarch64_dbx_register_number (unsigned regno)
1114 {
1115    if (GP_REGNUM_P (regno))
1116      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1117    else if (regno == SP_REGNUM)
1118      return AARCH64_DWARF_SP;
1119    else if (FP_REGNUM_P (regno))
1120      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1121    else if (PR_REGNUM_P (regno))
1122      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1123    else if (regno == VG_REGNUM)
1124      return AARCH64_DWARF_VG;
1125
1126    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1127       equivalent DWARF register.  */
1128    return DWARF_FRAME_REGISTERS;
1129 }
1130
1131 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1132 static bool
1133 aarch64_advsimd_struct_mode_p (machine_mode mode)
1134 {
1135   return (TARGET_SIMD
1136           && (mode == OImode || mode == CImode || mode == XImode));
1137 }
1138
1139 /* Return true if MODE is an SVE predicate mode.  */
1140 static bool
1141 aarch64_sve_pred_mode_p (machine_mode mode)
1142 {
1143   return (TARGET_SVE
1144           && (mode == VNx16BImode
1145               || mode == VNx8BImode
1146               || mode == VNx4BImode
1147               || mode == VNx2BImode));
1148 }
1149
1150 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1151 const unsigned int VEC_ADVSIMD  = 1;
1152 const unsigned int VEC_SVE_DATA = 2;
1153 const unsigned int VEC_SVE_PRED = 4;
1154 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1155    a structure of 2, 3 or 4 vectors.  */
1156 const unsigned int VEC_STRUCT   = 8;
1157 /* Useful combinations of the above.  */
1158 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1159 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1160
1161 /* Return a set of flags describing the vector properties of mode MODE.
1162    Ignore modes that are not supported by the current target.  */
1163 static unsigned int
1164 aarch64_classify_vector_mode (machine_mode mode)
1165 {
1166   if (aarch64_advsimd_struct_mode_p (mode))
1167     return VEC_ADVSIMD | VEC_STRUCT;
1168
1169   if (aarch64_sve_pred_mode_p (mode))
1170     return VEC_SVE_PRED;
1171
1172   scalar_mode inner = GET_MODE_INNER (mode);
1173   if (VECTOR_MODE_P (mode)
1174       && (inner == QImode
1175           || inner == HImode
1176           || inner == HFmode
1177           || inner == SImode
1178           || inner == SFmode
1179           || inner == DImode
1180           || inner == DFmode))
1181     {
1182       if (TARGET_SVE)
1183         {
1184           if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1185             return VEC_SVE_DATA;
1186           if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1187               || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1188               || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1189             return VEC_SVE_DATA | VEC_STRUCT;
1190         }
1191
1192       /* This includes V1DF but not V1DI (which doesn't exist).  */
1193       if (TARGET_SIMD
1194           && (known_eq (GET_MODE_BITSIZE (mode), 64)
1195               || known_eq (GET_MODE_BITSIZE (mode), 128)))
1196         return VEC_ADVSIMD;
1197     }
1198
1199   return 0;
1200 }
1201
1202 /* Return true if MODE is any of the data vector modes, including
1203    structure modes.  */
1204 static bool
1205 aarch64_vector_data_mode_p (machine_mode mode)
1206 {
1207   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1208 }
1209
1210 /* Return true if MODE is an SVE data vector mode; either a single vector
1211    or a structure of vectors.  */
1212 static bool
1213 aarch64_sve_data_mode_p (machine_mode mode)
1214 {
1215   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1216 }
1217
1218 /* Implement target hook TARGET_ARRAY_MODE.  */
1219 static opt_machine_mode
1220 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1221 {
1222   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1223       && IN_RANGE (nelems, 2, 4))
1224     return mode_for_vector (GET_MODE_INNER (mode),
1225                             GET_MODE_NUNITS (mode) * nelems);
1226
1227   return opt_machine_mode ();
1228 }
1229
1230 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1231 static bool
1232 aarch64_array_mode_supported_p (machine_mode mode,
1233                                 unsigned HOST_WIDE_INT nelems)
1234 {
1235   if (TARGET_SIMD
1236       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1237           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1238       && (nelems >= 2 && nelems <= 4))
1239     return true;
1240
1241   return false;
1242 }
1243
1244 /* Return the SVE predicate mode to use for elements that have
1245    ELEM_NBYTES bytes, if such a mode exists.  */
1246
1247 opt_machine_mode
1248 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1249 {
1250   if (TARGET_SVE)
1251     {
1252       if (elem_nbytes == 1)
1253         return VNx16BImode;
1254       if (elem_nbytes == 2)
1255         return VNx8BImode;
1256       if (elem_nbytes == 4)
1257         return VNx4BImode;
1258       if (elem_nbytes == 8)
1259         return VNx2BImode;
1260     }
1261   return opt_machine_mode ();
1262 }
1263
1264 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1265
1266 static opt_machine_mode
1267 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1268 {
1269   if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1270     {
1271       unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1272       machine_mode pred_mode;
1273       if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1274         return pred_mode;
1275     }
1276
1277   return default_get_mask_mode (nunits, nbytes);
1278 }
1279
1280 /* Implement TARGET_HARD_REGNO_NREGS.  */
1281
1282 static unsigned int
1283 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1284 {
1285   /* ??? Logically we should only need to provide a value when
1286      HARD_REGNO_MODE_OK says that the combination is valid,
1287      but at the moment we need to handle all modes.  Just ignore
1288      any runtime parts for registers that can't store them.  */
1289   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1290   switch (aarch64_regno_regclass (regno))
1291     {
1292     case FP_REGS:
1293     case FP_LO_REGS:
1294       if (aarch64_sve_data_mode_p (mode))
1295         return exact_div (GET_MODE_SIZE (mode),
1296                           BYTES_PER_SVE_VECTOR).to_constant ();
1297       return CEIL (lowest_size, UNITS_PER_VREG);
1298     case PR_REGS:
1299     case PR_LO_REGS:
1300     case PR_HI_REGS:
1301       return 1;
1302     default:
1303       return CEIL (lowest_size, UNITS_PER_WORD);
1304     }
1305   gcc_unreachable ();
1306 }
1307
1308 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1309
1310 static bool
1311 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1312 {
1313   if (GET_MODE_CLASS (mode) == MODE_CC)
1314     return regno == CC_REGNUM;
1315
1316   if (regno == VG_REGNUM)
1317     /* This must have the same size as _Unwind_Word.  */
1318     return mode == DImode;
1319
1320   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1321   if (vec_flags & VEC_SVE_PRED)
1322     return PR_REGNUM_P (regno);
1323
1324   if (PR_REGNUM_P (regno))
1325     return 0;
1326
1327   if (regno == SP_REGNUM)
1328     /* The purpose of comparing with ptr_mode is to support the
1329        global register variable associated with the stack pointer
1330        register via the syntax of asm ("wsp") in ILP32.  */
1331     return mode == Pmode || mode == ptr_mode;
1332
1333   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1334     return mode == Pmode;
1335
1336   if (GP_REGNUM_P (regno) && known_le (GET_MODE_SIZE (mode), 16))
1337     return true;
1338
1339   if (FP_REGNUM_P (regno))
1340     {
1341       if (vec_flags & VEC_STRUCT)
1342         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1343       else
1344         return !VECTOR_MODE_P (mode) || vec_flags != 0;
1345     }
1346
1347   return false;
1348 }
1349
1350 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1351    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1352    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1353
1354 static bool
1355 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1356 {
1357   return FP_REGNUM_P (regno) && maybe_gt (GET_MODE_SIZE (mode), 8);
1358 }
1359
1360 /* Implement REGMODE_NATURAL_SIZE.  */
1361 poly_uint64
1362 aarch64_regmode_natural_size (machine_mode mode)
1363 {
1364   /* The natural size for SVE data modes is one SVE data vector,
1365      and similarly for predicates.  We can't independently modify
1366      anything smaller than that.  */
1367   /* ??? For now, only do this for variable-width SVE registers.
1368      Doing it for constant-sized registers breaks lower-subreg.c.  */
1369   /* ??? And once that's fixed, we should probably have similar
1370      code for Advanced SIMD.  */
1371   if (!aarch64_sve_vg.is_constant ())
1372     {
1373       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1374       if (vec_flags & VEC_SVE_PRED)
1375         return BYTES_PER_SVE_PRED;
1376       if (vec_flags & VEC_SVE_DATA)
1377         return BYTES_PER_SVE_VECTOR;
1378     }
1379   return UNITS_PER_WORD;
1380 }
1381
1382 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1383 machine_mode
1384 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1385                                      machine_mode mode)
1386 {
1387   /* The predicate mode determines which bits are significant and
1388      which are "don't care".  Decreasing the number of lanes would
1389      lose data while increasing the number of lanes would make bits
1390      unnecessarily significant.  */
1391   if (PR_REGNUM_P (regno))
1392     return mode;
1393   if (known_ge (GET_MODE_SIZE (mode), 4))
1394     return mode;
1395   else
1396     return SImode;
1397 }
1398
1399 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
1400    that strcpy from constants will be faster.  */
1401
1402 static HOST_WIDE_INT
1403 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1404 {
1405   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1406     return MAX (align, BITS_PER_WORD);
1407   return align;
1408 }
1409
1410 /* Return true if calls to DECL should be treated as
1411    long-calls (ie called via a register).  */
1412 static bool
1413 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1414 {
1415   return false;
1416 }
1417
1418 /* Return true if calls to symbol-ref SYM should be treated as
1419    long-calls (ie called via a register).  */
1420 bool
1421 aarch64_is_long_call_p (rtx sym)
1422 {
1423   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1424 }
1425
1426 /* Return true if calls to symbol-ref SYM should not go through
1427    plt stubs.  */
1428
1429 bool
1430 aarch64_is_noplt_call_p (rtx sym)
1431 {
1432   const_tree decl = SYMBOL_REF_DECL (sym);
1433
1434   if (flag_pic
1435       && decl
1436       && (!flag_plt
1437           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1438       && !targetm.binds_local_p (decl))
1439     return true;
1440
1441   return false;
1442 }
1443
1444 /* Return true if the offsets to a zero/sign-extract operation
1445    represent an expression that matches an extend operation.  The
1446    operands represent the paramters from
1447
1448    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1449 bool
1450 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1451                                 rtx extract_imm)
1452 {
1453   HOST_WIDE_INT mult_val, extract_val;
1454
1455   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1456     return false;
1457
1458   mult_val = INTVAL (mult_imm);
1459   extract_val = INTVAL (extract_imm);
1460
1461   if (extract_val > 8
1462       && extract_val < GET_MODE_BITSIZE (mode)
1463       && exact_log2 (extract_val & ~7) > 0
1464       && (extract_val & 7) <= 4
1465       && mult_val == (1 << (extract_val & 7)))
1466     return true;
1467
1468   return false;
1469 }
1470
1471 /* Emit an insn that's a simple single-set.  Both the operands must be
1472    known to be valid.  */
1473 inline static rtx_insn *
1474 emit_set_insn (rtx x, rtx y)
1475 {
1476   return emit_insn (gen_rtx_SET (x, y));
1477 }
1478
1479 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1480    return the rtx for register 0 in the proper mode.  */
1481 rtx
1482 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1483 {
1484   machine_mode mode = SELECT_CC_MODE (code, x, y);
1485   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1486
1487   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1488   return cc_reg;
1489 }
1490
1491 /* Build the SYMBOL_REF for __tls_get_addr.  */
1492
1493 static GTY(()) rtx tls_get_addr_libfunc;
1494
1495 rtx
1496 aarch64_tls_get_addr (void)
1497 {
1498   if (!tls_get_addr_libfunc)
1499     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1500   return tls_get_addr_libfunc;
1501 }
1502
1503 /* Return the TLS model to use for ADDR.  */
1504
1505 static enum tls_model
1506 tls_symbolic_operand_type (rtx addr)
1507 {
1508   enum tls_model tls_kind = TLS_MODEL_NONE;
1509   if (GET_CODE (addr) == CONST)
1510     {
1511       poly_int64 addend;
1512       rtx sym = strip_offset (addr, &addend);
1513       if (GET_CODE (sym) == SYMBOL_REF)
1514         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1515     }
1516   else if (GET_CODE (addr) == SYMBOL_REF)
1517     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1518
1519   return tls_kind;
1520 }
1521
1522 /* We'll allow lo_sum's in addresses in our legitimate addresses
1523    so that combine would take care of combining addresses where
1524    necessary, but for generation purposes, we'll generate the address
1525    as :
1526    RTL                               Absolute
1527    tmp = hi (symbol_ref);            adrp  x1, foo
1528    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1529                                      nop
1530
1531    PIC                               TLS
1532    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1533    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1534                                      bl   __tls_get_addr
1535                                      nop
1536
1537    Load TLS symbol, depending on TLS mechanism and TLS access model.
1538
1539    Global Dynamic - Traditional TLS:
1540    adrp tmp, :tlsgd:imm
1541    add  dest, tmp, #:tlsgd_lo12:imm
1542    bl   __tls_get_addr
1543
1544    Global Dynamic - TLS Descriptors:
1545    adrp dest, :tlsdesc:imm
1546    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1547    add  dest, dest, #:tlsdesc_lo12:imm
1548    blr  tmp
1549    mrs  tp, tpidr_el0
1550    add  dest, dest, tp
1551
1552    Initial Exec:
1553    mrs  tp, tpidr_el0
1554    adrp tmp, :gottprel:imm
1555    ldr  dest, [tmp, #:gottprel_lo12:imm]
1556    add  dest, dest, tp
1557
1558    Local Exec:
1559    mrs  tp, tpidr_el0
1560    add  t0, tp, #:tprel_hi12:imm, lsl #12
1561    add  t0, t0, #:tprel_lo12_nc:imm
1562 */
1563
1564 static void
1565 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1566                                    enum aarch64_symbol_type type)
1567 {
1568   switch (type)
1569     {
1570     case SYMBOL_SMALL_ABSOLUTE:
1571       {
1572         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1573         rtx tmp_reg = dest;
1574         machine_mode mode = GET_MODE (dest);
1575
1576         gcc_assert (mode == Pmode || mode == ptr_mode);
1577
1578         if (can_create_pseudo_p ())
1579           tmp_reg = gen_reg_rtx (mode);
1580
1581         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1582         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1583         return;
1584       }
1585
1586     case SYMBOL_TINY_ABSOLUTE:
1587       emit_insn (gen_rtx_SET (dest, imm));
1588       return;
1589
1590     case SYMBOL_SMALL_GOT_28K:
1591       {
1592         machine_mode mode = GET_MODE (dest);
1593         rtx gp_rtx = pic_offset_table_rtx;
1594         rtx insn;
1595         rtx mem;
1596
1597         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1598            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1599            decide rtx costs, in which case pic_offset_table_rtx is not
1600            initialized.  For that case no need to generate the first adrp
1601            instruction as the final cost for global variable access is
1602            one instruction.  */
1603         if (gp_rtx != NULL)
1604           {
1605             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1606                using the page base as GOT base, the first page may be wasted,
1607                in the worst scenario, there is only 28K space for GOT).
1608
1609                The generate instruction sequence for accessing global variable
1610                is:
1611
1612                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1613
1614                Only one instruction needed. But we must initialize
1615                pic_offset_table_rtx properly.  We generate initialize insn for
1616                every global access, and allow CSE to remove all redundant.
1617
1618                The final instruction sequences will look like the following
1619                for multiply global variables access.
1620
1621                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1622
1623                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1624                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1625                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1626                  ...  */
1627
1628             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1629             crtl->uses_pic_offset_table = 1;
1630             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1631
1632             if (mode != GET_MODE (gp_rtx))
1633              gp_rtx = gen_lowpart (mode, gp_rtx);
1634
1635           }
1636
1637         if (mode == ptr_mode)
1638           {
1639             if (mode == DImode)
1640               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1641             else
1642               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1643
1644             mem = XVECEXP (SET_SRC (insn), 0, 0);
1645           }
1646         else
1647           {
1648             gcc_assert (mode == Pmode);
1649
1650             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1651             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1652           }
1653
1654         /* The operand is expected to be MEM.  Whenever the related insn
1655            pattern changed, above code which calculate mem should be
1656            updated.  */
1657         gcc_assert (GET_CODE (mem) == MEM);
1658         MEM_READONLY_P (mem) = 1;
1659         MEM_NOTRAP_P (mem) = 1;
1660         emit_insn (insn);
1661         return;
1662       }
1663
1664     case SYMBOL_SMALL_GOT_4G:
1665       {
1666         /* In ILP32, the mode of dest can be either SImode or DImode,
1667            while the got entry is always of SImode size.  The mode of
1668            dest depends on how dest is used: if dest is assigned to a
1669            pointer (e.g. in the memory), it has SImode; it may have
1670            DImode if dest is dereferenced to access the memeory.
1671            This is why we have to handle three different ldr_got_small
1672            patterns here (two patterns for ILP32).  */
1673
1674         rtx insn;
1675         rtx mem;
1676         rtx tmp_reg = dest;
1677         machine_mode mode = GET_MODE (dest);
1678
1679         if (can_create_pseudo_p ())
1680           tmp_reg = gen_reg_rtx (mode);
1681
1682         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1683         if (mode == ptr_mode)
1684           {
1685             if (mode == DImode)
1686               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1687             else
1688               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1689
1690             mem = XVECEXP (SET_SRC (insn), 0, 0);
1691           }
1692         else
1693           {
1694             gcc_assert (mode == Pmode);
1695
1696             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1697             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1698           }
1699
1700         gcc_assert (GET_CODE (mem) == MEM);
1701         MEM_READONLY_P (mem) = 1;
1702         MEM_NOTRAP_P (mem) = 1;
1703         emit_insn (insn);
1704         return;
1705       }
1706
1707     case SYMBOL_SMALL_TLSGD:
1708       {
1709         rtx_insn *insns;
1710         machine_mode mode = GET_MODE (dest);
1711         rtx result = gen_rtx_REG (mode, R0_REGNUM);
1712
1713         start_sequence ();
1714         if (TARGET_ILP32)
1715           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1716         else
1717           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1718         insns = get_insns ();
1719         end_sequence ();
1720
1721         RTL_CONST_CALL_P (insns) = 1;
1722         emit_libcall_block (insns, dest, result, imm);
1723         return;
1724       }
1725
1726     case SYMBOL_SMALL_TLSDESC:
1727       {
1728         machine_mode mode = GET_MODE (dest);
1729         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1730         rtx tp;
1731
1732         gcc_assert (mode == Pmode || mode == ptr_mode);
1733
1734         /* In ILP32, the got entry is always of SImode size.  Unlike
1735            small GOT, the dest is fixed at reg 0.  */
1736         if (TARGET_ILP32)
1737           emit_insn (gen_tlsdesc_small_si (imm));
1738         else
1739           emit_insn (gen_tlsdesc_small_di (imm));
1740         tp = aarch64_load_tp (NULL);
1741
1742         if (mode != Pmode)
1743           tp = gen_lowpart (mode, tp);
1744
1745         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1746         if (REG_P (dest))
1747           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1748         return;
1749       }
1750
1751     case SYMBOL_SMALL_TLSIE:
1752       {
1753         /* In ILP32, the mode of dest can be either SImode or DImode,
1754            while the got entry is always of SImode size.  The mode of
1755            dest depends on how dest is used: if dest is assigned to a
1756            pointer (e.g. in the memory), it has SImode; it may have
1757            DImode if dest is dereferenced to access the memeory.
1758            This is why we have to handle three different tlsie_small
1759            patterns here (two patterns for ILP32).  */
1760         machine_mode mode = GET_MODE (dest);
1761         rtx tmp_reg = gen_reg_rtx (mode);
1762         rtx tp = aarch64_load_tp (NULL);
1763
1764         if (mode == ptr_mode)
1765           {
1766             if (mode == DImode)
1767               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1768             else
1769               {
1770                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1771                 tp = gen_lowpart (mode, tp);
1772               }
1773           }
1774         else
1775           {
1776             gcc_assert (mode == Pmode);
1777             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1778           }
1779
1780         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1781         if (REG_P (dest))
1782           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1783         return;
1784       }
1785
1786     case SYMBOL_TLSLE12:
1787     case SYMBOL_TLSLE24:
1788     case SYMBOL_TLSLE32:
1789     case SYMBOL_TLSLE48:
1790       {
1791         machine_mode mode = GET_MODE (dest);
1792         rtx tp = aarch64_load_tp (NULL);
1793
1794         if (mode != Pmode)
1795           tp = gen_lowpart (mode, tp);
1796
1797         switch (type)
1798           {
1799           case SYMBOL_TLSLE12:
1800             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1801                         (dest, tp, imm));
1802             break;
1803           case SYMBOL_TLSLE24:
1804             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1805                         (dest, tp, imm));
1806           break;
1807           case SYMBOL_TLSLE32:
1808             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1809                         (dest, imm));
1810             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1811                         (dest, dest, tp));
1812           break;
1813           case SYMBOL_TLSLE48:
1814             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1815                         (dest, imm));
1816             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1817                         (dest, dest, tp));
1818             break;
1819           default:
1820             gcc_unreachable ();
1821           }
1822
1823         if (REG_P (dest))
1824           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1825         return;
1826       }
1827
1828     case SYMBOL_TINY_GOT:
1829       emit_insn (gen_ldr_got_tiny (dest, imm));
1830       return;
1831
1832     case SYMBOL_TINY_TLSIE:
1833       {
1834         machine_mode mode = GET_MODE (dest);
1835         rtx tp = aarch64_load_tp (NULL);
1836
1837         if (mode == ptr_mode)
1838           {
1839             if (mode == DImode)
1840               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1841             else
1842               {
1843                 tp = gen_lowpart (mode, tp);
1844                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1845               }
1846           }
1847         else
1848           {
1849             gcc_assert (mode == Pmode);
1850             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1851           }
1852
1853         if (REG_P (dest))
1854           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1855         return;
1856       }
1857
1858     default:
1859       gcc_unreachable ();
1860     }
1861 }
1862
1863 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1864    handle all moves if !can_create_pseudo_p ().  The distinction is
1865    important because, unlike emit_move_insn, the move expanders know
1866    how to force Pmode objects into the constant pool even when the
1867    constant pool address is not itself legitimate.  */
1868 static rtx
1869 aarch64_emit_move (rtx dest, rtx src)
1870 {
1871   return (can_create_pseudo_p ()
1872           ? emit_move_insn (dest, src)
1873           : emit_move_insn_1 (dest, src));
1874 }
1875
1876 /* Apply UNOPTAB to OP and store the result in DEST.  */
1877
1878 static void
1879 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
1880 {
1881   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
1882   if (dest != tmp)
1883     emit_move_insn (dest, tmp);
1884 }
1885
1886 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
1887
1888 static void
1889 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
1890 {
1891   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
1892                           OPTAB_DIRECT);
1893   if (dest != tmp)
1894     emit_move_insn (dest, tmp);
1895 }
1896
1897 /* Split a 128-bit move operation into two 64-bit move operations,
1898    taking care to handle partial overlap of register to register
1899    copies.  Special cases are needed when moving between GP regs and
1900    FP regs.  SRC can be a register, constant or memory; DST a register
1901    or memory.  If either operand is memory it must not have any side
1902    effects.  */
1903 void
1904 aarch64_split_128bit_move (rtx dst, rtx src)
1905 {
1906   rtx dst_lo, dst_hi;
1907   rtx src_lo, src_hi;
1908
1909   machine_mode mode = GET_MODE (dst);
1910
1911   gcc_assert (mode == TImode || mode == TFmode);
1912   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1913   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1914
1915   if (REG_P (dst) && REG_P (src))
1916     {
1917       int src_regno = REGNO (src);
1918       int dst_regno = REGNO (dst);
1919
1920       /* Handle FP <-> GP regs.  */
1921       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1922         {
1923           src_lo = gen_lowpart (word_mode, src);
1924           src_hi = gen_highpart (word_mode, src);
1925
1926           if (mode == TImode)
1927             {
1928               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1929               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1930             }
1931           else
1932             {
1933               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1934               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1935             }
1936           return;
1937         }
1938       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1939         {
1940           dst_lo = gen_lowpart (word_mode, dst);
1941           dst_hi = gen_highpart (word_mode, dst);
1942
1943           if (mode == TImode)
1944             {
1945               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1946               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1947             }
1948           else
1949             {
1950               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1951               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1952             }
1953           return;
1954         }
1955     }
1956
1957   dst_lo = gen_lowpart (word_mode, dst);
1958   dst_hi = gen_highpart (word_mode, dst);
1959   src_lo = gen_lowpart (word_mode, src);
1960   src_hi = gen_highpart_mode (word_mode, mode, src);
1961
1962   /* At most one pairing may overlap.  */
1963   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1964     {
1965       aarch64_emit_move (dst_hi, src_hi);
1966       aarch64_emit_move (dst_lo, src_lo);
1967     }
1968   else
1969     {
1970       aarch64_emit_move (dst_lo, src_lo);
1971       aarch64_emit_move (dst_hi, src_hi);
1972     }
1973 }
1974
1975 bool
1976 aarch64_split_128bit_move_p (rtx dst, rtx src)
1977 {
1978   return (! REG_P (src)
1979           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1980 }
1981
1982 /* Split a complex SIMD combine.  */
1983
1984 void
1985 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1986 {
1987   machine_mode src_mode = GET_MODE (src1);
1988   machine_mode dst_mode = GET_MODE (dst);
1989
1990   gcc_assert (VECTOR_MODE_P (dst_mode));
1991   gcc_assert (register_operand (dst, dst_mode)
1992               && register_operand (src1, src_mode)
1993               && register_operand (src2, src_mode));
1994
1995   rtx (*gen) (rtx, rtx, rtx);
1996
1997   switch (src_mode)
1998     {
1999     case E_V8QImode:
2000       gen = gen_aarch64_simd_combinev8qi;
2001       break;
2002     case E_V4HImode:
2003       gen = gen_aarch64_simd_combinev4hi;
2004       break;
2005     case E_V2SImode:
2006       gen = gen_aarch64_simd_combinev2si;
2007       break;
2008     case E_V4HFmode:
2009       gen = gen_aarch64_simd_combinev4hf;
2010       break;
2011     case E_V2SFmode:
2012       gen = gen_aarch64_simd_combinev2sf;
2013       break;
2014     case E_DImode:
2015       gen = gen_aarch64_simd_combinedi;
2016       break;
2017     case E_DFmode:
2018       gen = gen_aarch64_simd_combinedf;
2019       break;
2020     default:
2021       gcc_unreachable ();
2022     }
2023
2024   emit_insn (gen (dst, src1, src2));
2025   return;
2026 }
2027
2028 /* Split a complex SIMD move.  */
2029
2030 void
2031 aarch64_split_simd_move (rtx dst, rtx src)
2032 {
2033   machine_mode src_mode = GET_MODE (src);
2034   machine_mode dst_mode = GET_MODE (dst);
2035
2036   gcc_assert (VECTOR_MODE_P (dst_mode));
2037
2038   if (REG_P (dst) && REG_P (src))
2039     {
2040       rtx (*gen) (rtx, rtx);
2041
2042       gcc_assert (VECTOR_MODE_P (src_mode));
2043
2044       switch (src_mode)
2045         {
2046         case E_V16QImode:
2047           gen = gen_aarch64_split_simd_movv16qi;
2048           break;
2049         case E_V8HImode:
2050           gen = gen_aarch64_split_simd_movv8hi;
2051           break;
2052         case E_V4SImode:
2053           gen = gen_aarch64_split_simd_movv4si;
2054           break;
2055         case E_V2DImode:
2056           gen = gen_aarch64_split_simd_movv2di;
2057           break;
2058         case E_V8HFmode:
2059           gen = gen_aarch64_split_simd_movv8hf;
2060           break;
2061         case E_V4SFmode:
2062           gen = gen_aarch64_split_simd_movv4sf;
2063           break;
2064         case E_V2DFmode:
2065           gen = gen_aarch64_split_simd_movv2df;
2066           break;
2067         default:
2068           gcc_unreachable ();
2069         }
2070
2071       emit_insn (gen (dst, src));
2072       return;
2073     }
2074 }
2075
2076 bool
2077 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2078                               machine_mode ymode, rtx y)
2079 {
2080   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2081   gcc_assert (r != NULL);
2082   return rtx_equal_p (x, r);
2083 }
2084
2085
2086 static rtx
2087 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2088 {
2089   if (can_create_pseudo_p ())
2090     return force_reg (mode, value);
2091   else
2092     {
2093       gcc_assert (x);
2094       aarch64_emit_move (x, value);
2095       return x;
2096     }
2097 }
2098
2099 /* Return true if we can move VALUE into a register using a single
2100    CNT[BHWD] instruction.  */
2101
2102 static bool
2103 aarch64_sve_cnt_immediate_p (poly_int64 value)
2104 {
2105   HOST_WIDE_INT factor = value.coeffs[0];
2106   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
2107   return (value.coeffs[1] == factor
2108           && IN_RANGE (factor, 2, 16 * 16)
2109           && (factor & 1) == 0
2110           && factor <= 16 * (factor & -factor));
2111 }
2112
2113 /* Likewise for rtx X.  */
2114
2115 bool
2116 aarch64_sve_cnt_immediate_p (rtx x)
2117 {
2118   poly_int64 value;
2119   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2120 }
2121
2122 /* Return the asm string for an instruction with a CNT-like vector size
2123    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2124    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2125    first part of the operands template (the part that comes before the
2126    vector size itself).  FACTOR is the number of quadwords.
2127    NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2128    If it is zero, we can use any element size.  */
2129
2130 static char *
2131 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2132                                   unsigned int factor,
2133                                   unsigned int nelts_per_vq)
2134 {
2135   static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2136
2137   if (nelts_per_vq == 0)
2138     /* There is some overlap in the ranges of the four CNT instructions.
2139        Here we always use the smallest possible element size, so that the
2140        multiplier is 1 whereever possible.  */
2141     nelts_per_vq = factor & -factor;
2142   int shift = std::min (exact_log2 (nelts_per_vq), 4);
2143   gcc_assert (IN_RANGE (shift, 1, 4));
2144   char suffix = "dwhb"[shift - 1];
2145
2146   factor >>= shift;
2147   unsigned int written;
2148   if (factor == 1)
2149     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2150                         prefix, suffix, operands);
2151   else
2152     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2153                         prefix, suffix, operands, factor);
2154   gcc_assert (written < sizeof (buffer));
2155   return buffer;
2156 }
2157
2158 /* Return the asm string for an instruction with a CNT-like vector size
2159    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2160    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2161    first part of the operands template (the part that comes before the
2162    vector size itself).  X is the value of the vector size operand,
2163    as a polynomial integer rtx.  */
2164
2165 char *
2166 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2167                                   rtx x)
2168 {
2169   poly_int64 value = rtx_to_poly_int64 (x);
2170   gcc_assert (aarch64_sve_cnt_immediate_p (value));
2171   return aarch64_output_sve_cnt_immediate (prefix, operands,
2172                                            value.coeffs[1], 0);
2173 }
2174
2175 /* Return true if we can add VALUE to a register using a single ADDVL
2176    or ADDPL instruction.  */
2177
2178 static bool
2179 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2180 {
2181   HOST_WIDE_INT factor = value.coeffs[0];
2182   if (factor == 0 || value.coeffs[1] != factor)
2183     return false;
2184   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2185      and a value of 16 is one vector width.  */
2186   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2187           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2188 }
2189
2190 /* Likewise for rtx X.  */
2191
2192 bool
2193 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2194 {
2195   poly_int64 value;
2196   return (poly_int_rtx_p (x, &value)
2197           && aarch64_sve_addvl_addpl_immediate_p (value));
2198 }
2199
2200 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2201    and storing the result in operand 0.  */
2202
2203 char *
2204 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2205 {
2206   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2207   poly_int64 offset_value = rtx_to_poly_int64 (offset);
2208   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2209
2210   /* Use INC or DEC if possible.  */
2211   if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2212     {
2213       if (aarch64_sve_cnt_immediate_p (offset_value))
2214         return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2215                                                  offset_value.coeffs[1], 0);
2216       if (aarch64_sve_cnt_immediate_p (-offset_value))
2217         return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2218                                                  -offset_value.coeffs[1], 0);
2219     }
2220
2221   int factor = offset_value.coeffs[1];
2222   if ((factor & 15) == 0)
2223     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2224   else
2225     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2226   return buffer;
2227 }
2228
2229 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2230    instruction.  If it is, store the number of elements in each vector
2231    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2232    factor in *FACTOR_OUT (if nonnull).  */
2233
2234 bool
2235 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2236                                  unsigned int *nelts_per_vq_out)
2237 {
2238   rtx elt;
2239   poly_int64 value;
2240
2241   if (!const_vec_duplicate_p (x, &elt)
2242       || !poly_int_rtx_p (elt, &value))
2243     return false;
2244
2245   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2246   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2247     /* There's no vector INCB.  */
2248     return false;
2249
2250   HOST_WIDE_INT factor = value.coeffs[0];
2251   if (value.coeffs[1] != factor)
2252     return false;
2253
2254   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
2255   if ((factor % nelts_per_vq) != 0
2256       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2257     return false;
2258
2259   if (factor_out)
2260     *factor_out = factor;
2261   if (nelts_per_vq_out)
2262     *nelts_per_vq_out = nelts_per_vq;
2263   return true;
2264 }
2265
2266 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2267    instruction.  */
2268
2269 bool
2270 aarch64_sve_inc_dec_immediate_p (rtx x)
2271 {
2272   return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2273 }
2274
2275 /* Return the asm template for an SVE vector INC or DEC instruction.
2276    OPERANDS gives the operands before the vector count and X is the
2277    value of the vector count operand itself.  */
2278
2279 char *
2280 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2281 {
2282   int factor;
2283   unsigned int nelts_per_vq;
2284   if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2285     gcc_unreachable ();
2286   if (factor < 0)
2287     return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2288                                              nelts_per_vq);
2289   else
2290     return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2291                                              nelts_per_vq);
2292 }
2293
2294 static int
2295 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2296                                 scalar_int_mode mode)
2297 {
2298   int i;
2299   unsigned HOST_WIDE_INT val, val2, mask;
2300   int one_match, zero_match;
2301   int num_insns;
2302
2303   val = INTVAL (imm);
2304
2305   if (aarch64_move_imm (val, mode))
2306     {
2307       if (generate)
2308         emit_insn (gen_rtx_SET (dest, imm));
2309       return 1;
2310     }
2311
2312   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2313      (with XXXX non-zero). In that case check to see if the move can be done in
2314      a smaller mode.  */
2315   val2 = val & 0xffffffff;
2316   if (mode == DImode
2317       && aarch64_move_imm (val2, SImode)
2318       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2319     {
2320       if (generate)
2321         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2322
2323       /* Check if we have to emit a second instruction by checking to see
2324          if any of the upper 32 bits of the original DI mode value is set.  */
2325       if (val == val2)
2326         return 1;
2327
2328       i = (val >> 48) ? 48 : 32;
2329
2330       if (generate)
2331          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2332                                     GEN_INT ((val >> i) & 0xffff)));
2333
2334       return 2;
2335     }
2336
2337   if ((val >> 32) == 0 || mode == SImode)
2338     {
2339       if (generate)
2340         {
2341           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2342           if (mode == SImode)
2343             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2344                                        GEN_INT ((val >> 16) & 0xffff)));
2345           else
2346             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2347                                        GEN_INT ((val >> 16) & 0xffff)));
2348         }
2349       return 2;
2350     }
2351
2352   /* Remaining cases are all for DImode.  */
2353
2354   mask = 0xffff;
2355   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2356     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2357   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2358     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2359
2360   if (zero_match != 2 && one_match != 2)
2361     {
2362       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2363          For a 64-bit bitmask try whether changing 16 bits to all ones or
2364          zeroes creates a valid bitmask.  To check any repeated bitmask,
2365          try using 16 bits from the other 32-bit half of val.  */
2366
2367       for (i = 0; i < 64; i += 16, mask <<= 16)
2368         {
2369           val2 = val & ~mask;
2370           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2371             break;
2372           val2 = val | mask;
2373           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2374             break;
2375           val2 = val2 & ~mask;
2376           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2377           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2378             break;
2379         }
2380       if (i != 64)
2381         {
2382           if (generate)
2383             {
2384               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2385               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2386                                          GEN_INT ((val >> i) & 0xffff)));
2387             }
2388           return 2;
2389         }
2390     }
2391
2392   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2393      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
2394      otherwise skip zero bits.  */
2395
2396   num_insns = 1;
2397   mask = 0xffff;
2398   val2 = one_match > zero_match ? ~val : val;
2399   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2400
2401   if (generate)
2402     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2403                                            ? (val | ~(mask << i))
2404                                            : (val & (mask << i)))));
2405   for (i += 16; i < 64; i += 16)
2406     {
2407       if ((val2 & (mask << i)) == 0)
2408         continue;
2409       if (generate)
2410         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2411                                    GEN_INT ((val >> i) & 0xffff)));
2412       num_insns ++;
2413     }
2414
2415   return num_insns;
2416 }
2417
2418 /* Return whether imm is a 128-bit immediate which is simple enough to
2419    expand inline.  */
2420 bool
2421 aarch64_mov128_immediate (rtx imm)
2422 {
2423   if (GET_CODE (imm) == CONST_INT)
2424     return true;
2425
2426   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2427
2428   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2429   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2430
2431   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2432          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2433 }
2434
2435
2436 /* Return the number of temporary registers that aarch64_add_offset_1
2437    would need to add OFFSET to a register.  */
2438
2439 static unsigned int
2440 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2441 {
2442   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2443 }
2444
2445 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
2446    a non-polynomial OFFSET.  MODE is the mode of the addition.
2447    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2448    be set and CFA adjustments added to the generated instructions.
2449
2450    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2451    temporary if register allocation is already complete.  This temporary
2452    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
2453    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2454    the immediate again.
2455
2456    Since this function may be used to adjust the stack pointer, we must
2457    ensure that it cannot cause transient stack deallocation (for example
2458    by first incrementing SP and then decrementing when adjusting by a
2459    large immediate).  */
2460
2461 static void
2462 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2463                       rtx src, HOST_WIDE_INT offset, rtx temp1,
2464                       bool frame_related_p, bool emit_move_imm)
2465 {
2466   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2467   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2468
2469   HOST_WIDE_INT moffset = abs_hwi (offset);
2470   rtx_insn *insn;
2471
2472   if (!moffset)
2473     {
2474       if (!rtx_equal_p (dest, src))
2475         {
2476           insn = emit_insn (gen_rtx_SET (dest, src));
2477           RTX_FRAME_RELATED_P (insn) = frame_related_p;
2478         }
2479       return;
2480     }
2481
2482   /* Single instruction adjustment.  */
2483   if (aarch64_uimm12_shift (moffset))
2484     {
2485       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2486       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2487       return;
2488     }
2489
2490   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2491      and either:
2492
2493      a) the offset cannot be loaded by a 16-bit move or
2494      b) there is no spare register into which we can move it.  */
2495   if (moffset < 0x1000000
2496       && ((!temp1 && !can_create_pseudo_p ())
2497           || !aarch64_move_imm (moffset, mode)))
2498     {
2499       HOST_WIDE_INT low_off = moffset & 0xfff;
2500
2501       low_off = offset < 0 ? -low_off : low_off;
2502       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2503       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2504       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2505       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2506       return;
2507     }
2508
2509   /* Emit a move immediate if required and an addition/subtraction.  */
2510   if (emit_move_imm)
2511     {
2512       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2513       temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2514     }
2515   insn = emit_insn (offset < 0
2516                     ? gen_sub3_insn (dest, src, temp1)
2517                     : gen_add3_insn (dest, src, temp1));
2518   if (frame_related_p)
2519     {
2520       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2521       rtx adj = plus_constant (mode, src, offset);
2522       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2523     }
2524 }
2525
2526 /* Return the number of temporary registers that aarch64_add_offset
2527    would need to move OFFSET into a register or add OFFSET to a register;
2528    ADD_P is true if we want the latter rather than the former.  */
2529
2530 static unsigned int
2531 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2532 {
2533   /* This follows the same structure as aarch64_add_offset.  */
2534   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2535     return 0;
2536
2537   unsigned int count = 0;
2538   HOST_WIDE_INT factor = offset.coeffs[1];
2539   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2540   poly_int64 poly_offset (factor, factor);
2541   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2542     /* Need one register for the ADDVL/ADDPL result.  */
2543     count += 1;
2544   else if (factor != 0)
2545     {
2546       factor = abs (factor);
2547       if (factor > 16 * (factor & -factor))
2548         /* Need one register for the CNT result and one for the multiplication
2549            factor.  If necessary, the second temporary can be reused for the
2550            constant part of the offset.  */
2551         return 2;
2552       /* Need one register for the CNT result (which might then
2553          be shifted).  */
2554       count += 1;
2555     }
2556   return count + aarch64_add_offset_1_temporaries (constant);
2557 }
2558
2559 /* If X can be represented as a poly_int64, return the number
2560    of temporaries that are required to add it to a register.
2561    Return -1 otherwise.  */
2562
2563 int
2564 aarch64_add_offset_temporaries (rtx x)
2565 {
2566   poly_int64 offset;
2567   if (!poly_int_rtx_p (x, &offset))
2568     return -1;
2569   return aarch64_offset_temporaries (true, offset);
2570 }
2571
2572 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
2573    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2574    be set and CFA adjustments added to the generated instructions.
2575
2576    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2577    temporary if register allocation is already complete.  This temporary
2578    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2579    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2580    false to avoid emitting the immediate again.
2581
2582    TEMP2, if nonnull, is a second temporary register that doesn't
2583    overlap either DEST or REG.
2584
2585    Since this function may be used to adjust the stack pointer, we must
2586    ensure that it cannot cause transient stack deallocation (for example
2587    by first incrementing SP and then decrementing when adjusting by a
2588    large immediate).  */
2589
2590 static void
2591 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2592                     poly_int64 offset, rtx temp1, rtx temp2,
2593                     bool frame_related_p, bool emit_move_imm = true)
2594 {
2595   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2596   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2597   gcc_assert (temp1 == NULL_RTX
2598               || !frame_related_p
2599               || !reg_overlap_mentioned_p (temp1, dest));
2600   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2601
2602   /* Try using ADDVL or ADDPL to add the whole value.  */
2603   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2604     {
2605       rtx offset_rtx = gen_int_mode (offset, mode);
2606       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2607       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2608       return;
2609     }
2610
2611   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2612      SVE vector register, over and above the minimum size of 128 bits.
2613      This is equivalent to half the value returned by CNTD with a
2614      vector shape of ALL.  */
2615   HOST_WIDE_INT factor = offset.coeffs[1];
2616   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2617
2618   /* Try using ADDVL or ADDPL to add the VG-based part.  */
2619   poly_int64 poly_offset (factor, factor);
2620   if (src != const0_rtx
2621       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2622     {
2623       rtx offset_rtx = gen_int_mode (poly_offset, mode);
2624       if (frame_related_p)
2625         {
2626           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2627           RTX_FRAME_RELATED_P (insn) = true;
2628           src = dest;
2629         }
2630       else
2631         {
2632           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2633           src = aarch64_force_temporary (mode, temp1, addr);
2634           temp1 = temp2;
2635           temp2 = NULL_RTX;
2636         }
2637     }
2638   /* Otherwise use a CNT-based sequence.  */
2639   else if (factor != 0)
2640     {
2641       /* Use a subtraction if we have a negative factor.  */
2642       rtx_code code = PLUS;
2643       if (factor < 0)
2644         {
2645           factor = -factor;
2646           code = MINUS;
2647         }
2648
2649       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
2650          into the multiplication.  */
2651       rtx val;
2652       int shift = 0;
2653       if (factor & 1)
2654         /* Use a right shift by 1.  */
2655         shift = -1;
2656       else
2657         factor /= 2;
2658       HOST_WIDE_INT low_bit = factor & -factor;
2659       if (factor <= 16 * low_bit)
2660         {
2661           if (factor > 16 * 8)
2662             {
2663               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2664                  the value with the minimum multiplier and shift it into
2665                  position.  */
2666               int extra_shift = exact_log2 (low_bit);
2667               shift += extra_shift;
2668               factor >>= extra_shift;
2669             }
2670           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
2671         }
2672       else
2673         {
2674           /* Use CNTD, then multiply it by FACTOR.  */
2675           val = gen_int_mode (poly_int64 (2, 2), mode);
2676           val = aarch64_force_temporary (mode, temp1, val);
2677
2678           /* Go back to using a negative multiplication factor if we have
2679              no register from which to subtract.  */
2680           if (code == MINUS && src == const0_rtx)
2681             {
2682               factor = -factor;
2683               code = PLUS;
2684             }
2685           rtx coeff1 = gen_int_mode (factor, mode);
2686           coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
2687           val = gen_rtx_MULT (mode, val, coeff1);
2688         }
2689
2690       if (shift > 0)
2691         {
2692           /* Multiply by 1 << SHIFT.  */
2693           val = aarch64_force_temporary (mode, temp1, val);
2694           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
2695         }
2696       else if (shift == -1)
2697         {
2698           /* Divide by 2.  */
2699           val = aarch64_force_temporary (mode, temp1, val);
2700           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
2701         }
2702
2703       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
2704       if (src != const0_rtx)
2705         {
2706           val = aarch64_force_temporary (mode, temp1, val);
2707           val = gen_rtx_fmt_ee (code, mode, src, val);
2708         }
2709       else if (code == MINUS)
2710         {
2711           val = aarch64_force_temporary (mode, temp1, val);
2712           val = gen_rtx_NEG (mode, val);
2713         }
2714
2715       if (constant == 0 || frame_related_p)
2716         {
2717           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
2718           if (frame_related_p)
2719             {
2720               RTX_FRAME_RELATED_P (insn) = true;
2721               add_reg_note (insn, REG_CFA_ADJUST_CFA,
2722                             gen_rtx_SET (dest, plus_constant (Pmode, src,
2723                                                               poly_offset)));
2724             }
2725           src = dest;
2726           if (constant == 0)
2727             return;
2728         }
2729       else
2730         {
2731           src = aarch64_force_temporary (mode, temp1, val);
2732           temp1 = temp2;
2733           temp2 = NULL_RTX;
2734         }
2735
2736       emit_move_imm = true;
2737     }
2738
2739   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
2740                         frame_related_p, emit_move_imm);
2741 }
2742
2743 /* Like aarch64_add_offset, but the offset is given as an rtx rather
2744    than a poly_int64.  */
2745
2746 void
2747 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2748                           rtx offset_rtx, rtx temp1, rtx temp2)
2749 {
2750   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
2751                       temp1, temp2, false);
2752 }
2753
2754 /* Add DELTA to the stack pointer, marking the instructions frame-related.
2755    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
2756    if TEMP1 already contains abs (DELTA).  */
2757
2758 static inline void
2759 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
2760 {
2761   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
2762                       temp1, temp2, true, emit_move_imm);
2763 }
2764
2765 /* Subtract DELTA from the stack pointer, marking the instructions
2766    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
2767    if nonnull.  */
2768
2769 static inline void
2770 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p)
2771 {
2772   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
2773                       temp1, temp2, frame_related_p);
2774 }
2775
2776 /* Set DEST to (vec_series BASE STEP).  */
2777
2778 static void
2779 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
2780 {
2781   machine_mode mode = GET_MODE (dest);
2782   scalar_mode inner = GET_MODE_INNER (mode);
2783
2784   /* Each operand can be a register or an immediate in the range [-16, 15].  */
2785   if (!aarch64_sve_index_immediate_p (base))
2786     base = force_reg (inner, base);
2787   if (!aarch64_sve_index_immediate_p (step))
2788     step = force_reg (inner, step);
2789
2790   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
2791 }
2792
2793 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
2794    integer of mode INT_MODE.  Return true on success.  */
2795
2796 static bool
2797 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
2798                                       rtx src)
2799 {
2800   /* If the constant is smaller than 128 bits, we can do the move
2801      using a vector of SRC_MODEs.  */
2802   if (src_mode != TImode)
2803     {
2804       poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
2805                                      GET_MODE_SIZE (src_mode));
2806       machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
2807       emit_move_insn (gen_lowpart (dup_mode, dest),
2808                       gen_const_vec_duplicate (dup_mode, src));
2809       return true;
2810     }
2811
2812   /* Use LD1RQ[BHWD] to load the 128 bits from memory.  */
2813   src = force_const_mem (src_mode, src);
2814   if (!src)
2815     return false;
2816
2817   /* Make sure that the address is legitimate.  */
2818   if (!aarch64_sve_ld1r_operand_p (src))
2819     {
2820       rtx addr = force_reg (Pmode, XEXP (src, 0));
2821       src = replace_equiv_address (src, addr);
2822     }
2823
2824   machine_mode mode = GET_MODE (dest);
2825   unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
2826   machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
2827   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
2828   src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
2829   emit_insn (gen_rtx_SET (dest, src));
2830   return true;
2831 }
2832
2833 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
2834    isn't a simple duplicate or series.  */
2835
2836 static void
2837 aarch64_expand_sve_const_vector (rtx dest, rtx src)
2838 {
2839   machine_mode mode = GET_MODE (src);
2840   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
2841   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
2842   gcc_assert (npatterns > 1);
2843
2844   if (nelts_per_pattern == 1)
2845     {
2846       /* The constant is a repeating seqeuence of at least two elements,
2847          where the repeating elements occupy no more than 128 bits.
2848          Get an integer representation of the replicated value.  */
2849       scalar_int_mode int_mode;
2850       if (BYTES_BIG_ENDIAN)
2851         /* For now, always use LD1RQ to load the value on big-endian
2852            targets, since the handling of smaller integers includes a
2853            subreg that is semantically an element reverse.  */
2854         int_mode = TImode;
2855       else
2856         {
2857           unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
2858           gcc_assert (int_bits <= 128);
2859           int_mode = int_mode_for_size (int_bits, 0).require ();
2860         }
2861       rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
2862       if (int_value
2863           && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
2864         return;
2865     }
2866
2867   /* Expand each pattern individually.  */
2868   rtx_vector_builder builder;
2869   auto_vec<rtx, 16> vectors (npatterns);
2870   for (unsigned int i = 0; i < npatterns; ++i)
2871     {
2872       builder.new_vector (mode, 1, nelts_per_pattern);
2873       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
2874         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
2875       vectors.quick_push (force_reg (mode, builder.build ()));
2876     }
2877
2878   /* Use permutes to interleave the separate vectors.  */
2879   while (npatterns > 1)
2880     {
2881       npatterns /= 2;
2882       for (unsigned int i = 0; i < npatterns; ++i)
2883         {
2884           rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
2885           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
2886           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
2887           vectors[i] = tmp;
2888         }
2889     }
2890   gcc_assert (vectors[0] == dest);
2891 }
2892
2893 /* Set DEST to immediate IMM.  For SVE vector modes, GEN_VEC_DUPLICATE
2894    is a pattern that can be used to set DEST to a replicated scalar
2895    element.  */
2896
2897 void
2898 aarch64_expand_mov_immediate (rtx dest, rtx imm,
2899                               rtx (*gen_vec_duplicate) (rtx, rtx))
2900 {
2901   machine_mode mode = GET_MODE (dest);
2902
2903   /* Check on what type of symbol it is.  */
2904   scalar_int_mode int_mode;
2905   if ((GET_CODE (imm) == SYMBOL_REF
2906        || GET_CODE (imm) == LABEL_REF
2907        || GET_CODE (imm) == CONST
2908        || GET_CODE (imm) == CONST_POLY_INT)
2909       && is_a <scalar_int_mode> (mode, &int_mode))
2910     {
2911       rtx mem;
2912       poly_int64 offset;
2913       HOST_WIDE_INT const_offset;
2914       enum aarch64_symbol_type sty;
2915
2916       /* If we have (const (plus symbol offset)), separate out the offset
2917          before we start classifying the symbol.  */
2918       rtx base = strip_offset (imm, &offset);
2919
2920       /* We must always add an offset involving VL separately, rather than
2921          folding it into the relocation.  */
2922       if (!offset.is_constant (&const_offset))
2923         {
2924           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
2925             emit_insn (gen_rtx_SET (dest, imm));
2926           else
2927             {
2928               /* Do arithmetic on 32-bit values if the result is smaller
2929                  than that.  */
2930               if (partial_subreg_p (int_mode, SImode))
2931                 {
2932                   /* It is invalid to do symbol calculations in modes
2933                      narrower than SImode.  */
2934                   gcc_assert (base == const0_rtx);
2935                   dest = gen_lowpart (SImode, dest);
2936                   int_mode = SImode;
2937                 }
2938               if (base != const0_rtx)
2939                 {
2940                   base = aarch64_force_temporary (int_mode, dest, base);
2941                   aarch64_add_offset (int_mode, dest, base, offset,
2942                                       NULL_RTX, NULL_RTX, false);
2943                 }
2944               else
2945                 aarch64_add_offset (int_mode, dest, base, offset,
2946                                     dest, NULL_RTX, false);
2947             }
2948           return;
2949         }
2950
2951       sty = aarch64_classify_symbol (base, const_offset);
2952       switch (sty)
2953         {
2954         case SYMBOL_FORCE_TO_MEM:
2955           if (const_offset != 0
2956               && targetm.cannot_force_const_mem (int_mode, imm))
2957             {
2958               gcc_assert (can_create_pseudo_p ());
2959               base = aarch64_force_temporary (int_mode, dest, base);
2960               aarch64_add_offset (int_mode, dest, base, const_offset,
2961                                   NULL_RTX, NULL_RTX, false);
2962               return;
2963             }
2964
2965           mem = force_const_mem (ptr_mode, imm);
2966           gcc_assert (mem);
2967
2968           /* If we aren't generating PC relative literals, then
2969              we need to expand the literal pool access carefully.
2970              This is something that needs to be done in a number
2971              of places, so could well live as a separate function.  */
2972           if (!aarch64_pcrelative_literal_loads)
2973             {
2974               gcc_assert (can_create_pseudo_p ());
2975               base = gen_reg_rtx (ptr_mode);
2976               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
2977               if (ptr_mode != Pmode)
2978                 base = convert_memory_address (Pmode, base);
2979               mem = gen_rtx_MEM (ptr_mode, base);
2980             }
2981
2982           if (int_mode != ptr_mode)
2983             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
2984
2985           emit_insn (gen_rtx_SET (dest, mem));
2986
2987           return;
2988
2989         case SYMBOL_SMALL_TLSGD:
2990         case SYMBOL_SMALL_TLSDESC:
2991         case SYMBOL_SMALL_TLSIE:
2992         case SYMBOL_SMALL_GOT_28K:
2993         case SYMBOL_SMALL_GOT_4G:
2994         case SYMBOL_TINY_GOT:
2995         case SYMBOL_TINY_TLSIE:
2996           if (const_offset != 0)
2997             {
2998               gcc_assert(can_create_pseudo_p ());
2999               base = aarch64_force_temporary (int_mode, dest, base);
3000               aarch64_add_offset (int_mode, dest, base, const_offset,
3001                                   NULL_RTX, NULL_RTX, false);
3002               return;
3003             }
3004           /* FALLTHRU */
3005
3006         case SYMBOL_SMALL_ABSOLUTE:
3007         case SYMBOL_TINY_ABSOLUTE:
3008         case SYMBOL_TLSLE12:
3009         case SYMBOL_TLSLE24:
3010         case SYMBOL_TLSLE32:
3011         case SYMBOL_TLSLE48:
3012           aarch64_load_symref_appropriately (dest, imm, sty);
3013           return;
3014
3015         default:
3016           gcc_unreachable ();
3017         }
3018     }
3019
3020   if (!CONST_INT_P (imm))
3021     {
3022       rtx base, step, value;
3023       if (GET_CODE (imm) == HIGH
3024           || aarch64_simd_valid_immediate (imm, NULL))
3025         emit_insn (gen_rtx_SET (dest, imm));
3026       else if (const_vec_series_p (imm, &base, &step))
3027         aarch64_expand_vec_series (dest, base, step);
3028       else if (const_vec_duplicate_p (imm, &value))
3029         {
3030           /* If the constant is out of range of an SVE vector move,
3031              load it from memory if we can, otherwise move it into
3032              a register and use a DUP.  */
3033           scalar_mode inner_mode = GET_MODE_INNER (mode);
3034           rtx op = force_const_mem (inner_mode, value);
3035           if (!op)
3036             op = force_reg (inner_mode, value);
3037           else if (!aarch64_sve_ld1r_operand_p (op))
3038             {
3039               rtx addr = force_reg (Pmode, XEXP (op, 0));
3040               op = replace_equiv_address (op, addr);
3041             }
3042           emit_insn (gen_vec_duplicate (dest, op));
3043         }
3044       else if (GET_CODE (imm) == CONST_VECTOR
3045                && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3046         aarch64_expand_sve_const_vector (dest, imm);
3047       else
3048         {
3049           rtx mem = force_const_mem (mode, imm);
3050           gcc_assert (mem);
3051           emit_move_insn (dest, mem);
3052         }
3053
3054       return;
3055     }
3056
3057   aarch64_internal_mov_immediate (dest, imm, true,
3058                                   as_a <scalar_int_mode> (mode));
3059 }
3060
3061 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
3062    that is known to contain PTRUE.  */
3063
3064 void
3065 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3066 {
3067   emit_insn (gen_rtx_SET (dest, gen_rtx_UNSPEC (GET_MODE (dest),
3068                                                 gen_rtvec (2, pred, src),
3069                                                 UNSPEC_MERGE_PTRUE)));
3070 }
3071
3072 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3073    operand is in memory.  In this case we need to use the predicated LD1
3074    and ST1 instead of LDR and STR, both for correctness on big-endian
3075    targets and because LD1 and ST1 support a wider range of addressing modes.
3076    PRED_MODE is the mode of the predicate.
3077
3078    See the comment at the head of aarch64-sve.md for details about the
3079    big-endian handling.  */
3080
3081 void
3082 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3083 {
3084   machine_mode mode = GET_MODE (dest);
3085   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3086   if (!register_operand (src, mode)
3087       && !register_operand (dest, mode))
3088     {
3089       rtx tmp = gen_reg_rtx (mode);
3090       if (MEM_P (src))
3091         aarch64_emit_sve_pred_move (tmp, ptrue, src);
3092       else
3093         emit_move_insn (tmp, src);
3094       src = tmp;
3095     }
3096   aarch64_emit_sve_pred_move (dest, ptrue, src);
3097 }
3098
3099 /* Called only on big-endian targets.  See whether an SVE vector move
3100    from SRC to DEST is effectively a REV[BHW] instruction, because at
3101    least one operand is a subreg of an SVE vector that has wider or
3102    narrower elements.  Return true and emit the instruction if so.
3103
3104    For example:
3105
3106      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3107
3108    represents a VIEW_CONVERT between the following vectors, viewed
3109    in memory order:
3110
3111      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
3112      R1: { [0],      [1],      [2],      [3],     ... }
3113
3114    The high part of lane X in R2 should therefore correspond to lane X*2
3115    of R1, but the register representations are:
3116
3117          msb                                      lsb
3118      R2: ...... [1].high  [1].low   [0].high  [0].low
3119      R1: ...... [3]       [2]       [1]       [0]
3120
3121    where the low part of lane X in R2 corresponds to lane X*2 in R1.
3122    We therefore need a reverse operation to swap the high and low values
3123    around.
3124
3125    This is purely an optimization.  Without it we would spill the
3126    subreg operand to the stack in one mode and reload it in the
3127    other mode, which has the same effect as the REV.  */
3128
3129 bool
3130 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3131 {
3132   gcc_assert (BYTES_BIG_ENDIAN);
3133   if (GET_CODE (dest) == SUBREG)
3134     dest = SUBREG_REG (dest);
3135   if (GET_CODE (src) == SUBREG)
3136     src = SUBREG_REG (src);
3137
3138   /* The optimization handles two single SVE REGs with different element
3139      sizes.  */
3140   if (!REG_P (dest)
3141       || !REG_P (src)
3142       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3143       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3144       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3145           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3146     return false;
3147
3148   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
3149   rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3150   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3151                                UNSPEC_REV_SUBREG);
3152   emit_insn (gen_rtx_SET (dest, unspec));
3153   return true;
3154 }
3155
3156 /* Return a copy of X with mode MODE, without changing its other
3157    attributes.  Unlike gen_lowpart, this doesn't care whether the
3158    mode change is valid.  */
3159
3160 static rtx
3161 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3162 {
3163   if (GET_MODE (x) == mode)
3164     return x;
3165
3166   x = shallow_copy_rtx (x);
3167   set_mode_and_regno (x, mode, REGNO (x));
3168   return x;
3169 }
3170
3171 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3172    operands.  */
3173
3174 void
3175 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3176 {
3177   /* Decide which REV operation we need.  The mode with narrower elements
3178      determines the mode of the operands and the mode with the wider
3179      elements determines the reverse width.  */
3180   machine_mode mode_with_wider_elts = GET_MODE (dest);
3181   machine_mode mode_with_narrower_elts = GET_MODE (src);
3182   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3183       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3184     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3185
3186   unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3187   unsigned int unspec;
3188   if (wider_bytes == 8)
3189     unspec = UNSPEC_REV64;
3190   else if (wider_bytes == 4)
3191     unspec = UNSPEC_REV32;
3192   else if (wider_bytes == 2)
3193     unspec = UNSPEC_REV16;
3194   else
3195     gcc_unreachable ();
3196   machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3197
3198   /* Emit:
3199
3200        (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3201                          UNSPEC_MERGE_PTRUE))
3202
3203      with the appropriate modes.  */
3204   ptrue = gen_lowpart (pred_mode, ptrue);
3205   dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3206   src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3207   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3208   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3209                         UNSPEC_MERGE_PTRUE);
3210   emit_insn (gen_rtx_SET (dest, src));
3211 }
3212
3213 static bool
3214 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3215                                  tree exp ATTRIBUTE_UNUSED)
3216 {
3217   /* Currently, always true.  */
3218   return true;
3219 }
3220
3221 /* Implement TARGET_PASS_BY_REFERENCE.  */
3222
3223 static bool
3224 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3225                            machine_mode mode,
3226                            const_tree type,
3227                            bool named ATTRIBUTE_UNUSED)
3228 {
3229   HOST_WIDE_INT size;
3230   machine_mode dummymode;
3231   int nregs;
3232
3233   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
3234   if (mode == BLKmode && type)
3235     size = int_size_in_bytes (type);
3236   else
3237     /* No frontends can create types with variable-sized modes, so we
3238        shouldn't be asked to pass or return them.  */
3239     size = GET_MODE_SIZE (mode).to_constant ();
3240
3241   /* Aggregates are passed by reference based on their size.  */
3242   if (type && AGGREGATE_TYPE_P (type))
3243     {
3244       size = int_size_in_bytes (type);
3245     }
3246
3247   /* Variable sized arguments are always returned by reference.  */
3248   if (size < 0)
3249     return true;
3250
3251   /* Can this be a candidate to be passed in fp/simd register(s)?  */
3252   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3253                                                &dummymode, &nregs,
3254                                                NULL))
3255     return false;
3256
3257   /* Arguments which are variable sized or larger than 2 registers are
3258      passed by reference unless they are a homogenous floating point
3259      aggregate.  */
3260   return size > 2 * UNITS_PER_WORD;
3261 }
3262
3263 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
3264 static bool
3265 aarch64_return_in_msb (const_tree valtype)
3266 {
3267   machine_mode dummy_mode;
3268   int dummy_int;
3269
3270   /* Never happens in little-endian mode.  */
3271   if (!BYTES_BIG_ENDIAN)
3272     return false;
3273
3274   /* Only composite types smaller than or equal to 16 bytes can
3275      be potentially returned in registers.  */
3276   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3277       || int_size_in_bytes (valtype) <= 0
3278       || int_size_in_bytes (valtype) > 16)
3279     return false;
3280
3281   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3282      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3283      is always passed/returned in the least significant bits of fp/simd
3284      register(s).  */
3285   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3286                                                &dummy_mode, &dummy_int, NULL))
3287     return false;
3288
3289   return true;
3290 }
3291
3292 /* Implement TARGET_FUNCTION_VALUE.
3293    Define how to find the value returned by a function.  */
3294
3295 static rtx
3296 aarch64_function_value (const_tree type, const_tree func,
3297                         bool outgoing ATTRIBUTE_UNUSED)
3298 {
3299   machine_mode mode;
3300   int unsignedp;
3301   int count;
3302   machine_mode ag_mode;
3303
3304   mode = TYPE_MODE (type);
3305   if (INTEGRAL_TYPE_P (type))
3306     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3307
3308   if (aarch64_return_in_msb (type))
3309     {
3310       HOST_WIDE_INT size = int_size_in_bytes (type);
3311
3312       if (size % UNITS_PER_WORD != 0)
3313         {
3314           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3315           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3316         }
3317     }
3318
3319   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3320                                                &ag_mode, &count, NULL))
3321     {
3322       if (!aarch64_composite_type_p (type, mode))
3323         {
3324           gcc_assert (count == 1 && mode == ag_mode);
3325           return gen_rtx_REG (mode, V0_REGNUM);
3326         }
3327       else
3328         {
3329           int i;
3330           rtx par;
3331
3332           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3333           for (i = 0; i < count; i++)
3334             {
3335               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3336               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3337               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3338               XVECEXP (par, 0, i) = tmp;
3339             }
3340           return par;
3341         }
3342     }
3343   else
3344     return gen_rtx_REG (mode, R0_REGNUM);
3345 }
3346
3347 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3348    Return true if REGNO is the number of a hard register in which the values
3349    of called function may come back.  */
3350
3351 static bool
3352 aarch64_function_value_regno_p (const unsigned int regno)
3353 {
3354   /* Maximum of 16 bytes can be returned in the general registers.  Examples
3355      of 16-byte return values are: 128-bit integers and 16-byte small
3356      structures (excluding homogeneous floating-point aggregates).  */
3357   if (regno == R0_REGNUM || regno == R1_REGNUM)
3358     return true;
3359
3360   /* Up to four fp/simd registers can return a function value, e.g. a
3361      homogeneous floating-point aggregate having four members.  */
3362   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3363     return TARGET_FLOAT;
3364
3365   return false;
3366 }
3367
3368 /* Implement TARGET_RETURN_IN_MEMORY.
3369
3370    If the type T of the result of a function is such that
3371      void func (T arg)
3372    would require that arg be passed as a value in a register (or set of
3373    registers) according to the parameter passing rules, then the result
3374    is returned in the same registers as would be used for such an
3375    argument.  */
3376
3377 static bool
3378 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3379 {
3380   HOST_WIDE_INT size;
3381   machine_mode ag_mode;
3382   int count;
3383
3384   if (!AGGREGATE_TYPE_P (type)
3385       && TREE_CODE (type) != COMPLEX_TYPE
3386       && TREE_CODE (type) != VECTOR_TYPE)
3387     /* Simple scalar types always returned in registers.  */
3388     return false;
3389
3390   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3391                                                type,
3392                                                &ag_mode,
3393                                                &count,
3394                                                NULL))
3395     return false;
3396
3397   /* Types larger than 2 registers returned in memory.  */
3398   size = int_size_in_bytes (type);
3399   return (size < 0 || size > 2 * UNITS_PER_WORD);
3400 }
3401
3402 static bool
3403 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3404                                const_tree type, int *nregs)
3405 {
3406   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3407   return aarch64_vfp_is_call_or_return_candidate (mode,
3408                                                   type,
3409                                                   &pcum->aapcs_vfp_rmode,
3410                                                   nregs,
3411                                                   NULL);
3412 }
3413
3414 /* Given MODE and TYPE of a function argument, return the alignment in
3415    bits.  The idea is to suppress any stronger alignment requested by
3416    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3417    This is a helper function for local use only.  */
3418
3419 static unsigned int
3420 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
3421 {
3422   if (!type)
3423     return GET_MODE_ALIGNMENT (mode);
3424
3425   if (integer_zerop (TYPE_SIZE (type)))
3426     return 0;
3427
3428   gcc_assert (TYPE_MODE (type) == mode);
3429
3430   if (!AGGREGATE_TYPE_P (type))
3431     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3432
3433   if (TREE_CODE (type) == ARRAY_TYPE)
3434     return TYPE_ALIGN (TREE_TYPE (type));
3435
3436   unsigned int alignment = 0;
3437   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3438     if (TREE_CODE (field) == FIELD_DECL)
3439       alignment = std::max (alignment, DECL_ALIGN (field));
3440
3441   return alignment;
3442 }
3443
3444 /* Layout a function argument according to the AAPCS64 rules.  The rule
3445    numbers refer to the rule numbers in the AAPCS64.  */
3446
3447 static void
3448 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3449                     const_tree type,
3450                     bool named ATTRIBUTE_UNUSED)
3451 {
3452   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3453   int ncrn, nvrn, nregs;
3454   bool allocate_ncrn, allocate_nvrn;
3455   HOST_WIDE_INT size;
3456
3457   /* We need to do this once per argument.  */
3458   if (pcum->aapcs_arg_processed)
3459     return;
3460
3461   pcum->aapcs_arg_processed = true;
3462
3463   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
3464   if (type)
3465     size = int_size_in_bytes (type);
3466   else
3467     /* No frontends can create types with variable-sized modes, so we
3468        shouldn't be asked to pass or return them.  */
3469     size = GET_MODE_SIZE (mode).to_constant ();
3470   size = ROUND_UP (size, UNITS_PER_WORD);
3471
3472   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3473   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3474                                                  mode,
3475                                                  type,
3476                                                  &nregs);
3477
3478   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3479      The following code thus handles passing by SIMD/FP registers first.  */
3480
3481   nvrn = pcum->aapcs_nvrn;
3482
3483   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3484      and homogenous short-vector aggregates (HVA).  */
3485   if (allocate_nvrn)
3486     {
3487       if (!TARGET_FLOAT)
3488         aarch64_err_no_fpadvsimd (mode, "argument");
3489
3490       if (nvrn + nregs <= NUM_FP_ARG_REGS)
3491         {
3492           pcum->aapcs_nextnvrn = nvrn + nregs;
3493           if (!aarch64_composite_type_p (type, mode))
3494             {
3495               gcc_assert (nregs == 1);
3496               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3497             }
3498           else
3499             {
3500               rtx par;
3501               int i;
3502               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3503               for (i = 0; i < nregs; i++)
3504                 {
3505                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3506                                          V0_REGNUM + nvrn + i);
3507                   rtx offset = gen_int_mode
3508                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3509                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3510                   XVECEXP (par, 0, i) = tmp;
3511                 }
3512               pcum->aapcs_reg = par;
3513             }
3514           return;
3515         }
3516       else
3517         {
3518           /* C.3 NSRN is set to 8.  */
3519           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3520           goto on_stack;
3521         }
3522     }
3523
3524   ncrn = pcum->aapcs_ncrn;
3525   nregs = size / UNITS_PER_WORD;
3526
3527   /* C6 - C9.  though the sign and zero extension semantics are
3528      handled elsewhere.  This is the case where the argument fits
3529      entirely general registers.  */
3530   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3531     {
3532
3533       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3534
3535       /* C.8 if the argument has an alignment of 16 then the NGRN is
3536          rounded up to the next even number.  */
3537       if (nregs == 2
3538           && ncrn % 2
3539           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3540              comparison is there because for > 16 * BITS_PER_UNIT
3541              alignment nregs should be > 2 and therefore it should be
3542              passed by reference rather than value.  */
3543           && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3544         {
3545           ++ncrn;
3546           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3547         }
3548
3549       /* NREGS can be 0 when e.g. an empty structure is to be passed.
3550          A reg is still generated for it, but the caller should be smart
3551          enough not to use it.  */
3552       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3553         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3554       else
3555         {
3556           rtx par;
3557           int i;
3558
3559           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3560           for (i = 0; i < nregs; i++)
3561             {
3562               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3563               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3564                                        GEN_INT (i * UNITS_PER_WORD));
3565               XVECEXP (par, 0, i) = tmp;
3566             }
3567           pcum->aapcs_reg = par;
3568         }
3569
3570       pcum->aapcs_nextncrn = ncrn + nregs;
3571       return;
3572     }
3573
3574   /* C.11  */
3575   pcum->aapcs_nextncrn = NUM_ARG_REGS;
3576
3577   /* The argument is passed on stack; record the needed number of words for
3578      this argument and align the total size if necessary.  */
3579 on_stack:
3580   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3581
3582   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3583     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
3584                                        16 / UNITS_PER_WORD);
3585   return;
3586 }
3587
3588 /* Implement TARGET_FUNCTION_ARG.  */
3589
3590 static rtx
3591 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3592                       const_tree type, bool named)
3593 {
3594   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3595   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3596
3597   if (mode == VOIDmode)
3598     return NULL_RTX;
3599
3600   aarch64_layout_arg (pcum_v, mode, type, named);
3601   return pcum->aapcs_reg;
3602 }
3603
3604 void
3605 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
3606                            const_tree fntype ATTRIBUTE_UNUSED,
3607                            rtx libname ATTRIBUTE_UNUSED,
3608                            const_tree fndecl ATTRIBUTE_UNUSED,
3609                            unsigned n_named ATTRIBUTE_UNUSED)
3610 {
3611   pcum->aapcs_ncrn = 0;
3612   pcum->aapcs_nvrn = 0;
3613   pcum->aapcs_nextncrn = 0;
3614   pcum->aapcs_nextnvrn = 0;
3615   pcum->pcs_variant = ARM_PCS_AAPCS64;
3616   pcum->aapcs_reg = NULL_RTX;
3617   pcum->aapcs_arg_processed = false;
3618   pcum->aapcs_stack_words = 0;
3619   pcum->aapcs_stack_size = 0;
3620
3621   if (!TARGET_FLOAT
3622       && fndecl && TREE_PUBLIC (fndecl)
3623       && fntype && fntype != error_mark_node)
3624     {
3625       const_tree type = TREE_TYPE (fntype);
3626       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
3627       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
3628       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
3629                                                    &mode, &nregs, NULL))
3630         aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
3631     }
3632   return;
3633 }
3634
3635 static void
3636 aarch64_function_arg_advance (cumulative_args_t pcum_v,
3637                               machine_mode mode,
3638                               const_tree type,
3639                               bool named)
3640 {
3641   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3642   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
3643     {
3644       aarch64_layout_arg (pcum_v, mode, type, named);
3645       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
3646                   != (pcum->aapcs_stack_words != 0));
3647       pcum->aapcs_arg_processed = false;
3648       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
3649       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
3650       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
3651       pcum->aapcs_stack_words = 0;
3652       pcum->aapcs_reg = NULL_RTX;
3653     }
3654 }
3655
3656 bool
3657 aarch64_function_arg_regno_p (unsigned regno)
3658 {
3659   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
3660           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
3661 }
3662
3663 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
3664    PARM_BOUNDARY bits of alignment, but will be given anything up
3665    to STACK_BOUNDARY bits if the type requires it.  This makes sure
3666    that both before and after the layout of each argument, the Next
3667    Stacked Argument Address (NSAA) will have a minimum alignment of
3668    8 bytes.  */
3669
3670 static unsigned int
3671 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
3672 {
3673   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
3674   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
3675 }
3676
3677 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
3678
3679 static fixed_size_mode
3680 aarch64_get_reg_raw_mode (int regno)
3681 {
3682   if (TARGET_SVE && FP_REGNUM_P (regno))
3683     /* Don't use the SVE part of the register for __builtin_apply and
3684        __builtin_return.  The SVE registers aren't used by the normal PCS,
3685        so using them there would be a waste of time.  The PCS extensions
3686        for SVE types are fundamentally incompatible with the
3687        __builtin_return/__builtin_apply interface.  */
3688     return as_a <fixed_size_mode> (V16QImode);
3689   return default_get_reg_raw_mode (regno);
3690 }
3691
3692 /* Implement TARGET_FUNCTION_ARG_PADDING.
3693
3694    Small aggregate types are placed in the lowest memory address.
3695
3696    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
3697
3698 static pad_direction
3699 aarch64_function_arg_padding (machine_mode mode, const_tree type)
3700 {
3701   /* On little-endian targets, the least significant byte of every stack
3702      argument is passed at the lowest byte address of the stack slot.  */
3703   if (!BYTES_BIG_ENDIAN)
3704     return PAD_UPWARD;
3705
3706   /* Otherwise, integral, floating-point and pointer types are padded downward:
3707      the least significant byte of a stack argument is passed at the highest
3708      byte address of the stack slot.  */
3709   if (type
3710       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
3711          || POINTER_TYPE_P (type))
3712       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
3713     return PAD_DOWNWARD;
3714
3715   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
3716   return PAD_UPWARD;
3717 }
3718
3719 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
3720
3721    It specifies padding for the last (may also be the only)
3722    element of a block move between registers and memory.  If
3723    assuming the block is in the memory, padding upward means that
3724    the last element is padded after its highest significant byte,
3725    while in downward padding, the last element is padded at the
3726    its least significant byte side.
3727
3728    Small aggregates and small complex types are always padded
3729    upwards.
3730
3731    We don't need to worry about homogeneous floating-point or
3732    short-vector aggregates; their move is not affected by the
3733    padding direction determined here.  Regardless of endianness,
3734    each element of such an aggregate is put in the least
3735    significant bits of a fp/simd register.
3736
3737    Return !BYTES_BIG_ENDIAN if the least significant byte of the
3738    register has useful data, and return the opposite if the most
3739    significant byte does.  */
3740
3741 bool
3742 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
3743                      bool first ATTRIBUTE_UNUSED)
3744 {
3745
3746   /* Small composite types are always padded upward.  */
3747   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
3748     {
3749       HOST_WIDE_INT size;
3750       if (type)
3751         size = int_size_in_bytes (type);
3752       else
3753         /* No frontends can create types with variable-sized modes, so we
3754            shouldn't be asked to pass or return them.  */
3755         size = GET_MODE_SIZE (mode).to_constant ();
3756       if (size < 2 * UNITS_PER_WORD)
3757         return true;
3758     }
3759
3760   /* Otherwise, use the default padding.  */
3761   return !BYTES_BIG_ENDIAN;
3762 }
3763
3764 static scalar_int_mode
3765 aarch64_libgcc_cmp_return_mode (void)
3766 {
3767   return SImode;
3768 }
3769
3770 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
3771
3772 /* We use the 12-bit shifted immediate arithmetic instructions so values
3773    must be multiple of (1 << 12), i.e. 4096.  */
3774 #define ARITH_FACTOR 4096
3775
3776 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
3777 #error Cannot use simple address calculation for stack probing
3778 #endif
3779
3780 /* The pair of scratch registers used for stack probing.  */
3781 #define PROBE_STACK_FIRST_REG  9
3782 #define PROBE_STACK_SECOND_REG 10
3783
3784 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
3785    inclusive.  These are offsets from the current stack pointer.  */
3786
3787 static void
3788 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
3789 {
3790   HOST_WIDE_INT size;
3791   if (!poly_size.is_constant (&size))
3792     {
3793       sorry ("stack probes for SVE frames");
3794       return;
3795     }
3796
3797   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
3798
3799   /* See the same assertion on PROBE_INTERVAL above.  */
3800   gcc_assert ((first % ARITH_FACTOR) == 0);
3801
3802   /* See if we have a constant small number of probes to generate.  If so,
3803      that's the easy case.  */
3804   if (size <= PROBE_INTERVAL)
3805     {
3806       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
3807
3808       emit_set_insn (reg1,
3809                      plus_constant (Pmode,
3810                                     stack_pointer_rtx, -(first + base)));
3811       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
3812     }
3813
3814   /* The run-time loop is made up of 8 insns in the generic case while the
3815      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
3816   else if (size <= 4 * PROBE_INTERVAL)
3817     {
3818       HOST_WIDE_INT i, rem;
3819
3820       emit_set_insn (reg1,
3821                      plus_constant (Pmode,
3822                                     stack_pointer_rtx,
3823                                     -(first + PROBE_INTERVAL)));
3824       emit_stack_probe (reg1);
3825
3826       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
3827          it exceeds SIZE.  If only two probes are needed, this will not
3828          generate any code.  Then probe at FIRST + SIZE.  */
3829       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
3830         {
3831           emit_set_insn (reg1,
3832                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
3833           emit_stack_probe (reg1);
3834         }
3835
3836       rem = size - (i - PROBE_INTERVAL);
3837       if (rem > 256)
3838         {
3839           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3840
3841           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
3842           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
3843         }
3844       else
3845         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
3846     }
3847
3848   /* Otherwise, do the same as above, but in a loop.  Note that we must be
3849      extra careful with variables wrapping around because we might be at
3850      the very top (or the very bottom) of the address space and we have
3851      to be able to handle this case properly; in particular, we use an
3852      equality test for the loop condition.  */
3853   else
3854     {
3855       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
3856
3857       /* Step 1: round SIZE to the previous multiple of the interval.  */
3858
3859       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
3860
3861
3862       /* Step 2: compute initial and final value of the loop counter.  */
3863
3864       /* TEST_ADDR = SP + FIRST.  */
3865       emit_set_insn (reg1,
3866                      plus_constant (Pmode, stack_pointer_rtx, -first));
3867
3868       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
3869       HOST_WIDE_INT adjustment = - (first + rounded_size);
3870       if (! aarch64_uimm12_shift (adjustment))
3871         {
3872           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
3873                                           true, Pmode);
3874           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
3875         }
3876       else
3877         emit_set_insn (reg2,
3878                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
3879
3880       /* Step 3: the loop
3881
3882          do
3883            {
3884              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
3885              probe at TEST_ADDR
3886            }
3887          while (TEST_ADDR != LAST_ADDR)
3888
3889          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
3890          until it is equal to ROUNDED_SIZE.  */
3891
3892       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
3893
3894
3895       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
3896          that SIZE is equal to ROUNDED_SIZE.  */
3897
3898       if (size != rounded_size)
3899         {
3900           HOST_WIDE_INT rem = size - rounded_size;
3901
3902           if (rem > 256)
3903             {
3904               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3905
3906               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
3907               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
3908             }
3909           else
3910             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
3911         }
3912     }
3913
3914   /* Make sure nothing is scheduled before we are done.  */
3915   emit_insn (gen_blockage ());
3916 }
3917
3918 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
3919    absolute addresses.  */
3920
3921 const char *
3922 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
3923 {
3924   static int labelno = 0;
3925   char loop_lab[32];
3926   rtx xops[2];
3927
3928   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
3929
3930   /* Loop.  */
3931   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
3932
3933   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
3934   xops[0] = reg1;
3935   xops[1] = GEN_INT (PROBE_INTERVAL);
3936   output_asm_insn ("sub\t%0, %0, %1", xops);
3937
3938   /* Probe at TEST_ADDR.  */
3939   output_asm_insn ("str\txzr, [%0]", xops);
3940
3941   /* Test if TEST_ADDR == LAST_ADDR.  */
3942   xops[1] = reg2;
3943   output_asm_insn ("cmp\t%0, %1", xops);
3944
3945   /* Branch.  */
3946   fputs ("\tb.ne\t", asm_out_file);
3947   assemble_name_raw (asm_out_file, loop_lab);
3948   fputc ('\n', asm_out_file);
3949
3950   return "";
3951 }
3952
3953 /* Mark the registers that need to be saved by the callee and calculate
3954    the size of the callee-saved registers area and frame record (both FP
3955    and LR may be omitted).  */
3956 static void
3957 aarch64_layout_frame (void)
3958 {
3959   HOST_WIDE_INT offset = 0;
3960   int regno, last_fp_reg = INVALID_REGNUM;
3961
3962   if (reload_completed && cfun->machine->frame.laid_out)
3963     return;
3964
3965   /* Force a frame chain for EH returns so the return address is at FP+8.  */
3966   cfun->machine->frame.emit_frame_chain
3967     = frame_pointer_needed || crtl->calls_eh_return;
3968
3969   /* Emit a frame chain if the frame pointer is enabled.
3970      If -momit-leaf-frame-pointer is used, do not use a frame chain
3971      in leaf functions which do not use LR.  */
3972   if (flag_omit_frame_pointer == 2
3973       && !(flag_omit_leaf_frame_pointer && crtl->is_leaf
3974            && !df_regs_ever_live_p (LR_REGNUM)))
3975     cfun->machine->frame.emit_frame_chain = true;
3976
3977 #define SLOT_NOT_REQUIRED (-2)
3978 #define SLOT_REQUIRED     (-1)
3979
3980   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
3981   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
3982
3983   /* First mark all the registers that really need to be saved...  */
3984   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
3985     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
3986
3987   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
3988     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
3989
3990   /* ... that includes the eh data registers (if needed)...  */
3991   if (crtl->calls_eh_return)
3992     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
3993       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
3994         = SLOT_REQUIRED;
3995
3996   /* ... and any callee saved register that dataflow says is live.  */
3997   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
3998     if (df_regs_ever_live_p (regno)
3999         && (regno == R30_REGNUM
4000             || !call_used_regs[regno]))
4001       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4002
4003   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4004     if (df_regs_ever_live_p (regno)
4005         && !call_used_regs[regno])
4006       {
4007         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4008         last_fp_reg = regno;
4009       }
4010
4011   if (cfun->machine->frame.emit_frame_chain)
4012     {
4013       /* FP and LR are placed in the linkage record.  */
4014       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4015       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4016       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4017       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4018       offset = 2 * UNITS_PER_WORD;
4019     }
4020
4021   /* Now assign stack slots for them.  */
4022   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4023     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4024       {
4025         cfun->machine->frame.reg_offset[regno] = offset;
4026         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4027           cfun->machine->frame.wb_candidate1 = regno;
4028         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4029           cfun->machine->frame.wb_candidate2 = regno;
4030         offset += UNITS_PER_WORD;
4031       }
4032
4033   HOST_WIDE_INT max_int_offset = offset;
4034   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4035   bool has_align_gap = offset != max_int_offset;
4036
4037   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4038     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4039       {
4040         /* If there is an alignment gap between integer and fp callee-saves,
4041            allocate the last fp register to it if possible.  */
4042         if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
4043           {
4044             cfun->machine->frame.reg_offset[regno] = max_int_offset;
4045             break;
4046           }
4047
4048         cfun->machine->frame.reg_offset[regno] = offset;
4049         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4050           cfun->machine->frame.wb_candidate1 = regno;
4051         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4052                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4053           cfun->machine->frame.wb_candidate2 = regno;
4054         offset += UNITS_PER_WORD;
4055       }
4056
4057   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4058
4059   cfun->machine->frame.saved_regs_size = offset;
4060
4061   HOST_WIDE_INT varargs_and_saved_regs_size
4062     = offset + cfun->machine->frame.saved_varargs_size;
4063
4064   cfun->machine->frame.hard_fp_offset
4065     = aligned_upper_bound (varargs_and_saved_regs_size
4066                            + get_frame_size (),
4067                            STACK_BOUNDARY / BITS_PER_UNIT);
4068
4069   /* Both these values are already aligned.  */
4070   gcc_assert (multiple_p (crtl->outgoing_args_size,
4071                           STACK_BOUNDARY / BITS_PER_UNIT));
4072   cfun->machine->frame.frame_size
4073     = (cfun->machine->frame.hard_fp_offset
4074        + crtl->outgoing_args_size);
4075
4076   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4077
4078   cfun->machine->frame.initial_adjust = 0;
4079   cfun->machine->frame.final_adjust = 0;
4080   cfun->machine->frame.callee_adjust = 0;
4081   cfun->machine->frame.callee_offset = 0;
4082
4083   HOST_WIDE_INT max_push_offset = 0;
4084   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4085     max_push_offset = 512;
4086   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4087     max_push_offset = 256;
4088
4089   HOST_WIDE_INT const_size, const_fp_offset;
4090   if (cfun->machine->frame.frame_size.is_constant (&const_size)
4091       && const_size < max_push_offset
4092       && known_eq (crtl->outgoing_args_size, 0))
4093     {
4094       /* Simple, small frame with no outgoing arguments:
4095          stp reg1, reg2, [sp, -frame_size]!
4096          stp reg3, reg4, [sp, 16]  */
4097       cfun->machine->frame.callee_adjust = const_size;
4098     }
4099   else if (known_lt (crtl->outgoing_args_size
4100                      + cfun->machine->frame.saved_regs_size, 512)
4101            && !(cfun->calls_alloca
4102                 && known_lt (cfun->machine->frame.hard_fp_offset,
4103                              max_push_offset)))
4104     {
4105       /* Frame with small outgoing arguments:
4106          sub sp, sp, frame_size
4107          stp reg1, reg2, [sp, outgoing_args_size]
4108          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
4109       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4110       cfun->machine->frame.callee_offset
4111         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4112     }
4113   else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4114            && const_fp_offset < max_push_offset)
4115     {
4116       /* Frame with large outgoing arguments but a small local area:
4117          stp reg1, reg2, [sp, -hard_fp_offset]!
4118          stp reg3, reg4, [sp, 16]
4119          sub sp, sp, outgoing_args_size  */
4120       cfun->machine->frame.callee_adjust = const_fp_offset;
4121       cfun->machine->frame.final_adjust
4122         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4123     }
4124   else
4125     {
4126       /* Frame with large local area and outgoing arguments using frame pointer:
4127          sub sp, sp, hard_fp_offset
4128          stp x29, x30, [sp, 0]
4129          add x29, sp, 0
4130          stp reg3, reg4, [sp, 16]
4131          sub sp, sp, outgoing_args_size  */
4132       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4133       cfun->machine->frame.final_adjust
4134         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4135     }
4136
4137   cfun->machine->frame.laid_out = true;
4138 }
4139
4140 /* Return true if the register REGNO is saved on entry to
4141    the current function.  */
4142
4143 static bool
4144 aarch64_register_saved_on_entry (int regno)
4145 {
4146   return cfun->machine->frame.reg_offset[regno] >= 0;
4147 }
4148
4149 /* Return the next register up from REGNO up to LIMIT for the callee
4150    to save.  */
4151
4152 static unsigned
4153 aarch64_next_callee_save (unsigned regno, unsigned limit)
4154 {
4155   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4156     regno ++;
4157   return regno;
4158 }
4159
4160 /* Push the register number REGNO of mode MODE to the stack with write-back
4161    adjusting the stack by ADJUSTMENT.  */
4162
4163 static void
4164 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4165                            HOST_WIDE_INT adjustment)
4166  {
4167   rtx base_rtx = stack_pointer_rtx;
4168   rtx insn, reg, mem;
4169
4170   reg = gen_rtx_REG (mode, regno);
4171   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4172                             plus_constant (Pmode, base_rtx, -adjustment));
4173   mem = gen_frame_mem (mode, mem);
4174
4175   insn = emit_move_insn (mem, reg);
4176   RTX_FRAME_RELATED_P (insn) = 1;
4177 }
4178
4179 /* Generate and return an instruction to store the pair of registers
4180    REG and REG2 of mode MODE to location BASE with write-back adjusting
4181    the stack location BASE by ADJUSTMENT.  */
4182
4183 static rtx
4184 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4185                           HOST_WIDE_INT adjustment)
4186 {
4187   switch (mode)
4188     {
4189     case E_DImode:
4190       return gen_storewb_pairdi_di (base, base, reg, reg2,
4191                                     GEN_INT (-adjustment),
4192                                     GEN_INT (UNITS_PER_WORD - adjustment));
4193     case E_DFmode:
4194       return gen_storewb_pairdf_di (base, base, reg, reg2,
4195                                     GEN_INT (-adjustment),
4196                                     GEN_INT (UNITS_PER_WORD - adjustment));
4197     default:
4198       gcc_unreachable ();
4199     }
4200 }
4201
4202 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4203    stack pointer by ADJUSTMENT.  */
4204
4205 static void
4206 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4207 {
4208   rtx_insn *insn;
4209   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4210
4211   if (regno2 == INVALID_REGNUM)
4212     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4213
4214   rtx reg1 = gen_rtx_REG (mode, regno1);
4215   rtx reg2 = gen_rtx_REG (mode, regno2);
4216
4217   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4218                                               reg2, adjustment));
4219   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4220   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4221   RTX_FRAME_RELATED_P (insn) = 1;
4222 }
4223
4224 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4225    adjusting it by ADJUSTMENT afterwards.  */
4226
4227 static rtx
4228 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4229                          HOST_WIDE_INT adjustment)
4230 {
4231   switch (mode)
4232     {
4233     case E_DImode:
4234       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4235                                    GEN_INT (UNITS_PER_WORD));
4236     case E_DFmode:
4237       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4238                                    GEN_INT (UNITS_PER_WORD));
4239     default:
4240       gcc_unreachable ();
4241     }
4242 }
4243
4244 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4245    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4246    into CFI_OPS.  */
4247
4248 static void
4249 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4250                   rtx *cfi_ops)
4251 {
4252   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4253   rtx reg1 = gen_rtx_REG (mode, regno1);
4254
4255   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4256
4257   if (regno2 == INVALID_REGNUM)
4258     {
4259       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4260       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4261       emit_move_insn (reg1, gen_frame_mem (mode, mem));
4262     }
4263   else
4264     {
4265       rtx reg2 = gen_rtx_REG (mode, regno2);
4266       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4267       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4268                                           reg2, adjustment));
4269     }
4270 }
4271
4272 /* Generate and return a store pair instruction of mode MODE to store
4273    register REG1 to MEM1 and register REG2 to MEM2.  */
4274
4275 static rtx
4276 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4277                         rtx reg2)
4278 {
4279   switch (mode)
4280     {
4281     case E_DImode:
4282       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
4283
4284     case E_DFmode:
4285       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
4286
4287     default:
4288       gcc_unreachable ();
4289     }
4290 }
4291
4292 /* Generate and regurn a load pair isntruction of mode MODE to load register
4293    REG1 from MEM1 and register REG2 from MEM2.  */
4294
4295 static rtx
4296 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4297                        rtx mem2)
4298 {
4299   switch (mode)
4300     {
4301     case E_DImode:
4302       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
4303
4304     case E_DFmode:
4305       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
4306
4307     default:
4308       gcc_unreachable ();
4309     }
4310 }
4311
4312 /* Return TRUE if return address signing should be enabled for the current
4313    function, otherwise return FALSE.  */
4314
4315 bool
4316 aarch64_return_address_signing_enabled (void)
4317 {
4318   /* This function should only be called after frame laid out.   */
4319   gcc_assert (cfun->machine->frame.laid_out);
4320
4321   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4322      if it's LR is pushed onto stack.  */
4323   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4324           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4325               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4326 }
4327
4328 /* Emit code to save the callee-saved registers from register number START
4329    to LIMIT to the stack at the location starting at offset START_OFFSET,
4330    skipping any write-back candidates if SKIP_WB is true.  */
4331
4332 static void
4333 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4334                            unsigned start, unsigned limit, bool skip_wb)
4335 {
4336   rtx_insn *insn;
4337   unsigned regno;
4338   unsigned regno2;
4339
4340   for (regno = aarch64_next_callee_save (start, limit);
4341        regno <= limit;
4342        regno = aarch64_next_callee_save (regno + 1, limit))
4343     {
4344       rtx reg, mem;
4345       poly_int64 offset;
4346
4347       if (skip_wb
4348           && (regno == cfun->machine->frame.wb_candidate1
4349               || regno == cfun->machine->frame.wb_candidate2))
4350         continue;
4351
4352       if (cfun->machine->reg_is_wrapped_separately[regno])
4353        continue;
4354
4355       reg = gen_rtx_REG (mode, regno);
4356       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4357       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4358                                                 offset));
4359
4360       regno2 = aarch64_next_callee_save (regno + 1, limit);
4361
4362       if (regno2 <= limit
4363           && !cfun->machine->reg_is_wrapped_separately[regno2]
4364           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4365               == cfun->machine->frame.reg_offset[regno2]))
4366
4367         {
4368           rtx reg2 = gen_rtx_REG (mode, regno2);
4369           rtx mem2;
4370
4371           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4372           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4373                                                      offset));
4374           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4375                                                     reg2));
4376
4377           /* The first part of a frame-related parallel insn is
4378              always assumed to be relevant to the frame
4379              calculations; subsequent parts, are only
4380              frame-related if explicitly marked.  */
4381           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4382           regno = regno2;
4383         }
4384       else
4385         insn = emit_move_insn (mem, reg);
4386
4387       RTX_FRAME_RELATED_P (insn) = 1;
4388     }
4389 }
4390
4391 /* Emit code to restore the callee registers of mode MODE from register
4392    number START up to and including LIMIT.  Restore from the stack offset
4393    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4394    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
4395
4396 static void
4397 aarch64_restore_callee_saves (machine_mode mode,
4398                               poly_int64 start_offset, unsigned start,
4399                               unsigned limit, bool skip_wb, rtx *cfi_ops)
4400 {
4401   rtx base_rtx = stack_pointer_rtx;
4402   unsigned regno;
4403   unsigned regno2;
4404   poly_int64 offset;
4405
4406   for (regno = aarch64_next_callee_save (start, limit);
4407        regno <= limit;
4408        regno = aarch64_next_callee_save (regno + 1, limit))
4409     {
4410       if (cfun->machine->reg_is_wrapped_separately[regno])
4411        continue;
4412
4413       rtx reg, mem;
4414
4415       if (skip_wb
4416           && (regno == cfun->machine->frame.wb_candidate1
4417               || regno == cfun->machine->frame.wb_candidate2))
4418         continue;
4419
4420       reg = gen_rtx_REG (mode, regno);
4421       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4422       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4423
4424       regno2 = aarch64_next_callee_save (regno + 1, limit);
4425
4426       if (regno2 <= limit
4427           && !cfun->machine->reg_is_wrapped_separately[regno2]
4428           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4429               == cfun->machine->frame.reg_offset[regno2]))
4430         {
4431           rtx reg2 = gen_rtx_REG (mode, regno2);
4432           rtx mem2;
4433
4434           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4435           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4436           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4437
4438           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4439           regno = regno2;
4440         }
4441       else
4442         emit_move_insn (reg, mem);
4443       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
4444     }
4445 }
4446
4447 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4448    of MODE.  */
4449
4450 static inline bool
4451 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4452 {
4453   HOST_WIDE_INT multiple;
4454   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4455           && IN_RANGE (multiple, -8, 7));
4456 }
4457
4458 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4459    of MODE.  */
4460
4461 static inline bool
4462 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4463 {
4464   HOST_WIDE_INT multiple;
4465   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4466           && IN_RANGE (multiple, 0, 63));
4467 }
4468
4469 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4470    of MODE.  */
4471
4472 bool
4473 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4474 {
4475   HOST_WIDE_INT multiple;
4476   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4477           && IN_RANGE (multiple, -64, 63));
4478 }
4479
4480 /* Return true if OFFSET is a signed 9-bit value.  */
4481
4482 static inline bool
4483 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
4484                                poly_int64 offset)
4485 {
4486   HOST_WIDE_INT const_offset;
4487   return (offset.is_constant (&const_offset)
4488           && IN_RANGE (const_offset, -256, 255));
4489 }
4490
4491 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4492    of MODE.  */
4493
4494 static inline bool
4495 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4496 {
4497   HOST_WIDE_INT multiple;
4498   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4499           && IN_RANGE (multiple, -256, 255));
4500 }
4501
4502 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4503    of MODE.  */
4504
4505 static inline bool
4506 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4507 {
4508   HOST_WIDE_INT multiple;
4509   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4510           && IN_RANGE (multiple, 0, 4095));
4511 }
4512
4513 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
4514
4515 static sbitmap
4516 aarch64_get_separate_components (void)
4517 {
4518   aarch64_layout_frame ();
4519
4520   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4521   bitmap_clear (components);
4522
4523   /* The registers we need saved to the frame.  */
4524   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4525     if (aarch64_register_saved_on_entry (regno))
4526       {
4527         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4528         if (!frame_pointer_needed)
4529           offset += cfun->machine->frame.frame_size
4530                     - cfun->machine->frame.hard_fp_offset;
4531         /* Check that we can access the stack slot of the register with one
4532            direct load with no adjustments needed.  */
4533         if (offset_12bit_unsigned_scaled_p (DImode, offset))
4534           bitmap_set_bit (components, regno);
4535       }
4536
4537   /* Don't mess with the hard frame pointer.  */
4538   if (frame_pointer_needed)
4539     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
4540
4541   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4542   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4543   /* If aarch64_layout_frame has chosen registers to store/restore with
4544      writeback don't interfere with them to avoid having to output explicit
4545      stack adjustment instructions.  */
4546   if (reg2 != INVALID_REGNUM)
4547     bitmap_clear_bit (components, reg2);
4548   if (reg1 != INVALID_REGNUM)
4549     bitmap_clear_bit (components, reg1);
4550
4551   bitmap_clear_bit (components, LR_REGNUM);
4552   bitmap_clear_bit (components, SP_REGNUM);
4553
4554   return components;
4555 }
4556
4557 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
4558
4559 static sbitmap
4560 aarch64_components_for_bb (basic_block bb)
4561 {
4562   bitmap in = DF_LIVE_IN (bb);
4563   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
4564   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
4565
4566   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4567   bitmap_clear (components);
4568
4569   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
4570   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4571     if ((!call_used_regs[regno])
4572        && (bitmap_bit_p (in, regno)
4573            || bitmap_bit_p (gen, regno)
4574            || bitmap_bit_p (kill, regno)))
4575       {
4576         unsigned regno2, offset, offset2;
4577         bitmap_set_bit (components, regno);
4578
4579         /* If there is a callee-save at an adjacent offset, add it too
4580            to increase the use of LDP/STP.  */
4581         offset = cfun->machine->frame.reg_offset[regno];
4582         regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
4583
4584         if (regno2 <= LAST_SAVED_REGNUM)
4585           {
4586             offset2 = cfun->machine->frame.reg_offset[regno2];
4587             if ((offset & ~8) == (offset2 & ~8))
4588               bitmap_set_bit (components, regno2);
4589           }
4590       }
4591
4592   return components;
4593 }
4594
4595 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
4596    Nothing to do for aarch64.  */
4597
4598 static void
4599 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
4600 {
4601 }
4602
4603 /* Return the next set bit in BMP from START onwards.  Return the total number
4604    of bits in BMP if no set bit is found at or after START.  */
4605
4606 static unsigned int
4607 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
4608 {
4609   unsigned int nbits = SBITMAP_SIZE (bmp);
4610   if (start == nbits)
4611     return start;
4612
4613   gcc_assert (start < nbits);
4614   for (unsigned int i = start; i < nbits; i++)
4615     if (bitmap_bit_p (bmp, i))
4616       return i;
4617
4618   return nbits;
4619 }
4620
4621 /* Do the work for aarch64_emit_prologue_components and
4622    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
4623    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
4624    for these components or the epilogue sequence.  That is, it determines
4625    whether we should emit stores or loads and what kind of CFA notes to attach
4626    to the insns.  Otherwise the logic for the two sequences is very
4627    similar.  */
4628
4629 static void
4630 aarch64_process_components (sbitmap components, bool prologue_p)
4631 {
4632   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
4633                              ? HARD_FRAME_POINTER_REGNUM
4634                              : STACK_POINTER_REGNUM);
4635
4636   unsigned last_regno = SBITMAP_SIZE (components);
4637   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
4638   rtx_insn *insn = NULL;
4639
4640   while (regno != last_regno)
4641     {
4642       /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
4643          so DFmode for the vector registers is enough.  */
4644       machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
4645       rtx reg = gen_rtx_REG (mode, regno);
4646       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4647       if (!frame_pointer_needed)
4648         offset += cfun->machine->frame.frame_size
4649                   - cfun->machine->frame.hard_fp_offset;
4650       rtx addr = plus_constant (Pmode, ptr_reg, offset);
4651       rtx mem = gen_frame_mem (mode, addr);
4652
4653       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
4654       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
4655       /* No more registers to handle after REGNO.
4656          Emit a single save/restore and exit.  */
4657       if (regno2 == last_regno)
4658         {
4659           insn = emit_insn (set);
4660           RTX_FRAME_RELATED_P (insn) = 1;
4661           if (prologue_p)
4662             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4663           else
4664             add_reg_note (insn, REG_CFA_RESTORE, reg);
4665           break;
4666         }
4667
4668       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
4669       /* The next register is not of the same class or its offset is not
4670          mergeable with the current one into a pair.  */
4671       if (!satisfies_constraint_Ump (mem)
4672           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
4673           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
4674                        GET_MODE_SIZE (mode)))
4675         {
4676           insn = emit_insn (set);
4677           RTX_FRAME_RELATED_P (insn) = 1;
4678           if (prologue_p)
4679             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4680           else
4681             add_reg_note (insn, REG_CFA_RESTORE, reg);
4682
4683           regno = regno2;
4684           continue;
4685         }
4686
4687       /* REGNO2 can be saved/restored in a pair with REGNO.  */
4688       rtx reg2 = gen_rtx_REG (mode, regno2);
4689       if (!frame_pointer_needed)
4690         offset2 += cfun->machine->frame.frame_size
4691                   - cfun->machine->frame.hard_fp_offset;
4692       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
4693       rtx mem2 = gen_frame_mem (mode, addr2);
4694       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
4695                              : gen_rtx_SET (reg2, mem2);
4696
4697       if (prologue_p)
4698         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
4699       else
4700         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4701
4702       RTX_FRAME_RELATED_P (insn) = 1;
4703       if (prologue_p)
4704         {
4705           add_reg_note (insn, REG_CFA_OFFSET, set);
4706           add_reg_note (insn, REG_CFA_OFFSET, set2);
4707         }
4708       else
4709         {
4710           add_reg_note (insn, REG_CFA_RESTORE, reg);
4711           add_reg_note (insn, REG_CFA_RESTORE, reg2);
4712         }
4713
4714       regno = aarch64_get_next_set_bit (components, regno2 + 1);
4715     }
4716 }
4717
4718 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
4719
4720 static void
4721 aarch64_emit_prologue_components (sbitmap components)
4722 {
4723   aarch64_process_components (components, true);
4724 }
4725
4726 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
4727
4728 static void
4729 aarch64_emit_epilogue_components (sbitmap components)
4730 {
4731   aarch64_process_components (components, false);
4732 }
4733
4734 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
4735
4736 static void
4737 aarch64_set_handled_components (sbitmap components)
4738 {
4739   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4740     if (bitmap_bit_p (components, regno))
4741       cfun->machine->reg_is_wrapped_separately[regno] = true;
4742 }
4743
4744 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
4745    is saved at BASE + OFFSET.  */
4746
4747 static void
4748 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
4749                             rtx base, poly_int64 offset)
4750 {
4751   rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
4752   add_reg_note (insn, REG_CFA_EXPRESSION,
4753                 gen_rtx_SET (mem, regno_reg_rtx[reg]));
4754 }
4755
4756 /* AArch64 stack frames generated by this compiler look like:
4757
4758         +-------------------------------+
4759         |                               |
4760         |  incoming stack arguments     |
4761         |                               |
4762         +-------------------------------+
4763         |                               | <-- incoming stack pointer (aligned)
4764         |  callee-allocated save area   |
4765         |  for register varargs         |
4766         |                               |
4767         +-------------------------------+
4768         |  local variables              | <-- frame_pointer_rtx
4769         |                               |
4770         +-------------------------------+
4771         |  padding0                     | \
4772         +-------------------------------+  |
4773         |  callee-saved registers       |  | frame.saved_regs_size
4774         +-------------------------------+  |
4775         |  LR'                          |  |
4776         +-------------------------------+  |
4777         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
4778         +-------------------------------+
4779         |  dynamic allocation           |
4780         +-------------------------------+
4781         |  padding                      |
4782         +-------------------------------+
4783         |  outgoing stack arguments     | <-- arg_pointer
4784         |                               |
4785         +-------------------------------+
4786         |                               | <-- stack_pointer_rtx (aligned)
4787
4788    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
4789    but leave frame_pointer_rtx and hard_frame_pointer_rtx
4790    unchanged.  */
4791
4792 /* Generate the prologue instructions for entry into a function.
4793    Establish the stack frame by decreasing the stack pointer with a
4794    properly calculated size and, if necessary, create a frame record
4795    filled with the values of LR and previous frame pointer.  The
4796    current FP is also set up if it is in use.  */
4797
4798 void
4799 aarch64_expand_prologue (void)
4800 {
4801   aarch64_layout_frame ();
4802
4803   poly_int64 frame_size = cfun->machine->frame.frame_size;
4804   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4805   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4806   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4807   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4808   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4809   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4810   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
4811   rtx_insn *insn;
4812
4813   /* Sign return address for functions.  */
4814   if (aarch64_return_address_signing_enabled ())
4815     {
4816       insn = emit_insn (gen_pacisp ());
4817       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
4818       RTX_FRAME_RELATED_P (insn) = 1;
4819     }
4820
4821   if (flag_stack_usage_info)
4822     current_function_static_stack_size = constant_lower_bound (frame_size);
4823
4824   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4825     {
4826       if (crtl->is_leaf && !cfun->calls_alloca)
4827         {
4828           if (maybe_gt (frame_size, PROBE_INTERVAL)
4829               && maybe_gt (frame_size, get_stack_check_protect ()))
4830             aarch64_emit_probe_stack_range (get_stack_check_protect (),
4831                                             (frame_size
4832                                              - get_stack_check_protect ()));
4833         }
4834       else if (maybe_gt (frame_size, 0))
4835         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
4836     }
4837
4838   rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4839   rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4840
4841   aarch64_sub_sp (ip0_rtx, ip1_rtx, initial_adjust, true);
4842
4843   if (callee_adjust != 0)
4844     aarch64_push_regs (reg1, reg2, callee_adjust);
4845
4846   if (emit_frame_chain)
4847     {
4848       poly_int64 reg_offset = callee_adjust;
4849       if (callee_adjust == 0)
4850         {
4851           reg1 = R29_REGNUM;
4852           reg2 = R30_REGNUM;
4853           reg_offset = callee_offset;
4854           aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
4855         }
4856       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
4857                           stack_pointer_rtx, callee_offset,
4858                           ip1_rtx, ip0_rtx, frame_pointer_needed);
4859       if (frame_pointer_needed && !frame_size.is_constant ())
4860         {
4861           /* Variable-sized frames need to describe the save slot
4862              address using DW_CFA_expression rather than DW_CFA_offset.
4863              This means that, without taking further action, the
4864              locations of the registers that we've already saved would
4865              remain based on the stack pointer even after we redefine
4866              the CFA based on the frame pointer.  We therefore need new
4867              DW_CFA_expressions to re-express the save slots with addresses
4868              based on the frame pointer.  */
4869           rtx_insn *insn = get_last_insn ();
4870           gcc_assert (RTX_FRAME_RELATED_P (insn));
4871
4872           /* Add an explicit CFA definition if this was previously
4873              implicit.  */
4874           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
4875             {
4876               rtx src = plus_constant (Pmode, stack_pointer_rtx,
4877                                        callee_offset);
4878               add_reg_note (insn, REG_CFA_ADJUST_CFA,
4879                             gen_rtx_SET (hard_frame_pointer_rtx, src));
4880             }
4881
4882           /* Change the save slot expressions for the registers that
4883              we've already saved.  */
4884           reg_offset -= callee_offset;
4885           aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
4886                                       reg_offset + UNITS_PER_WORD);
4887           aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
4888                                       reg_offset);
4889         }
4890       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
4891     }
4892
4893   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4894                              callee_adjust != 0 || emit_frame_chain);
4895   aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4896                              callee_adjust != 0 || emit_frame_chain);
4897   aarch64_sub_sp (ip1_rtx, ip0_rtx, final_adjust, !frame_pointer_needed);
4898 }
4899
4900 /* Return TRUE if we can use a simple_return insn.
4901
4902    This function checks whether the callee saved stack is empty, which
4903    means no restore actions are need. The pro_and_epilogue will use
4904    this to check whether shrink-wrapping opt is feasible.  */
4905
4906 bool
4907 aarch64_use_return_insn_p (void)
4908 {
4909   if (!reload_completed)
4910     return false;
4911
4912   if (crtl->profile)
4913     return false;
4914
4915   aarch64_layout_frame ();
4916
4917   return known_eq (cfun->machine->frame.frame_size, 0);
4918 }
4919
4920 /* Generate the epilogue instructions for returning from a function.
4921    This is almost exactly the reverse of the prolog sequence, except
4922    that we need to insert barriers to avoid scheduling loads that read
4923    from a deallocated stack, and we optimize the unwind records by
4924    emitting them all together if possible.  */
4925 void
4926 aarch64_expand_epilogue (bool for_sibcall)
4927 {
4928   aarch64_layout_frame ();
4929
4930   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4931   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4932   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4933   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4934   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4935   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4936   rtx cfi_ops = NULL;
4937   rtx_insn *insn;
4938   /* A stack clash protection prologue may not have left IP0_REGNUM or
4939      IP1_REGNUM in a usable state.  The same is true for allocations
4940      with an SVE component, since we then need both temporary registers
4941      for each allocation.  */
4942   bool can_inherit_p = (initial_adjust.is_constant ()
4943                         && final_adjust.is_constant ()
4944                         && !flag_stack_clash_protection);
4945
4946   /* We need to add memory barrier to prevent read from deallocated stack.  */
4947   bool need_barrier_p
4948     = maybe_ne (get_frame_size ()
4949                 + cfun->machine->frame.saved_varargs_size, 0);
4950
4951   /* Emit a barrier to prevent loads from a deallocated stack.  */
4952   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
4953       || cfun->calls_alloca
4954       || crtl->calls_eh_return)
4955     {
4956       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
4957       need_barrier_p = false;
4958     }
4959
4960   /* Restore the stack pointer from the frame pointer if it may not
4961      be the same as the stack pointer.  */
4962   rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4963   rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4964   if (frame_pointer_needed
4965       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
4966     /* If writeback is used when restoring callee-saves, the CFA
4967        is restored on the instruction doing the writeback.  */
4968     aarch64_add_offset (Pmode, stack_pointer_rtx,
4969                         hard_frame_pointer_rtx, -callee_offset,
4970                         ip1_rtx, ip0_rtx, callee_adjust == 0);
4971   else
4972     aarch64_add_sp (ip1_rtx, ip0_rtx, final_adjust,
4973                     !can_inherit_p || df_regs_ever_live_p (IP1_REGNUM));
4974
4975   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4976                                 callee_adjust != 0, &cfi_ops);
4977   aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4978                                 callee_adjust != 0, &cfi_ops);
4979
4980   if (need_barrier_p)
4981     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
4982
4983   if (callee_adjust != 0)
4984     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
4985
4986   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
4987     {
4988       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
4989       insn = get_last_insn ();
4990       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
4991       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
4992       RTX_FRAME_RELATED_P (insn) = 1;
4993       cfi_ops = NULL;
4994     }
4995
4996   aarch64_add_sp (ip0_rtx, ip1_rtx, initial_adjust,
4997                   !can_inherit_p || df_regs_ever_live_p (IP0_REGNUM));
4998
4999   if (cfi_ops)
5000     {
5001       /* Emit delayed restores and reset the CFA to be SP.  */
5002       insn = get_last_insn ();
5003       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
5004       REG_NOTES (insn) = cfi_ops;
5005       RTX_FRAME_RELATED_P (insn) = 1;
5006     }
5007
5008   /* We prefer to emit the combined return/authenticate instruction RETAA,
5009      however there are three cases in which we must instead emit an explicit
5010      authentication instruction.
5011
5012         1) Sibcalls don't return in a normal way, so if we're about to call one
5013            we must authenticate.
5014
5015         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5016            generating code for !TARGET_ARMV8_3 we can't use it and must
5017            explicitly authenticate.
5018
5019         3) On an eh_return path we make extra stack adjustments to update the
5020            canonical frame address to be the exception handler's CFA.  We want
5021            to authenticate using the CFA of the function which calls eh_return.
5022     */
5023   if (aarch64_return_address_signing_enabled ()
5024       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
5025     {
5026       insn = emit_insn (gen_autisp ());
5027       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5028       RTX_FRAME_RELATED_P (insn) = 1;
5029     }
5030
5031   /* Stack adjustment for exception handler.  */
5032   if (crtl->calls_eh_return)
5033     {
5034       /* We need to unwind the stack by the offset computed by
5035          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
5036          to be SP; letting the CFA move during this adjustment
5037          is just as correct as retaining the CFA from the body
5038          of the function.  Therefore, do nothing special.  */
5039       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
5040     }
5041
5042   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
5043   if (!for_sibcall)
5044     emit_jump_insn (ret_rtx);
5045 }
5046
5047 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
5048    normally or return to a previous frame after unwinding.
5049
5050    An EH return uses a single shared return sequence.  The epilogue is
5051    exactly like a normal epilogue except that it has an extra input
5052    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5053    that must be applied after the frame has been destroyed.  An extra label
5054    is inserted before the epilogue which initializes this register to zero,
5055    and this is the entry point for a normal return.
5056
5057    An actual EH return updates the return address, initializes the stack
5058    adjustment and jumps directly into the epilogue (bypassing the zeroing
5059    of the adjustment).  Since the return address is typically saved on the
5060    stack when a function makes a call, the saved LR must be updated outside
5061    the epilogue.
5062
5063    This poses problems as the store is generated well before the epilogue,
5064    so the offset of LR is not known yet.  Also optimizations will remove the
5065    store as it appears dead, even after the epilogue is generated (as the
5066    base or offset for loading LR is different in many cases).
5067
5068    To avoid these problems this implementation forces the frame pointer
5069    in eh_return functions so that the location of LR is fixed and known early.
5070    It also marks the store volatile, so no optimization is permitted to
5071    remove the store.  */
5072 rtx
5073 aarch64_eh_return_handler_rtx (void)
5074 {
5075   rtx tmp = gen_frame_mem (Pmode,
5076     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
5077
5078   /* Mark the store volatile, so no optimization is permitted to remove it.  */
5079   MEM_VOLATILE_P (tmp) = true;
5080   return tmp;
5081 }
5082
5083 /* Output code to add DELTA to the first argument, and then jump
5084    to FUNCTION.  Used for C++ multiple inheritance.  */
5085 static void
5086 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
5087                          HOST_WIDE_INT delta,
5088                          HOST_WIDE_INT vcall_offset,
5089                          tree function)
5090 {
5091   /* The this pointer is always in x0.  Note that this differs from
5092      Arm where the this pointer maybe bumped to r1 if r0 is required
5093      to return a pointer to an aggregate.  On AArch64 a result value
5094      pointer will be in x8.  */
5095   int this_regno = R0_REGNUM;
5096   rtx this_rtx, temp0, temp1, addr, funexp;
5097   rtx_insn *insn;
5098
5099   reload_completed = 1;
5100   emit_note (NOTE_INSN_PROLOGUE_END);
5101
5102   this_rtx = gen_rtx_REG (Pmode, this_regno);
5103   temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
5104   temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
5105
5106   if (vcall_offset == 0)
5107     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
5108   else
5109     {
5110       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
5111
5112       addr = this_rtx;
5113       if (delta != 0)
5114         {
5115           if (delta >= -256 && delta < 256)
5116             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
5117                                        plus_constant (Pmode, this_rtx, delta));
5118           else
5119             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
5120                                 temp1, temp0, false);
5121         }
5122
5123       if (Pmode == ptr_mode)
5124         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
5125       else
5126         aarch64_emit_move (temp0,
5127                            gen_rtx_ZERO_EXTEND (Pmode,
5128                                                 gen_rtx_MEM (ptr_mode, addr)));
5129
5130       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
5131           addr = plus_constant (Pmode, temp0, vcall_offset);
5132       else
5133         {
5134           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
5135                                           Pmode);
5136           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
5137         }
5138
5139       if (Pmode == ptr_mode)
5140         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
5141       else
5142         aarch64_emit_move (temp1,
5143                            gen_rtx_SIGN_EXTEND (Pmode,
5144                                                 gen_rtx_MEM (ptr_mode, addr)));
5145
5146       emit_insn (gen_add2_insn (this_rtx, temp1));
5147     }
5148
5149   /* Generate a tail call to the target function.  */
5150   if (!TREE_USED (function))
5151     {
5152       assemble_external (function);
5153       TREE_USED (function) = 1;
5154     }
5155   funexp = XEXP (DECL_RTL (function), 0);
5156   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
5157   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
5158   SIBLING_CALL_P (insn) = 1;
5159
5160   insn = get_insns ();
5161   shorten_branches (insn);
5162   final_start_function (insn, file, 1);
5163   final (insn, file, 1);
5164   final_end_function ();
5165
5166   /* Stop pretending to be a post-reload pass.  */
5167   reload_completed = 0;
5168 }
5169
5170 static bool
5171 aarch64_tls_referenced_p (rtx x)
5172 {
5173   if (!TARGET_HAVE_TLS)
5174     return false;
5175   subrtx_iterator::array_type array;
5176   FOR_EACH_SUBRTX (iter, array, x, ALL)
5177     {
5178       const_rtx x = *iter;
5179       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
5180         return true;
5181       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
5182          TLS offsets, not real symbol references.  */
5183       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5184         iter.skip_subrtxes ();
5185     }
5186   return false;
5187 }
5188
5189
5190 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5191    a left shift of 0 or 12 bits.  */
5192 bool
5193 aarch64_uimm12_shift (HOST_WIDE_INT val)
5194 {
5195   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
5196           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
5197           );
5198 }
5199
5200
5201 /* Return true if val is an immediate that can be loaded into a
5202    register by a MOVZ instruction.  */
5203 static bool
5204 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
5205 {
5206   if (GET_MODE_SIZE (mode) > 4)
5207     {
5208       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
5209           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
5210         return 1;
5211     }
5212   else
5213     {
5214       /* Ignore sign extension.  */
5215       val &= (HOST_WIDE_INT) 0xffffffff;
5216     }
5217   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
5218           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
5219 }
5220
5221 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
5222    64-bit (DImode) integer.  */
5223
5224 static unsigned HOST_WIDE_INT
5225 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5226 {
5227   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
5228   while (size < 64)
5229     {
5230       val &= (HOST_WIDE_INT_1U << size) - 1;
5231       val |= val << size;
5232       size *= 2;
5233     }
5234   return val;
5235 }
5236
5237 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
5238
5239 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
5240   {
5241     0x0000000100000001ull,
5242     0x0001000100010001ull,
5243     0x0101010101010101ull,
5244     0x1111111111111111ull,
5245     0x5555555555555555ull,
5246   };
5247
5248
5249 /* Return true if val is a valid bitmask immediate.  */
5250
5251 bool
5252 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
5253 {
5254   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
5255   int bits;
5256
5257   /* Check for a single sequence of one bits and return quickly if so.
5258      The special cases of all ones and all zeroes returns false.  */
5259   val = aarch64_replicate_bitmask_imm (val_in, mode);
5260   tmp = val + (val & -val);
5261
5262   if (tmp == (tmp & -tmp))
5263     return (val + 1) > 1;
5264
5265   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
5266   if (mode == SImode)
5267     val = (val << 32) | (val & 0xffffffff);
5268
5269   /* Invert if the immediate doesn't start with a zero bit - this means we
5270      only need to search for sequences of one bits.  */
5271   if (val & 1)
5272     val = ~val;
5273
5274   /* Find the first set bit and set tmp to val with the first sequence of one
5275      bits removed.  Return success if there is a single sequence of ones.  */
5276   first_one = val & -val;
5277   tmp = val & (val + first_one);
5278
5279   if (tmp == 0)
5280     return true;
5281
5282   /* Find the next set bit and compute the difference in bit position.  */
5283   next_one = tmp & -tmp;
5284   bits = clz_hwi (first_one) - clz_hwi (next_one);
5285   mask = val ^ tmp;
5286
5287   /* Check the bit position difference is a power of 2, and that the first
5288      sequence of one bits fits within 'bits' bits.  */
5289   if ((mask >> bits) != 0 || bits != (bits & -bits))
5290     return false;
5291
5292   /* Check the sequence of one bits is repeated 64/bits times.  */
5293   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
5294 }
5295
5296 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5297    Assumed precondition: VAL_IN Is not zero.  */
5298
5299 unsigned HOST_WIDE_INT
5300 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
5301 {
5302   int lowest_bit_set = ctz_hwi (val_in);
5303   int highest_bit_set = floor_log2 (val_in);
5304   gcc_assert (val_in != 0);
5305
5306   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
5307           (HOST_WIDE_INT_1U << lowest_bit_set));
5308 }
5309
5310 /* Create constant where bits outside of lowest bit set to highest bit set
5311    are set to 1.  */
5312
5313 unsigned HOST_WIDE_INT
5314 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
5315 {
5316   return val_in | ~aarch64_and_split_imm1 (val_in);
5317 }
5318
5319 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
5320
5321 bool
5322 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
5323 {
5324   scalar_int_mode int_mode;
5325   if (!is_a <scalar_int_mode> (mode, &int_mode))
5326     return false;
5327
5328   if (aarch64_bitmask_imm (val_in, int_mode))
5329     return false;
5330
5331   if (aarch64_move_imm (val_in, int_mode))
5332     return false;
5333
5334   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
5335
5336   return aarch64_bitmask_imm (imm2, int_mode);
5337 }
5338
5339 /* Return true if val is an immediate that can be loaded into a
5340    register in a single instruction.  */
5341 bool
5342 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
5343 {
5344   scalar_int_mode int_mode;
5345   if (!is_a <scalar_int_mode> (mode, &int_mode))
5346     return false;
5347
5348   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
5349     return 1;
5350   return aarch64_bitmask_imm (val, int_mode);
5351 }
5352
5353 static bool
5354 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
5355 {
5356   rtx base, offset;
5357
5358   if (GET_CODE (x) == HIGH)
5359     return true;
5360
5361   /* There's no way to calculate VL-based values using relocations.  */
5362   subrtx_iterator::array_type array;
5363   FOR_EACH_SUBRTX (iter, array, x, ALL)
5364     if (GET_CODE (*iter) == CONST_POLY_INT)
5365       return true;
5366
5367   split_const (x, &base, &offset);
5368   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
5369     {
5370       if (aarch64_classify_symbol (base, INTVAL (offset))
5371           != SYMBOL_FORCE_TO_MEM)
5372         return true;
5373       else
5374         /* Avoid generating a 64-bit relocation in ILP32; leave
5375            to aarch64_expand_mov_immediate to handle it properly.  */
5376         return mode != ptr_mode;
5377     }
5378
5379   return aarch64_tls_referenced_p (x);
5380 }
5381
5382 /* Implement TARGET_CASE_VALUES_THRESHOLD.
5383    The expansion for a table switch is quite expensive due to the number
5384    of instructions, the table lookup and hard to predict indirect jump.
5385    When optimizing for speed, and -O3 enabled, use the per-core tuning if
5386    set, otherwise use tables for > 16 cases as a tradeoff between size and
5387    performance.  When optimizing for size, use the default setting.  */
5388
5389 static unsigned int
5390 aarch64_case_values_threshold (void)
5391 {
5392   /* Use the specified limit for the number of cases before using jump
5393      tables at higher optimization levels.  */
5394   if (optimize > 2
5395       && selected_cpu->tune->max_case_values != 0)
5396     return selected_cpu->tune->max_case_values;
5397   else
5398     return optimize_size ? default_case_values_threshold () : 17;
5399 }
5400
5401 /* Return true if register REGNO is a valid index register.
5402    STRICT_P is true if REG_OK_STRICT is in effect.  */
5403
5404 bool
5405 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
5406 {
5407   if (!HARD_REGISTER_NUM_P (regno))
5408     {
5409       if (!strict_p)
5410         return true;
5411
5412       if (!reg_renumber)
5413         return false;
5414
5415       regno = reg_renumber[regno];
5416     }
5417   return GP_REGNUM_P (regno);
5418 }
5419
5420 /* Return true if register REGNO is a valid base register for mode MODE.
5421    STRICT_P is true if REG_OK_STRICT is in effect.  */
5422
5423 bool
5424 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
5425 {
5426   if (!HARD_REGISTER_NUM_P (regno))
5427     {
5428       if (!strict_p)
5429         return true;
5430
5431       if (!reg_renumber)
5432         return false;
5433
5434       regno = reg_renumber[regno];
5435     }
5436
5437   /* The fake registers will be eliminated to either the stack or
5438      hard frame pointer, both of which are usually valid base registers.
5439      Reload deals with the cases where the eliminated form isn't valid.  */
5440   return (GP_REGNUM_P (regno)
5441           || regno == SP_REGNUM
5442           || regno == FRAME_POINTER_REGNUM
5443           || regno == ARG_POINTER_REGNUM);
5444 }
5445
5446 /* Return true if X is a valid base register for mode MODE.
5447    STRICT_P is true if REG_OK_STRICT is in effect.  */
5448
5449 static bool
5450 aarch64_base_register_rtx_p (rtx x, bool strict_p)
5451 {
5452   if (!strict_p
5453       && GET_CODE (x) == SUBREG
5454       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
5455     x = SUBREG_REG (x);
5456
5457   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
5458 }
5459
5460 /* Return true if address offset is a valid index.  If it is, fill in INFO
5461    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
5462
5463 static bool
5464 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
5465                         machine_mode mode, bool strict_p)
5466 {
5467   enum aarch64_address_type type;
5468   rtx index;
5469   int shift;
5470
5471   /* (reg:P) */
5472   if ((REG_P (x) || GET_CODE (x) == SUBREG)
5473       && GET_MODE (x) == Pmode)
5474     {
5475       type = ADDRESS_REG_REG;
5476       index = x;
5477       shift = 0;
5478     }
5479   /* (sign_extend:DI (reg:SI)) */
5480   else if ((GET_CODE (x) == SIGN_EXTEND
5481             || GET_CODE (x) == ZERO_EXTEND)
5482            && GET_MODE (x) == DImode
5483            && GET_MODE (XEXP (x, 0)) == SImode)
5484     {
5485       type = (GET_CODE (x) == SIGN_EXTEND)
5486         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5487       index = XEXP (x, 0);
5488       shift = 0;
5489     }
5490   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
5491   else if (GET_CODE (x) == MULT
5492            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5493                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5494            && GET_MODE (XEXP (x, 0)) == DImode
5495            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5496            && CONST_INT_P (XEXP (x, 1)))
5497     {
5498       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5499         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5500       index = XEXP (XEXP (x, 0), 0);
5501       shift = exact_log2 (INTVAL (XEXP (x, 1)));
5502     }
5503   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
5504   else if (GET_CODE (x) == ASHIFT
5505            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5506                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5507            && GET_MODE (XEXP (x, 0)) == DImode
5508            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5509            && CONST_INT_P (XEXP (x, 1)))
5510     {
5511       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5512         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5513       index = XEXP (XEXP (x, 0), 0);
5514       shift = INTVAL (XEXP (x, 1));
5515     }
5516   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
5517   else if ((GET_CODE (x) == SIGN_EXTRACT
5518             || GET_CODE (x) == ZERO_EXTRACT)
5519            && GET_MODE (x) == DImode
5520            && GET_CODE (XEXP (x, 0)) == MULT
5521            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5522            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5523     {
5524       type = (GET_CODE (x) == SIGN_EXTRACT)
5525         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5526       index = XEXP (XEXP (x, 0), 0);
5527       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5528       if (INTVAL (XEXP (x, 1)) != 32 + shift
5529           || INTVAL (XEXP (x, 2)) != 0)
5530         shift = -1;
5531     }
5532   /* (and:DI (mult:DI (reg:DI) (const_int scale))
5533      (const_int 0xffffffff<<shift)) */
5534   else if (GET_CODE (x) == AND
5535            && GET_MODE (x) == DImode
5536            && GET_CODE (XEXP (x, 0)) == MULT
5537            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5538            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5539            && CONST_INT_P (XEXP (x, 1)))
5540     {
5541       type = ADDRESS_REG_UXTW;
5542       index = XEXP (XEXP (x, 0), 0);
5543       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5544       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5545         shift = -1;
5546     }
5547   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
5548   else if ((GET_CODE (x) == SIGN_EXTRACT
5549             || GET_CODE (x) == ZERO_EXTRACT)
5550            && GET_MODE (x) == DImode
5551            && GET_CODE (XEXP (x, 0)) == ASHIFT
5552            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5553            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5554     {
5555       type = (GET_CODE (x) == SIGN_EXTRACT)
5556         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5557       index = XEXP (XEXP (x, 0), 0);
5558       shift = INTVAL (XEXP (XEXP (x, 0), 1));
5559       if (INTVAL (XEXP (x, 1)) != 32 + shift
5560           || INTVAL (XEXP (x, 2)) != 0)
5561         shift = -1;
5562     }
5563   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
5564      (const_int 0xffffffff<<shift)) */
5565   else if (GET_CODE (x) == AND
5566            && GET_MODE (x) == DImode
5567            && GET_CODE (XEXP (x, 0)) == ASHIFT
5568            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5569            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5570            && CONST_INT_P (XEXP (x, 1)))
5571     {
5572       type = ADDRESS_REG_UXTW;
5573       index = XEXP (XEXP (x, 0), 0);
5574       shift = INTVAL (XEXP (XEXP (x, 0), 1));
5575       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5576         shift = -1;
5577     }
5578   /* (mult:P (reg:P) (const_int scale)) */
5579   else if (GET_CODE (x) == MULT
5580            && GET_MODE (x) == Pmode
5581            && GET_MODE (XEXP (x, 0)) == Pmode
5582            && CONST_INT_P (XEXP (x, 1)))
5583     {
5584       type = ADDRESS_REG_REG;
5585       index = XEXP (x, 0);
5586       shift = exact_log2 (INTVAL (XEXP (x, 1)));
5587     }
5588   /* (ashift:P (reg:P) (const_int shift)) */
5589   else if (GET_CODE (x) == ASHIFT
5590            && GET_MODE (x) == Pmode
5591            && GET_MODE (XEXP (x, 0)) == Pmode
5592            && CONST_INT_P (XEXP (x, 1)))
5593     {
5594       type = ADDRESS_REG_REG;
5595       index = XEXP (x, 0);
5596       shift = INTVAL (XEXP (x, 1));
5597     }
5598   else
5599     return false;
5600
5601   if (!strict_p
5602       && GET_CODE (index) == SUBREG
5603       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
5604     index = SUBREG_REG (index);
5605
5606   if (aarch64_sve_data_mode_p (mode))
5607     {
5608       if (type != ADDRESS_REG_REG
5609           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
5610         return false;
5611     }
5612   else
5613     {
5614       if (shift != 0
5615           && !(IN_RANGE (shift, 1, 3)
5616                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
5617         return false;
5618     }
5619
5620   if (REG_P (index)
5621       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
5622     {
5623       info->type = type;
5624       info->offset = index;
5625       info->shift = shift;
5626       return true;
5627     }
5628
5629   return false;
5630 }
5631
5632 /* Return true if MODE is one of the modes for which we
5633    support LDP/STP operations.  */
5634
5635 static bool
5636 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
5637 {
5638   return mode == SImode || mode == DImode
5639          || mode == SFmode || mode == DFmode
5640          || (aarch64_vector_mode_supported_p (mode)
5641              && known_eq (GET_MODE_SIZE (mode), 8));
5642 }
5643
5644 /* Return true if REGNO is a virtual pointer register, or an eliminable
5645    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
5646    include stack_pointer or hard_frame_pointer.  */
5647 static bool
5648 virt_or_elim_regno_p (unsigned regno)
5649 {
5650   return ((regno >= FIRST_VIRTUAL_REGISTER
5651            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
5652           || regno == FRAME_POINTER_REGNUM
5653           || regno == ARG_POINTER_REGNUM);
5654 }
5655
5656 /* Return true if X is a valid address of type TYPE for machine mode MODE.
5657    If it is, fill in INFO appropriately.  STRICT_P is true if
5658    REG_OK_STRICT is in effect.  */
5659
5660 static bool
5661 aarch64_classify_address (struct aarch64_address_info *info,
5662                           rtx x, machine_mode mode, bool strict_p,
5663                           aarch64_addr_query_type type = ADDR_QUERY_M)
5664 {
5665   enum rtx_code code = GET_CODE (x);
5666   rtx op0, op1;
5667   poly_int64 offset;
5668
5669   HOST_WIDE_INT const_size;
5670
5671   /* On BE, we use load/store pair for all large int mode load/stores.
5672      TI/TFmode may also use a load/store pair.  */
5673   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5674   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
5675   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
5676                             || mode == TImode
5677                             || mode == TFmode
5678                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
5679
5680   bool allow_reg_index_p = (!load_store_pair_p
5681                             && (known_lt (GET_MODE_SIZE (mode), 16)
5682                                 || vec_flags == VEC_ADVSIMD
5683                                 || vec_flags == VEC_SVE_DATA));
5684
5685   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
5686      [Rn, #offset, MUL VL].  */
5687   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
5688       && (code != REG && code != PLUS))
5689     return false;
5690
5691   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
5692      REG addressing.  */
5693   if (advsimd_struct_p
5694       && !BYTES_BIG_ENDIAN
5695       && (code != POST_INC && code != REG))
5696     return false;
5697
5698   gcc_checking_assert (GET_MODE (x) == VOIDmode
5699                        || SCALAR_INT_MODE_P (GET_MODE (x)));
5700
5701   switch (code)
5702     {
5703     case REG:
5704     case SUBREG:
5705       info->type = ADDRESS_REG_IMM;
5706       info->base = x;
5707       info->offset = const0_rtx;
5708       info->const_offset = 0;
5709       return aarch64_base_register_rtx_p (x, strict_p);
5710
5711     case PLUS:
5712       op0 = XEXP (x, 0);
5713       op1 = XEXP (x, 1);
5714
5715       if (! strict_p
5716           && REG_P (op0)
5717           && virt_or_elim_regno_p (REGNO (op0))
5718           && poly_int_rtx_p (op1, &offset))
5719         {
5720           info->type = ADDRESS_REG_IMM;
5721           info->base = op0;
5722           info->offset = op1;
5723           info->const_offset = offset;
5724
5725           return true;
5726         }
5727
5728       if (maybe_ne (GET_MODE_SIZE (mode), 0)
5729           && aarch64_base_register_rtx_p (op0, strict_p)
5730           && poly_int_rtx_p (op1, &offset))
5731         {
5732           info->type = ADDRESS_REG_IMM;
5733           info->base = op0;
5734           info->offset = op1;
5735           info->const_offset = offset;
5736
5737           /* TImode and TFmode values are allowed in both pairs of X
5738              registers and individual Q registers.  The available
5739              address modes are:
5740              X,X: 7-bit signed scaled offset
5741              Q:   9-bit signed offset
5742              We conservatively require an offset representable in either mode.
5743              When performing the check for pairs of X registers i.e.  LDP/STP
5744              pass down DImode since that is the natural size of the LDP/STP
5745              instruction memory accesses.  */
5746           if (mode == TImode || mode == TFmode)
5747             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
5748                     && (offset_9bit_signed_unscaled_p (mode, offset)
5749                         || offset_12bit_unsigned_scaled_p (mode, offset)));
5750
5751           /* A 7bit offset check because OImode will emit a ldp/stp
5752              instruction (only big endian will get here).
5753              For ldp/stp instructions, the offset is scaled for the size of a
5754              single element of the pair.  */
5755           if (mode == OImode)
5756             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
5757
5758           /* Three 9/12 bit offsets checks because CImode will emit three
5759              ldr/str instructions (only big endian will get here).  */
5760           if (mode == CImode)
5761             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5762                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
5763                         || offset_12bit_unsigned_scaled_p (V16QImode,
5764                                                            offset + 32)));
5765
5766           /* Two 7bit offsets checks because XImode will emit two ldp/stp
5767              instructions (only big endian will get here).  */
5768           if (mode == XImode)
5769             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5770                     && aarch64_offset_7bit_signed_scaled_p (TImode,
5771                                                             offset + 32));
5772
5773           /* Make "m" use the LD1 offset range for SVE data modes, so
5774              that pre-RTL optimizers like ivopts will work to that
5775              instead of the wider LDR/STR range.  */
5776           if (vec_flags == VEC_SVE_DATA)
5777             return (type == ADDR_QUERY_M
5778                     ? offset_4bit_signed_scaled_p (mode, offset)
5779                     : offset_9bit_signed_scaled_p (mode, offset));
5780
5781           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
5782             {
5783               poly_int64 end_offset = (offset
5784                                        + GET_MODE_SIZE (mode)
5785                                        - BYTES_PER_SVE_VECTOR);
5786               return (type == ADDR_QUERY_M
5787                       ? offset_4bit_signed_scaled_p (mode, offset)
5788                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
5789                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
5790                                                          end_offset)));
5791             }
5792
5793           if (vec_flags == VEC_SVE_PRED)
5794             return offset_9bit_signed_scaled_p (mode, offset);
5795
5796           if (load_store_pair_p)
5797             return ((known_eq (GET_MODE_SIZE (mode), 4)
5798                      || known_eq (GET_MODE_SIZE (mode), 8))
5799                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5800           else
5801             return (offset_9bit_signed_unscaled_p (mode, offset)
5802                     || offset_12bit_unsigned_scaled_p (mode, offset));
5803         }
5804
5805       if (allow_reg_index_p)
5806         {
5807           /* Look for base + (scaled/extended) index register.  */
5808           if (aarch64_base_register_rtx_p (op0, strict_p)
5809               && aarch64_classify_index (info, op1, mode, strict_p))
5810             {
5811               info->base = op0;
5812               return true;
5813             }
5814           if (aarch64_base_register_rtx_p (op1, strict_p)
5815               && aarch64_classify_index (info, op0, mode, strict_p))
5816             {
5817               info->base = op1;
5818               return true;
5819             }
5820         }
5821
5822       return false;
5823
5824     case POST_INC:
5825     case POST_DEC:
5826     case PRE_INC:
5827     case PRE_DEC:
5828       info->type = ADDRESS_REG_WB;
5829       info->base = XEXP (x, 0);
5830       info->offset = NULL_RTX;
5831       return aarch64_base_register_rtx_p (info->base, strict_p);
5832
5833     case POST_MODIFY:
5834     case PRE_MODIFY:
5835       info->type = ADDRESS_REG_WB;
5836       info->base = XEXP (x, 0);
5837       if (GET_CODE (XEXP (x, 1)) == PLUS
5838           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
5839           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
5840           && aarch64_base_register_rtx_p (info->base, strict_p))
5841         {
5842           info->offset = XEXP (XEXP (x, 1), 1);
5843           info->const_offset = offset;
5844
5845           /* TImode and TFmode values are allowed in both pairs of X
5846              registers and individual Q registers.  The available
5847              address modes are:
5848              X,X: 7-bit signed scaled offset
5849              Q:   9-bit signed offset
5850              We conservatively require an offset representable in either mode.
5851            */
5852           if (mode == TImode || mode == TFmode)
5853             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
5854                     && offset_9bit_signed_unscaled_p (mode, offset));
5855
5856           if (load_store_pair_p)
5857             return ((known_eq (GET_MODE_SIZE (mode), 4)
5858                      || known_eq (GET_MODE_SIZE (mode), 8))
5859                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5860           else
5861             return offset_9bit_signed_unscaled_p (mode, offset);
5862         }
5863       return false;
5864
5865     case CONST:
5866     case SYMBOL_REF:
5867     case LABEL_REF:
5868       /* load literal: pc-relative constant pool entry.  Only supported
5869          for SI mode or larger.  */
5870       info->type = ADDRESS_SYMBOLIC;
5871
5872       if (!load_store_pair_p
5873           && GET_MODE_SIZE (mode).is_constant (&const_size)
5874           && const_size >= 4)
5875         {
5876           rtx sym, addend;
5877
5878           split_const (x, &sym, &addend);
5879           return ((GET_CODE (sym) == LABEL_REF
5880                    || (GET_CODE (sym) == SYMBOL_REF
5881                        && CONSTANT_POOL_ADDRESS_P (sym)
5882                        && aarch64_pcrelative_literal_loads)));
5883         }
5884       return false;
5885
5886     case LO_SUM:
5887       info->type = ADDRESS_LO_SUM;
5888       info->base = XEXP (x, 0);
5889       info->offset = XEXP (x, 1);
5890       if (allow_reg_index_p
5891           && aarch64_base_register_rtx_p (info->base, strict_p))
5892         {
5893           rtx sym, offs;
5894           split_const (info->offset, &sym, &offs);
5895           if (GET_CODE (sym) == SYMBOL_REF
5896               && (aarch64_classify_symbol (sym, INTVAL (offs))
5897                   == SYMBOL_SMALL_ABSOLUTE))
5898             {
5899               /* The symbol and offset must be aligned to the access size.  */
5900               unsigned int align;
5901
5902               if (CONSTANT_POOL_ADDRESS_P (sym))
5903                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
5904               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
5905                 {
5906                   tree exp = SYMBOL_REF_DECL (sym);
5907                   align = TYPE_ALIGN (TREE_TYPE (exp));
5908                   align = aarch64_constant_alignment (exp, align);
5909                 }
5910               else if (SYMBOL_REF_DECL (sym))
5911                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
5912               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
5913                        && SYMBOL_REF_BLOCK (sym) != NULL)
5914                 align = SYMBOL_REF_BLOCK (sym)->alignment;
5915               else
5916                 align = BITS_PER_UNIT;
5917
5918               poly_int64 ref_size = GET_MODE_SIZE (mode);
5919               if (known_eq (ref_size, 0))
5920                 ref_size = GET_MODE_SIZE (DImode);
5921
5922               return (multiple_p (INTVAL (offs), ref_size)
5923                       && multiple_p (align / BITS_PER_UNIT, ref_size));
5924             }
5925         }
5926       return false;
5927
5928     default:
5929       return false;
5930     }
5931 }
5932
5933 /* Return true if the address X is valid for a PRFM instruction.
5934    STRICT_P is true if we should do strict checking with
5935    aarch64_classify_address.  */
5936
5937 bool
5938 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
5939 {
5940   struct aarch64_address_info addr;
5941
5942   /* PRFM accepts the same addresses as DImode...  */
5943   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
5944   if (!res)
5945     return false;
5946
5947   /* ... except writeback forms.  */
5948   return addr.type != ADDRESS_REG_WB;
5949 }
5950
5951 bool
5952 aarch64_symbolic_address_p (rtx x)
5953 {
5954   rtx offset;
5955
5956   split_const (x, &x, &offset);
5957   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
5958 }
5959
5960 /* Classify the base of symbolic expression X.  */
5961
5962 enum aarch64_symbol_type
5963 aarch64_classify_symbolic_expression (rtx x)
5964 {
5965   rtx offset;
5966
5967   split_const (x, &x, &offset);
5968   return aarch64_classify_symbol (x, INTVAL (offset));
5969 }
5970
5971
5972 /* Return TRUE if X is a legitimate address for accessing memory in
5973    mode MODE.  */
5974 static bool
5975 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
5976 {
5977   struct aarch64_address_info addr;
5978
5979   return aarch64_classify_address (&addr, x, mode, strict_p);
5980 }
5981
5982 /* Return TRUE if X is a legitimate address of type TYPE for accessing
5983    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
5984 bool
5985 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
5986                               aarch64_addr_query_type type)
5987 {
5988   struct aarch64_address_info addr;
5989
5990   return aarch64_classify_address (&addr, x, mode, strict_p, type);
5991 }
5992
5993 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
5994
5995 static bool
5996 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
5997                                          poly_int64 orig_offset,
5998                                          machine_mode mode)
5999 {
6000   HOST_WIDE_INT size;
6001   if (GET_MODE_SIZE (mode).is_constant (&size))
6002     {
6003       HOST_WIDE_INT const_offset, second_offset;
6004
6005       /* A general SVE offset is A * VQ + B.  Remove the A component from
6006          coefficient 0 in order to get the constant B.  */
6007       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
6008
6009       /* Split an out-of-range address displacement into a base and
6010          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
6011          range otherwise to increase opportunities for sharing the base
6012          address of different sizes.  Unaligned accesses use the signed
6013          9-bit range, TImode/TFmode use the intersection of signed
6014          scaled 7-bit and signed 9-bit offset.  */
6015       if (mode == TImode || mode == TFmode)
6016         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
6017       else if ((const_offset & (size - 1)) != 0)
6018         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
6019       else
6020         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
6021
6022       if (second_offset == 0 || known_eq (orig_offset, second_offset))
6023         return false;
6024
6025       /* Split the offset into second_offset and the rest.  */
6026       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6027       *offset2 = gen_int_mode (second_offset, Pmode);
6028       return true;
6029     }
6030   else
6031     {
6032       /* Get the mode we should use as the basis of the range.  For structure
6033          modes this is the mode of one vector.  */
6034       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6035       machine_mode step_mode
6036         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
6037
6038       /* Get the "mul vl" multiplier we'd like to use.  */
6039       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
6040       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
6041       if (vec_flags & VEC_SVE_DATA)
6042         /* LDR supports a 9-bit range, but the move patterns for
6043            structure modes require all vectors to be in range of the
6044            same base.  The simplest way of accomodating that while still
6045            promoting reuse of anchor points between different modes is
6046            to use an 8-bit range unconditionally.  */
6047         vnum = ((vnum + 128) & 255) - 128;
6048       else
6049         /* Predicates are only handled singly, so we might as well use
6050            the full range.  */
6051         vnum = ((vnum + 256) & 511) - 256;
6052       if (vnum == 0)
6053         return false;
6054
6055       /* Convert the "mul vl" multiplier into a byte offset.  */
6056       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
6057       if (known_eq (second_offset, orig_offset))
6058         return false;
6059
6060       /* Split the offset into second_offset and the rest.  */
6061       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6062       *offset2 = gen_int_mode (second_offset, Pmode);
6063       return true;
6064     }
6065 }
6066
6067 /* Return the binary representation of floating point constant VALUE in INTVAL.
6068    If the value cannot be converted, return false without setting INTVAL.
6069    The conversion is done in the given MODE.  */
6070 bool
6071 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
6072 {
6073
6074   /* We make a general exception for 0.  */
6075   if (aarch64_float_const_zero_rtx_p (value))
6076     {
6077       *intval = 0;
6078       return true;
6079     }
6080
6081   scalar_float_mode mode;
6082   if (GET_CODE (value) != CONST_DOUBLE
6083       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
6084       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
6085       /* Only support up to DF mode.  */
6086       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
6087     return false;
6088
6089   unsigned HOST_WIDE_INT ival = 0;
6090
6091   long res[2];
6092   real_to_target (res,
6093                   CONST_DOUBLE_REAL_VALUE (value),
6094                   REAL_MODE_FORMAT (mode));
6095
6096   if (mode == DFmode)
6097     {
6098       int order = BYTES_BIG_ENDIAN ? 1 : 0;
6099       ival = zext_hwi (res[order], 32);
6100       ival |= (zext_hwi (res[1 - order], 32) << 32);
6101     }
6102   else
6103       ival = zext_hwi (res[0], 32);
6104
6105   *intval = ival;
6106   return true;
6107 }
6108
6109 /* Return TRUE if rtx X is an immediate constant that can be moved using a
6110    single MOV(+MOVK) followed by an FMOV.  */
6111 bool
6112 aarch64_float_const_rtx_p (rtx x)
6113 {
6114   machine_mode mode = GET_MODE (x);
6115   if (mode == VOIDmode)
6116     return false;
6117
6118   /* Determine whether it's cheaper to write float constants as
6119      mov/movk pairs over ldr/adrp pairs.  */
6120   unsigned HOST_WIDE_INT ival;
6121
6122   if (GET_CODE (x) == CONST_DOUBLE
6123       && SCALAR_FLOAT_MODE_P (mode)
6124       && aarch64_reinterpret_float_as_int (x, &ival))
6125     {
6126       scalar_int_mode imode = (mode == HFmode
6127                                ? SImode
6128                                : int_mode_for_mode (mode).require ());
6129       int num_instr = aarch64_internal_mov_immediate
6130                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
6131       return num_instr < 3;
6132     }
6133
6134   return false;
6135 }
6136
6137 /* Return TRUE if rtx X is immediate constant 0.0 */
6138 bool
6139 aarch64_float_const_zero_rtx_p (rtx x)
6140 {
6141   if (GET_MODE (x) == VOIDmode)
6142     return false;
6143
6144   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
6145     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
6146   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
6147 }
6148
6149 /* Return TRUE if rtx X is immediate constant that fits in a single
6150    MOVI immediate operation.  */
6151 bool
6152 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
6153 {
6154   if (!TARGET_SIMD)
6155      return false;
6156
6157   machine_mode vmode;
6158   scalar_int_mode imode;
6159   unsigned HOST_WIDE_INT ival;
6160
6161   if (GET_CODE (x) == CONST_DOUBLE
6162       && SCALAR_FLOAT_MODE_P (mode))
6163     {
6164       if (!aarch64_reinterpret_float_as_int (x, &ival))
6165         return false;
6166
6167       /* We make a general exception for 0.  */
6168       if (aarch64_float_const_zero_rtx_p (x))
6169         return true;
6170
6171       imode = int_mode_for_mode (mode).require ();
6172     }
6173   else if (GET_CODE (x) == CONST_INT
6174            && is_a <scalar_int_mode> (mode, &imode))
6175     ival = INTVAL (x);
6176   else
6177     return false;
6178
6179    /* use a 64 bit mode for everything except for DI/DF mode, where we use
6180      a 128 bit vector mode.  */
6181   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
6182
6183   vmode = aarch64_simd_container_mode (imode, width);
6184   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
6185
6186   return aarch64_simd_valid_immediate (v_op, NULL);
6187 }
6188
6189
6190 /* Return the fixed registers used for condition codes.  */
6191
6192 static bool
6193 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
6194 {
6195   *p1 = CC_REGNUM;
6196   *p2 = INVALID_REGNUM;
6197   return true;
6198 }
6199
6200 /* This function is used by the call expanders of the machine description.
6201    RESULT is the register in which the result is returned.  It's NULL for
6202    "call" and "sibcall".
6203    MEM is the location of the function call.
6204    SIBCALL indicates whether this function call is normal call or sibling call.
6205    It will generate different pattern accordingly.  */
6206
6207 void
6208 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
6209 {
6210   rtx call, callee, tmp;
6211   rtvec vec;
6212   machine_mode mode;
6213
6214   gcc_assert (MEM_P (mem));
6215   callee = XEXP (mem, 0);
6216   mode = GET_MODE (callee);
6217   gcc_assert (mode == Pmode);
6218
6219   /* Decide if we should generate indirect calls by loading the
6220      address of the callee into a register before performing
6221      the branch-and-link.  */
6222   if (SYMBOL_REF_P (callee)
6223       ? (aarch64_is_long_call_p (callee)
6224          || aarch64_is_noplt_call_p (callee))
6225       : !REG_P (callee))
6226     XEXP (mem, 0) = force_reg (mode, callee);
6227
6228   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
6229
6230   if (result != NULL_RTX)
6231     call = gen_rtx_SET (result, call);
6232
6233   if (sibcall)
6234     tmp = ret_rtx;
6235   else
6236     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
6237
6238   vec = gen_rtvec (2, call, tmp);
6239   call = gen_rtx_PARALLEL (VOIDmode, vec);
6240
6241   aarch64_emit_call_insn (call);
6242 }
6243
6244 /* Emit call insn with PAT and do aarch64-specific handling.  */
6245
6246 void
6247 aarch64_emit_call_insn (rtx pat)
6248 {
6249   rtx insn = emit_call_insn (pat);
6250
6251   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
6252   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
6253   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
6254 }
6255
6256 machine_mode
6257 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
6258 {
6259   /* All floating point compares return CCFP if it is an equality
6260      comparison, and CCFPE otherwise.  */
6261   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
6262     {
6263       switch (code)
6264         {
6265         case EQ:
6266         case NE:
6267         case UNORDERED:
6268         case ORDERED:
6269         case UNLT:
6270         case UNLE:
6271         case UNGT:
6272         case UNGE:
6273         case UNEQ:
6274           return CCFPmode;
6275
6276         case LT:
6277         case LE:
6278         case GT:
6279         case GE:
6280         case LTGT:
6281           return CCFPEmode;
6282
6283         default:
6284           gcc_unreachable ();
6285         }
6286     }
6287
6288   /* Equality comparisons of short modes against zero can be performed
6289      using the TST instruction with the appropriate bitmask.  */
6290   if (y == const0_rtx && REG_P (x)
6291       && (code == EQ || code == NE)
6292       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
6293     return CC_NZmode;
6294
6295   /* Similarly, comparisons of zero_extends from shorter modes can
6296      be performed using an ANDS with an immediate mask.  */
6297   if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
6298       && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6299       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
6300       && (code == EQ || code == NE))
6301     return CC_NZmode;
6302
6303   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6304       && y == const0_rtx
6305       && (code == EQ || code == NE || code == LT || code == GE)
6306       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
6307           || GET_CODE (x) == NEG
6308           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
6309               && CONST_INT_P (XEXP (x, 2)))))
6310     return CC_NZmode;
6311
6312   /* A compare with a shifted operand.  Because of canonicalization,
6313      the comparison will have to be swapped when we emit the assembly
6314      code.  */
6315   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6316       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
6317       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
6318           || GET_CODE (x) == LSHIFTRT
6319           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
6320     return CC_SWPmode;
6321
6322   /* Similarly for a negated operand, but we can only do this for
6323      equalities.  */
6324   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6325       && (REG_P (y) || GET_CODE (y) == SUBREG)
6326       && (code == EQ || code == NE)
6327       && GET_CODE (x) == NEG)
6328     return CC_Zmode;
6329
6330   /* A test for unsigned overflow.  */
6331   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
6332       && code == NE
6333       && GET_CODE (x) == PLUS
6334       && GET_CODE (y) == ZERO_EXTEND)
6335     return CC_Cmode;
6336
6337   /* For everything else, return CCmode.  */
6338   return CCmode;
6339 }
6340
6341 static int
6342 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
6343
6344 int
6345 aarch64_get_condition_code (rtx x)
6346 {
6347   machine_mode mode = GET_MODE (XEXP (x, 0));
6348   enum rtx_code comp_code = GET_CODE (x);
6349
6350   if (GET_MODE_CLASS (mode) != MODE_CC)
6351     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
6352   return aarch64_get_condition_code_1 (mode, comp_code);
6353 }
6354
6355 static int
6356 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
6357 {
6358   switch (mode)
6359     {
6360     case E_CCFPmode:
6361     case E_CCFPEmode:
6362       switch (comp_code)
6363         {
6364         case GE: return AARCH64_GE;
6365         case GT: return AARCH64_GT;
6366         case LE: return AARCH64_LS;
6367         case LT: return AARCH64_MI;
6368         case NE: return AARCH64_NE;
6369         case EQ: return AARCH64_EQ;
6370         case ORDERED: return AARCH64_VC;
6371         case UNORDERED: return AARCH64_VS;
6372         case UNLT: return AARCH64_LT;
6373         case UNLE: return AARCH64_LE;
6374         case UNGT: return AARCH64_HI;
6375         case UNGE: return AARCH64_PL;
6376         default: return -1;
6377         }
6378       break;
6379
6380     case E_CCmode:
6381       switch (comp_code)
6382         {
6383         case NE: return AARCH64_NE;
6384         case EQ: return AARCH64_EQ;
6385         case GE: return AARCH64_GE;
6386         case GT: return AARCH64_GT;
6387         case LE: return AARCH64_LE;
6388         case LT: return AARCH64_LT;
6389         case GEU: return AARCH64_CS;
6390         case GTU: return AARCH64_HI;
6391         case LEU: return AARCH64_LS;
6392         case LTU: return AARCH64_CC;
6393         default: return -1;
6394         }
6395       break;
6396
6397     case E_CC_SWPmode:
6398       switch (comp_code)
6399         {
6400         case NE: return AARCH64_NE;
6401         case EQ: return AARCH64_EQ;
6402         case GE: return AARCH64_LE;
6403         case GT: return AARCH64_LT;
6404         case LE: return AARCH64_GE;
6405         case LT: return AARCH64_GT;
6406         case GEU: return AARCH64_LS;
6407         case GTU: return AARCH64_CC;
6408         case LEU: return AARCH64_CS;
6409         case LTU: return AARCH64_HI;
6410         default: return -1;
6411         }
6412       break;
6413
6414     case E_CC_NZmode:
6415       switch (comp_code)
6416         {
6417         case NE: return AARCH64_NE;
6418         case EQ: return AARCH64_EQ;
6419         case GE: return AARCH64_PL;
6420         case LT: return AARCH64_MI;
6421         default: return -1;
6422         }
6423       break;
6424
6425     case E_CC_Zmode:
6426       switch (comp_code)
6427         {
6428         case NE: return AARCH64_NE;
6429         case EQ: return AARCH64_EQ;
6430         default: return -1;
6431         }
6432       break;
6433
6434     case E_CC_Cmode:
6435       switch (comp_code)
6436         {
6437         case NE: return AARCH64_CS;
6438         case EQ: return AARCH64_CC;
6439         default: return -1;
6440         }
6441       break;
6442
6443     default:
6444       return -1;
6445     }
6446
6447   return -1;
6448 }
6449
6450 bool
6451 aarch64_const_vec_all_same_in_range_p (rtx x,
6452                                        HOST_WIDE_INT minval,
6453                                        HOST_WIDE_INT maxval)
6454 {
6455   rtx elt;
6456   return (const_vec_duplicate_p (x, &elt)
6457           && CONST_INT_P (elt)
6458           && IN_RANGE (INTVAL (elt), minval, maxval));
6459 }
6460
6461 bool
6462 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
6463 {
6464   return aarch64_const_vec_all_same_in_range_p (x, val, val);
6465 }
6466
6467 /* Return true if VEC is a constant in which every element is in the range
6468    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
6469
6470 static bool
6471 aarch64_const_vec_all_in_range_p (rtx vec,
6472                                   HOST_WIDE_INT minval,
6473                                   HOST_WIDE_INT maxval)
6474 {
6475   if (GET_CODE (vec) != CONST_VECTOR
6476       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
6477     return false;
6478
6479   int nunits;
6480   if (!CONST_VECTOR_STEPPED_P (vec))
6481     nunits = const_vector_encoded_nelts (vec);
6482   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
6483     return false;
6484
6485   for (int i = 0; i < nunits; i++)
6486     {
6487       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
6488       if (!CONST_INT_P (vec_elem)
6489           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
6490         return false;
6491     }
6492   return true;
6493 }
6494
6495 /* N Z C V.  */
6496 #define AARCH64_CC_V 1
6497 #define AARCH64_CC_C (1 << 1)
6498 #define AARCH64_CC_Z (1 << 2)
6499 #define AARCH64_CC_N (1 << 3)
6500
6501 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
6502 static const int aarch64_nzcv_codes[] =
6503 {
6504   0,            /* EQ, Z == 1.  */
6505   AARCH64_CC_Z, /* NE, Z == 0.  */
6506   0,            /* CS, C == 1.  */
6507   AARCH64_CC_C, /* CC, C == 0.  */
6508   0,            /* MI, N == 1.  */
6509   AARCH64_CC_N, /* PL, N == 0.  */
6510   0,            /* VS, V == 1.  */
6511   AARCH64_CC_V, /* VC, V == 0.  */
6512   0,            /* HI, C ==1 && Z == 0.  */
6513   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
6514   AARCH64_CC_V, /* GE, N == V.  */
6515   0,            /* LT, N != V.  */
6516   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
6517   0,            /* LE, !(Z == 0 && N == V).  */
6518   0,            /* AL, Any.  */
6519   0             /* NV, Any.  */
6520 };
6521
6522 /* Print floating-point vector immediate operand X to F, negating it
6523    first if NEGATE is true.  Return true on success, false if it isn't
6524    a constant we can handle.  */
6525
6526 static bool
6527 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
6528 {
6529   rtx elt;
6530
6531   if (!const_vec_duplicate_p (x, &elt))
6532     return false;
6533
6534   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
6535   if (negate)
6536     r = real_value_negate (&r);
6537
6538   /* We only handle the SVE single-bit immediates here.  */
6539   if (real_equal (&r, &dconst0))
6540     asm_fprintf (f, "0.0");
6541   else if (real_equal (&r, &dconst1))
6542     asm_fprintf (f, "1.0");
6543   else if (real_equal (&r, &dconsthalf))
6544     asm_fprintf (f, "0.5");
6545   else
6546     return false;
6547
6548   return true;
6549 }
6550
6551 /* Return the equivalent letter for size.  */
6552 static char
6553 sizetochar (int size)
6554 {
6555   switch (size)
6556     {
6557     case 64: return 'd';
6558     case 32: return 's';
6559     case 16: return 'h';
6560     case 8 : return 'b';
6561     default: gcc_unreachable ();
6562     }
6563 }
6564
6565 /* Print operand X to file F in a target specific manner according to CODE.
6566    The acceptable formatting commands given by CODE are:
6567      'c':               An integer or symbol address without a preceding #
6568                         sign.
6569      'C':               Take the duplicated element in a vector constant
6570                         and print it in hex.
6571      'D':               Take the duplicated element in a vector constant
6572                         and print it as an unsigned integer, in decimal.
6573      'e':               Print the sign/zero-extend size as a character 8->b,
6574                         16->h, 32->w.
6575      'p':               Prints N such that 2^N == X (X must be power of 2 and
6576                         const int).
6577      'P':               Print the number of non-zero bits in X (a const_int).
6578      'H':               Print the higher numbered register of a pair (TImode)
6579                         of regs.
6580      'm':               Print a condition (eq, ne, etc).
6581      'M':               Same as 'm', but invert condition.
6582      'N':               Take the duplicated element in a vector constant
6583                         and print the negative of it in decimal.
6584      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
6585      'S/T/U/V':         Print a FP/SIMD register name for a register list.
6586                         The register printed is the FP/SIMD register name
6587                         of X + 0/1/2/3 for S/T/U/V.
6588      'R':               Print a scalar FP/SIMD register name + 1.
6589      'X':               Print bottom 16 bits of integer constant in hex.
6590      'w/x':             Print a general register name or the zero register
6591                         (32-bit or 64-bit).
6592      '0':               Print a normal operand, if it's a general register,
6593                         then we assume DImode.
6594      'k':               Print NZCV for conditional compare instructions.
6595      'A':               Output address constant representing the first
6596                         argument of X, specifying a relocation offset
6597                         if appropriate.
6598      'L':               Output constant address specified by X
6599                         with a relocation offset if appropriate.
6600      'G':               Prints address of X, specifying a PC relative
6601                         relocation mode if appropriate.
6602      'y':               Output address of LDP or STP - this is used for
6603                         some LDP/STPs which don't use a PARALLEL in their
6604                         pattern (so the mode needs to be adjusted).
6605      'z':               Output address of a typical LDP or STP.  */
6606
6607 static void
6608 aarch64_print_operand (FILE *f, rtx x, int code)
6609 {
6610   rtx elt;
6611   switch (code)
6612     {
6613     case 'c':
6614       switch (GET_CODE (x))
6615         {
6616         case CONST_INT:
6617           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
6618           break;
6619
6620         case SYMBOL_REF:
6621           output_addr_const (f, x);
6622           break;
6623
6624         case CONST:
6625           if (GET_CODE (XEXP (x, 0)) == PLUS
6626               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
6627             {
6628               output_addr_const (f, x);
6629               break;
6630             }
6631           /* Fall through.  */
6632
6633         default:
6634           output_operand_lossage ("unsupported operand for code '%c'", code);
6635         }
6636       break;
6637
6638     case 'e':
6639       {
6640         int n;
6641
6642         if (!CONST_INT_P (x)
6643             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
6644           {
6645             output_operand_lossage ("invalid operand for '%%%c'", code);
6646             return;
6647           }
6648
6649         switch (n)
6650           {
6651           case 3:
6652             fputc ('b', f);
6653             break;
6654           case 4:
6655             fputc ('h', f);
6656             break;
6657           case 5:
6658             fputc ('w', f);
6659             break;
6660           default:
6661             output_operand_lossage ("invalid operand for '%%%c'", code);
6662             return;
6663           }
6664       }
6665       break;
6666
6667     case 'p':
6668       {
6669         int n;
6670
6671         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
6672           {
6673             output_operand_lossage ("invalid operand for '%%%c'", code);
6674             return;
6675           }
6676
6677         asm_fprintf (f, "%d", n);
6678       }
6679       break;
6680
6681     case 'P':
6682       if (!CONST_INT_P (x))
6683         {
6684           output_operand_lossage ("invalid operand for '%%%c'", code);
6685           return;
6686         }
6687
6688       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
6689       break;
6690
6691     case 'H':
6692       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
6693         {
6694           output_operand_lossage ("invalid operand for '%%%c'", code);
6695           return;
6696         }
6697
6698       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
6699       break;
6700
6701     case 'M':
6702     case 'm':
6703       {
6704         int cond_code;
6705         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
6706         if (x == const_true_rtx)
6707           {
6708             if (code == 'M')
6709               fputs ("nv", f);
6710             return;
6711           }
6712
6713         if (!COMPARISON_P (x))
6714           {
6715             output_operand_lossage ("invalid operand for '%%%c'", code);
6716             return;
6717           }
6718
6719         cond_code = aarch64_get_condition_code (x);
6720         gcc_assert (cond_code >= 0);
6721         if (code == 'M')
6722           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
6723         fputs (aarch64_condition_codes[cond_code], f);
6724       }
6725       break;
6726
6727     case 'N':
6728       if (!const_vec_duplicate_p (x, &elt))
6729         {
6730           output_operand_lossage ("invalid vector constant");
6731           return;
6732         }
6733
6734       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6735         asm_fprintf (f, "%wd", -INTVAL (elt));
6736       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6737                && aarch64_print_vector_float_operand (f, x, true))
6738         ;
6739       else
6740         {
6741           output_operand_lossage ("invalid vector constant");
6742           return;
6743         }
6744       break;
6745
6746     case 'b':
6747     case 'h':
6748     case 's':
6749     case 'd':
6750     case 'q':
6751       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6752         {
6753           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6754           return;
6755         }
6756       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
6757       break;
6758
6759     case 'S':
6760     case 'T':
6761     case 'U':
6762     case 'V':
6763       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6764         {
6765           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6766           return;
6767         }
6768       asm_fprintf (f, "%c%d",
6769                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
6770                    REGNO (x) - V0_REGNUM + (code - 'S'));
6771       break;
6772
6773     case 'R':
6774       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6775         {
6776           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6777           return;
6778         }
6779       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
6780       break;
6781
6782     case 'X':
6783       if (!CONST_INT_P (x))
6784         {
6785           output_operand_lossage ("invalid operand for '%%%c'", code);
6786           return;
6787         }
6788       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
6789       break;
6790
6791     case 'C':
6792       {
6793         /* Print a replicated constant in hex.  */
6794         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6795           {
6796             output_operand_lossage ("invalid operand for '%%%c'", code);
6797             return;
6798           }
6799         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6800         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6801       }
6802       break;
6803
6804     case 'D':
6805       {
6806         /* Print a replicated constant in decimal, treating it as
6807            unsigned.  */
6808         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6809           {
6810             output_operand_lossage ("invalid operand for '%%%c'", code);
6811             return;
6812           }
6813         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6814         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6815       }
6816       break;
6817
6818     case 'w':
6819     case 'x':
6820       if (x == const0_rtx
6821           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
6822         {
6823           asm_fprintf (f, "%czr", code);
6824           break;
6825         }
6826
6827       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
6828         {
6829           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
6830           break;
6831         }
6832
6833       if (REG_P (x) && REGNO (x) == SP_REGNUM)
6834         {
6835           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
6836           break;
6837         }
6838
6839       /* Fall through */
6840
6841     case 0:
6842       if (x == NULL)
6843         {
6844           output_operand_lossage ("missing operand");
6845           return;
6846         }
6847
6848       switch (GET_CODE (x))
6849         {
6850         case REG:
6851           if (aarch64_sve_data_mode_p (GET_MODE (x)))
6852             {
6853               if (REG_NREGS (x) == 1)
6854                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
6855               else
6856                 {
6857                   char suffix
6858                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
6859                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
6860                                REGNO (x) - V0_REGNUM, suffix,
6861                                END_REGNO (x) - V0_REGNUM - 1, suffix);
6862                 }
6863             }
6864           else
6865             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
6866           break;
6867
6868         case MEM:
6869           output_address (GET_MODE (x), XEXP (x, 0));
6870           break;
6871
6872         case LABEL_REF:
6873         case SYMBOL_REF:
6874           output_addr_const (asm_out_file, x);
6875           break;
6876
6877         case CONST_INT:
6878           asm_fprintf (f, "%wd", INTVAL (x));
6879           break;
6880
6881         case CONST:
6882           if (!VECTOR_MODE_P (GET_MODE (x)))
6883             {
6884               output_addr_const (asm_out_file, x);
6885               break;
6886             }
6887           /* fall through */
6888
6889         case CONST_VECTOR:
6890           if (!const_vec_duplicate_p (x, &elt))
6891             {
6892               output_operand_lossage ("invalid vector constant");
6893               return;
6894             }
6895
6896           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6897             asm_fprintf (f, "%wd", INTVAL (elt));
6898           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6899                    && aarch64_print_vector_float_operand (f, x, false))
6900             ;
6901           else
6902             {
6903               output_operand_lossage ("invalid vector constant");
6904               return;
6905             }
6906           break;
6907
6908         case CONST_DOUBLE:
6909           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
6910              be getting CONST_DOUBLEs holding integers.  */
6911           gcc_assert (GET_MODE (x) != VOIDmode);
6912           if (aarch64_float_const_zero_rtx_p (x))
6913             {
6914               fputc ('0', f);
6915               break;
6916             }
6917           else if (aarch64_float_const_representable_p (x))
6918             {
6919 #define buf_size 20
6920               char float_buf[buf_size] = {'\0'};
6921               real_to_decimal_for_mode (float_buf,
6922                                         CONST_DOUBLE_REAL_VALUE (x),
6923                                         buf_size, buf_size,
6924                                         1, GET_MODE (x));
6925               asm_fprintf (asm_out_file, "%s", float_buf);
6926               break;
6927 #undef buf_size
6928             }
6929           output_operand_lossage ("invalid constant");
6930           return;
6931         default:
6932           output_operand_lossage ("invalid operand");
6933           return;
6934         }
6935       break;
6936
6937     case 'A':
6938       if (GET_CODE (x) == HIGH)
6939         x = XEXP (x, 0);
6940
6941       switch (aarch64_classify_symbolic_expression (x))
6942         {
6943         case SYMBOL_SMALL_GOT_4G:
6944           asm_fprintf (asm_out_file, ":got:");
6945           break;
6946
6947         case SYMBOL_SMALL_TLSGD:
6948           asm_fprintf (asm_out_file, ":tlsgd:");
6949           break;
6950
6951         case SYMBOL_SMALL_TLSDESC:
6952           asm_fprintf (asm_out_file, ":tlsdesc:");
6953           break;
6954
6955         case SYMBOL_SMALL_TLSIE:
6956           asm_fprintf (asm_out_file, ":gottprel:");
6957           break;
6958
6959         case SYMBOL_TLSLE24:
6960           asm_fprintf (asm_out_file, ":tprel:");
6961           break;
6962
6963         case SYMBOL_TINY_GOT:
6964           gcc_unreachable ();
6965           break;
6966
6967         default:
6968           break;
6969         }
6970       output_addr_const (asm_out_file, x);
6971       break;
6972
6973     case 'L':
6974       switch (aarch64_classify_symbolic_expression (x))
6975         {
6976         case SYMBOL_SMALL_GOT_4G:
6977           asm_fprintf (asm_out_file, ":lo12:");
6978           break;
6979
6980         case SYMBOL_SMALL_TLSGD:
6981           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
6982           break;
6983
6984         case SYMBOL_SMALL_TLSDESC:
6985           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
6986           break;
6987
6988         case SYMBOL_SMALL_TLSIE:
6989           asm_fprintf (asm_out_file, ":gottprel_lo12:");
6990           break;
6991
6992         case SYMBOL_TLSLE12:
6993           asm_fprintf (asm_out_file, ":tprel_lo12:");
6994           break;
6995
6996         case SYMBOL_TLSLE24:
6997           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
6998           break;
6999
7000         case SYMBOL_TINY_GOT:
7001           asm_fprintf (asm_out_file, ":got:");
7002           break;
7003
7004         case SYMBOL_TINY_TLSIE:
7005           asm_fprintf (asm_out_file, ":gottprel:");
7006           break;
7007
7008         default:
7009           break;
7010         }
7011       output_addr_const (asm_out_file, x);
7012       break;
7013
7014     case 'G':
7015       switch (aarch64_classify_symbolic_expression (x))
7016         {
7017         case SYMBOL_TLSLE24:
7018           asm_fprintf (asm_out_file, ":tprel_hi12:");
7019           break;
7020         default:
7021           break;
7022         }
7023       output_addr_const (asm_out_file, x);
7024       break;
7025
7026     case 'k':
7027       {
7028         HOST_WIDE_INT cond_code;
7029
7030         if (!CONST_INT_P (x))
7031           {
7032             output_operand_lossage ("invalid operand for '%%%c'", code);
7033             return;
7034           }
7035
7036         cond_code = INTVAL (x);
7037         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
7038         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
7039       }
7040       break;
7041
7042     case 'y':
7043     case 'z':
7044       {
7045         machine_mode mode = GET_MODE (x);
7046
7047         if (GET_CODE (x) != MEM
7048             || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
7049           {
7050             output_operand_lossage ("invalid operand for '%%%c'", code);
7051             return;
7052           }
7053
7054         if (code == 'y')
7055           /* LDP/STP which uses a single double-width memory operand.
7056              Adjust the mode to appear like a typical LDP/STP.
7057              Currently this is supported for 16-byte accesses only.  */
7058           mode = DFmode;
7059
7060         if (!aarch64_print_ldpstp_address (f, mode, XEXP (x, 0)))
7061           output_operand_lossage ("invalid operand prefix '%%%c'", code);
7062       }
7063       break;
7064
7065     default:
7066       output_operand_lossage ("invalid operand prefix '%%%c'", code);
7067       return;
7068     }
7069 }
7070
7071 /* Print address 'x' of a memory access with mode 'mode'.
7072    'op' is the context required by aarch64_classify_address.  It can either be
7073    MEM for a normal memory access or PARALLEL for LDP/STP.  */
7074 static bool
7075 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
7076                                 aarch64_addr_query_type type)
7077 {
7078   struct aarch64_address_info addr;
7079   unsigned int size;
7080
7081   /* Check all addresses are Pmode - including ILP32.  */
7082   if (GET_MODE (x) != Pmode)
7083     output_operand_lossage ("invalid address mode");
7084
7085   if (aarch64_classify_address (&addr, x, mode, true, type))
7086     switch (addr.type)
7087       {
7088       case ADDRESS_REG_IMM:
7089         if (known_eq (addr.const_offset, 0))
7090           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
7091         else if (aarch64_sve_data_mode_p (mode))
7092           {
7093             HOST_WIDE_INT vnum
7094               = exact_div (addr.const_offset,
7095                            BYTES_PER_SVE_VECTOR).to_constant ();
7096             asm_fprintf (f, "[%s, #%wd, mul vl]",
7097                          reg_names[REGNO (addr.base)], vnum);
7098           }
7099         else if (aarch64_sve_pred_mode_p (mode))
7100           {
7101             HOST_WIDE_INT vnum
7102               = exact_div (addr.const_offset,
7103                            BYTES_PER_SVE_PRED).to_constant ();
7104             asm_fprintf (f, "[%s, #%wd, mul vl]",
7105                          reg_names[REGNO (addr.base)], vnum);
7106           }
7107         else
7108           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
7109                        INTVAL (addr.offset));
7110         return true;
7111
7112       case ADDRESS_REG_REG:
7113         if (addr.shift == 0)
7114           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
7115                        reg_names [REGNO (addr.offset)]);
7116         else
7117           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
7118                        reg_names [REGNO (addr.offset)], addr.shift);
7119         return true;
7120
7121       case ADDRESS_REG_UXTW:
7122         if (addr.shift == 0)
7123           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
7124                        REGNO (addr.offset) - R0_REGNUM);
7125         else
7126           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
7127                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
7128         return true;
7129
7130       case ADDRESS_REG_SXTW:
7131         if (addr.shift == 0)
7132           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
7133                        REGNO (addr.offset) - R0_REGNUM);
7134         else
7135           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
7136                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
7137         return true;
7138
7139       case ADDRESS_REG_WB:
7140         /* Writeback is only supported for fixed-width modes.  */
7141         size = GET_MODE_SIZE (mode).to_constant ();
7142         switch (GET_CODE (x))
7143           {
7144           case PRE_INC:
7145             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
7146             return true;
7147           case POST_INC:
7148             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
7149             return true;
7150           case PRE_DEC:
7151             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
7152             return true;
7153           case POST_DEC:
7154             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
7155             return true;
7156           case PRE_MODIFY:
7157             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
7158                          INTVAL (addr.offset));
7159             return true;
7160           case POST_MODIFY:
7161             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
7162                          INTVAL (addr.offset));
7163             return true;
7164           default:
7165             break;
7166           }
7167         break;
7168
7169       case ADDRESS_LO_SUM:
7170         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
7171         output_addr_const (f, addr.offset);
7172         asm_fprintf (f, "]");
7173         return true;
7174
7175       case ADDRESS_SYMBOLIC:
7176         output_addr_const (f, x);
7177         return true;
7178       }
7179
7180   return false;
7181 }
7182
7183 /* Print address 'x' of a LDP/STP with mode 'mode'.  */
7184 static bool
7185 aarch64_print_ldpstp_address (FILE *f, machine_mode mode, rtx x)
7186 {
7187   return aarch64_print_address_internal (f, mode, x, ADDR_QUERY_LDP_STP);
7188 }
7189
7190 /* Print address 'x' of a memory access with mode 'mode'.  */
7191 static void
7192 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
7193 {
7194   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
7195     output_addr_const (f, x);
7196 }
7197
7198 bool
7199 aarch64_label_mentioned_p (rtx x)
7200 {
7201   const char *fmt;
7202   int i;
7203
7204   if (GET_CODE (x) == LABEL_REF)
7205     return true;
7206
7207   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
7208      referencing instruction, but they are constant offsets, not
7209      symbols.  */
7210   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
7211     return false;
7212
7213   fmt = GET_RTX_FORMAT (GET_CODE (x));
7214   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
7215     {
7216       if (fmt[i] == 'E')
7217         {
7218           int j;
7219
7220           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
7221             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
7222               return 1;
7223         }
7224       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
7225         return 1;
7226     }
7227
7228   return 0;
7229 }
7230
7231 /* Implement REGNO_REG_CLASS.  */
7232
7233 enum reg_class
7234 aarch64_regno_regclass (unsigned regno)
7235 {
7236   if (GP_REGNUM_P (regno))
7237     return GENERAL_REGS;
7238
7239   if (regno == SP_REGNUM)
7240     return STACK_REG;
7241
7242   if (regno == FRAME_POINTER_REGNUM
7243       || regno == ARG_POINTER_REGNUM)
7244     return POINTER_REGS;
7245
7246   if (FP_REGNUM_P (regno))
7247     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
7248
7249   if (PR_REGNUM_P (regno))
7250     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
7251
7252   return NO_REGS;
7253 }
7254
7255 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
7256    If OFFSET is out of range, return an offset of an anchor point
7257    that is in range.  Return 0 otherwise.  */
7258
7259 static HOST_WIDE_INT
7260 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
7261                        machine_mode mode)
7262 {
7263   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
7264   if (size > 16)
7265     return (offset + 0x400) & ~0x7f0;
7266
7267   /* For offsets that aren't a multiple of the access size, the limit is
7268      -256...255.  */
7269   if (offset & (size - 1))
7270     {
7271       /* BLKmode typically uses LDP of X-registers.  */
7272       if (mode == BLKmode)
7273         return (offset + 512) & ~0x3ff;
7274       return (offset + 0x100) & ~0x1ff;
7275     }
7276
7277   /* Small negative offsets are supported.  */
7278   if (IN_RANGE (offset, -256, 0))
7279     return 0;
7280
7281   if (mode == TImode || mode == TFmode)
7282     return (offset + 0x100) & ~0x1ff;
7283
7284   /* Use 12-bit offset by access size.  */
7285   return offset & (~0xfff * size);
7286 }
7287
7288 static rtx
7289 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
7290 {
7291   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
7292      where mask is selected by alignment and size of the offset.
7293      We try to pick as large a range for the offset as possible to
7294      maximize the chance of a CSE.  However, for aligned addresses
7295      we limit the range to 4k so that structures with different sized
7296      elements are likely to use the same base.  We need to be careful
7297      not to split a CONST for some forms of address expression, otherwise
7298      it will generate sub-optimal code.  */
7299
7300   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
7301     {
7302       rtx base = XEXP (x, 0);
7303       rtx offset_rtx = XEXP (x, 1);
7304       HOST_WIDE_INT offset = INTVAL (offset_rtx);
7305
7306       if (GET_CODE (base) == PLUS)
7307         {
7308           rtx op0 = XEXP (base, 0);
7309           rtx op1 = XEXP (base, 1);
7310
7311           /* Force any scaling into a temp for CSE.  */
7312           op0 = force_reg (Pmode, op0);
7313           op1 = force_reg (Pmode, op1);
7314
7315           /* Let the pointer register be in op0.  */
7316           if (REG_POINTER (op1))
7317             std::swap (op0, op1);
7318
7319           /* If the pointer is virtual or frame related, then we know that
7320              virtual register instantiation or register elimination is going
7321              to apply a second constant.  We want the two constants folded
7322              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
7323           if (virt_or_elim_regno_p (REGNO (op0)))
7324             {
7325               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
7326                                    NULL_RTX, true, OPTAB_DIRECT);
7327               return gen_rtx_PLUS (Pmode, base, op1);
7328             }
7329
7330           /* Otherwise, in order to encourage CSE (and thence loop strength
7331              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
7332           base = expand_binop (Pmode, add_optab, op0, op1,
7333                                NULL_RTX, true, OPTAB_DIRECT);
7334           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
7335         }
7336
7337       HOST_WIDE_INT size;
7338       if (GET_MODE_SIZE (mode).is_constant (&size))
7339         {
7340           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
7341                                                              mode);
7342           if (base_offset != 0)
7343             {
7344               base = plus_constant (Pmode, base, base_offset);
7345               base = force_operand (base, NULL_RTX);
7346               return plus_constant (Pmode, base, offset - base_offset);
7347             }
7348         }
7349     }
7350
7351   return x;
7352 }
7353
7354 /* Return the reload icode required for a constant pool in mode.  */
7355 static enum insn_code
7356 aarch64_constant_pool_reload_icode (machine_mode mode)
7357 {
7358   switch (mode)
7359     {
7360     case E_SFmode:
7361       return CODE_FOR_aarch64_reload_movcpsfdi;
7362
7363     case E_DFmode:
7364       return CODE_FOR_aarch64_reload_movcpdfdi;
7365
7366     case E_TFmode:
7367       return CODE_FOR_aarch64_reload_movcptfdi;
7368
7369     case E_V8QImode:
7370       return CODE_FOR_aarch64_reload_movcpv8qidi;
7371
7372     case E_V16QImode:
7373       return CODE_FOR_aarch64_reload_movcpv16qidi;
7374
7375     case E_V4HImode:
7376       return CODE_FOR_aarch64_reload_movcpv4hidi;
7377
7378     case E_V8HImode:
7379       return CODE_FOR_aarch64_reload_movcpv8hidi;
7380
7381     case E_V2SImode:
7382       return CODE_FOR_aarch64_reload_movcpv2sidi;
7383
7384     case E_V4SImode:
7385       return CODE_FOR_aarch64_reload_movcpv4sidi;
7386
7387     case E_V2DImode:
7388       return CODE_FOR_aarch64_reload_movcpv2didi;
7389
7390     case E_V2DFmode:
7391       return CODE_FOR_aarch64_reload_movcpv2dfdi;
7392
7393     default:
7394       gcc_unreachable ();
7395     }
7396
7397   gcc_unreachable ();
7398 }
7399 static reg_class_t
7400 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
7401                           reg_class_t rclass,
7402                           machine_mode mode,
7403                           secondary_reload_info *sri)
7404 {
7405   /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
7406      directly by the *aarch64_sve_mov<mode>_be move pattern.  See the
7407      comment at the head of aarch64-sve.md for more details about the
7408      big-endian handling.  */
7409   if (BYTES_BIG_ENDIAN
7410       && reg_class_subset_p (rclass, FP_REGS)
7411       && !((REG_P (x) && HARD_REGISTER_P (x))
7412            || aarch64_simd_valid_immediate (x, NULL))
7413       && aarch64_sve_data_mode_p (mode))
7414     {
7415       sri->icode = CODE_FOR_aarch64_sve_reload_be;
7416       return NO_REGS;
7417     }
7418
7419   /* If we have to disable direct literal pool loads and stores because the
7420      function is too big, then we need a scratch register.  */
7421   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
7422       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
7423           || targetm.vector_mode_supported_p (GET_MODE (x)))
7424       && !aarch64_pcrelative_literal_loads)
7425     {
7426       sri->icode = aarch64_constant_pool_reload_icode (mode);
7427       return NO_REGS;
7428     }
7429
7430   /* Without the TARGET_SIMD instructions we cannot move a Q register
7431      to a Q register directly.  We need a scratch.  */
7432   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
7433       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
7434       && reg_class_subset_p (rclass, FP_REGS))
7435     {
7436       if (mode == TFmode)
7437         sri->icode = CODE_FOR_aarch64_reload_movtf;
7438       else if (mode == TImode)
7439         sri->icode = CODE_FOR_aarch64_reload_movti;
7440       return NO_REGS;
7441     }
7442
7443   /* A TFmode or TImode memory access should be handled via an FP_REGS
7444      because AArch64 has richer addressing modes for LDR/STR instructions
7445      than LDP/STP instructions.  */
7446   if (TARGET_FLOAT && rclass == GENERAL_REGS
7447       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
7448     return FP_REGS;
7449
7450   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
7451       return GENERAL_REGS;
7452
7453   return NO_REGS;
7454 }
7455
7456 static bool
7457 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
7458 {
7459   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
7460
7461   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
7462      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
7463   if (frame_pointer_needed)
7464     return to == HARD_FRAME_POINTER_REGNUM;
7465   return true;
7466 }
7467
7468 poly_int64
7469 aarch64_initial_elimination_offset (unsigned from, unsigned to)
7470 {
7471   aarch64_layout_frame ();
7472
7473   if (to == HARD_FRAME_POINTER_REGNUM)
7474     {
7475       if (from == ARG_POINTER_REGNUM)
7476         return cfun->machine->frame.hard_fp_offset;
7477
7478       if (from == FRAME_POINTER_REGNUM)
7479         return cfun->machine->frame.hard_fp_offset
7480                - cfun->machine->frame.locals_offset;
7481     }
7482
7483   if (to == STACK_POINTER_REGNUM)
7484     {
7485       if (from == FRAME_POINTER_REGNUM)
7486           return cfun->machine->frame.frame_size
7487                  - cfun->machine->frame.locals_offset;
7488     }
7489
7490   return cfun->machine->frame.frame_size;
7491 }
7492
7493 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
7494    previous frame.  */
7495
7496 rtx
7497 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
7498 {
7499   if (count != 0)
7500     return const0_rtx;
7501   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
7502 }
7503
7504
7505 static void
7506 aarch64_asm_trampoline_template (FILE *f)
7507 {
7508   if (TARGET_ILP32)
7509     {
7510       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
7511       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
7512     }
7513   else
7514     {
7515       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
7516       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
7517     }
7518   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
7519   assemble_aligned_integer (4, const0_rtx);
7520   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7521   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7522 }
7523
7524 static void
7525 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
7526 {
7527   rtx fnaddr, mem, a_tramp;
7528   const int tramp_code_sz = 16;
7529
7530   /* Don't need to copy the trailing D-words, we fill those in below.  */
7531   emit_block_move (m_tramp, assemble_trampoline_template (),
7532                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
7533   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
7534   fnaddr = XEXP (DECL_RTL (fndecl), 0);
7535   if (GET_MODE (fnaddr) != ptr_mode)
7536     fnaddr = convert_memory_address (ptr_mode, fnaddr);
7537   emit_move_insn (mem, fnaddr);
7538
7539   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
7540   emit_move_insn (mem, chain_value);
7541
7542   /* XXX We should really define a "clear_cache" pattern and use
7543      gen_clear_cache().  */
7544   a_tramp = XEXP (m_tramp, 0);
7545   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
7546                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
7547                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
7548                      ptr_mode);
7549 }
7550
7551 static unsigned char
7552 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
7553 {
7554   /* ??? Logically we should only need to provide a value when
7555      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
7556      can hold MODE, but at the moment we need to handle all modes.
7557      Just ignore any runtime parts for registers that can't store them.  */
7558   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
7559   unsigned int nregs;
7560   switch (regclass)
7561     {
7562     case TAILCALL_ADDR_REGS:
7563     case POINTER_REGS:
7564     case GENERAL_REGS:
7565     case ALL_REGS:
7566     case POINTER_AND_FP_REGS:
7567     case FP_REGS:
7568     case FP_LO_REGS:
7569       if (aarch64_sve_data_mode_p (mode)
7570           && constant_multiple_p (GET_MODE_SIZE (mode),
7571                                   BYTES_PER_SVE_VECTOR, &nregs))
7572         return nregs;
7573       return (aarch64_vector_data_mode_p (mode)
7574               ? CEIL (lowest_size, UNITS_PER_VREG)
7575               : CEIL (lowest_size, UNITS_PER_WORD));
7576     case STACK_REG:
7577     case PR_REGS:
7578     case PR_LO_REGS:
7579     case PR_HI_REGS:
7580       return 1;
7581
7582     case NO_REGS:
7583       return 0;
7584
7585     default:
7586       break;
7587     }
7588   gcc_unreachable ();
7589 }
7590
7591 static reg_class_t
7592 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
7593 {
7594   if (regclass == POINTER_REGS)
7595     return GENERAL_REGS;
7596
7597   if (regclass == STACK_REG)
7598     {
7599       if (REG_P(x)
7600           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
7601           return regclass;
7602
7603       return NO_REGS;
7604     }
7605
7606   /* Register eliminiation can result in a request for
7607      SP+constant->FP_REGS.  We cannot support such operations which
7608      use SP as source and an FP_REG as destination, so reject out
7609      right now.  */
7610   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
7611     {
7612       rtx lhs = XEXP (x, 0);
7613
7614       /* Look through a possible SUBREG introduced by ILP32.  */
7615       if (GET_CODE (lhs) == SUBREG)
7616         lhs = SUBREG_REG (lhs);
7617
7618       gcc_assert (REG_P (lhs));
7619       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
7620                                       POINTER_REGS));
7621       return NO_REGS;
7622     }
7623
7624   return regclass;
7625 }
7626
7627 void
7628 aarch64_asm_output_labelref (FILE* f, const char *name)
7629 {
7630   asm_fprintf (f, "%U%s", name);
7631 }
7632
7633 static void
7634 aarch64_elf_asm_constructor (rtx symbol, int priority)
7635 {
7636   if (priority == DEFAULT_INIT_PRIORITY)
7637     default_ctor_section_asm_out_constructor (symbol, priority);
7638   else
7639     {
7640       section *s;
7641       /* While priority is known to be in range [0, 65535], so 18 bytes
7642          would be enough, the compiler might not know that.  To avoid
7643          -Wformat-truncation false positive, use a larger size.  */
7644       char buf[23];
7645       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
7646       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7647       switch_to_section (s);
7648       assemble_align (POINTER_SIZE);
7649       assemble_aligned_integer (POINTER_BYTES, symbol);
7650     }
7651 }
7652
7653 static void
7654 aarch64_elf_asm_destructor (rtx symbol, int priority)
7655 {
7656   if (priority == DEFAULT_INIT_PRIORITY)
7657     default_dtor_section_asm_out_destructor (symbol, priority);
7658   else
7659     {
7660       section *s;
7661       /* While priority is known to be in range [0, 65535], so 18 bytes
7662          would be enough, the compiler might not know that.  To avoid
7663          -Wformat-truncation false positive, use a larger size.  */
7664       char buf[23];
7665       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
7666       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7667       switch_to_section (s);
7668       assemble_align (POINTER_SIZE);
7669       assemble_aligned_integer (POINTER_BYTES, symbol);
7670     }
7671 }
7672
7673 const char*
7674 aarch64_output_casesi (rtx *operands)
7675 {
7676   char buf[100];
7677   char label[100];
7678   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
7679   int index;
7680   static const char *const patterns[4][2] =
7681   {
7682     {
7683       "ldrb\t%w3, [%0,%w1,uxtw]",
7684       "add\t%3, %4, %w3, sxtb #2"
7685     },
7686     {
7687       "ldrh\t%w3, [%0,%w1,uxtw #1]",
7688       "add\t%3, %4, %w3, sxth #2"
7689     },
7690     {
7691       "ldr\t%w3, [%0,%w1,uxtw #2]",
7692       "add\t%3, %4, %w3, sxtw #2"
7693     },
7694     /* We assume that DImode is only generated when not optimizing and
7695        that we don't really need 64-bit address offsets.  That would
7696        imply an object file with 8GB of code in a single function!  */
7697     {
7698       "ldr\t%w3, [%0,%w1,uxtw #2]",
7699       "add\t%3, %4, %w3, sxtw #2"
7700     }
7701   };
7702
7703   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
7704
7705   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
7706   index = exact_log2 (GET_MODE_SIZE (mode));
7707
7708   gcc_assert (index >= 0 && index <= 3);
7709
7710   /* Need to implement table size reduction, by chaning the code below.  */
7711   output_asm_insn (patterns[index][0], operands);
7712   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
7713   snprintf (buf, sizeof (buf),
7714             "adr\t%%4, %s", targetm.strip_name_encoding (label));
7715   output_asm_insn (buf, operands);
7716   output_asm_insn (patterns[index][1], operands);
7717   output_asm_insn ("br\t%3", operands);
7718   assemble_label (asm_out_file, label);
7719   return "";
7720 }
7721
7722
7723 /* Return size in bits of an arithmetic operand which is shifted/scaled and
7724    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
7725    operator.  */
7726
7727 int
7728 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
7729 {
7730   if (shift >= 0 && shift <= 3)
7731     {
7732       int size;
7733       for (size = 8; size <= 32; size *= 2)
7734         {
7735           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
7736           if (mask == bits << shift)
7737             return size;
7738         }
7739     }
7740   return 0;
7741 }
7742
7743 /* Constant pools are per function only when PC relative
7744    literal loads are true or we are in the large memory
7745    model.  */
7746
7747 static inline bool
7748 aarch64_can_use_per_function_literal_pools_p (void)
7749 {
7750   return (aarch64_pcrelative_literal_loads
7751           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
7752 }
7753
7754 static bool
7755 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
7756 {
7757   /* We can't use blocks for constants when we're using a per-function
7758      constant pool.  */
7759   return !aarch64_can_use_per_function_literal_pools_p ();
7760 }
7761
7762 /* Select appropriate section for constants depending
7763    on where we place literal pools.  */
7764
7765 static section *
7766 aarch64_select_rtx_section (machine_mode mode,
7767                             rtx x,
7768                             unsigned HOST_WIDE_INT align)
7769 {
7770   if (aarch64_can_use_per_function_literal_pools_p ())
7771     return function_section (current_function_decl);
7772
7773   return default_elf_select_rtx_section (mode, x, align);
7774 }
7775
7776 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
7777 void
7778 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
7779                                   HOST_WIDE_INT offset)
7780 {
7781   /* When using per-function literal pools, we must ensure that any code
7782      section is aligned to the minimal instruction length, lest we get
7783      errors from the assembler re "unaligned instructions".  */
7784   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
7785     ASM_OUTPUT_ALIGN (f, 2);
7786 }
7787
7788 /* Costs.  */
7789
7790 /* Helper function for rtx cost calculation.  Strip a shift expression
7791    from X.  Returns the inner operand if successful, or the original
7792    expression on failure.  */
7793 static rtx
7794 aarch64_strip_shift (rtx x)
7795 {
7796   rtx op = x;
7797
7798   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
7799      we can convert both to ROR during final output.  */
7800   if ((GET_CODE (op) == ASHIFT
7801        || GET_CODE (op) == ASHIFTRT
7802        || GET_CODE (op) == LSHIFTRT
7803        || GET_CODE (op) == ROTATERT
7804        || GET_CODE (op) == ROTATE)
7805       && CONST_INT_P (XEXP (op, 1)))
7806     return XEXP (op, 0);
7807
7808   if (GET_CODE (op) == MULT
7809       && CONST_INT_P (XEXP (op, 1))
7810       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
7811     return XEXP (op, 0);
7812
7813   return x;
7814 }
7815
7816 /* Helper function for rtx cost calculation.  Strip an extend
7817    expression from X.  Returns the inner operand if successful, or the
7818    original expression on failure.  We deal with a number of possible
7819    canonicalization variations here. If STRIP_SHIFT is true, then
7820    we can strip off a shift also.  */
7821 static rtx
7822 aarch64_strip_extend (rtx x, bool strip_shift)
7823 {
7824   scalar_int_mode mode;
7825   rtx op = x;
7826
7827   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
7828     return op;
7829
7830   /* Zero and sign extraction of a widened value.  */
7831   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
7832       && XEXP (op, 2) == const0_rtx
7833       && GET_CODE (XEXP (op, 0)) == MULT
7834       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
7835                                          XEXP (op, 1)))
7836     return XEXP (XEXP (op, 0), 0);
7837
7838   /* It can also be represented (for zero-extend) as an AND with an
7839      immediate.  */
7840   if (GET_CODE (op) == AND
7841       && GET_CODE (XEXP (op, 0)) == MULT
7842       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
7843       && CONST_INT_P (XEXP (op, 1))
7844       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
7845                            INTVAL (XEXP (op, 1))) != 0)
7846     return XEXP (XEXP (op, 0), 0);
7847
7848   /* Now handle extended register, as this may also have an optional
7849      left shift by 1..4.  */
7850   if (strip_shift
7851       && GET_CODE (op) == ASHIFT
7852       && CONST_INT_P (XEXP (op, 1))
7853       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
7854     op = XEXP (op, 0);
7855
7856   if (GET_CODE (op) == ZERO_EXTEND
7857       || GET_CODE (op) == SIGN_EXTEND)
7858     op = XEXP (op, 0);
7859
7860   if (op != x)
7861     return op;
7862
7863   return x;
7864 }
7865
7866 /* Return true iff CODE is a shift supported in combination
7867    with arithmetic instructions.  */
7868
7869 static bool
7870 aarch64_shift_p (enum rtx_code code)
7871 {
7872   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
7873 }
7874
7875
7876 /* Return true iff X is a cheap shift without a sign extend. */
7877
7878 static bool
7879 aarch64_cheap_mult_shift_p (rtx x)
7880 {
7881   rtx op0, op1;
7882
7883   op0 = XEXP (x, 0);
7884   op1 = XEXP (x, 1);
7885
7886   if (!(aarch64_tune_params.extra_tuning_flags
7887                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
7888     return false;
7889
7890   if (GET_CODE (op0) == SIGN_EXTEND)
7891     return false;
7892
7893   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
7894       && UINTVAL (op1) <= 4)
7895     return true;
7896
7897   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
7898     return false;
7899
7900   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
7901
7902   if (l2 > 0 && l2 <= 4)
7903     return true;
7904
7905   return false;
7906 }
7907
7908 /* Helper function for rtx cost calculation.  Calculate the cost of
7909    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
7910    Return the calculated cost of the expression, recursing manually in to
7911    operands where needed.  */
7912
7913 static int
7914 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
7915 {
7916   rtx op0, op1;
7917   const struct cpu_cost_table *extra_cost
7918     = aarch64_tune_params.insn_extra_cost;
7919   int cost = 0;
7920   bool compound_p = (outer == PLUS || outer == MINUS);
7921   machine_mode mode = GET_MODE (x);
7922
7923   gcc_checking_assert (code == MULT);
7924
7925   op0 = XEXP (x, 0);
7926   op1 = XEXP (x, 1);
7927
7928   if (VECTOR_MODE_P (mode))
7929     mode = GET_MODE_INNER (mode);
7930
7931   /* Integer multiply/fma.  */
7932   if (GET_MODE_CLASS (mode) == MODE_INT)
7933     {
7934       /* The multiply will be canonicalized as a shift, cost it as such.  */
7935       if (aarch64_shift_p (GET_CODE (x))
7936           || (CONST_INT_P (op1)
7937               && exact_log2 (INTVAL (op1)) > 0))
7938         {
7939           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
7940                            || GET_CODE (op0) == SIGN_EXTEND;
7941           if (speed)
7942             {
7943               if (compound_p)
7944                 {
7945                   /* If the shift is considered cheap,
7946                      then don't add any cost. */
7947                   if (aarch64_cheap_mult_shift_p (x))
7948                     ;
7949                   else if (REG_P (op1))
7950                     /* ARITH + shift-by-register.  */
7951                     cost += extra_cost->alu.arith_shift_reg;
7952                   else if (is_extend)
7953                     /* ARITH + extended register.  We don't have a cost field
7954                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
7955                     cost += extra_cost->alu.extend_arith;
7956                   else
7957                     /* ARITH + shift-by-immediate.  */
7958                     cost += extra_cost->alu.arith_shift;
7959                 }
7960               else
7961                 /* LSL (immediate).  */
7962                 cost += extra_cost->alu.shift;
7963
7964             }
7965           /* Strip extends as we will have costed them in the case above.  */
7966           if (is_extend)
7967             op0 = aarch64_strip_extend (op0, true);
7968
7969           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
7970
7971           return cost;
7972         }
7973
7974       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
7975          compound and let the below cases handle it.  After all, MNEG is a
7976          special-case alias of MSUB.  */
7977       if (GET_CODE (op0) == NEG)
7978         {
7979           op0 = XEXP (op0, 0);
7980           compound_p = true;
7981         }
7982
7983       /* Integer multiplies or FMAs have zero/sign extending variants.  */
7984       if ((GET_CODE (op0) == ZERO_EXTEND
7985            && GET_CODE (op1) == ZERO_EXTEND)
7986           || (GET_CODE (op0) == SIGN_EXTEND
7987               && GET_CODE (op1) == SIGN_EXTEND))
7988         {
7989           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
7990           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
7991
7992           if (speed)
7993             {
7994               if (compound_p)
7995                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
7996                 cost += extra_cost->mult[0].extend_add;
7997               else
7998                 /* MUL/SMULL/UMULL.  */
7999                 cost += extra_cost->mult[0].extend;
8000             }
8001
8002           return cost;
8003         }
8004
8005       /* This is either an integer multiply or a MADD.  In both cases
8006          we want to recurse and cost the operands.  */
8007       cost += rtx_cost (op0, mode, MULT, 0, speed);
8008       cost += rtx_cost (op1, mode, MULT, 1, speed);
8009
8010       if (speed)
8011         {
8012           if (compound_p)
8013             /* MADD/MSUB.  */
8014             cost += extra_cost->mult[mode == DImode].add;
8015           else
8016             /* MUL.  */
8017             cost += extra_cost->mult[mode == DImode].simple;
8018         }
8019
8020       return cost;
8021     }
8022   else
8023     {
8024       if (speed)
8025         {
8026           /* Floating-point FMA/FMUL can also support negations of the
8027              operands, unless the rounding mode is upward or downward in
8028              which case FNMUL is different than FMUL with operand negation.  */
8029           bool neg0 = GET_CODE (op0) == NEG;
8030           bool neg1 = GET_CODE (op1) == NEG;
8031           if (compound_p || !flag_rounding_math || (neg0 && neg1))
8032             {
8033               if (neg0)
8034                 op0 = XEXP (op0, 0);
8035               if (neg1)
8036                 op1 = XEXP (op1, 0);
8037             }
8038
8039           if (compound_p)
8040             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
8041             cost += extra_cost->fp[mode == DFmode].fma;
8042           else
8043             /* FMUL/FNMUL.  */
8044             cost += extra_cost->fp[mode == DFmode].mult;
8045         }
8046
8047       cost += rtx_cost (op0, mode, MULT, 0, speed);
8048       cost += rtx_cost (op1, mode, MULT, 1, speed);
8049       return cost;
8050     }
8051 }
8052
8053 static int
8054 aarch64_address_cost (rtx x,
8055                       machine_mode mode,
8056                       addr_space_t as ATTRIBUTE_UNUSED,
8057                       bool speed)
8058 {
8059   enum rtx_code c = GET_CODE (x);
8060   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
8061   struct aarch64_address_info info;
8062   int cost = 0;
8063   info.shift = 0;
8064
8065   if (!aarch64_classify_address (&info, x, mode, false))
8066     {
8067       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
8068         {
8069           /* This is a CONST or SYMBOL ref which will be split
8070              in a different way depending on the code model in use.
8071              Cost it through the generic infrastructure.  */
8072           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
8073           /* Divide through by the cost of one instruction to
8074              bring it to the same units as the address costs.  */
8075           cost_symbol_ref /= COSTS_N_INSNS (1);
8076           /* The cost is then the cost of preparing the address,
8077              followed by an immediate (possibly 0) offset.  */
8078           return cost_symbol_ref + addr_cost->imm_offset;
8079         }
8080       else
8081         {
8082           /* This is most likely a jump table from a case
8083              statement.  */
8084           return addr_cost->register_offset;
8085         }
8086     }
8087
8088   switch (info.type)
8089     {
8090       case ADDRESS_LO_SUM:
8091       case ADDRESS_SYMBOLIC:
8092       case ADDRESS_REG_IMM:
8093         cost += addr_cost->imm_offset;
8094         break;
8095
8096       case ADDRESS_REG_WB:
8097         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
8098           cost += addr_cost->pre_modify;
8099         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
8100           cost += addr_cost->post_modify;
8101         else
8102           gcc_unreachable ();
8103
8104         break;
8105
8106       case ADDRESS_REG_REG:
8107         cost += addr_cost->register_offset;
8108         break;
8109
8110       case ADDRESS_REG_SXTW:
8111         cost += addr_cost->register_sextend;
8112         break;
8113
8114       case ADDRESS_REG_UXTW:
8115         cost += addr_cost->register_zextend;
8116         break;
8117
8118       default:
8119         gcc_unreachable ();
8120     }
8121
8122
8123   if (info.shift > 0)
8124     {
8125       /* For the sake of calculating the cost of the shifted register
8126          component, we can treat same sized modes in the same way.  */
8127       if (known_eq (GET_MODE_BITSIZE (mode), 16))
8128         cost += addr_cost->addr_scale_costs.hi;
8129       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
8130         cost += addr_cost->addr_scale_costs.si;
8131       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
8132         cost += addr_cost->addr_scale_costs.di;
8133       else
8134         /* We can't tell, or this is a 128-bit vector.  */
8135         cost += addr_cost->addr_scale_costs.ti;
8136     }
8137
8138   return cost;
8139 }
8140
8141 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
8142    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
8143    to be taken.  */
8144
8145 int
8146 aarch64_branch_cost (bool speed_p, bool predictable_p)
8147 {
8148   /* When optimizing for speed, use the cost of unpredictable branches.  */
8149   const struct cpu_branch_cost *branch_costs =
8150     aarch64_tune_params.branch_costs;
8151
8152   if (!speed_p || predictable_p)
8153     return branch_costs->predictable;
8154   else
8155     return branch_costs->unpredictable;
8156 }
8157
8158 /* Return true if the RTX X in mode MODE is a zero or sign extract
8159    usable in an ADD or SUB (extended register) instruction.  */
8160 static bool
8161 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
8162 {
8163   /* Catch add with a sign extract.
8164      This is add_<optab><mode>_multp2.  */
8165   if (GET_CODE (x) == SIGN_EXTRACT
8166       || GET_CODE (x) == ZERO_EXTRACT)
8167     {
8168       rtx op0 = XEXP (x, 0);
8169       rtx op1 = XEXP (x, 1);
8170       rtx op2 = XEXP (x, 2);
8171
8172       if (GET_CODE (op0) == MULT
8173           && CONST_INT_P (op1)
8174           && op2 == const0_rtx
8175           && CONST_INT_P (XEXP (op0, 1))
8176           && aarch64_is_extend_from_extract (mode,
8177                                              XEXP (op0, 1),
8178                                              op1))
8179         {
8180           return true;
8181         }
8182     }
8183   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
8184      No shift.  */
8185   else if (GET_CODE (x) == SIGN_EXTEND
8186            || GET_CODE (x) == ZERO_EXTEND)
8187     return REG_P (XEXP (x, 0));
8188
8189   return false;
8190 }
8191
8192 static bool
8193 aarch64_frint_unspec_p (unsigned int u)
8194 {
8195   switch (u)
8196     {
8197       case UNSPEC_FRINTZ:
8198       case UNSPEC_FRINTP:
8199       case UNSPEC_FRINTM:
8200       case UNSPEC_FRINTA:
8201       case UNSPEC_FRINTN:
8202       case UNSPEC_FRINTX:
8203       case UNSPEC_FRINTI:
8204         return true;
8205
8206       default:
8207         return false;
8208     }
8209 }
8210
8211 /* Return true iff X is an rtx that will match an extr instruction
8212    i.e. as described in the *extr<mode>5_insn family of patterns.
8213    OP0 and OP1 will be set to the operands of the shifts involved
8214    on success and will be NULL_RTX otherwise.  */
8215
8216 static bool
8217 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
8218 {
8219   rtx op0, op1;
8220   scalar_int_mode mode;
8221   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
8222     return false;
8223
8224   *res_op0 = NULL_RTX;
8225   *res_op1 = NULL_RTX;
8226
8227   if (GET_CODE (x) != IOR)
8228     return false;
8229
8230   op0 = XEXP (x, 0);
8231   op1 = XEXP (x, 1);
8232
8233   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
8234       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
8235     {
8236      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
8237       if (GET_CODE (op1) == ASHIFT)
8238         std::swap (op0, op1);
8239
8240       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
8241         return false;
8242
8243       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
8244       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
8245
8246       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
8247           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
8248         {
8249           *res_op0 = XEXP (op0, 0);
8250           *res_op1 = XEXP (op1, 0);
8251           return true;
8252         }
8253     }
8254
8255   return false;
8256 }
8257
8258 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
8259    storing it in *COST.  Result is true if the total cost of the operation
8260    has now been calculated.  */
8261 static bool
8262 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
8263 {
8264   rtx inner;
8265   rtx comparator;
8266   enum rtx_code cmpcode;
8267
8268   if (COMPARISON_P (op0))
8269     {
8270       inner = XEXP (op0, 0);
8271       comparator = XEXP (op0, 1);
8272       cmpcode = GET_CODE (op0);
8273     }
8274   else
8275     {
8276       inner = op0;
8277       comparator = const0_rtx;
8278       cmpcode = NE;
8279     }
8280
8281   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
8282     {
8283       /* Conditional branch.  */
8284       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8285         return true;
8286       else
8287         {
8288           if (cmpcode == NE || cmpcode == EQ)
8289             {
8290               if (comparator == const0_rtx)
8291                 {
8292                   /* TBZ/TBNZ/CBZ/CBNZ.  */
8293                   if (GET_CODE (inner) == ZERO_EXTRACT)
8294                     /* TBZ/TBNZ.  */
8295                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
8296                                        ZERO_EXTRACT, 0, speed);
8297                   else
8298                     /* CBZ/CBNZ.  */
8299                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
8300
8301                 return true;
8302               }
8303             }
8304           else if (cmpcode == LT || cmpcode == GE)
8305             {
8306               /* TBZ/TBNZ.  */
8307               if (comparator == const0_rtx)
8308                 return true;
8309             }
8310         }
8311     }
8312   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8313     {
8314       /* CCMP.  */
8315       if (GET_CODE (op1) == COMPARE)
8316         {
8317           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
8318           if (XEXP (op1, 1) == const0_rtx)
8319             *cost += 1;
8320           if (speed)
8321             {
8322               machine_mode mode = GET_MODE (XEXP (op1, 0));
8323               const struct cpu_cost_table *extra_cost
8324                 = aarch64_tune_params.insn_extra_cost;
8325
8326               if (GET_MODE_CLASS (mode) == MODE_INT)
8327                 *cost += extra_cost->alu.arith;
8328               else
8329                 *cost += extra_cost->fp[mode == DFmode].compare;
8330             }
8331           return true;
8332         }
8333
8334       /* It's a conditional operation based on the status flags,
8335          so it must be some flavor of CSEL.  */
8336
8337       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
8338       if (GET_CODE (op1) == NEG
8339           || GET_CODE (op1) == NOT
8340           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
8341         op1 = XEXP (op1, 0);
8342       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
8343         {
8344           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
8345           op1 = XEXP (op1, 0);
8346           op2 = XEXP (op2, 0);
8347         }
8348
8349       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
8350       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
8351       return true;
8352     }
8353
8354   /* We don't know what this is, cost all operands.  */
8355   return false;
8356 }
8357
8358 /* Check whether X is a bitfield operation of the form shift + extend that
8359    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
8360    operand to which the bitfield operation is applied.  Otherwise return
8361    NULL_RTX.  */
8362
8363 static rtx
8364 aarch64_extend_bitfield_pattern_p (rtx x)
8365 {
8366   rtx_code outer_code = GET_CODE (x);
8367   machine_mode outer_mode = GET_MODE (x);
8368
8369   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
8370       && outer_mode != SImode && outer_mode != DImode)
8371     return NULL_RTX;
8372
8373   rtx inner = XEXP (x, 0);
8374   rtx_code inner_code = GET_CODE (inner);
8375   machine_mode inner_mode = GET_MODE (inner);
8376   rtx op = NULL_RTX;
8377
8378   switch (inner_code)
8379     {
8380       case ASHIFT:
8381         if (CONST_INT_P (XEXP (inner, 1))
8382             && (inner_mode == QImode || inner_mode == HImode))
8383           op = XEXP (inner, 0);
8384         break;
8385       case LSHIFTRT:
8386         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
8387             && (inner_mode == QImode || inner_mode == HImode))
8388           op = XEXP (inner, 0);
8389         break;
8390       case ASHIFTRT:
8391         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
8392             && (inner_mode == QImode || inner_mode == HImode))
8393           op = XEXP (inner, 0);
8394         break;
8395       default:
8396         break;
8397     }
8398
8399   return op;
8400 }
8401
8402 /* Return true if the mask and a shift amount from an RTX of the form
8403    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
8404    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
8405
8406 bool
8407 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
8408                                     rtx shft_amnt)
8409 {
8410   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
8411          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
8412          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
8413          && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
8414 }
8415
8416 /* Calculate the cost of calculating X, storing it in *COST.  Result
8417    is true if the total cost of the operation has now been calculated.  */
8418 static bool
8419 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
8420                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
8421 {
8422   rtx op0, op1, op2;
8423   const struct cpu_cost_table *extra_cost
8424     = aarch64_tune_params.insn_extra_cost;
8425   int code = GET_CODE (x);
8426   scalar_int_mode int_mode;
8427
8428   /* By default, assume that everything has equivalent cost to the
8429      cheapest instruction.  Any additional costs are applied as a delta
8430      above this default.  */
8431   *cost = COSTS_N_INSNS (1);
8432
8433   switch (code)
8434     {
8435     case SET:
8436       /* The cost depends entirely on the operands to SET.  */
8437       *cost = 0;
8438       op0 = SET_DEST (x);
8439       op1 = SET_SRC (x);
8440
8441       switch (GET_CODE (op0))
8442         {
8443         case MEM:
8444           if (speed)
8445             {
8446               rtx address = XEXP (op0, 0);
8447               if (VECTOR_MODE_P (mode))
8448                 *cost += extra_cost->ldst.storev;
8449               else if (GET_MODE_CLASS (mode) == MODE_INT)
8450                 *cost += extra_cost->ldst.store;
8451               else if (mode == SFmode)
8452                 *cost += extra_cost->ldst.storef;
8453               else if (mode == DFmode)
8454                 *cost += extra_cost->ldst.stored;
8455
8456               *cost +=
8457                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8458                                                      0, speed));
8459             }
8460
8461           *cost += rtx_cost (op1, mode, SET, 1, speed);
8462           return true;
8463
8464         case SUBREG:
8465           if (! REG_P (SUBREG_REG (op0)))
8466             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
8467
8468           /* Fall through.  */
8469         case REG:
8470           /* The cost is one per vector-register copied.  */
8471           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
8472             {
8473               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
8474               *cost = COSTS_N_INSNS (nregs);
8475             }
8476           /* const0_rtx is in general free, but we will use an
8477              instruction to set a register to 0.  */
8478           else if (REG_P (op1) || op1 == const0_rtx)
8479             {
8480               /* The cost is 1 per register copied.  */
8481               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
8482               *cost = COSTS_N_INSNS (nregs);
8483             }
8484           else
8485             /* Cost is just the cost of the RHS of the set.  */
8486             *cost += rtx_cost (op1, mode, SET, 1, speed);
8487           return true;
8488
8489         case ZERO_EXTRACT:
8490         case SIGN_EXTRACT:
8491           /* Bit-field insertion.  Strip any redundant widening of
8492              the RHS to meet the width of the target.  */
8493           if (GET_CODE (op1) == SUBREG)
8494             op1 = SUBREG_REG (op1);
8495           if ((GET_CODE (op1) == ZERO_EXTEND
8496                || GET_CODE (op1) == SIGN_EXTEND)
8497               && CONST_INT_P (XEXP (op0, 1))
8498               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
8499               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
8500             op1 = XEXP (op1, 0);
8501
8502           if (CONST_INT_P (op1))
8503             {
8504               /* MOV immediate is assumed to always be cheap.  */
8505               *cost = COSTS_N_INSNS (1);
8506             }
8507           else
8508             {
8509               /* BFM.  */
8510               if (speed)
8511                 *cost += extra_cost->alu.bfi;
8512               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
8513             }
8514
8515           return true;
8516
8517         default:
8518           /* We can't make sense of this, assume default cost.  */
8519           *cost = COSTS_N_INSNS (1);
8520           return false;
8521         }
8522       return false;
8523
8524     case CONST_INT:
8525       /* If an instruction can incorporate a constant within the
8526          instruction, the instruction's expression avoids calling
8527          rtx_cost() on the constant.  If rtx_cost() is called on a
8528          constant, then it is usually because the constant must be
8529          moved into a register by one or more instructions.
8530
8531          The exception is constant 0, which can be expressed
8532          as XZR/WZR and is therefore free.  The exception to this is
8533          if we have (set (reg) (const0_rtx)) in which case we must cost
8534          the move.  However, we can catch that when we cost the SET, so
8535          we don't need to consider that here.  */
8536       if (x == const0_rtx)
8537         *cost = 0;
8538       else
8539         {
8540           /* To an approximation, building any other constant is
8541              proportionally expensive to the number of instructions
8542              required to build that constant.  This is true whether we
8543              are compiling for SPEED or otherwise.  */
8544           if (!is_a <scalar_int_mode> (mode, &int_mode))
8545             int_mode = word_mode;
8546           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
8547                                  (NULL_RTX, x, false, int_mode));
8548         }
8549       return true;
8550
8551     case CONST_DOUBLE:
8552
8553       /* First determine number of instructions to do the move
8554           as an integer constant.  */
8555       if (!aarch64_float_const_representable_p (x)
8556            && !aarch64_can_const_movi_rtx_p (x, mode)
8557            && aarch64_float_const_rtx_p (x))
8558         {
8559           unsigned HOST_WIDE_INT ival;
8560           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
8561           gcc_assert (succeed);
8562
8563           scalar_int_mode imode = (mode == HFmode
8564                                    ? SImode
8565                                    : int_mode_for_mode (mode).require ());
8566           int ncost = aarch64_internal_mov_immediate
8567                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
8568           *cost += COSTS_N_INSNS (ncost);
8569           return true;
8570         }
8571
8572       if (speed)
8573         {
8574           /* mov[df,sf]_aarch64.  */
8575           if (aarch64_float_const_representable_p (x))
8576             /* FMOV (scalar immediate).  */
8577             *cost += extra_cost->fp[mode == DFmode].fpconst;
8578           else if (!aarch64_float_const_zero_rtx_p (x))
8579             {
8580               /* This will be a load from memory.  */
8581               if (mode == DFmode)
8582                 *cost += extra_cost->ldst.loadd;
8583               else
8584                 *cost += extra_cost->ldst.loadf;
8585             }
8586           else
8587             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
8588                or MOV v0.s[0], wzr - neither of which are modeled by the
8589                cost tables.  Just use the default cost.  */
8590             {
8591             }
8592         }
8593
8594       return true;
8595
8596     case MEM:
8597       if (speed)
8598         {
8599           /* For loads we want the base cost of a load, plus an
8600              approximation for the additional cost of the addressing
8601              mode.  */
8602           rtx address = XEXP (x, 0);
8603           if (VECTOR_MODE_P (mode))
8604             *cost += extra_cost->ldst.loadv;
8605           else if (GET_MODE_CLASS (mode) == MODE_INT)
8606             *cost += extra_cost->ldst.load;
8607           else if (mode == SFmode)
8608             *cost += extra_cost->ldst.loadf;
8609           else if (mode == DFmode)
8610             *cost += extra_cost->ldst.loadd;
8611
8612           *cost +=
8613                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8614                                                      0, speed));
8615         }
8616
8617       return true;
8618
8619     case NEG:
8620       op0 = XEXP (x, 0);
8621
8622       if (VECTOR_MODE_P (mode))
8623         {
8624           if (speed)
8625             {
8626               /* FNEG.  */
8627               *cost += extra_cost->vect.alu;
8628             }
8629           return false;
8630         }
8631
8632       if (GET_MODE_CLASS (mode) == MODE_INT)
8633         {
8634           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8635               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8636             {
8637               /* CSETM.  */
8638               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
8639               return true;
8640             }
8641
8642           /* Cost this as SUB wzr, X.  */
8643           op0 = CONST0_RTX (mode);
8644           op1 = XEXP (x, 0);
8645           goto cost_minus;
8646         }
8647
8648       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8649         {
8650           /* Support (neg(fma...)) as a single instruction only if
8651              sign of zeros is unimportant.  This matches the decision
8652              making in aarch64.md.  */
8653           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
8654             {
8655               /* FNMADD.  */
8656               *cost = rtx_cost (op0, mode, NEG, 0, speed);
8657               return true;
8658             }
8659           if (GET_CODE (op0) == MULT)
8660             {
8661               /* FNMUL.  */
8662               *cost = rtx_cost (op0, mode, NEG, 0, speed);
8663               return true;
8664             }
8665           if (speed)
8666             /* FNEG.  */
8667             *cost += extra_cost->fp[mode == DFmode].neg;
8668           return false;
8669         }
8670
8671       return false;
8672
8673     case CLRSB:
8674     case CLZ:
8675       if (speed)
8676         {
8677           if (VECTOR_MODE_P (mode))
8678             *cost += extra_cost->vect.alu;
8679           else
8680             *cost += extra_cost->alu.clz;
8681         }
8682
8683       return false;
8684
8685     case COMPARE:
8686       op0 = XEXP (x, 0);
8687       op1 = XEXP (x, 1);
8688
8689       if (op1 == const0_rtx
8690           && GET_CODE (op0) == AND)
8691         {
8692           x = op0;
8693           mode = GET_MODE (op0);
8694           goto cost_logic;
8695         }
8696
8697       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
8698         {
8699           /* TODO: A write to the CC flags possibly costs extra, this
8700              needs encoding in the cost tables.  */
8701
8702           mode = GET_MODE (op0);
8703           /* ANDS.  */
8704           if (GET_CODE (op0) == AND)
8705             {
8706               x = op0;
8707               goto cost_logic;
8708             }
8709
8710           if (GET_CODE (op0) == PLUS)
8711             {
8712               /* ADDS (and CMN alias).  */
8713               x = op0;
8714               goto cost_plus;
8715             }
8716
8717           if (GET_CODE (op0) == MINUS)
8718             {
8719               /* SUBS.  */
8720               x = op0;
8721               goto cost_minus;
8722             }
8723
8724           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
8725               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
8726               && CONST_INT_P (XEXP (op0, 2)))
8727             {
8728               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
8729                  Handle it here directly rather than going to cost_logic
8730                  since we know the immediate generated for the TST is valid
8731                  so we can avoid creating an intermediate rtx for it only
8732                  for costing purposes.  */
8733               if (speed)
8734                 *cost += extra_cost->alu.logical;
8735
8736               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
8737                                  ZERO_EXTRACT, 0, speed);
8738               return true;
8739             }
8740
8741           if (GET_CODE (op1) == NEG)
8742             {
8743               /* CMN.  */
8744               if (speed)
8745                 *cost += extra_cost->alu.arith;
8746
8747               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
8748               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
8749               return true;
8750             }
8751
8752           /* CMP.
8753
8754              Compare can freely swap the order of operands, and
8755              canonicalization puts the more complex operation first.
8756              But the integer MINUS logic expects the shift/extend
8757              operation in op1.  */
8758           if (! (REG_P (op0)
8759                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
8760           {
8761             op0 = XEXP (x, 1);
8762             op1 = XEXP (x, 0);
8763           }
8764           goto cost_minus;
8765         }
8766
8767       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
8768         {
8769           /* FCMP.  */
8770           if (speed)
8771             *cost += extra_cost->fp[mode == DFmode].compare;
8772
8773           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
8774             {
8775               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
8776               /* FCMP supports constant 0.0 for no extra cost. */
8777               return true;
8778             }
8779           return false;
8780         }
8781
8782       if (VECTOR_MODE_P (mode))
8783         {
8784           /* Vector compare.  */
8785           if (speed)
8786             *cost += extra_cost->vect.alu;
8787
8788           if (aarch64_float_const_zero_rtx_p (op1))
8789             {
8790               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
8791                  cost.  */
8792               return true;
8793             }
8794           return false;
8795         }
8796       return false;
8797
8798     case MINUS:
8799       {
8800         op0 = XEXP (x, 0);
8801         op1 = XEXP (x, 1);
8802
8803 cost_minus:
8804         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
8805
8806         /* Detect valid immediates.  */
8807         if ((GET_MODE_CLASS (mode) == MODE_INT
8808              || (GET_MODE_CLASS (mode) == MODE_CC
8809                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
8810             && CONST_INT_P (op1)
8811             && aarch64_uimm12_shift (INTVAL (op1)))
8812           {
8813             if (speed)
8814               /* SUB(S) (immediate).  */
8815               *cost += extra_cost->alu.arith;
8816             return true;
8817           }
8818
8819         /* Look for SUB (extended register).  */
8820         if (is_a <scalar_int_mode> (mode, &int_mode)
8821             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
8822           {
8823             if (speed)
8824               *cost += extra_cost->alu.extend_arith;
8825
8826             op1 = aarch64_strip_extend (op1, true);
8827             *cost += rtx_cost (op1, VOIDmode,
8828                                (enum rtx_code) GET_CODE (op1), 0, speed);
8829             return true;
8830           }
8831
8832         rtx new_op1 = aarch64_strip_extend (op1, false);
8833
8834         /* Cost this as an FMA-alike operation.  */
8835         if ((GET_CODE (new_op1) == MULT
8836              || aarch64_shift_p (GET_CODE (new_op1)))
8837             && code != COMPARE)
8838           {
8839             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
8840                                             (enum rtx_code) code,
8841                                             speed);
8842             return true;
8843           }
8844
8845         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
8846
8847         if (speed)
8848           {
8849             if (VECTOR_MODE_P (mode))
8850               {
8851                 /* Vector SUB.  */
8852                 *cost += extra_cost->vect.alu;
8853               }
8854             else if (GET_MODE_CLASS (mode) == MODE_INT)
8855               {
8856                 /* SUB(S).  */
8857                 *cost += extra_cost->alu.arith;
8858               }
8859             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8860               {
8861                 /* FSUB.  */
8862                 *cost += extra_cost->fp[mode == DFmode].addsub;
8863               }
8864           }
8865         return true;
8866       }
8867
8868     case PLUS:
8869       {
8870         rtx new_op0;
8871
8872         op0 = XEXP (x, 0);
8873         op1 = XEXP (x, 1);
8874
8875 cost_plus:
8876         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8877             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8878           {
8879             /* CSINC.  */
8880             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
8881             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8882             return true;
8883           }
8884
8885         if (GET_MODE_CLASS (mode) == MODE_INT
8886             && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
8887                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
8888           {
8889             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
8890
8891             if (speed)
8892               /* ADD (immediate).  */
8893               *cost += extra_cost->alu.arith;
8894             return true;
8895           }
8896
8897         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8898
8899         /* Look for ADD (extended register).  */
8900         if (is_a <scalar_int_mode> (mode, &int_mode)
8901             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
8902           {
8903             if (speed)
8904               *cost += extra_cost->alu.extend_arith;
8905
8906             op0 = aarch64_strip_extend (op0, true);
8907             *cost += rtx_cost (op0, VOIDmode,
8908                                (enum rtx_code) GET_CODE (op0), 0, speed);
8909             return true;
8910           }
8911
8912         /* Strip any extend, leave shifts behind as we will
8913            cost them through mult_cost.  */
8914         new_op0 = aarch64_strip_extend (op0, false);
8915
8916         if (GET_CODE (new_op0) == MULT
8917             || aarch64_shift_p (GET_CODE (new_op0)))
8918           {
8919             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
8920                                             speed);
8921             return true;
8922           }
8923
8924         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
8925
8926         if (speed)
8927           {
8928             if (VECTOR_MODE_P (mode))
8929               {
8930                 /* Vector ADD.  */
8931                 *cost += extra_cost->vect.alu;
8932               }
8933             else if (GET_MODE_CLASS (mode) == MODE_INT)
8934               {
8935                 /* ADD.  */
8936                 *cost += extra_cost->alu.arith;
8937               }
8938             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8939               {
8940                 /* FADD.  */
8941                 *cost += extra_cost->fp[mode == DFmode].addsub;
8942               }
8943           }
8944         return true;
8945       }
8946
8947     case BSWAP:
8948       *cost = COSTS_N_INSNS (1);
8949
8950       if (speed)
8951         {
8952           if (VECTOR_MODE_P (mode))
8953             *cost += extra_cost->vect.alu;
8954           else
8955             *cost += extra_cost->alu.rev;
8956         }
8957       return false;
8958
8959     case IOR:
8960       if (aarch_rev16_p (x))
8961         {
8962           *cost = COSTS_N_INSNS (1);
8963
8964           if (speed)
8965             {
8966               if (VECTOR_MODE_P (mode))
8967                 *cost += extra_cost->vect.alu;
8968               else
8969                 *cost += extra_cost->alu.rev;
8970             }
8971           return true;
8972         }
8973
8974       if (aarch64_extr_rtx_p (x, &op0, &op1))
8975         {
8976           *cost += rtx_cost (op0, mode, IOR, 0, speed);
8977           *cost += rtx_cost (op1, mode, IOR, 1, speed);
8978           if (speed)
8979             *cost += extra_cost->alu.shift;
8980
8981           return true;
8982         }
8983     /* Fall through.  */
8984     case XOR:
8985     case AND:
8986     cost_logic:
8987       op0 = XEXP (x, 0);
8988       op1 = XEXP (x, 1);
8989
8990       if (VECTOR_MODE_P (mode))
8991         {
8992           if (speed)
8993             *cost += extra_cost->vect.alu;
8994           return true;
8995         }
8996
8997       if (code == AND
8998           && GET_CODE (op0) == MULT
8999           && CONST_INT_P (XEXP (op0, 1))
9000           && CONST_INT_P (op1)
9001           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
9002                                INTVAL (op1)) != 0)
9003         {
9004           /* This is a UBFM/SBFM.  */
9005           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
9006           if (speed)
9007             *cost += extra_cost->alu.bfx;
9008           return true;
9009         }
9010
9011       if (is_int_mode (mode, &int_mode))
9012         {
9013           if (CONST_INT_P (op1))
9014             {
9015               /* We have a mask + shift version of a UBFIZ
9016                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
9017               if (GET_CODE (op0) == ASHIFT
9018                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
9019                                                          XEXP (op0, 1)))
9020                 {
9021                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
9022                                      (enum rtx_code) code, 0, speed);
9023                   if (speed)
9024                     *cost += extra_cost->alu.bfx;
9025
9026                   return true;
9027                 }
9028               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
9029                 {
9030                 /* We possibly get the immediate for free, this is not
9031                    modelled.  */
9032                   *cost += rtx_cost (op0, int_mode,
9033                                      (enum rtx_code) code, 0, speed);
9034                   if (speed)
9035                     *cost += extra_cost->alu.logical;
9036
9037                   return true;
9038                 }
9039             }
9040           else
9041             {
9042               rtx new_op0 = op0;
9043
9044               /* Handle ORN, EON, or BIC.  */
9045               if (GET_CODE (op0) == NOT)
9046                 op0 = XEXP (op0, 0);
9047
9048               new_op0 = aarch64_strip_shift (op0);
9049
9050               /* If we had a shift on op0 then this is a logical-shift-
9051                  by-register/immediate operation.  Otherwise, this is just
9052                  a logical operation.  */
9053               if (speed)
9054                 {
9055                   if (new_op0 != op0)
9056                     {
9057                       /* Shift by immediate.  */
9058                       if (CONST_INT_P (XEXP (op0, 1)))
9059                         *cost += extra_cost->alu.log_shift;
9060                       else
9061                         *cost += extra_cost->alu.log_shift_reg;
9062                     }
9063                   else
9064                     *cost += extra_cost->alu.logical;
9065                 }
9066
9067               /* In both cases we want to cost both operands.  */
9068               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
9069                                  0, speed);
9070               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
9071                                  1, speed);
9072
9073               return true;
9074             }
9075         }
9076       return false;
9077
9078     case NOT:
9079       x = XEXP (x, 0);
9080       op0 = aarch64_strip_shift (x);
9081
9082       if (VECTOR_MODE_P (mode))
9083         {
9084           /* Vector NOT.  */
9085           *cost += extra_cost->vect.alu;
9086           return false;
9087         }
9088
9089       /* MVN-shifted-reg.  */
9090       if (op0 != x)
9091         {
9092           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9093
9094           if (speed)
9095             *cost += extra_cost->alu.log_shift;
9096
9097           return true;
9098         }
9099       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
9100          Handle the second form here taking care that 'a' in the above can
9101          be a shift.  */
9102       else if (GET_CODE (op0) == XOR)
9103         {
9104           rtx newop0 = XEXP (op0, 0);
9105           rtx newop1 = XEXP (op0, 1);
9106           rtx op0_stripped = aarch64_strip_shift (newop0);
9107
9108           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
9109           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
9110
9111           if (speed)
9112             {
9113               if (op0_stripped != newop0)
9114                 *cost += extra_cost->alu.log_shift;
9115               else
9116                 *cost += extra_cost->alu.logical;
9117             }
9118
9119           return true;
9120         }
9121       /* MVN.  */
9122       if (speed)
9123         *cost += extra_cost->alu.logical;
9124
9125       return false;
9126
9127     case ZERO_EXTEND:
9128
9129       op0 = XEXP (x, 0);
9130       /* If a value is written in SI mode, then zero extended to DI
9131          mode, the operation will in general be free as a write to
9132          a 'w' register implicitly zeroes the upper bits of an 'x'
9133          register.  However, if this is
9134
9135            (set (reg) (zero_extend (reg)))
9136
9137          we must cost the explicit register move.  */
9138       if (mode == DImode
9139           && GET_MODE (op0) == SImode
9140           && outer == SET)
9141         {
9142           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
9143
9144         /* If OP_COST is non-zero, then the cost of the zero extend
9145            is effectively the cost of the inner operation.  Otherwise
9146            we have a MOV instruction and we take the cost from the MOV
9147            itself.  This is true independently of whether we are
9148            optimizing for space or time.  */
9149           if (op_cost)
9150             *cost = op_cost;
9151
9152           return true;
9153         }
9154       else if (MEM_P (op0))
9155         {
9156           /* All loads can zero extend to any size for free.  */
9157           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
9158           return true;
9159         }
9160
9161       op0 = aarch64_extend_bitfield_pattern_p (x);
9162       if (op0)
9163         {
9164           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
9165           if (speed)
9166             *cost += extra_cost->alu.bfx;
9167           return true;
9168         }
9169
9170       if (speed)
9171         {
9172           if (VECTOR_MODE_P (mode))
9173             {
9174               /* UMOV.  */
9175               *cost += extra_cost->vect.alu;
9176             }
9177           else
9178             {
9179               /* We generate an AND instead of UXTB/UXTH.  */
9180               *cost += extra_cost->alu.logical;
9181             }
9182         }
9183       return false;
9184
9185     case SIGN_EXTEND:
9186       if (MEM_P (XEXP (x, 0)))
9187         {
9188           /* LDRSH.  */
9189           if (speed)
9190             {
9191               rtx address = XEXP (XEXP (x, 0), 0);
9192               *cost += extra_cost->ldst.load_sign_extend;
9193
9194               *cost +=
9195                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9196                                                      0, speed));
9197             }
9198           return true;
9199         }
9200
9201       op0 = aarch64_extend_bitfield_pattern_p (x);
9202       if (op0)
9203         {
9204           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
9205           if (speed)
9206             *cost += extra_cost->alu.bfx;
9207           return true;
9208         }
9209
9210       if (speed)
9211         {
9212           if (VECTOR_MODE_P (mode))
9213             *cost += extra_cost->vect.alu;
9214           else
9215             *cost += extra_cost->alu.extend;
9216         }
9217       return false;
9218
9219     case ASHIFT:
9220       op0 = XEXP (x, 0);
9221       op1 = XEXP (x, 1);
9222
9223       if (CONST_INT_P (op1))
9224         {
9225           if (speed)
9226             {
9227               if (VECTOR_MODE_P (mode))
9228                 {
9229                   /* Vector shift (immediate).  */
9230                   *cost += extra_cost->vect.alu;
9231                 }
9232               else
9233                 {
9234                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
9235                      aliases.  */
9236                   *cost += extra_cost->alu.shift;
9237                 }
9238             }
9239
9240           /* We can incorporate zero/sign extend for free.  */
9241           if (GET_CODE (op0) == ZERO_EXTEND
9242               || GET_CODE (op0) == SIGN_EXTEND)
9243             op0 = XEXP (op0, 0);
9244
9245           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
9246           return true;
9247         }
9248       else
9249         {
9250           if (VECTOR_MODE_P (mode))
9251             {
9252               if (speed)
9253                 /* Vector shift (register).  */
9254                 *cost += extra_cost->vect.alu;
9255             }
9256           else
9257             {
9258               if (speed)
9259                 /* LSLV.  */
9260                 *cost += extra_cost->alu.shift_reg;
9261
9262               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9263                   && CONST_INT_P (XEXP (op1, 1))
9264                   && known_eq (INTVAL (XEXP (op1, 1)),
9265                                GET_MODE_BITSIZE (mode) - 1))
9266                 {
9267                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9268                   /* We already demanded XEXP (op1, 0) to be REG_P, so
9269                      don't recurse into it.  */
9270                   return true;
9271                 }
9272             }
9273           return false;  /* All arguments need to be in registers.  */
9274         }
9275
9276     case ROTATE:
9277     case ROTATERT:
9278     case LSHIFTRT:
9279     case ASHIFTRT:
9280       op0 = XEXP (x, 0);
9281       op1 = XEXP (x, 1);
9282
9283       if (CONST_INT_P (op1))
9284         {
9285           /* ASR (immediate) and friends.  */
9286           if (speed)
9287             {
9288               if (VECTOR_MODE_P (mode))
9289                 *cost += extra_cost->vect.alu;
9290               else
9291                 *cost += extra_cost->alu.shift;
9292             }
9293
9294           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9295           return true;
9296         }
9297       else
9298         {
9299           if (VECTOR_MODE_P (mode))
9300             {
9301               if (speed)
9302                 /* Vector shift (register).  */
9303                 *cost += extra_cost->vect.alu;
9304             }
9305           else
9306             {
9307               if (speed)
9308                 /* ASR (register) and friends.  */
9309                 *cost += extra_cost->alu.shift_reg;
9310
9311               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9312                   && CONST_INT_P (XEXP (op1, 1))
9313                   && known_eq (INTVAL (XEXP (op1, 1)),
9314                                GET_MODE_BITSIZE (mode) - 1))
9315                 {
9316                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9317                   /* We already demanded XEXP (op1, 0) to be REG_P, so
9318                      don't recurse into it.  */
9319                   return true;
9320                 }
9321             }
9322           return false;  /* All arguments need to be in registers.  */
9323         }
9324
9325     case SYMBOL_REF:
9326
9327       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
9328           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
9329         {
9330           /* LDR.  */
9331           if (speed)
9332             *cost += extra_cost->ldst.load;
9333         }
9334       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
9335                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
9336         {
9337           /* ADRP, followed by ADD.  */
9338           *cost += COSTS_N_INSNS (1);
9339           if (speed)
9340             *cost += 2 * extra_cost->alu.arith;
9341         }
9342       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
9343                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
9344         {
9345           /* ADR.  */
9346           if (speed)
9347             *cost += extra_cost->alu.arith;
9348         }
9349
9350       if (flag_pic)
9351         {
9352           /* One extra load instruction, after accessing the GOT.  */
9353           *cost += COSTS_N_INSNS (1);
9354           if (speed)
9355             *cost += extra_cost->ldst.load;
9356         }
9357       return true;
9358
9359     case HIGH:
9360     case LO_SUM:
9361       /* ADRP/ADD (immediate).  */
9362       if (speed)
9363         *cost += extra_cost->alu.arith;
9364       return true;
9365
9366     case ZERO_EXTRACT:
9367     case SIGN_EXTRACT:
9368       /* UBFX/SBFX.  */
9369       if (speed)
9370         {
9371           if (VECTOR_MODE_P (mode))
9372             *cost += extra_cost->vect.alu;
9373           else
9374             *cost += extra_cost->alu.bfx;
9375         }
9376
9377       /* We can trust that the immediates used will be correct (there
9378          are no by-register forms), so we need only cost op0.  */
9379       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
9380       return true;
9381
9382     case MULT:
9383       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
9384       /* aarch64_rtx_mult_cost always handles recursion to its
9385          operands.  */
9386       return true;
9387
9388     case MOD:
9389     /* We can expand signed mod by power of 2 using a NEGS, two parallel
9390        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
9391        an unconditional negate.  This case should only ever be reached through
9392        the set_smod_pow2_cheap check in expmed.c.  */
9393       if (CONST_INT_P (XEXP (x, 1))
9394           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
9395           && (mode == SImode || mode == DImode))
9396         {
9397           /* We expand to 4 instructions.  Reset the baseline.  */
9398           *cost = COSTS_N_INSNS (4);
9399
9400           if (speed)
9401             *cost += 2 * extra_cost->alu.logical
9402                      + 2 * extra_cost->alu.arith;
9403
9404           return true;
9405         }
9406
9407     /* Fall-through.  */
9408     case UMOD:
9409       if (speed)
9410         {
9411           /* Slighly prefer UMOD over SMOD.  */
9412           if (VECTOR_MODE_P (mode))
9413             *cost += extra_cost->vect.alu;
9414           else if (GET_MODE_CLASS (mode) == MODE_INT)
9415             *cost += (extra_cost->mult[mode == DImode].add
9416                       + extra_cost->mult[mode == DImode].idiv
9417                       + (code == MOD ? 1 : 0));
9418         }
9419       return false;  /* All arguments need to be in registers.  */
9420
9421     case DIV:
9422     case UDIV:
9423     case SQRT:
9424       if (speed)
9425         {
9426           if (VECTOR_MODE_P (mode))
9427             *cost += extra_cost->vect.alu;
9428           else if (GET_MODE_CLASS (mode) == MODE_INT)
9429             /* There is no integer SQRT, so only DIV and UDIV can get
9430                here.  */
9431             *cost += (extra_cost->mult[mode == DImode].idiv
9432                      /* Slighly prefer UDIV over SDIV.  */
9433                      + (code == DIV ? 1 : 0));
9434           else
9435             *cost += extra_cost->fp[mode == DFmode].div;
9436         }
9437       return false;  /* All arguments need to be in registers.  */
9438
9439     case IF_THEN_ELSE:
9440       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
9441                                          XEXP (x, 2), cost, speed);
9442
9443     case EQ:
9444     case NE:
9445     case GT:
9446     case GTU:
9447     case LT:
9448     case LTU:
9449     case GE:
9450     case GEU:
9451     case LE:
9452     case LEU:
9453
9454       return false; /* All arguments must be in registers.  */
9455
9456     case FMA:
9457       op0 = XEXP (x, 0);
9458       op1 = XEXP (x, 1);
9459       op2 = XEXP (x, 2);
9460
9461       if (speed)
9462         {
9463           if (VECTOR_MODE_P (mode))
9464             *cost += extra_cost->vect.alu;
9465           else
9466             *cost += extra_cost->fp[mode == DFmode].fma;
9467         }
9468
9469       /* FMSUB, FNMADD, and FNMSUB are free.  */
9470       if (GET_CODE (op0) == NEG)
9471         op0 = XEXP (op0, 0);
9472
9473       if (GET_CODE (op2) == NEG)
9474         op2 = XEXP (op2, 0);
9475
9476       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
9477          and the by-element operand as operand 0.  */
9478       if (GET_CODE (op1) == NEG)
9479         op1 = XEXP (op1, 0);
9480
9481       /* Catch vector-by-element operations.  The by-element operand can
9482          either be (vec_duplicate (vec_select (x))) or just
9483          (vec_select (x)), depending on whether we are multiplying by
9484          a vector or a scalar.
9485
9486          Canonicalization is not very good in these cases, FMA4 will put the
9487          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
9488       if (GET_CODE (op0) == VEC_DUPLICATE)
9489         op0 = XEXP (op0, 0);
9490       else if (GET_CODE (op1) == VEC_DUPLICATE)
9491         op1 = XEXP (op1, 0);
9492
9493       if (GET_CODE (op0) == VEC_SELECT)
9494         op0 = XEXP (op0, 0);
9495       else if (GET_CODE (op1) == VEC_SELECT)
9496         op1 = XEXP (op1, 0);
9497
9498       /* If the remaining parameters are not registers,
9499          get the cost to put them into registers.  */
9500       *cost += rtx_cost (op0, mode, FMA, 0, speed);
9501       *cost += rtx_cost (op1, mode, FMA, 1, speed);
9502       *cost += rtx_cost (op2, mode, FMA, 2, speed);
9503       return true;
9504
9505     case FLOAT:
9506     case UNSIGNED_FLOAT:
9507       if (speed)
9508         *cost += extra_cost->fp[mode == DFmode].fromint;
9509       return false;
9510
9511     case FLOAT_EXTEND:
9512       if (speed)
9513         {
9514           if (VECTOR_MODE_P (mode))
9515             {
9516               /*Vector truncate.  */
9517               *cost += extra_cost->vect.alu;
9518             }
9519           else
9520             *cost += extra_cost->fp[mode == DFmode].widen;
9521         }
9522       return false;
9523
9524     case FLOAT_TRUNCATE:
9525       if (speed)
9526         {
9527           if (VECTOR_MODE_P (mode))
9528             {
9529               /*Vector conversion.  */
9530               *cost += extra_cost->vect.alu;
9531             }
9532           else
9533             *cost += extra_cost->fp[mode == DFmode].narrow;
9534         }
9535       return false;
9536
9537     case FIX:
9538     case UNSIGNED_FIX:
9539       x = XEXP (x, 0);
9540       /* Strip the rounding part.  They will all be implemented
9541          by the fcvt* family of instructions anyway.  */
9542       if (GET_CODE (x) == UNSPEC)
9543         {
9544           unsigned int uns_code = XINT (x, 1);
9545
9546           if (uns_code == UNSPEC_FRINTA
9547               || uns_code == UNSPEC_FRINTM
9548               || uns_code == UNSPEC_FRINTN
9549               || uns_code == UNSPEC_FRINTP
9550               || uns_code == UNSPEC_FRINTZ)
9551             x = XVECEXP (x, 0, 0);
9552         }
9553
9554       if (speed)
9555         {
9556           if (VECTOR_MODE_P (mode))
9557             *cost += extra_cost->vect.alu;
9558           else
9559             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
9560         }
9561
9562       /* We can combine fmul by a power of 2 followed by a fcvt into a single
9563          fixed-point fcvt.  */
9564       if (GET_CODE (x) == MULT
9565           && ((VECTOR_MODE_P (mode)
9566                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
9567               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
9568         {
9569           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
9570                              0, speed);
9571           return true;
9572         }
9573
9574       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
9575       return true;
9576
9577     case ABS:
9578       if (VECTOR_MODE_P (mode))
9579         {
9580           /* ABS (vector).  */
9581           if (speed)
9582             *cost += extra_cost->vect.alu;
9583         }
9584       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9585         {
9586           op0 = XEXP (x, 0);
9587
9588           /* FABD, which is analogous to FADD.  */
9589           if (GET_CODE (op0) == MINUS)
9590             {
9591               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
9592               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
9593               if (speed)
9594                 *cost += extra_cost->fp[mode == DFmode].addsub;
9595
9596               return true;
9597             }
9598           /* Simple FABS is analogous to FNEG.  */
9599           if (speed)
9600             *cost += extra_cost->fp[mode == DFmode].neg;
9601         }
9602       else
9603         {
9604           /* Integer ABS will either be split to
9605              two arithmetic instructions, or will be an ABS
9606              (scalar), which we don't model.  */
9607           *cost = COSTS_N_INSNS (2);
9608           if (speed)
9609             *cost += 2 * extra_cost->alu.arith;
9610         }
9611       return false;
9612
9613     case SMAX:
9614     case SMIN:
9615       if (speed)
9616         {
9617           if (VECTOR_MODE_P (mode))
9618             *cost += extra_cost->vect.alu;
9619           else
9620             {
9621               /* FMAXNM/FMINNM/FMAX/FMIN.
9622                  TODO: This may not be accurate for all implementations, but
9623                  we do not model this in the cost tables.  */
9624               *cost += extra_cost->fp[mode == DFmode].addsub;
9625             }
9626         }
9627       return false;
9628
9629     case UNSPEC:
9630       /* The floating point round to integer frint* instructions.  */
9631       if (aarch64_frint_unspec_p (XINT (x, 1)))
9632         {
9633           if (speed)
9634             *cost += extra_cost->fp[mode == DFmode].roundint;
9635
9636           return false;
9637         }
9638
9639       if (XINT (x, 1) == UNSPEC_RBIT)
9640         {
9641           if (speed)
9642             *cost += extra_cost->alu.rev;
9643
9644           return false;
9645         }
9646       break;
9647
9648     case TRUNCATE:
9649
9650       /* Decompose <su>muldi3_highpart.  */
9651       if (/* (truncate:DI  */
9652           mode == DImode
9653           /*   (lshiftrt:TI  */
9654           && GET_MODE (XEXP (x, 0)) == TImode
9655           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
9656           /*      (mult:TI  */
9657           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
9658           /*        (ANY_EXTEND:TI (reg:DI))
9659                     (ANY_EXTEND:TI (reg:DI)))  */
9660           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
9661                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
9662               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
9663                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
9664           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
9665           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
9666           /*     (const_int 64)  */
9667           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9668           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
9669         {
9670           /* UMULH/SMULH.  */
9671           if (speed)
9672             *cost += extra_cost->mult[mode == DImode].extend;
9673           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
9674                              mode, MULT, 0, speed);
9675           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
9676                              mode, MULT, 1, speed);
9677           return true;
9678         }
9679
9680       /* Fall through.  */
9681     default:
9682       break;
9683     }
9684
9685   if (dump_file
9686       && flag_aarch64_verbose_cost)
9687     fprintf (dump_file,
9688       "\nFailed to cost RTX.  Assuming default cost.\n");
9689
9690   return true;
9691 }
9692
9693 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
9694    calculated for X.  This cost is stored in *COST.  Returns true
9695    if the total cost of X was calculated.  */
9696 static bool
9697 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
9698                    int param, int *cost, bool speed)
9699 {
9700   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
9701
9702   if (dump_file
9703       && flag_aarch64_verbose_cost)
9704     {
9705       print_rtl_single (dump_file, x);
9706       fprintf (dump_file, "\n%s cost: %d (%s)\n",
9707                speed ? "Hot" : "Cold",
9708                *cost, result ? "final" : "partial");
9709     }
9710
9711   return result;
9712 }
9713
9714 static int
9715 aarch64_register_move_cost (machine_mode mode,
9716                             reg_class_t from_i, reg_class_t to_i)
9717 {
9718   enum reg_class from = (enum reg_class) from_i;
9719   enum reg_class to = (enum reg_class) to_i;
9720   const struct cpu_regmove_cost *regmove_cost
9721     = aarch64_tune_params.regmove_cost;
9722
9723   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
9724   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
9725     to = GENERAL_REGS;
9726
9727   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
9728     from = GENERAL_REGS;
9729
9730   /* Moving between GPR and stack cost is the same as GP2GP.  */
9731   if ((from == GENERAL_REGS && to == STACK_REG)
9732       || (to == GENERAL_REGS && from == STACK_REG))
9733     return regmove_cost->GP2GP;
9734
9735   /* To/From the stack register, we move via the gprs.  */
9736   if (to == STACK_REG || from == STACK_REG)
9737     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
9738             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
9739
9740   if (known_eq (GET_MODE_SIZE (mode), 16))
9741     {
9742       /* 128-bit operations on general registers require 2 instructions.  */
9743       if (from == GENERAL_REGS && to == GENERAL_REGS)
9744         return regmove_cost->GP2GP * 2;
9745       else if (from == GENERAL_REGS)
9746         return regmove_cost->GP2FP * 2;
9747       else if (to == GENERAL_REGS)
9748         return regmove_cost->FP2GP * 2;
9749
9750       /* When AdvSIMD instructions are disabled it is not possible to move
9751          a 128-bit value directly between Q registers.  This is handled in
9752          secondary reload.  A general register is used as a scratch to move
9753          the upper DI value and the lower DI value is moved directly,
9754          hence the cost is the sum of three moves. */
9755       if (! TARGET_SIMD)
9756         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
9757
9758       return regmove_cost->FP2FP;
9759     }
9760
9761   if (from == GENERAL_REGS && to == GENERAL_REGS)
9762     return regmove_cost->GP2GP;
9763   else if (from == GENERAL_REGS)
9764     return regmove_cost->GP2FP;
9765   else if (to == GENERAL_REGS)
9766     return regmove_cost->FP2GP;
9767
9768   return regmove_cost->FP2FP;
9769 }
9770
9771 static int
9772 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
9773                           reg_class_t rclass ATTRIBUTE_UNUSED,
9774                           bool in ATTRIBUTE_UNUSED)
9775 {
9776   return aarch64_tune_params.memmov_cost;
9777 }
9778
9779 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
9780    to optimize 1.0/sqrt.  */
9781
9782 static bool
9783 use_rsqrt_p (machine_mode mode)
9784 {
9785   return (!flag_trapping_math
9786           && flag_unsafe_math_optimizations
9787           && ((aarch64_tune_params.approx_modes->recip_sqrt
9788                & AARCH64_APPROX_MODE (mode))
9789               || flag_mrecip_low_precision_sqrt));
9790 }
9791
9792 /* Function to decide when to use the approximate reciprocal square root
9793    builtin.  */
9794
9795 static tree
9796 aarch64_builtin_reciprocal (tree fndecl)
9797 {
9798   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
9799
9800   if (!use_rsqrt_p (mode))
9801     return NULL_TREE;
9802   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
9803 }
9804
9805 typedef rtx (*rsqrte_type) (rtx, rtx);
9806
9807 /* Select reciprocal square root initial estimate insn depending on machine
9808    mode.  */
9809
9810 static rsqrte_type
9811 get_rsqrte_type (machine_mode mode)
9812 {
9813   switch (mode)
9814   {
9815     case E_DFmode:   return gen_aarch64_rsqrtedf;
9816     case E_SFmode:   return gen_aarch64_rsqrtesf;
9817     case E_V2DFmode: return gen_aarch64_rsqrtev2df;
9818     case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
9819     case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
9820     default: gcc_unreachable ();
9821   }
9822 }
9823
9824 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
9825
9826 /* Select reciprocal square root series step insn depending on machine mode.  */
9827
9828 static rsqrts_type
9829 get_rsqrts_type (machine_mode mode)
9830 {
9831   switch (mode)
9832   {
9833     case E_DFmode:   return gen_aarch64_rsqrtsdf;
9834     case E_SFmode:   return gen_aarch64_rsqrtssf;
9835     case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
9836     case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
9837     case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
9838     default: gcc_unreachable ();
9839   }
9840 }
9841
9842 /* Emit instruction sequence to compute either the approximate square root
9843    or its approximate reciprocal, depending on the flag RECP, and return
9844    whether the sequence was emitted or not.  */
9845
9846 bool
9847 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
9848 {
9849   machine_mode mode = GET_MODE (dst);
9850
9851   if (GET_MODE_INNER (mode) == HFmode)
9852     {
9853       gcc_assert (!recp);
9854       return false;
9855     }
9856
9857   if (!recp)
9858     {
9859       if (!(flag_mlow_precision_sqrt
9860             || (aarch64_tune_params.approx_modes->sqrt
9861                 & AARCH64_APPROX_MODE (mode))))
9862         return false;
9863
9864       if (flag_finite_math_only
9865           || flag_trapping_math
9866           || !flag_unsafe_math_optimizations
9867           || optimize_function_for_size_p (cfun))
9868         return false;
9869     }
9870   else
9871     /* Caller assumes we cannot fail.  */
9872     gcc_assert (use_rsqrt_p (mode));
9873
9874   machine_mode mmsk = mode_for_int_vector (mode).require ();
9875   rtx xmsk = gen_reg_rtx (mmsk);
9876   if (!recp)
9877     /* When calculating the approximate square root, compare the
9878        argument with 0.0 and create a mask.  */
9879     emit_insn (gen_rtx_SET (xmsk,
9880                             gen_rtx_NEG (mmsk,
9881                                          gen_rtx_EQ (mmsk, src,
9882                                                      CONST0_RTX (mode)))));
9883
9884   /* Estimate the approximate reciprocal square root.  */
9885   rtx xdst = gen_reg_rtx (mode);
9886   emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
9887
9888   /* Iterate over the series twice for SF and thrice for DF.  */
9889   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9890
9891   /* Optionally iterate over the series once less for faster performance
9892      while sacrificing the accuracy.  */
9893   if ((recp && flag_mrecip_low_precision_sqrt)
9894       || (!recp && flag_mlow_precision_sqrt))
9895     iterations--;
9896
9897   /* Iterate over the series to calculate the approximate reciprocal square
9898      root.  */
9899   rtx x1 = gen_reg_rtx (mode);
9900   while (iterations--)
9901     {
9902       rtx x2 = gen_reg_rtx (mode);
9903       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
9904
9905       emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
9906
9907       if (iterations > 0)
9908         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
9909     }
9910
9911   if (!recp)
9912     {
9913       /* Qualify the approximate reciprocal square root when the argument is
9914          0.0 by squashing the intermediary result to 0.0.  */
9915       rtx xtmp = gen_reg_rtx (mmsk);
9916       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
9917                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
9918       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
9919
9920       /* Calculate the approximate square root.  */
9921       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
9922     }
9923
9924   /* Finalize the approximation.  */
9925   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
9926
9927   return true;
9928 }
9929
9930 typedef rtx (*recpe_type) (rtx, rtx);
9931
9932 /* Select reciprocal initial estimate insn depending on machine mode.  */
9933
9934 static recpe_type
9935 get_recpe_type (machine_mode mode)
9936 {
9937   switch (mode)
9938   {
9939     case E_SFmode:   return (gen_aarch64_frecpesf);
9940     case E_V2SFmode: return (gen_aarch64_frecpev2sf);
9941     case E_V4SFmode: return (gen_aarch64_frecpev4sf);
9942     case E_DFmode:   return (gen_aarch64_frecpedf);
9943     case E_V2DFmode: return (gen_aarch64_frecpev2df);
9944     default:         gcc_unreachable ();
9945   }
9946 }
9947
9948 typedef rtx (*recps_type) (rtx, rtx, rtx);
9949
9950 /* Select reciprocal series step insn depending on machine mode.  */
9951
9952 static recps_type
9953 get_recps_type (machine_mode mode)
9954 {
9955   switch (mode)
9956   {
9957     case E_SFmode:   return (gen_aarch64_frecpssf);
9958     case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
9959     case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
9960     case E_DFmode:   return (gen_aarch64_frecpsdf);
9961     case E_V2DFmode: return (gen_aarch64_frecpsv2df);
9962     default:         gcc_unreachable ();
9963   }
9964 }
9965
9966 /* Emit the instruction sequence to compute the approximation for the division
9967    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
9968
9969 bool
9970 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
9971 {
9972   machine_mode mode = GET_MODE (quo);
9973
9974   if (GET_MODE_INNER (mode) == HFmode)
9975     return false;
9976
9977   bool use_approx_division_p = (flag_mlow_precision_div
9978                                 || (aarch64_tune_params.approx_modes->division
9979                                     & AARCH64_APPROX_MODE (mode)));
9980
9981   if (!flag_finite_math_only
9982       || flag_trapping_math
9983       || !flag_unsafe_math_optimizations
9984       || optimize_function_for_size_p (cfun)
9985       || !use_approx_division_p)
9986     return false;
9987
9988   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
9989     return false;
9990
9991   /* Estimate the approximate reciprocal.  */
9992   rtx xrcp = gen_reg_rtx (mode);
9993   emit_insn ((*get_recpe_type (mode)) (xrcp, den));
9994
9995   /* Iterate over the series twice for SF and thrice for DF.  */
9996   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9997
9998   /* Optionally iterate over the series once less for faster performance,
9999      while sacrificing the accuracy.  */
10000   if (flag_mlow_precision_div)
10001     iterations--;
10002
10003   /* Iterate over the series to calculate the approximate reciprocal.  */
10004   rtx xtmp = gen_reg_rtx (mode);
10005   while (iterations--)
10006     {
10007       emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
10008
10009       if (iterations > 0)
10010         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
10011     }
10012
10013   if (num != CONST1_RTX (mode))
10014     {
10015       /* As the approximate reciprocal of DEN is already calculated, only
10016          calculate the approximate division when NUM is not 1.0.  */
10017       rtx xnum = force_reg (mode, num);
10018       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
10019     }
10020
10021   /* Finalize the approximation.  */
10022   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
10023   return true;
10024 }
10025
10026 /* Return the number of instructions that can be issued per cycle.  */
10027 static int
10028 aarch64_sched_issue_rate (void)
10029 {
10030   return aarch64_tune_params.issue_rate;
10031 }
10032
10033 static int
10034 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10035 {
10036   int issue_rate = aarch64_sched_issue_rate ();
10037
10038   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
10039 }
10040
10041
10042 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
10043    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
10044    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
10045
10046 static int
10047 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
10048                                                     int ready_index)
10049 {
10050   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
10051 }
10052
10053
10054 /* Vectorizer cost model target hooks.  */
10055
10056 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
10057 static int
10058 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
10059                                     tree vectype,
10060                                     int misalign ATTRIBUTE_UNUSED)
10061 {
10062   unsigned elements;
10063   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
10064   bool fp = false;
10065
10066   if (vectype != NULL)
10067     fp = FLOAT_TYPE_P (vectype);
10068
10069   switch (type_of_cost)
10070     {
10071       case scalar_stmt:
10072         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
10073
10074       case scalar_load:
10075         return costs->scalar_load_cost;
10076
10077       case scalar_store:
10078         return costs->scalar_store_cost;
10079
10080       case vector_stmt:
10081         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10082
10083       case vector_load:
10084         return costs->vec_align_load_cost;
10085
10086       case vector_store:
10087         return costs->vec_store_cost;
10088
10089       case vec_to_scalar:
10090         return costs->vec_to_scalar_cost;
10091
10092       case scalar_to_vec:
10093         return costs->scalar_to_vec_cost;
10094
10095       case unaligned_load:
10096       case vector_gather_load:
10097         return costs->vec_unalign_load_cost;
10098
10099       case unaligned_store:
10100       case vector_scatter_store:
10101         return costs->vec_unalign_store_cost;
10102
10103       case cond_branch_taken:
10104         return costs->cond_taken_branch_cost;
10105
10106       case cond_branch_not_taken:
10107         return costs->cond_not_taken_branch_cost;
10108
10109       case vec_perm:
10110         return costs->vec_permute_cost;
10111
10112       case vec_promote_demote:
10113         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10114
10115       case vec_construct:
10116         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
10117         return elements / 2 + 1;
10118
10119       default:
10120         gcc_unreachable ();
10121     }
10122 }
10123
10124 /* Implement targetm.vectorize.add_stmt_cost.  */
10125 static unsigned
10126 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
10127                        struct _stmt_vec_info *stmt_info, int misalign,
10128                        enum vect_cost_model_location where)
10129 {
10130   unsigned *cost = (unsigned *) data;
10131   unsigned retval = 0;
10132
10133   if (flag_vect_cost_model)
10134     {
10135       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
10136       int stmt_cost =
10137             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
10138
10139       /* Statements in an inner loop relative to the loop being
10140          vectorized are weighted more heavily.  The value here is
10141          arbitrary and could potentially be improved with analysis.  */
10142       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
10143         count *= 50; /*  FIXME  */
10144
10145       retval = (unsigned) (count * stmt_cost);
10146       cost[where] += retval;
10147     }
10148
10149   return retval;
10150 }
10151
10152 static void initialize_aarch64_code_model (struct gcc_options *);
10153
10154 /* Parse the TO_PARSE string and put the architecture struct that it
10155    selects into RES and the architectural features into ISA_FLAGS.
10156    Return an aarch64_parse_opt_result describing the parse result.
10157    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
10158
10159 static enum aarch64_parse_opt_result
10160 aarch64_parse_arch (const char *to_parse, const struct processor **res,
10161                     unsigned long *isa_flags)
10162 {
10163   char *ext;
10164   const struct processor *arch;
10165   char *str = (char *) alloca (strlen (to_parse) + 1);
10166   size_t len;
10167
10168   strcpy (str, to_parse);
10169
10170   ext = strchr (str, '+');
10171
10172   if (ext != NULL)
10173     len = ext - str;
10174   else
10175     len = strlen (str);
10176
10177   if (len == 0)
10178     return AARCH64_PARSE_MISSING_ARG;
10179
10180
10181   /* Loop through the list of supported ARCHes to find a match.  */
10182   for (arch = all_architectures; arch->name != NULL; arch++)
10183     {
10184       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
10185         {
10186           unsigned long isa_temp = arch->flags;
10187
10188           if (ext != NULL)
10189             {
10190               /* TO_PARSE string contains at least one extension.  */
10191               enum aarch64_parse_opt_result ext_res
10192                 = aarch64_parse_extension (ext, &isa_temp);
10193
10194               if (ext_res != AARCH64_PARSE_OK)
10195                 return ext_res;
10196             }
10197           /* Extension parsing was successful.  Confirm the result
10198              arch and ISA flags.  */
10199           *res = arch;
10200           *isa_flags = isa_temp;
10201           return AARCH64_PARSE_OK;
10202         }
10203     }
10204
10205   /* ARCH name not found in list.  */
10206   return AARCH64_PARSE_INVALID_ARG;
10207 }
10208
10209 /* Parse the TO_PARSE string and put the result tuning in RES and the
10210    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
10211    describing the parse result.  If there is an error parsing, RES and
10212    ISA_FLAGS are left unchanged.  */
10213
10214 static enum aarch64_parse_opt_result
10215 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
10216                    unsigned long *isa_flags)
10217 {
10218   char *ext;
10219   const struct processor *cpu;
10220   char *str = (char *) alloca (strlen (to_parse) + 1);
10221   size_t len;
10222
10223   strcpy (str, to_parse);
10224
10225   ext = strchr (str, '+');
10226
10227   if (ext != NULL)
10228     len = ext - str;
10229   else
10230     len = strlen (str);
10231
10232   if (len == 0)
10233     return AARCH64_PARSE_MISSING_ARG;
10234
10235
10236   /* Loop through the list of supported CPUs to find a match.  */
10237   for (cpu = all_cores; cpu->name != NULL; cpu++)
10238     {
10239       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
10240         {
10241           unsigned long isa_temp = cpu->flags;
10242
10243
10244           if (ext != NULL)
10245             {
10246               /* TO_PARSE string contains at least one extension.  */
10247               enum aarch64_parse_opt_result ext_res
10248                 = aarch64_parse_extension (ext, &isa_temp);
10249
10250               if (ext_res != AARCH64_PARSE_OK)
10251                 return ext_res;
10252             }
10253           /* Extension parsing was successfull.  Confirm the result
10254              cpu and ISA flags.  */
10255           *res = cpu;
10256           *isa_flags = isa_temp;
10257           return AARCH64_PARSE_OK;
10258         }
10259     }
10260
10261   /* CPU name not found in list.  */
10262   return AARCH64_PARSE_INVALID_ARG;
10263 }
10264
10265 /* Parse the TO_PARSE string and put the cpu it selects into RES.
10266    Return an aarch64_parse_opt_result describing the parse result.
10267    If the parsing fails the RES does not change.  */
10268
10269 static enum aarch64_parse_opt_result
10270 aarch64_parse_tune (const char *to_parse, const struct processor **res)
10271 {
10272   const struct processor *cpu;
10273   char *str = (char *) alloca (strlen (to_parse) + 1);
10274
10275   strcpy (str, to_parse);
10276
10277   /* Loop through the list of supported CPUs to find a match.  */
10278   for (cpu = all_cores; cpu->name != NULL; cpu++)
10279     {
10280       if (strcmp (cpu->name, str) == 0)
10281         {
10282           *res = cpu;
10283           return AARCH64_PARSE_OK;
10284         }
10285     }
10286
10287   /* CPU name not found in list.  */
10288   return AARCH64_PARSE_INVALID_ARG;
10289 }
10290
10291 /* Parse TOKEN, which has length LENGTH to see if it is an option
10292    described in FLAG.  If it is, return the index bit for that fusion type.
10293    If not, error (printing OPTION_NAME) and return zero.  */
10294
10295 static unsigned int
10296 aarch64_parse_one_option_token (const char *token,
10297                                 size_t length,
10298                                 const struct aarch64_flag_desc *flag,
10299                                 const char *option_name)
10300 {
10301   for (; flag->name != NULL; flag++)
10302     {
10303       if (length == strlen (flag->name)
10304           && !strncmp (flag->name, token, length))
10305         return flag->flag;
10306     }
10307
10308   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
10309   return 0;
10310 }
10311
10312 /* Parse OPTION which is a comma-separated list of flags to enable.
10313    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
10314    default state we inherit from the CPU tuning structures.  OPTION_NAME
10315    gives the top-level option we are parsing in the -moverride string,
10316    for use in error messages.  */
10317
10318 static unsigned int
10319 aarch64_parse_boolean_options (const char *option,
10320                                const struct aarch64_flag_desc *flags,
10321                                unsigned int initial_state,
10322                                const char *option_name)
10323 {
10324   const char separator = '.';
10325   const char* specs = option;
10326   const char* ntoken = option;
10327   unsigned int found_flags = initial_state;
10328
10329   while ((ntoken = strchr (specs, separator)))
10330     {
10331       size_t token_length = ntoken - specs;
10332       unsigned token_ops = aarch64_parse_one_option_token (specs,
10333                                                            token_length,
10334                                                            flags,
10335                                                            option_name);
10336       /* If we find "none" (or, for simplicity's sake, an error) anywhere
10337          in the token stream, reset the supported operations.  So:
10338
10339            adrp+add.cmp+branch.none.adrp+add
10340
10341            would have the result of turning on only adrp+add fusion.  */
10342       if (!token_ops)
10343         found_flags = 0;
10344
10345       found_flags |= token_ops;
10346       specs = ++ntoken;
10347     }
10348
10349   /* We ended with a comma, print something.  */
10350   if (!(*specs))
10351     {
10352       error ("%s string ill-formed\n", option_name);
10353       return 0;
10354     }
10355
10356   /* We still have one more token to parse.  */
10357   size_t token_length = strlen (specs);
10358   unsigned token_ops = aarch64_parse_one_option_token (specs,
10359                                                        token_length,
10360                                                        flags,
10361                                                        option_name);
10362    if (!token_ops)
10363      found_flags = 0;
10364
10365   found_flags |= token_ops;
10366   return found_flags;
10367 }
10368
10369 /* Support for overriding instruction fusion.  */
10370
10371 static void
10372 aarch64_parse_fuse_string (const char *fuse_string,
10373                             struct tune_params *tune)
10374 {
10375   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
10376                                                      aarch64_fusible_pairs,
10377                                                      tune->fusible_ops,
10378                                                      "fuse=");
10379 }
10380
10381 /* Support for overriding other tuning flags.  */
10382
10383 static void
10384 aarch64_parse_tune_string (const char *tune_string,
10385                             struct tune_params *tune)
10386 {
10387   tune->extra_tuning_flags
10388     = aarch64_parse_boolean_options (tune_string,
10389                                      aarch64_tuning_flags,
10390                                      tune->extra_tuning_flags,
10391                                      "tune=");
10392 }
10393
10394 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
10395    we understand.  If it is, extract the option string and handoff to
10396    the appropriate function.  */
10397
10398 void
10399 aarch64_parse_one_override_token (const char* token,
10400                                   size_t length,
10401                                   struct tune_params *tune)
10402 {
10403   const struct aarch64_tuning_override_function *fn
10404     = aarch64_tuning_override_functions;
10405
10406   const char *option_part = strchr (token, '=');
10407   if (!option_part)
10408     {
10409       error ("tuning string missing in option (%s)", token);
10410       return;
10411     }
10412
10413   /* Get the length of the option name.  */
10414   length = option_part - token;
10415   /* Skip the '=' to get to the option string.  */
10416   option_part++;
10417
10418   for (; fn->name != NULL; fn++)
10419     {
10420       if (!strncmp (fn->name, token, length))
10421         {
10422           fn->parse_override (option_part, tune);
10423           return;
10424         }
10425     }
10426
10427   error ("unknown tuning option (%s)",token);
10428   return;
10429 }
10430
10431 /* A checking mechanism for the implementation of the tls size.  */
10432
10433 static void
10434 initialize_aarch64_tls_size (struct gcc_options *opts)
10435 {
10436   if (aarch64_tls_size == 0)
10437     aarch64_tls_size = 24;
10438
10439   switch (opts->x_aarch64_cmodel_var)
10440     {
10441     case AARCH64_CMODEL_TINY:
10442       /* Both the default and maximum TLS size allowed under tiny is 1M which
10443          needs two instructions to address, so we clamp the size to 24.  */
10444       if (aarch64_tls_size > 24)
10445         aarch64_tls_size = 24;
10446       break;
10447     case AARCH64_CMODEL_SMALL:
10448       /* The maximum TLS size allowed under small is 4G.  */
10449       if (aarch64_tls_size > 32)
10450         aarch64_tls_size = 32;
10451       break;
10452     case AARCH64_CMODEL_LARGE:
10453       /* The maximum TLS size allowed under large is 16E.
10454          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
10455       if (aarch64_tls_size > 48)
10456         aarch64_tls_size = 48;
10457       break;
10458     default:
10459       gcc_unreachable ();
10460     }
10461
10462   return;
10463 }
10464
10465 /* Parse STRING looking for options in the format:
10466      string     :: option:string
10467      option     :: name=substring
10468      name       :: {a-z}
10469      substring  :: defined by option.  */
10470
10471 static void
10472 aarch64_parse_override_string (const char* input_string,
10473                                struct tune_params* tune)
10474 {
10475   const char separator = ':';
10476   size_t string_length = strlen (input_string) + 1;
10477   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
10478   char *string = string_root;
10479   strncpy (string, input_string, string_length);
10480   string[string_length - 1] = '\0';
10481
10482   char* ntoken = string;
10483
10484   while ((ntoken = strchr (string, separator)))
10485     {
10486       size_t token_length = ntoken - string;
10487       /* Make this substring look like a string.  */
10488       *ntoken = '\0';
10489       aarch64_parse_one_override_token (string, token_length, tune);
10490       string = ++ntoken;
10491     }
10492
10493   /* One last option to parse.  */
10494   aarch64_parse_one_override_token (string, strlen (string), tune);
10495   free (string_root);
10496 }
10497
10498
10499 static void
10500 aarch64_override_options_after_change_1 (struct gcc_options *opts)
10501 {
10502   /* PR 70044: We have to be careful about being called multiple times for the
10503      same function.  This means all changes should be repeatable.  */
10504
10505   /* If the frame pointer is enabled, set it to a special value that behaves
10506      similar to frame pointer omission.  If we don't do this all leaf functions
10507      will get a frame pointer even if flag_omit_leaf_frame_pointer is set.
10508      If flag_omit_frame_pointer has this special value, we must force the
10509      frame pointer if not in a leaf function.  We also need to force it in a
10510      leaf function if flag_omit_frame_pointer is not set or if LR is used.  */
10511   if (opts->x_flag_omit_frame_pointer == 0)
10512     opts->x_flag_omit_frame_pointer = 2;
10513
10514   /* If not optimizing for size, set the default
10515      alignment to what the target wants.  */
10516   if (!opts->x_optimize_size)
10517     {
10518       if (opts->x_align_loops <= 0)
10519         opts->x_align_loops = aarch64_tune_params.loop_align;
10520       if (opts->x_align_jumps <= 0)
10521         opts->x_align_jumps = aarch64_tune_params.jump_align;
10522       if (opts->x_align_functions <= 0)
10523         opts->x_align_functions = aarch64_tune_params.function_align;
10524     }
10525
10526   /* We default to no pc-relative literal loads.  */
10527
10528   aarch64_pcrelative_literal_loads = false;
10529
10530   /* If -mpc-relative-literal-loads is set on the command line, this
10531      implies that the user asked for PC relative literal loads.  */
10532   if (opts->x_pcrelative_literal_loads == 1)
10533     aarch64_pcrelative_literal_loads = true;
10534
10535   /* In the tiny memory model it makes no sense to disallow PC relative
10536      literal pool loads.  */
10537   if (aarch64_cmodel == AARCH64_CMODEL_TINY
10538       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10539     aarch64_pcrelative_literal_loads = true;
10540
10541   /* When enabling the lower precision Newton series for the square root, also
10542      enable it for the reciprocal square root, since the latter is an
10543      intermediary step for the former.  */
10544   if (flag_mlow_precision_sqrt)
10545     flag_mrecip_low_precision_sqrt = true;
10546 }
10547
10548 /* 'Unpack' up the internal tuning structs and update the options
10549     in OPTS.  The caller must have set up selected_tune and selected_arch
10550     as all the other target-specific codegen decisions are
10551     derived from them.  */
10552
10553 void
10554 aarch64_override_options_internal (struct gcc_options *opts)
10555 {
10556   aarch64_tune_flags = selected_tune->flags;
10557   aarch64_tune = selected_tune->sched_core;
10558   /* Make a copy of the tuning parameters attached to the core, which
10559      we may later overwrite.  */
10560   aarch64_tune_params = *(selected_tune->tune);
10561   aarch64_architecture_version = selected_arch->architecture_version;
10562
10563   if (opts->x_aarch64_override_tune_string)
10564     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
10565                                   &aarch64_tune_params);
10566
10567   /* This target defaults to strict volatile bitfields.  */
10568   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
10569     opts->x_flag_strict_volatile_bitfields = 1;
10570
10571   initialize_aarch64_code_model (opts);
10572   initialize_aarch64_tls_size (opts);
10573
10574   int queue_depth = 0;
10575   switch (aarch64_tune_params.autoprefetcher_model)
10576     {
10577       case tune_params::AUTOPREFETCHER_OFF:
10578         queue_depth = -1;
10579         break;
10580       case tune_params::AUTOPREFETCHER_WEAK:
10581         queue_depth = 0;
10582         break;
10583       case tune_params::AUTOPREFETCHER_STRONG:
10584         queue_depth = max_insn_queue_index + 1;
10585         break;
10586       default:
10587         gcc_unreachable ();
10588     }
10589
10590   /* We don't mind passing in global_options_set here as we don't use
10591      the *options_set structs anyway.  */
10592   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
10593                          queue_depth,
10594                          opts->x_param_values,
10595                          global_options_set.x_param_values);
10596
10597   /* Set up parameters to be used in prefetching algorithm.  Do not
10598      override the defaults unless we are tuning for a core we have
10599      researched values for.  */
10600   if (aarch64_tune_params.prefetch->num_slots > 0)
10601     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
10602                            aarch64_tune_params.prefetch->num_slots,
10603                            opts->x_param_values,
10604                            global_options_set.x_param_values);
10605   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
10606     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
10607                            aarch64_tune_params.prefetch->l1_cache_size,
10608                            opts->x_param_values,
10609                            global_options_set.x_param_values);
10610   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
10611     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
10612                            aarch64_tune_params.prefetch->l1_cache_line_size,
10613                            opts->x_param_values,
10614                            global_options_set.x_param_values);
10615   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
10616     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
10617                            aarch64_tune_params.prefetch->l2_cache_size,
10618                            opts->x_param_values,
10619                            global_options_set.x_param_values);
10620
10621   /* Use the alternative scheduling-pressure algorithm by default.  */
10622   maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
10623                          opts->x_param_values,
10624                          global_options_set.x_param_values);
10625
10626   /* Enable sw prefetching at specified optimization level for
10627      CPUS that have prefetch.  Lower optimization level threshold by 1
10628      when profiling is enabled.  */
10629   if (opts->x_flag_prefetch_loop_arrays < 0
10630       && !opts->x_optimize_size
10631       && aarch64_tune_params.prefetch->default_opt_level >= 0
10632       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
10633     opts->x_flag_prefetch_loop_arrays = 1;
10634
10635   aarch64_override_options_after_change_1 (opts);
10636 }
10637
10638 /* Print a hint with a suggestion for a core or architecture name that
10639    most closely resembles what the user passed in STR.  ARCH is true if
10640    the user is asking for an architecture name.  ARCH is false if the user
10641    is asking for a core name.  */
10642
10643 static void
10644 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
10645 {
10646   auto_vec<const char *> candidates;
10647   const struct processor *entry = arch ? all_architectures : all_cores;
10648   for (; entry->name != NULL; entry++)
10649     candidates.safe_push (entry->name);
10650
10651 #ifdef HAVE_LOCAL_CPU_DETECT
10652   /* Add also "native" as possible value.  */
10653   if (arch)
10654     candidates.safe_push ("native");
10655 #endif
10656
10657   char *s;
10658   const char *hint = candidates_list_and_hint (str, s, candidates);
10659   if (hint)
10660     inform (input_location, "valid arguments are: %s;"
10661                              " did you mean %qs?", s, hint);
10662   else
10663     inform (input_location, "valid arguments are: %s", s);
10664
10665   XDELETEVEC (s);
10666 }
10667
10668 /* Print a hint with a suggestion for a core name that most closely resembles
10669    what the user passed in STR.  */
10670
10671 inline static void
10672 aarch64_print_hint_for_core (const char *str)
10673 {
10674   aarch64_print_hint_for_core_or_arch (str, false);
10675 }
10676
10677 /* Print a hint with a suggestion for an architecture name that most closely
10678    resembles what the user passed in STR.  */
10679
10680 inline static void
10681 aarch64_print_hint_for_arch (const char *str)
10682 {
10683   aarch64_print_hint_for_core_or_arch (str, true);
10684 }
10685
10686 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
10687    specified in STR and throw errors if appropriate.  Put the results if
10688    they are valid in RES and ISA_FLAGS.  Return whether the option is
10689    valid.  */
10690
10691 static bool
10692 aarch64_validate_mcpu (const char *str, const struct processor **res,
10693                        unsigned long *isa_flags)
10694 {
10695   enum aarch64_parse_opt_result parse_res
10696     = aarch64_parse_cpu (str, res, isa_flags);
10697
10698   if (parse_res == AARCH64_PARSE_OK)
10699     return true;
10700
10701   switch (parse_res)
10702     {
10703       case AARCH64_PARSE_MISSING_ARG:
10704         error ("missing cpu name in %<-mcpu=%s%>", str);
10705         break;
10706       case AARCH64_PARSE_INVALID_ARG:
10707         error ("unknown value %qs for -mcpu", str);
10708         aarch64_print_hint_for_core (str);
10709         break;
10710       case AARCH64_PARSE_INVALID_FEATURE:
10711         error ("invalid feature modifier in %<-mcpu=%s%>", str);
10712         break;
10713       default:
10714         gcc_unreachable ();
10715     }
10716
10717   return false;
10718 }
10719
10720 /* Validate a command-line -march option.  Parse the arch and extensions
10721    (if any) specified in STR and throw errors if appropriate.  Put the
10722    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
10723    option is valid.  */
10724
10725 static bool
10726 aarch64_validate_march (const char *str, const struct processor **res,
10727                          unsigned long *isa_flags)
10728 {
10729   enum aarch64_parse_opt_result parse_res
10730     = aarch64_parse_arch (str, res, isa_flags);
10731
10732   if (parse_res == AARCH64_PARSE_OK)
10733     return true;
10734
10735   switch (parse_res)
10736     {
10737       case AARCH64_PARSE_MISSING_ARG:
10738         error ("missing arch name in %<-march=%s%>", str);
10739         break;
10740       case AARCH64_PARSE_INVALID_ARG:
10741         error ("unknown value %qs for -march", str);
10742         aarch64_print_hint_for_arch (str);
10743         break;
10744       case AARCH64_PARSE_INVALID_FEATURE:
10745         error ("invalid feature modifier in %<-march=%s%>", str);
10746         break;
10747       default:
10748         gcc_unreachable ();
10749     }
10750
10751   return false;
10752 }
10753
10754 /* Validate a command-line -mtune option.  Parse the cpu
10755    specified in STR and throw errors if appropriate.  Put the
10756    result, if it is valid, in RES.  Return whether the option is
10757    valid.  */
10758
10759 static bool
10760 aarch64_validate_mtune (const char *str, const struct processor **res)
10761 {
10762   enum aarch64_parse_opt_result parse_res
10763     = aarch64_parse_tune (str, res);
10764
10765   if (parse_res == AARCH64_PARSE_OK)
10766     return true;
10767
10768   switch (parse_res)
10769     {
10770       case AARCH64_PARSE_MISSING_ARG:
10771         error ("missing cpu name in %<-mtune=%s%>", str);
10772         break;
10773       case AARCH64_PARSE_INVALID_ARG:
10774         error ("unknown value %qs for -mtune", str);
10775         aarch64_print_hint_for_core (str);
10776         break;
10777       default:
10778         gcc_unreachable ();
10779     }
10780   return false;
10781 }
10782
10783 /* Return the CPU corresponding to the enum CPU.
10784    If it doesn't specify a cpu, return the default.  */
10785
10786 static const struct processor *
10787 aarch64_get_tune_cpu (enum aarch64_processor cpu)
10788 {
10789   if (cpu != aarch64_none)
10790     return &all_cores[cpu];
10791
10792   /* The & 0x3f is to extract the bottom 6 bits that encode the
10793      default cpu as selected by the --with-cpu GCC configure option
10794      in config.gcc.
10795      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
10796      flags mechanism should be reworked to make it more sane.  */
10797   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10798 }
10799
10800 /* Return the architecture corresponding to the enum ARCH.
10801    If it doesn't specify a valid architecture, return the default.  */
10802
10803 static const struct processor *
10804 aarch64_get_arch (enum aarch64_arch arch)
10805 {
10806   if (arch != aarch64_no_arch)
10807     return &all_architectures[arch];
10808
10809   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10810
10811   return &all_architectures[cpu->arch];
10812 }
10813
10814 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
10815
10816 static poly_uint16
10817 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
10818 {
10819   /* For now generate vector-length agnostic code for -msve-vector-bits=128.
10820      This ensures we can clearly distinguish SVE and Advanced SIMD modes when
10821      deciding which .md file patterns to use and when deciding whether
10822      something is a legitimate address or constant.  */
10823   if (value == SVE_SCALABLE || value == SVE_128)
10824     return poly_uint16 (2, 2);
10825   else
10826     return (int) value / 64;
10827 }
10828
10829 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
10830    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
10831    tuning structs.  In particular it must set selected_tune and
10832    aarch64_isa_flags that define the available ISA features and tuning
10833    decisions.  It must also set selected_arch as this will be used to
10834    output the .arch asm tags for each function.  */
10835
10836 static void
10837 aarch64_override_options (void)
10838 {
10839   unsigned long cpu_isa = 0;
10840   unsigned long arch_isa = 0;
10841   aarch64_isa_flags = 0;
10842
10843   bool valid_cpu = true;
10844   bool valid_tune = true;
10845   bool valid_arch = true;
10846
10847   selected_cpu = NULL;
10848   selected_arch = NULL;
10849   selected_tune = NULL;
10850
10851   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
10852      If either of -march or -mtune is given, they override their
10853      respective component of -mcpu.  */
10854   if (aarch64_cpu_string)
10855     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
10856                                         &cpu_isa);
10857
10858   if (aarch64_arch_string)
10859     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
10860                                           &arch_isa);
10861
10862   if (aarch64_tune_string)
10863     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
10864
10865   /* If the user did not specify a processor, choose the default
10866      one for them.  This will be the CPU set during configuration using
10867      --with-cpu, otherwise it is "generic".  */
10868   if (!selected_cpu)
10869     {
10870       if (selected_arch)
10871         {
10872           selected_cpu = &all_cores[selected_arch->ident];
10873           aarch64_isa_flags = arch_isa;
10874           explicit_arch = selected_arch->arch;
10875         }
10876       else
10877         {
10878           /* Get default configure-time CPU.  */
10879           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
10880           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
10881         }
10882
10883       if (selected_tune)
10884         explicit_tune_core = selected_tune->ident;
10885     }
10886   /* If both -mcpu and -march are specified check that they are architecturally
10887      compatible, warn if they're not and prefer the -march ISA flags.  */
10888   else if (selected_arch)
10889     {
10890       if (selected_arch->arch != selected_cpu->arch)
10891         {
10892           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
10893                        all_architectures[selected_cpu->arch].name,
10894                        selected_arch->name);
10895         }
10896       aarch64_isa_flags = arch_isa;
10897       explicit_arch = selected_arch->arch;
10898       explicit_tune_core = selected_tune ? selected_tune->ident
10899                                           : selected_cpu->ident;
10900     }
10901   else
10902     {
10903       /* -mcpu but no -march.  */
10904       aarch64_isa_flags = cpu_isa;
10905       explicit_tune_core = selected_tune ? selected_tune->ident
10906                                           : selected_cpu->ident;
10907       gcc_assert (selected_cpu);
10908       selected_arch = &all_architectures[selected_cpu->arch];
10909       explicit_arch = selected_arch->arch;
10910     }
10911
10912   /* Set the arch as well as we will need it when outputing
10913      the .arch directive in assembly.  */
10914   if (!selected_arch)
10915     {
10916       gcc_assert (selected_cpu);
10917       selected_arch = &all_architectures[selected_cpu->arch];
10918     }
10919
10920   if (!selected_tune)
10921     selected_tune = selected_cpu;
10922
10923 #ifndef HAVE_AS_MABI_OPTION
10924   /* The compiler may have been configured with 2.23.* binutils, which does
10925      not have support for ILP32.  */
10926   if (TARGET_ILP32)
10927     error ("assembler does not support -mabi=ilp32");
10928 #endif
10929
10930   /* Convert -msve-vector-bits to a VG count.  */
10931   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
10932
10933   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
10934     sorry ("return address signing is only supported for -mabi=lp64");
10935
10936   /* Make sure we properly set up the explicit options.  */
10937   if ((aarch64_cpu_string && valid_cpu)
10938        || (aarch64_tune_string && valid_tune))
10939     gcc_assert (explicit_tune_core != aarch64_none);
10940
10941   if ((aarch64_cpu_string && valid_cpu)
10942        || (aarch64_arch_string && valid_arch))
10943     gcc_assert (explicit_arch != aarch64_no_arch);
10944
10945   aarch64_override_options_internal (&global_options);
10946
10947   /* Save these options as the default ones in case we push and pop them later
10948      while processing functions with potential target attributes.  */
10949   target_option_default_node = target_option_current_node
10950       = build_target_option_node (&global_options);
10951 }
10952
10953 /* Implement targetm.override_options_after_change.  */
10954
10955 static void
10956 aarch64_override_options_after_change (void)
10957 {
10958   aarch64_override_options_after_change_1 (&global_options);
10959 }
10960
10961 static struct machine_function *
10962 aarch64_init_machine_status (void)
10963 {
10964   struct machine_function *machine;
10965   machine = ggc_cleared_alloc<machine_function> ();
10966   return machine;
10967 }
10968
10969 void
10970 aarch64_init_expanders (void)
10971 {
10972   init_machine_status = aarch64_init_machine_status;
10973 }
10974
10975 /* A checking mechanism for the implementation of the various code models.  */
10976 static void
10977 initialize_aarch64_code_model (struct gcc_options *opts)
10978 {
10979    if (opts->x_flag_pic)
10980      {
10981        switch (opts->x_aarch64_cmodel_var)
10982          {
10983          case AARCH64_CMODEL_TINY:
10984            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
10985            break;
10986          case AARCH64_CMODEL_SMALL:
10987 #ifdef HAVE_AS_SMALL_PIC_RELOCS
10988            aarch64_cmodel = (flag_pic == 2
10989                              ? AARCH64_CMODEL_SMALL_PIC
10990                              : AARCH64_CMODEL_SMALL_SPIC);
10991 #else
10992            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
10993 #endif
10994            break;
10995          case AARCH64_CMODEL_LARGE:
10996            sorry ("code model %qs with -f%s", "large",
10997                   opts->x_flag_pic > 1 ? "PIC" : "pic");
10998            break;
10999          default:
11000            gcc_unreachable ();
11001          }
11002      }
11003    else
11004      aarch64_cmodel = opts->x_aarch64_cmodel_var;
11005 }
11006
11007 /* Implement TARGET_OPTION_SAVE.  */
11008
11009 static void
11010 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
11011 {
11012   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
11013 }
11014
11015 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
11016    using the information saved in PTR.  */
11017
11018 static void
11019 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
11020 {
11021   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
11022   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11023   opts->x_explicit_arch = ptr->x_explicit_arch;
11024   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
11025   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
11026
11027   aarch64_override_options_internal (opts);
11028 }
11029
11030 /* Implement TARGET_OPTION_PRINT.  */
11031
11032 static void
11033 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
11034 {
11035   const struct processor *cpu
11036     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11037   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
11038   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
11039   std::string extension
11040     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
11041
11042   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
11043   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
11044            arch->name, extension.c_str ());
11045 }
11046
11047 static GTY(()) tree aarch64_previous_fndecl;
11048
11049 void
11050 aarch64_reset_previous_fndecl (void)
11051 {
11052   aarch64_previous_fndecl = NULL;
11053 }
11054
11055 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
11056    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
11057    make sure optab availability predicates are recomputed when necessary.  */
11058
11059 void
11060 aarch64_save_restore_target_globals (tree new_tree)
11061 {
11062   if (TREE_TARGET_GLOBALS (new_tree))
11063     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
11064   else if (new_tree == target_option_default_node)
11065     restore_target_globals (&default_target_globals);
11066   else
11067     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
11068 }
11069
11070 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
11071    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
11072    of the function, if such exists.  This function may be called multiple
11073    times on a single function so use aarch64_previous_fndecl to avoid
11074    setting up identical state.  */
11075
11076 static void
11077 aarch64_set_current_function (tree fndecl)
11078 {
11079   if (!fndecl || fndecl == aarch64_previous_fndecl)
11080     return;
11081
11082   tree old_tree = (aarch64_previous_fndecl
11083                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
11084                    : NULL_TREE);
11085
11086   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11087
11088   /* If current function has no attributes but the previous one did,
11089      use the default node.  */
11090   if (!new_tree && old_tree)
11091     new_tree = target_option_default_node;
11092
11093   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
11094      the default have been handled by aarch64_save_restore_target_globals from
11095      aarch64_pragma_target_parse.  */
11096   if (old_tree == new_tree)
11097     return;
11098
11099   aarch64_previous_fndecl = fndecl;
11100
11101   /* First set the target options.  */
11102   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
11103
11104   aarch64_save_restore_target_globals (new_tree);
11105 }
11106
11107 /* Enum describing the various ways we can handle attributes.
11108    In many cases we can reuse the generic option handling machinery.  */
11109
11110 enum aarch64_attr_opt_type
11111 {
11112   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
11113   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
11114   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
11115   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
11116 };
11117
11118 /* All the information needed to handle a target attribute.
11119    NAME is the name of the attribute.
11120    ATTR_TYPE specifies the type of behavior of the attribute as described
11121    in the definition of enum aarch64_attr_opt_type.
11122    ALLOW_NEG is true if the attribute supports a "no-" form.
11123    HANDLER is the function that takes the attribute string as an argument
11124    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
11125    OPT_NUM is the enum specifying the option that the attribute modifies.
11126    This is needed for attributes that mirror the behavior of a command-line
11127    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
11128    aarch64_attr_enum.  */
11129
11130 struct aarch64_attribute_info
11131 {
11132   const char *name;
11133   enum aarch64_attr_opt_type attr_type;
11134   bool allow_neg;
11135   bool (*handler) (const char *);
11136   enum opt_code opt_num;
11137 };
11138
11139 /* Handle the ARCH_STR argument to the arch= target attribute.  */
11140
11141 static bool
11142 aarch64_handle_attr_arch (const char *str)
11143 {
11144   const struct processor *tmp_arch = NULL;
11145   enum aarch64_parse_opt_result parse_res
11146     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
11147
11148   if (parse_res == AARCH64_PARSE_OK)
11149     {
11150       gcc_assert (tmp_arch);
11151       selected_arch = tmp_arch;
11152       explicit_arch = selected_arch->arch;
11153       return true;
11154     }
11155
11156   switch (parse_res)
11157     {
11158       case AARCH64_PARSE_MISSING_ARG:
11159         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
11160         break;
11161       case AARCH64_PARSE_INVALID_ARG:
11162         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
11163         aarch64_print_hint_for_arch (str);
11164         break;
11165       case AARCH64_PARSE_INVALID_FEATURE:
11166         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11167         break;
11168       default:
11169         gcc_unreachable ();
11170     }
11171
11172   return false;
11173 }
11174
11175 /* Handle the argument CPU_STR to the cpu= target attribute.  */
11176
11177 static bool
11178 aarch64_handle_attr_cpu (const char *str)
11179 {
11180   const struct processor *tmp_cpu = NULL;
11181   enum aarch64_parse_opt_result parse_res
11182     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
11183
11184   if (parse_res == AARCH64_PARSE_OK)
11185     {
11186       gcc_assert (tmp_cpu);
11187       selected_tune = tmp_cpu;
11188       explicit_tune_core = selected_tune->ident;
11189
11190       selected_arch = &all_architectures[tmp_cpu->arch];
11191       explicit_arch = selected_arch->arch;
11192       return true;
11193     }
11194
11195   switch (parse_res)
11196     {
11197       case AARCH64_PARSE_MISSING_ARG:
11198         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
11199         break;
11200       case AARCH64_PARSE_INVALID_ARG:
11201         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
11202         aarch64_print_hint_for_core (str);
11203         break;
11204       case AARCH64_PARSE_INVALID_FEATURE:
11205         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11206         break;
11207       default:
11208         gcc_unreachable ();
11209     }
11210
11211   return false;
11212 }
11213
11214 /* Handle the argument STR to the tune= target attribute.  */
11215
11216 static bool
11217 aarch64_handle_attr_tune (const char *str)
11218 {
11219   const struct processor *tmp_tune = NULL;
11220   enum aarch64_parse_opt_result parse_res
11221     = aarch64_parse_tune (str, &tmp_tune);
11222
11223   if (parse_res == AARCH64_PARSE_OK)
11224     {
11225       gcc_assert (tmp_tune);
11226       selected_tune = tmp_tune;
11227       explicit_tune_core = selected_tune->ident;
11228       return true;
11229     }
11230
11231   switch (parse_res)
11232     {
11233       case AARCH64_PARSE_INVALID_ARG:
11234         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
11235         aarch64_print_hint_for_core (str);
11236         break;
11237       default:
11238         gcc_unreachable ();
11239     }
11240
11241   return false;
11242 }
11243
11244 /* Parse an architecture extensions target attribute string specified in STR.
11245    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
11246    if successful.  Update aarch64_isa_flags to reflect the ISA features
11247    modified.  */
11248
11249 static bool
11250 aarch64_handle_attr_isa_flags (char *str)
11251 {
11252   enum aarch64_parse_opt_result parse_res;
11253   unsigned long isa_flags = aarch64_isa_flags;
11254
11255   /* We allow "+nothing" in the beginning to clear out all architectural
11256      features if the user wants to handpick specific features.  */
11257   if (strncmp ("+nothing", str, 8) == 0)
11258     {
11259       isa_flags = 0;
11260       str += 8;
11261     }
11262
11263   parse_res = aarch64_parse_extension (str, &isa_flags);
11264
11265   if (parse_res == AARCH64_PARSE_OK)
11266     {
11267       aarch64_isa_flags = isa_flags;
11268       return true;
11269     }
11270
11271   switch (parse_res)
11272     {
11273       case AARCH64_PARSE_MISSING_ARG:
11274         error ("missing value in %<target()%> pragma or attribute");
11275         break;
11276
11277       case AARCH64_PARSE_INVALID_FEATURE:
11278         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11279         break;
11280
11281       default:
11282         gcc_unreachable ();
11283     }
11284
11285  return false;
11286 }
11287
11288 /* The target attributes that we support.  On top of these we also support just
11289    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
11290    handled explicitly in aarch64_process_one_target_attr.  */
11291
11292 static const struct aarch64_attribute_info aarch64_attributes[] =
11293 {
11294   { "general-regs-only", aarch64_attr_mask, false, NULL,
11295      OPT_mgeneral_regs_only },
11296   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
11297      OPT_mfix_cortex_a53_835769 },
11298   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
11299      OPT_mfix_cortex_a53_843419 },
11300   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
11301   { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
11302   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
11303      OPT_momit_leaf_frame_pointer },
11304   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
11305   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
11306      OPT_march_ },
11307   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
11308   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
11309      OPT_mtune_ },
11310   { "sign-return-address", aarch64_attr_enum, false, NULL,
11311      OPT_msign_return_address_ },
11312   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
11313 };
11314
11315 /* Parse ARG_STR which contains the definition of one target attribute.
11316    Show appropriate errors if any or return true if the attribute is valid.  */
11317
11318 static bool
11319 aarch64_process_one_target_attr (char *arg_str)
11320 {
11321   bool invert = false;
11322
11323   size_t len = strlen (arg_str);
11324
11325   if (len == 0)
11326     {
11327       error ("malformed %<target()%> pragma or attribute");
11328       return false;
11329     }
11330
11331   char *str_to_check = (char *) alloca (len + 1);
11332   strcpy (str_to_check, arg_str);
11333
11334   /* Skip leading whitespace.  */
11335   while (*str_to_check == ' ' || *str_to_check == '\t')
11336     str_to_check++;
11337
11338   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
11339      It is easier to detect and handle it explicitly here rather than going
11340      through the machinery for the rest of the target attributes in this
11341      function.  */
11342   if (*str_to_check == '+')
11343     return aarch64_handle_attr_isa_flags (str_to_check);
11344
11345   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
11346     {
11347       invert = true;
11348       str_to_check += 3;
11349     }
11350   char *arg = strchr (str_to_check, '=');
11351
11352   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
11353      and point ARG to "foo".  */
11354   if (arg)
11355     {
11356       *arg = '\0';
11357       arg++;
11358     }
11359   const struct aarch64_attribute_info *p_attr;
11360   bool found = false;
11361   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
11362     {
11363       /* If the names don't match up, or the user has given an argument
11364          to an attribute that doesn't accept one, or didn't give an argument
11365          to an attribute that expects one, fail to match.  */
11366       if (strcmp (str_to_check, p_attr->name) != 0)
11367         continue;
11368
11369       found = true;
11370       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
11371                               || p_attr->attr_type == aarch64_attr_enum;
11372
11373       if (attr_need_arg_p ^ (arg != NULL))
11374         {
11375           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
11376           return false;
11377         }
11378
11379       /* If the name matches but the attribute does not allow "no-" versions
11380          then we can't match.  */
11381       if (invert && !p_attr->allow_neg)
11382         {
11383           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
11384           return false;
11385         }
11386
11387       switch (p_attr->attr_type)
11388         {
11389         /* Has a custom handler registered.
11390            For example, cpu=, arch=, tune=.  */
11391           case aarch64_attr_custom:
11392             gcc_assert (p_attr->handler);
11393             if (!p_attr->handler (arg))
11394               return false;
11395             break;
11396
11397           /* Either set or unset a boolean option.  */
11398           case aarch64_attr_bool:
11399             {
11400               struct cl_decoded_option decoded;
11401
11402               generate_option (p_attr->opt_num, NULL, !invert,
11403                                CL_TARGET, &decoded);
11404               aarch64_handle_option (&global_options, &global_options_set,
11405                                       &decoded, input_location);
11406               break;
11407             }
11408           /* Set or unset a bit in the target_flags.  aarch64_handle_option
11409              should know what mask to apply given the option number.  */
11410           case aarch64_attr_mask:
11411             {
11412               struct cl_decoded_option decoded;
11413               /* We only need to specify the option number.
11414                  aarch64_handle_option will know which mask to apply.  */
11415               decoded.opt_index = p_attr->opt_num;
11416               decoded.value = !invert;
11417               aarch64_handle_option (&global_options, &global_options_set,
11418                                       &decoded, input_location);
11419               break;
11420             }
11421           /* Use the option setting machinery to set an option to an enum.  */
11422           case aarch64_attr_enum:
11423             {
11424               gcc_assert (arg);
11425               bool valid;
11426               int value;
11427               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
11428                                               &value, CL_TARGET);
11429               if (valid)
11430                 {
11431                   set_option (&global_options, NULL, p_attr->opt_num, value,
11432                               NULL, DK_UNSPECIFIED, input_location,
11433                               global_dc);
11434                 }
11435               else
11436                 {
11437                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
11438                 }
11439               break;
11440             }
11441           default:
11442             gcc_unreachable ();
11443         }
11444     }
11445
11446   /* If we reached here we either have found an attribute and validated
11447      it or didn't match any.  If we matched an attribute but its arguments
11448      were malformed we will have returned false already.  */
11449   return found;
11450 }
11451
11452 /* Count how many times the character C appears in
11453    NULL-terminated string STR.  */
11454
11455 static unsigned int
11456 num_occurences_in_str (char c, char *str)
11457 {
11458   unsigned int res = 0;
11459   while (*str != '\0')
11460     {
11461       if (*str == c)
11462         res++;
11463
11464       str++;
11465     }
11466
11467   return res;
11468 }
11469
11470 /* Parse the tree in ARGS that contains the target attribute information
11471    and update the global target options space.  */
11472
11473 bool
11474 aarch64_process_target_attr (tree args)
11475 {
11476   if (TREE_CODE (args) == TREE_LIST)
11477     {
11478       do
11479         {
11480           tree head = TREE_VALUE (args);
11481           if (head)
11482             {
11483               if (!aarch64_process_target_attr (head))
11484                 return false;
11485             }
11486           args = TREE_CHAIN (args);
11487         } while (args);
11488
11489       return true;
11490     }
11491
11492   if (TREE_CODE (args) != STRING_CST)
11493     {
11494       error ("attribute %<target%> argument not a string");
11495       return false;
11496     }
11497
11498   size_t len = strlen (TREE_STRING_POINTER (args));
11499   char *str_to_check = (char *) alloca (len + 1);
11500   strcpy (str_to_check, TREE_STRING_POINTER (args));
11501
11502   if (len == 0)
11503     {
11504       error ("malformed %<target()%> pragma or attribute");
11505       return false;
11506     }
11507
11508   /* Used to catch empty spaces between commas i.e.
11509      attribute ((target ("attr1,,attr2"))).  */
11510   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
11511
11512   /* Handle multiple target attributes separated by ','.  */
11513   char *token = strtok (str_to_check, ",");
11514
11515   unsigned int num_attrs = 0;
11516   while (token)
11517     {
11518       num_attrs++;
11519       if (!aarch64_process_one_target_attr (token))
11520         {
11521           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
11522           return false;
11523         }
11524
11525       token = strtok (NULL, ",");
11526     }
11527
11528   if (num_attrs != num_commas + 1)
11529     {
11530       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
11531       return false;
11532     }
11533
11534   return true;
11535 }
11536
11537 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
11538    process attribute ((target ("..."))).  */
11539
11540 static bool
11541 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
11542 {
11543   struct cl_target_option cur_target;
11544   bool ret;
11545   tree old_optimize;
11546   tree new_target, new_optimize;
11547   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11548
11549   /* If what we're processing is the current pragma string then the
11550      target option node is already stored in target_option_current_node
11551      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
11552      having to re-parse the string.  This is especially useful to keep
11553      arm_neon.h compile times down since that header contains a lot
11554      of intrinsics enclosed in pragmas.  */
11555   if (!existing_target && args == current_target_pragma)
11556     {
11557       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
11558       return true;
11559     }
11560   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11561
11562   old_optimize = build_optimization_node (&global_options);
11563   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11564
11565   /* If the function changed the optimization levels as well as setting
11566      target options, start with the optimizations specified.  */
11567   if (func_optimize && func_optimize != old_optimize)
11568     cl_optimization_restore (&global_options,
11569                              TREE_OPTIMIZATION (func_optimize));
11570
11571   /* Save the current target options to restore at the end.  */
11572   cl_target_option_save (&cur_target, &global_options);
11573
11574   /* If fndecl already has some target attributes applied to it, unpack
11575      them so that we add this attribute on top of them, rather than
11576      overwriting them.  */
11577   if (existing_target)
11578     {
11579       struct cl_target_option *existing_options
11580         = TREE_TARGET_OPTION (existing_target);
11581
11582       if (existing_options)
11583         cl_target_option_restore (&global_options, existing_options);
11584     }
11585   else
11586     cl_target_option_restore (&global_options,
11587                         TREE_TARGET_OPTION (target_option_current_node));
11588
11589   ret = aarch64_process_target_attr (args);
11590
11591   /* Set up any additional state.  */
11592   if (ret)
11593     {
11594       aarch64_override_options_internal (&global_options);
11595       /* Initialize SIMD builtins if we haven't already.
11596          Set current_target_pragma to NULL for the duration so that
11597          the builtin initialization code doesn't try to tag the functions
11598          being built with the attributes specified by any current pragma, thus
11599          going into an infinite recursion.  */
11600       if (TARGET_SIMD)
11601         {
11602           tree saved_current_target_pragma = current_target_pragma;
11603           current_target_pragma = NULL;
11604           aarch64_init_simd_builtins ();
11605           current_target_pragma = saved_current_target_pragma;
11606         }
11607       new_target = build_target_option_node (&global_options);
11608     }
11609   else
11610     new_target = NULL;
11611
11612   new_optimize = build_optimization_node (&global_options);
11613
11614   if (fndecl && ret)
11615     {
11616       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
11617
11618       if (old_optimize != new_optimize)
11619         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
11620     }
11621
11622   cl_target_option_restore (&global_options, &cur_target);
11623
11624   if (old_optimize != new_optimize)
11625     cl_optimization_restore (&global_options,
11626                              TREE_OPTIMIZATION (old_optimize));
11627   return ret;
11628 }
11629
11630 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
11631    tri-bool options (yes, no, don't care) and the default value is
11632    DEF, determine whether to reject inlining.  */
11633
11634 static bool
11635 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
11636                                      int dont_care, int def)
11637 {
11638   /* If the callee doesn't care, always allow inlining.  */
11639   if (callee == dont_care)
11640     return true;
11641
11642   /* If the caller doesn't care, always allow inlining.  */
11643   if (caller == dont_care)
11644     return true;
11645
11646   /* Otherwise, allow inlining if either the callee and caller values
11647      agree, or if the callee is using the default value.  */
11648   return (callee == caller || callee == def);
11649 }
11650
11651 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
11652    to inline CALLEE into CALLER based on target-specific info.
11653    Make sure that the caller and callee have compatible architectural
11654    features.  Then go through the other possible target attributes
11655    and see if they can block inlining.  Try not to reject always_inline
11656    callees unless they are incompatible architecturally.  */
11657
11658 static bool
11659 aarch64_can_inline_p (tree caller, tree callee)
11660 {
11661   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
11662   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
11663
11664   /* If callee has no option attributes, then it is ok to inline.  */
11665   if (!callee_tree)
11666     return true;
11667
11668   struct cl_target_option *caller_opts
11669         = TREE_TARGET_OPTION (caller_tree ? caller_tree
11670                                            : target_option_default_node);
11671
11672   struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
11673
11674
11675   /* Callee's ISA flags should be a subset of the caller's.  */
11676   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
11677        != callee_opts->x_aarch64_isa_flags)
11678     return false;
11679
11680   /* Allow non-strict aligned functions inlining into strict
11681      aligned ones.  */
11682   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
11683        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
11684       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
11685            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
11686     return false;
11687
11688   bool always_inline = lookup_attribute ("always_inline",
11689                                           DECL_ATTRIBUTES (callee));
11690
11691   /* If the architectural features match up and the callee is always_inline
11692      then the other attributes don't matter.  */
11693   if (always_inline)
11694     return true;
11695
11696   if (caller_opts->x_aarch64_cmodel_var
11697       != callee_opts->x_aarch64_cmodel_var)
11698     return false;
11699
11700   if (caller_opts->x_aarch64_tls_dialect
11701       != callee_opts->x_aarch64_tls_dialect)
11702     return false;
11703
11704   /* Honour explicit requests to workaround errata.  */
11705   if (!aarch64_tribools_ok_for_inlining_p (
11706           caller_opts->x_aarch64_fix_a53_err835769,
11707           callee_opts->x_aarch64_fix_a53_err835769,
11708           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
11709     return false;
11710
11711   if (!aarch64_tribools_ok_for_inlining_p (
11712           caller_opts->x_aarch64_fix_a53_err843419,
11713           callee_opts->x_aarch64_fix_a53_err843419,
11714           2, TARGET_FIX_ERR_A53_843419))
11715     return false;
11716
11717   /* If the user explicitly specified -momit-leaf-frame-pointer for the
11718      caller and calle and they don't match up, reject inlining.  */
11719   if (!aarch64_tribools_ok_for_inlining_p (
11720           caller_opts->x_flag_omit_leaf_frame_pointer,
11721           callee_opts->x_flag_omit_leaf_frame_pointer,
11722           2, 1))
11723     return false;
11724
11725   /* If the callee has specific tuning overrides, respect them.  */
11726   if (callee_opts->x_aarch64_override_tune_string != NULL
11727       && caller_opts->x_aarch64_override_tune_string == NULL)
11728     return false;
11729
11730   /* If the user specified tuning override strings for the
11731      caller and callee and they don't match up, reject inlining.
11732      We just do a string compare here, we don't analyze the meaning
11733      of the string, as it would be too costly for little gain.  */
11734   if (callee_opts->x_aarch64_override_tune_string
11735       && caller_opts->x_aarch64_override_tune_string
11736       && (strcmp (callee_opts->x_aarch64_override_tune_string,
11737                   caller_opts->x_aarch64_override_tune_string) != 0))
11738     return false;
11739
11740   return true;
11741 }
11742
11743 /* Return true if SYMBOL_REF X binds locally.  */
11744
11745 static bool
11746 aarch64_symbol_binds_local_p (const_rtx x)
11747 {
11748   return (SYMBOL_REF_DECL (x)
11749           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
11750           : SYMBOL_REF_LOCAL_P (x));
11751 }
11752
11753 /* Return true if SYMBOL_REF X is thread local */
11754 static bool
11755 aarch64_tls_symbol_p (rtx x)
11756 {
11757   if (! TARGET_HAVE_TLS)
11758     return false;
11759
11760   if (GET_CODE (x) != SYMBOL_REF)
11761     return false;
11762
11763   return SYMBOL_REF_TLS_MODEL (x) != 0;
11764 }
11765
11766 /* Classify a TLS symbol into one of the TLS kinds.  */
11767 enum aarch64_symbol_type
11768 aarch64_classify_tls_symbol (rtx x)
11769 {
11770   enum tls_model tls_kind = tls_symbolic_operand_type (x);
11771
11772   switch (tls_kind)
11773     {
11774     case TLS_MODEL_GLOBAL_DYNAMIC:
11775     case TLS_MODEL_LOCAL_DYNAMIC:
11776       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
11777
11778     case TLS_MODEL_INITIAL_EXEC:
11779       switch (aarch64_cmodel)
11780         {
11781         case AARCH64_CMODEL_TINY:
11782         case AARCH64_CMODEL_TINY_PIC:
11783           return SYMBOL_TINY_TLSIE;
11784         default:
11785           return SYMBOL_SMALL_TLSIE;
11786         }
11787
11788     case TLS_MODEL_LOCAL_EXEC:
11789       if (aarch64_tls_size == 12)
11790         return SYMBOL_TLSLE12;
11791       else if (aarch64_tls_size == 24)
11792         return SYMBOL_TLSLE24;
11793       else if (aarch64_tls_size == 32)
11794         return SYMBOL_TLSLE32;
11795       else if (aarch64_tls_size == 48)
11796         return SYMBOL_TLSLE48;
11797       else
11798         gcc_unreachable ();
11799
11800     case TLS_MODEL_EMULATED:
11801     case TLS_MODEL_NONE:
11802       return SYMBOL_FORCE_TO_MEM;
11803
11804     default:
11805       gcc_unreachable ();
11806     }
11807 }
11808
11809 /* Return the correct method for accessing X + OFFSET, where X is either
11810    a SYMBOL_REF or LABEL_REF.  */
11811
11812 enum aarch64_symbol_type
11813 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
11814 {
11815   if (GET_CODE (x) == LABEL_REF)
11816     {
11817       switch (aarch64_cmodel)
11818         {
11819         case AARCH64_CMODEL_LARGE:
11820           return SYMBOL_FORCE_TO_MEM;
11821
11822         case AARCH64_CMODEL_TINY_PIC:
11823         case AARCH64_CMODEL_TINY:
11824           return SYMBOL_TINY_ABSOLUTE;
11825
11826         case AARCH64_CMODEL_SMALL_SPIC:
11827         case AARCH64_CMODEL_SMALL_PIC:
11828         case AARCH64_CMODEL_SMALL:
11829           return SYMBOL_SMALL_ABSOLUTE;
11830
11831         default:
11832           gcc_unreachable ();
11833         }
11834     }
11835
11836   if (GET_CODE (x) == SYMBOL_REF)
11837     {
11838       if (aarch64_tls_symbol_p (x))
11839         return aarch64_classify_tls_symbol (x);
11840
11841       switch (aarch64_cmodel)
11842         {
11843         case AARCH64_CMODEL_TINY:
11844           /* When we retrieve symbol + offset address, we have to make sure
11845              the offset does not cause overflow of the final address.  But
11846              we have no way of knowing the address of symbol at compile time
11847              so we can't accurately say if the distance between the PC and
11848              symbol + offset is outside the addressible range of +/-1M in the
11849              TINY code model.  So we rely on images not being greater than
11850              1M and cap the offset at 1M and anything beyond 1M will have to
11851              be loaded using an alternative mechanism.  Furthermore if the
11852              symbol is a weak reference to something that isn't known to
11853              resolve to a symbol in this module, then force to memory.  */
11854           if ((SYMBOL_REF_WEAK (x)
11855                && !aarch64_symbol_binds_local_p (x))
11856               || !IN_RANGE (offset, -1048575, 1048575))
11857             return SYMBOL_FORCE_TO_MEM;
11858           return SYMBOL_TINY_ABSOLUTE;
11859
11860         case AARCH64_CMODEL_SMALL:
11861           /* Same reasoning as the tiny code model, but the offset cap here is
11862              4G.  */
11863           if ((SYMBOL_REF_WEAK (x)
11864                && !aarch64_symbol_binds_local_p (x))
11865               || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
11866                             HOST_WIDE_INT_C (4294967264)))
11867             return SYMBOL_FORCE_TO_MEM;
11868           return SYMBOL_SMALL_ABSOLUTE;
11869
11870         case AARCH64_CMODEL_TINY_PIC:
11871           if (!aarch64_symbol_binds_local_p (x))
11872             return SYMBOL_TINY_GOT;
11873           return SYMBOL_TINY_ABSOLUTE;
11874
11875         case AARCH64_CMODEL_SMALL_SPIC:
11876         case AARCH64_CMODEL_SMALL_PIC:
11877           if (!aarch64_symbol_binds_local_p (x))
11878             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
11879                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
11880           return SYMBOL_SMALL_ABSOLUTE;
11881
11882         case AARCH64_CMODEL_LARGE:
11883           /* This is alright even in PIC code as the constant
11884              pool reference is always PC relative and within
11885              the same translation unit.  */
11886           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
11887             return SYMBOL_SMALL_ABSOLUTE;
11888           else
11889             return SYMBOL_FORCE_TO_MEM;
11890
11891         default:
11892           gcc_unreachable ();
11893         }
11894     }
11895
11896   /* By default push everything into the constant pool.  */
11897   return SYMBOL_FORCE_TO_MEM;
11898 }
11899
11900 bool
11901 aarch64_constant_address_p (rtx x)
11902 {
11903   return (CONSTANT_P (x) && memory_address_p (DImode, x));
11904 }
11905
11906 bool
11907 aarch64_legitimate_pic_operand_p (rtx x)
11908 {
11909   if (GET_CODE (x) == SYMBOL_REF
11910       || (GET_CODE (x) == CONST
11911           && GET_CODE (XEXP (x, 0)) == PLUS
11912           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
11913      return false;
11914
11915   return true;
11916 }
11917
11918 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
11919    that should be rematerialized rather than spilled.  */
11920
11921 static bool
11922 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
11923 {
11924   /* Support CSE and rematerialization of common constants.  */
11925   if (CONST_INT_P (x)
11926       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
11927       || GET_CODE (x) == CONST_VECTOR)
11928     return true;
11929
11930   /* Do not allow vector struct mode constants for Advanced SIMD.
11931      We could support 0 and -1 easily, but they need support in
11932      aarch64-simd.md.  */
11933   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11934   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
11935     return false;
11936
11937   /* Only accept variable-length vector constants if they can be
11938      handled directly.
11939
11940      ??? It would be possible to handle rematerialization of other
11941      constants via secondary reloads.  */
11942   if (vec_flags & VEC_ANY_SVE)
11943     return aarch64_simd_valid_immediate (x, NULL);
11944
11945   if (GET_CODE (x) == HIGH)
11946     x = XEXP (x, 0);
11947
11948   /* Accept polynomial constants that can be calculated by using the
11949      destination of a move as the sole temporary.  Constants that
11950      require a second temporary cannot be rematerialized (they can't be
11951      forced to memory and also aren't legitimate constants).  */
11952   poly_int64 offset;
11953   if (poly_int_rtx_p (x, &offset))
11954     return aarch64_offset_temporaries (false, offset) <= 1;
11955
11956   /* If an offset is being added to something else, we need to allow the
11957      base to be moved into the destination register, meaning that there
11958      are no free temporaries for the offset.  */
11959   x = strip_offset (x, &offset);
11960   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
11961     return false;
11962
11963   /* Do not allow const (plus (anchor_symbol, const_int)).  */
11964   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
11965     return false;
11966
11967   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
11968      so spilling them is better than rematerialization.  */
11969   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
11970     return true;
11971
11972   /* Label references are always constant.  */
11973   if (GET_CODE (x) == LABEL_REF)
11974     return true;
11975
11976   return false;
11977 }
11978
11979 rtx
11980 aarch64_load_tp (rtx target)
11981 {
11982   if (!target
11983       || GET_MODE (target) != Pmode
11984       || !register_operand (target, Pmode))
11985     target = gen_reg_rtx (Pmode);
11986
11987   /* Can return in any reg.  */
11988   emit_insn (gen_aarch64_load_tp_hard (target));
11989   return target;
11990 }
11991
11992 /* On AAPCS systems, this is the "struct __va_list".  */
11993 static GTY(()) tree va_list_type;
11994
11995 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
11996    Return the type to use as __builtin_va_list.
11997
11998    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
11999
12000    struct __va_list
12001    {
12002      void *__stack;
12003      void *__gr_top;
12004      void *__vr_top;
12005      int   __gr_offs;
12006      int   __vr_offs;
12007    };  */
12008
12009 static tree
12010 aarch64_build_builtin_va_list (void)
12011 {
12012   tree va_list_name;
12013   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12014
12015   /* Create the type.  */
12016   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
12017   /* Give it the required name.  */
12018   va_list_name = build_decl (BUILTINS_LOCATION,
12019                              TYPE_DECL,
12020                              get_identifier ("__va_list"),
12021                              va_list_type);
12022   DECL_ARTIFICIAL (va_list_name) = 1;
12023   TYPE_NAME (va_list_type) = va_list_name;
12024   TYPE_STUB_DECL (va_list_type) = va_list_name;
12025
12026   /* Create the fields.  */
12027   f_stack = build_decl (BUILTINS_LOCATION,
12028                         FIELD_DECL, get_identifier ("__stack"),
12029                         ptr_type_node);
12030   f_grtop = build_decl (BUILTINS_LOCATION,
12031                         FIELD_DECL, get_identifier ("__gr_top"),
12032                         ptr_type_node);
12033   f_vrtop = build_decl (BUILTINS_LOCATION,
12034                         FIELD_DECL, get_identifier ("__vr_top"),
12035                         ptr_type_node);
12036   f_groff = build_decl (BUILTINS_LOCATION,
12037                         FIELD_DECL, get_identifier ("__gr_offs"),
12038                         integer_type_node);
12039   f_vroff = build_decl (BUILTINS_LOCATION,
12040                         FIELD_DECL, get_identifier ("__vr_offs"),
12041                         integer_type_node);
12042
12043   /* Tell tree-stdarg pass about our internal offset fields.
12044      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
12045      purpose to identify whether the code is updating va_list internal
12046      offset fields through irregular way.  */
12047   va_list_gpr_counter_field = f_groff;
12048   va_list_fpr_counter_field = f_vroff;
12049
12050   DECL_ARTIFICIAL (f_stack) = 1;
12051   DECL_ARTIFICIAL (f_grtop) = 1;
12052   DECL_ARTIFICIAL (f_vrtop) = 1;
12053   DECL_ARTIFICIAL (f_groff) = 1;
12054   DECL_ARTIFICIAL (f_vroff) = 1;
12055
12056   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
12057   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
12058   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
12059   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
12060   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
12061
12062   TYPE_FIELDS (va_list_type) = f_stack;
12063   DECL_CHAIN (f_stack) = f_grtop;
12064   DECL_CHAIN (f_grtop) = f_vrtop;
12065   DECL_CHAIN (f_vrtop) = f_groff;
12066   DECL_CHAIN (f_groff) = f_vroff;
12067
12068   /* Compute its layout.  */
12069   layout_type (va_list_type);
12070
12071   return va_list_type;
12072 }
12073
12074 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
12075 static void
12076 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
12077 {
12078   const CUMULATIVE_ARGS *cum;
12079   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12080   tree stack, grtop, vrtop, groff, vroff;
12081   tree t;
12082   int gr_save_area_size = cfun->va_list_gpr_size;
12083   int vr_save_area_size = cfun->va_list_fpr_size;
12084   int vr_offset;
12085
12086   cum = &crtl->args.info;
12087   if (cfun->va_list_gpr_size)
12088     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
12089                              cfun->va_list_gpr_size);
12090   if (cfun->va_list_fpr_size)
12091     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
12092                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
12093
12094   if (!TARGET_FLOAT)
12095     {
12096       gcc_assert (cum->aapcs_nvrn == 0);
12097       vr_save_area_size = 0;
12098     }
12099
12100   f_stack = TYPE_FIELDS (va_list_type_node);
12101   f_grtop = DECL_CHAIN (f_stack);
12102   f_vrtop = DECL_CHAIN (f_grtop);
12103   f_groff = DECL_CHAIN (f_vrtop);
12104   f_vroff = DECL_CHAIN (f_groff);
12105
12106   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
12107                   NULL_TREE);
12108   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
12109                   NULL_TREE);
12110   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
12111                   NULL_TREE);
12112   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
12113                   NULL_TREE);
12114   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
12115                   NULL_TREE);
12116
12117   /* Emit code to initialize STACK, which points to the next varargs stack
12118      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
12119      by named arguments.  STACK is 8-byte aligned.  */
12120   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
12121   if (cum->aapcs_stack_size > 0)
12122     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
12123   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
12124   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12125
12126   /* Emit code to initialize GRTOP, the top of the GR save area.
12127      virtual_incoming_args_rtx should have been 16 byte aligned.  */
12128   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
12129   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
12130   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12131
12132   /* Emit code to initialize VRTOP, the top of the VR save area.
12133      This address is gr_save_area_bytes below GRTOP, rounded
12134      down to the next 16-byte boundary.  */
12135   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
12136   vr_offset = ROUND_UP (gr_save_area_size,
12137                         STACK_BOUNDARY / BITS_PER_UNIT);
12138
12139   if (vr_offset)
12140     t = fold_build_pointer_plus_hwi (t, -vr_offset);
12141   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
12142   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12143
12144   /* Emit code to initialize GROFF, the offset from GRTOP of the
12145      next GPR argument.  */
12146   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
12147               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
12148   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12149
12150   /* Likewise emit code to initialize VROFF, the offset from FTOP
12151      of the next VR argument.  */
12152   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
12153               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
12154   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12155 }
12156
12157 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
12158
12159 static tree
12160 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
12161                               gimple_seq *post_p ATTRIBUTE_UNUSED)
12162 {
12163   tree addr;
12164   bool indirect_p;
12165   bool is_ha;           /* is HFA or HVA.  */
12166   bool dw_align;        /* double-word align.  */
12167   machine_mode ag_mode = VOIDmode;
12168   int nregs;
12169   machine_mode mode;
12170
12171   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12172   tree stack, f_top, f_off, off, arg, roundup, on_stack;
12173   HOST_WIDE_INT size, rsize, adjust, align;
12174   tree t, u, cond1, cond2;
12175
12176   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
12177   if (indirect_p)
12178     type = build_pointer_type (type);
12179
12180   mode = TYPE_MODE (type);
12181
12182   f_stack = TYPE_FIELDS (va_list_type_node);
12183   f_grtop = DECL_CHAIN (f_stack);
12184   f_vrtop = DECL_CHAIN (f_grtop);
12185   f_groff = DECL_CHAIN (f_vrtop);
12186   f_vroff = DECL_CHAIN (f_groff);
12187
12188   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
12189                   f_stack, NULL_TREE);
12190   size = int_size_in_bytes (type);
12191   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
12192
12193   dw_align = false;
12194   adjust = 0;
12195   if (aarch64_vfp_is_call_or_return_candidate (mode,
12196                                                type,
12197                                                &ag_mode,
12198                                                &nregs,
12199                                                &is_ha))
12200     {
12201       /* No frontends can create types with variable-sized modes, so we
12202          shouldn't be asked to pass or return them.  */
12203       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
12204
12205       /* TYPE passed in fp/simd registers.  */
12206       if (!TARGET_FLOAT)
12207         aarch64_err_no_fpadvsimd (mode, "varargs");
12208
12209       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
12210                       unshare_expr (valist), f_vrtop, NULL_TREE);
12211       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
12212                       unshare_expr (valist), f_vroff, NULL_TREE);
12213
12214       rsize = nregs * UNITS_PER_VREG;
12215
12216       if (is_ha)
12217         {
12218           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
12219             adjust = UNITS_PER_VREG - ag_size;
12220         }
12221       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12222                && size < UNITS_PER_VREG)
12223         {
12224           adjust = UNITS_PER_VREG - size;
12225         }
12226     }
12227   else
12228     {
12229       /* TYPE passed in general registers.  */
12230       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
12231                       unshare_expr (valist), f_grtop, NULL_TREE);
12232       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
12233                       unshare_expr (valist), f_groff, NULL_TREE);
12234       rsize = ROUND_UP (size, UNITS_PER_WORD);
12235       nregs = rsize / UNITS_PER_WORD;
12236
12237       if (align > 8)
12238         dw_align = true;
12239
12240       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12241           && size < UNITS_PER_WORD)
12242         {
12243           adjust = UNITS_PER_WORD  - size;
12244         }
12245     }
12246
12247   /* Get a local temporary for the field value.  */
12248   off = get_initialized_tmp_var (f_off, pre_p, NULL);
12249
12250   /* Emit code to branch if off >= 0.  */
12251   t = build2 (GE_EXPR, boolean_type_node, off,
12252               build_int_cst (TREE_TYPE (off), 0));
12253   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
12254
12255   if (dw_align)
12256     {
12257       /* Emit: offs = (offs + 15) & -16.  */
12258       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12259                   build_int_cst (TREE_TYPE (off), 15));
12260       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
12261                   build_int_cst (TREE_TYPE (off), -16));
12262       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
12263     }
12264   else
12265     roundup = NULL;
12266
12267   /* Update ap.__[g|v]r_offs  */
12268   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12269               build_int_cst (TREE_TYPE (off), rsize));
12270   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
12271
12272   /* String up.  */
12273   if (roundup)
12274     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12275
12276   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
12277   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
12278               build_int_cst (TREE_TYPE (f_off), 0));
12279   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
12280
12281   /* String up: make sure the assignment happens before the use.  */
12282   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
12283   COND_EXPR_ELSE (cond1) = t;
12284
12285   /* Prepare the trees handling the argument that is passed on the stack;
12286      the top level node will store in ON_STACK.  */
12287   arg = get_initialized_tmp_var (stack, pre_p, NULL);
12288   if (align > 8)
12289     {
12290       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
12291       t = fold_build_pointer_plus_hwi (arg, 15);
12292       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12293                   build_int_cst (TREE_TYPE (t), -16));
12294       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
12295     }
12296   else
12297     roundup = NULL;
12298   /* Advance ap.__stack  */
12299   t = fold_build_pointer_plus_hwi (arg, size + 7);
12300   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12301               build_int_cst (TREE_TYPE (t), -8));
12302   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
12303   /* String up roundup and advance.  */
12304   if (roundup)
12305     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12306   /* String up with arg */
12307   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
12308   /* Big-endianness related address adjustment.  */
12309   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12310       && size < UNITS_PER_WORD)
12311   {
12312     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
12313                 size_int (UNITS_PER_WORD - size));
12314     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
12315   }
12316
12317   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
12318   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
12319
12320   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
12321   t = off;
12322   if (adjust)
12323     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
12324                 build_int_cst (TREE_TYPE (off), adjust));
12325
12326   t = fold_convert (sizetype, t);
12327   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
12328
12329   if (is_ha)
12330     {
12331       /* type ha; // treat as "struct {ftype field[n];}"
12332          ... [computing offs]
12333          for (i = 0; i <nregs; ++i, offs += 16)
12334            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
12335          return ha;  */
12336       int i;
12337       tree tmp_ha, field_t, field_ptr_t;
12338
12339       /* Declare a local variable.  */
12340       tmp_ha = create_tmp_var_raw (type, "ha");
12341       gimple_add_tmp_var (tmp_ha);
12342
12343       /* Establish the base type.  */
12344       switch (ag_mode)
12345         {
12346         case E_SFmode:
12347           field_t = float_type_node;
12348           field_ptr_t = float_ptr_type_node;
12349           break;
12350         case E_DFmode:
12351           field_t = double_type_node;
12352           field_ptr_t = double_ptr_type_node;
12353           break;
12354         case E_TFmode:
12355           field_t = long_double_type_node;
12356           field_ptr_t = long_double_ptr_type_node;
12357           break;
12358         case E_HFmode:
12359           field_t = aarch64_fp16_type_node;
12360           field_ptr_t = aarch64_fp16_ptr_type_node;
12361           break;
12362         case E_V2SImode:
12363         case E_V4SImode:
12364             {
12365               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
12366               field_t = build_vector_type_for_mode (innertype, ag_mode);
12367               field_ptr_t = build_pointer_type (field_t);
12368             }
12369           break;
12370         default:
12371           gcc_assert (0);
12372         }
12373
12374       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
12375       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
12376       addr = t;
12377       t = fold_convert (field_ptr_t, addr);
12378       t = build2 (MODIFY_EXPR, field_t,
12379                   build1 (INDIRECT_REF, field_t, tmp_ha),
12380                   build1 (INDIRECT_REF, field_t, t));
12381
12382       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
12383       for (i = 1; i < nregs; ++i)
12384         {
12385           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
12386           u = fold_convert (field_ptr_t, addr);
12387           u = build2 (MODIFY_EXPR, field_t,
12388                       build2 (MEM_REF, field_t, tmp_ha,
12389                               build_int_cst (field_ptr_t,
12390                                              (i *
12391                                               int_size_in_bytes (field_t)))),
12392                       build1 (INDIRECT_REF, field_t, u));
12393           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
12394         }
12395
12396       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
12397       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
12398     }
12399
12400   COND_EXPR_ELSE (cond2) = t;
12401   addr = fold_convert (build_pointer_type (type), cond1);
12402   addr = build_va_arg_indirect_ref (addr);
12403
12404   if (indirect_p)
12405     addr = build_va_arg_indirect_ref (addr);
12406
12407   return addr;
12408 }
12409
12410 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
12411
12412 static void
12413 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
12414                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
12415                                 int no_rtl)
12416 {
12417   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
12418   CUMULATIVE_ARGS local_cum;
12419   int gr_saved = cfun->va_list_gpr_size;
12420   int vr_saved = cfun->va_list_fpr_size;
12421
12422   /* The caller has advanced CUM up to, but not beyond, the last named
12423      argument.  Advance a local copy of CUM past the last "real" named
12424      argument, to find out how many registers are left over.  */
12425   local_cum = *cum;
12426   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
12427
12428   /* Found out how many registers we need to save.
12429      Honor tree-stdvar analysis results.  */
12430   if (cfun->va_list_gpr_size)
12431     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
12432                     cfun->va_list_gpr_size / UNITS_PER_WORD);
12433   if (cfun->va_list_fpr_size)
12434     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
12435                     cfun->va_list_fpr_size / UNITS_PER_VREG);
12436
12437   if (!TARGET_FLOAT)
12438     {
12439       gcc_assert (local_cum.aapcs_nvrn == 0);
12440       vr_saved = 0;
12441     }
12442
12443   if (!no_rtl)
12444     {
12445       if (gr_saved > 0)
12446         {
12447           rtx ptr, mem;
12448
12449           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
12450           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
12451                                - gr_saved * UNITS_PER_WORD);
12452           mem = gen_frame_mem (BLKmode, ptr);
12453           set_mem_alias_set (mem, get_varargs_alias_set ());
12454
12455           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
12456                                mem, gr_saved);
12457         }
12458       if (vr_saved > 0)
12459         {
12460           /* We can't use move_block_from_reg, because it will use
12461              the wrong mode, storing D regs only.  */
12462           machine_mode mode = TImode;
12463           int off, i, vr_start;
12464
12465           /* Set OFF to the offset from virtual_incoming_args_rtx of
12466              the first vector register.  The VR save area lies below
12467              the GR one, and is aligned to 16 bytes.  */
12468           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
12469                            STACK_BOUNDARY / BITS_PER_UNIT);
12470           off -= vr_saved * UNITS_PER_VREG;
12471
12472           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
12473           for (i = 0; i < vr_saved; ++i)
12474             {
12475               rtx ptr, mem;
12476
12477               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
12478               mem = gen_frame_mem (mode, ptr);
12479               set_mem_alias_set (mem, get_varargs_alias_set ());
12480               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
12481               off += UNITS_PER_VREG;
12482             }
12483         }
12484     }
12485
12486   /* We don't save the size into *PRETEND_SIZE because we want to avoid
12487      any complication of having crtl->args.pretend_args_size changed.  */
12488   cfun->machine->frame.saved_varargs_size
12489     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
12490                  STACK_BOUNDARY / BITS_PER_UNIT)
12491        + vr_saved * UNITS_PER_VREG);
12492 }
12493
12494 static void
12495 aarch64_conditional_register_usage (void)
12496 {
12497   int i;
12498   if (!TARGET_FLOAT)
12499     {
12500       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
12501         {
12502           fixed_regs[i] = 1;
12503           call_used_regs[i] = 1;
12504         }
12505     }
12506   if (!TARGET_SVE)
12507     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
12508       {
12509         fixed_regs[i] = 1;
12510         call_used_regs[i] = 1;
12511       }
12512 }
12513
12514 /* Walk down the type tree of TYPE counting consecutive base elements.
12515    If *MODEP is VOIDmode, then set it to the first valid floating point
12516    type.  If a non-floating point type is found, or if a floating point
12517    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
12518    otherwise return the count in the sub-tree.  */
12519 static int
12520 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
12521 {
12522   machine_mode mode;
12523   HOST_WIDE_INT size;
12524
12525   switch (TREE_CODE (type))
12526     {
12527     case REAL_TYPE:
12528       mode = TYPE_MODE (type);
12529       if (mode != DFmode && mode != SFmode
12530           && mode != TFmode && mode != HFmode)
12531         return -1;
12532
12533       if (*modep == VOIDmode)
12534         *modep = mode;
12535
12536       if (*modep == mode)
12537         return 1;
12538
12539       break;
12540
12541     case COMPLEX_TYPE:
12542       mode = TYPE_MODE (TREE_TYPE (type));
12543       if (mode != DFmode && mode != SFmode
12544           && mode != TFmode && mode != HFmode)
12545         return -1;
12546
12547       if (*modep == VOIDmode)
12548         *modep = mode;
12549
12550       if (*modep == mode)
12551         return 2;
12552
12553       break;
12554
12555     case VECTOR_TYPE:
12556       /* Use V2SImode and V4SImode as representatives of all 64-bit
12557          and 128-bit vector types.  */
12558       size = int_size_in_bytes (type);
12559       switch (size)
12560         {
12561         case 8:
12562           mode = V2SImode;
12563           break;
12564         case 16:
12565           mode = V4SImode;
12566           break;
12567         default:
12568           return -1;
12569         }
12570
12571       if (*modep == VOIDmode)
12572         *modep = mode;
12573
12574       /* Vector modes are considered to be opaque: two vectors are
12575          equivalent for the purposes of being homogeneous aggregates
12576          if they are the same size.  */
12577       if (*modep == mode)
12578         return 1;
12579
12580       break;
12581
12582     case ARRAY_TYPE:
12583       {
12584         int count;
12585         tree index = TYPE_DOMAIN (type);
12586
12587         /* Can't handle incomplete types nor sizes that are not
12588            fixed.  */
12589         if (!COMPLETE_TYPE_P (type)
12590             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12591           return -1;
12592
12593         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
12594         if (count == -1
12595             || !index
12596             || !TYPE_MAX_VALUE (index)
12597             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
12598             || !TYPE_MIN_VALUE (index)
12599             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
12600             || count < 0)
12601           return -1;
12602
12603         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
12604                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
12605
12606         /* There must be no padding.  */
12607         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12608                       count * GET_MODE_BITSIZE (*modep)))
12609           return -1;
12610
12611         return count;
12612       }
12613
12614     case RECORD_TYPE:
12615       {
12616         int count = 0;
12617         int sub_count;
12618         tree field;
12619
12620         /* Can't handle incomplete types nor sizes that are not
12621            fixed.  */
12622         if (!COMPLETE_TYPE_P (type)
12623             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12624           return -1;
12625
12626         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12627           {
12628             if (TREE_CODE (field) != FIELD_DECL)
12629               continue;
12630
12631             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12632             if (sub_count < 0)
12633               return -1;
12634             count += sub_count;
12635           }
12636
12637         /* There must be no padding.  */
12638         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12639                       count * GET_MODE_BITSIZE (*modep)))
12640           return -1;
12641
12642         return count;
12643       }
12644
12645     case UNION_TYPE:
12646     case QUAL_UNION_TYPE:
12647       {
12648         /* These aren't very interesting except in a degenerate case.  */
12649         int count = 0;
12650         int sub_count;
12651         tree field;
12652
12653         /* Can't handle incomplete types nor sizes that are not
12654            fixed.  */
12655         if (!COMPLETE_TYPE_P (type)
12656             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12657           return -1;
12658
12659         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12660           {
12661             if (TREE_CODE (field) != FIELD_DECL)
12662               continue;
12663
12664             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12665             if (sub_count < 0)
12666               return -1;
12667             count = count > sub_count ? count : sub_count;
12668           }
12669
12670         /* There must be no padding.  */
12671         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12672                       count * GET_MODE_BITSIZE (*modep)))
12673           return -1;
12674
12675         return count;
12676       }
12677
12678     default:
12679       break;
12680     }
12681
12682   return -1;
12683 }
12684
12685 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
12686    type as described in AAPCS64 \S 4.1.2.
12687
12688    See the comment above aarch64_composite_type_p for the notes on MODE.  */
12689
12690 static bool
12691 aarch64_short_vector_p (const_tree type,
12692                         machine_mode mode)
12693 {
12694   poly_int64 size = -1;
12695
12696   if (type && TREE_CODE (type) == VECTOR_TYPE)
12697     size = int_size_in_bytes (type);
12698   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
12699             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
12700     size = GET_MODE_SIZE (mode);
12701
12702   return known_eq (size, 8) || known_eq (size, 16);
12703 }
12704
12705 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
12706    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
12707    array types.  The C99 floating-point complex types are also considered
12708    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
12709    types, which are GCC extensions and out of the scope of AAPCS64, are
12710    treated as composite types here as well.
12711
12712    Note that MODE itself is not sufficient in determining whether a type
12713    is such a composite type or not.  This is because
12714    stor-layout.c:compute_record_mode may have already changed the MODE
12715    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
12716    structure with only one field may have its MODE set to the mode of the
12717    field.  Also an integer mode whose size matches the size of the
12718    RECORD_TYPE type may be used to substitute the original mode
12719    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
12720    solely relied on.  */
12721
12722 static bool
12723 aarch64_composite_type_p (const_tree type,
12724                           machine_mode mode)
12725 {
12726   if (aarch64_short_vector_p (type, mode))
12727     return false;
12728
12729   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
12730     return true;
12731
12732   if (mode == BLKmode
12733       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
12734       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
12735     return true;
12736
12737   return false;
12738 }
12739
12740 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
12741    shall be passed or returned in simd/fp register(s) (providing these
12742    parameter passing registers are available).
12743
12744    Upon successful return, *COUNT returns the number of needed registers,
12745    *BASE_MODE returns the mode of the individual register and when IS_HAF
12746    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
12747    floating-point aggregate or a homogeneous short-vector aggregate.  */
12748
12749 static bool
12750 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
12751                                          const_tree type,
12752                                          machine_mode *base_mode,
12753                                          int *count,
12754                                          bool *is_ha)
12755 {
12756   machine_mode new_mode = VOIDmode;
12757   bool composite_p = aarch64_composite_type_p (type, mode);
12758
12759   if (is_ha != NULL) *is_ha = false;
12760
12761   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
12762       || aarch64_short_vector_p (type, mode))
12763     {
12764       *count = 1;
12765       new_mode = mode;
12766     }
12767   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
12768     {
12769       if (is_ha != NULL) *is_ha = true;
12770       *count = 2;
12771       new_mode = GET_MODE_INNER (mode);
12772     }
12773   else if (type && composite_p)
12774     {
12775       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
12776
12777       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
12778         {
12779           if (is_ha != NULL) *is_ha = true;
12780           *count = ag_count;
12781         }
12782       else
12783         return false;
12784     }
12785   else
12786     return false;
12787
12788   *base_mode = new_mode;
12789   return true;
12790 }
12791
12792 /* Implement TARGET_STRUCT_VALUE_RTX.  */
12793
12794 static rtx
12795 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
12796                           int incoming ATTRIBUTE_UNUSED)
12797 {
12798   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
12799 }
12800
12801 /* Implements target hook vector_mode_supported_p.  */
12802 static bool
12803 aarch64_vector_mode_supported_p (machine_mode mode)
12804 {
12805   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12806   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
12807 }
12808
12809 /* Return appropriate SIMD container
12810    for MODE within a vector of WIDTH bits.  */
12811 static machine_mode
12812 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
12813 {
12814   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
12815     switch (mode)
12816       {
12817       case E_DFmode:
12818         return VNx2DFmode;
12819       case E_SFmode:
12820         return VNx4SFmode;
12821       case E_HFmode:
12822         return VNx8HFmode;
12823       case E_DImode:
12824         return VNx2DImode;
12825       case E_SImode:
12826         return VNx4SImode;
12827       case E_HImode:
12828         return VNx8HImode;
12829       case E_QImode:
12830         return VNx16QImode;
12831       default:
12832         return word_mode;
12833       }
12834
12835   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
12836   if (TARGET_SIMD)
12837     {
12838       if (known_eq (width, 128))
12839         switch (mode)
12840           {
12841           case E_DFmode:
12842             return V2DFmode;
12843           case E_SFmode:
12844             return V4SFmode;
12845           case E_HFmode:
12846             return V8HFmode;
12847           case E_SImode:
12848             return V4SImode;
12849           case E_HImode:
12850             return V8HImode;
12851           case E_QImode:
12852             return V16QImode;
12853           case E_DImode:
12854             return V2DImode;
12855           default:
12856             break;
12857           }
12858       else
12859         switch (mode)
12860           {
12861           case E_SFmode:
12862             return V2SFmode;
12863           case E_HFmode:
12864             return V4HFmode;
12865           case E_SImode:
12866             return V2SImode;
12867           case E_HImode:
12868             return V4HImode;
12869           case E_QImode:
12870             return V8QImode;
12871           default:
12872             break;
12873           }
12874     }
12875   return word_mode;
12876 }
12877
12878 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
12879 static machine_mode
12880 aarch64_preferred_simd_mode (scalar_mode mode)
12881 {
12882   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
12883   return aarch64_simd_container_mode (mode, bits);
12884 }
12885
12886 /* Return a list of possible vector sizes for the vectorizer
12887    to iterate over.  */
12888 static void
12889 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
12890 {
12891   if (TARGET_SVE)
12892     sizes->safe_push (BYTES_PER_SVE_VECTOR);
12893   sizes->safe_push (16);
12894   sizes->safe_push (8);
12895 }
12896
12897 /* Implement TARGET_MANGLE_TYPE.  */
12898
12899 static const char *
12900 aarch64_mangle_type (const_tree type)
12901 {
12902   /* The AArch64 ABI documents say that "__va_list" has to be
12903      managled as if it is in the "std" namespace.  */
12904   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
12905     return "St9__va_list";
12906
12907   /* Half-precision float.  */
12908   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
12909     return "Dh";
12910
12911   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
12912      builtin types.  */
12913   if (TYPE_NAME (type) != NULL)
12914     return aarch64_mangle_builtin_type (type);
12915
12916   /* Use the default mangling.  */
12917   return NULL;
12918 }
12919
12920 /* Find the first rtx_insn before insn that will generate an assembly
12921    instruction.  */
12922
12923 static rtx_insn *
12924 aarch64_prev_real_insn (rtx_insn *insn)
12925 {
12926   if (!insn)
12927     return NULL;
12928
12929   do
12930     {
12931       insn = prev_real_insn (insn);
12932     }
12933   while (insn && recog_memoized (insn) < 0);
12934
12935   return insn;
12936 }
12937
12938 static bool
12939 is_madd_op (enum attr_type t1)
12940 {
12941   unsigned int i;
12942   /* A number of these may be AArch32 only.  */
12943   enum attr_type mlatypes[] = {
12944     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
12945     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
12946     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
12947   };
12948
12949   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
12950     {
12951       if (t1 == mlatypes[i])
12952         return true;
12953     }
12954
12955   return false;
12956 }
12957
12958 /* Check if there is a register dependency between a load and the insn
12959    for which we hold recog_data.  */
12960
12961 static bool
12962 dep_between_memop_and_curr (rtx memop)
12963 {
12964   rtx load_reg;
12965   int opno;
12966
12967   gcc_assert (GET_CODE (memop) == SET);
12968
12969   if (!REG_P (SET_DEST (memop)))
12970     return false;
12971
12972   load_reg = SET_DEST (memop);
12973   for (opno = 1; opno < recog_data.n_operands; opno++)
12974     {
12975       rtx operand = recog_data.operand[opno];
12976       if (REG_P (operand)
12977           && reg_overlap_mentioned_p (load_reg, operand))
12978         return true;
12979
12980     }
12981   return false;
12982 }
12983
12984
12985 /* When working around the Cortex-A53 erratum 835769,
12986    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
12987    instruction and has a preceding memory instruction such that a NOP
12988    should be inserted between them.  */
12989
12990 bool
12991 aarch64_madd_needs_nop (rtx_insn* insn)
12992 {
12993   enum attr_type attr_type;
12994   rtx_insn *prev;
12995   rtx body;
12996
12997   if (!TARGET_FIX_ERR_A53_835769)
12998     return false;
12999
13000   if (!INSN_P (insn) || recog_memoized (insn) < 0)
13001     return false;
13002
13003   attr_type = get_attr_type (insn);
13004   if (!is_madd_op (attr_type))
13005     return false;
13006
13007   prev = aarch64_prev_real_insn (insn);
13008   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
13009      Restore recog state to INSN to avoid state corruption.  */
13010   extract_constrain_insn_cached (insn);
13011
13012   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
13013     return false;
13014
13015   body = single_set (prev);
13016
13017   /* If the previous insn is a memory op and there is no dependency between
13018      it and the DImode madd, emit a NOP between them.  If body is NULL then we
13019      have a complex memory operation, probably a load/store pair.
13020      Be conservative for now and emit a NOP.  */
13021   if (GET_MODE (recog_data.operand[0]) == DImode
13022       && (!body || !dep_between_memop_and_curr (body)))
13023     return true;
13024
13025   return false;
13026
13027 }
13028
13029
13030 /* Implement FINAL_PRESCAN_INSN.  */
13031
13032 void
13033 aarch64_final_prescan_insn (rtx_insn *insn)
13034 {
13035   if (aarch64_madd_needs_nop (insn))
13036     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
13037 }
13038
13039
13040 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
13041    instruction.  */
13042
13043 bool
13044 aarch64_sve_index_immediate_p (rtx base_or_step)
13045 {
13046   return (CONST_INT_P (base_or_step)
13047           && IN_RANGE (INTVAL (base_or_step), -16, 15));
13048 }
13049
13050 /* Return true if X is a valid immediate for the SVE ADD and SUB
13051    instructions.  Negate X first if NEGATE_P is true.  */
13052
13053 bool
13054 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
13055 {
13056   rtx elt;
13057
13058   if (!const_vec_duplicate_p (x, &elt)
13059       || !CONST_INT_P (elt))
13060     return false;
13061
13062   HOST_WIDE_INT val = INTVAL (elt);
13063   if (negate_p)
13064     val = -val;
13065   val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
13066
13067   if (val & 0xff)
13068     return IN_RANGE (val, 0, 0xff);
13069   return IN_RANGE (val, 0, 0xff00);
13070 }
13071
13072 /* Return true if X is a valid immediate operand for an SVE logical
13073    instruction such as AND.  */
13074
13075 bool
13076 aarch64_sve_bitmask_immediate_p (rtx x)
13077 {
13078   rtx elt;
13079
13080   return (const_vec_duplicate_p (x, &elt)
13081           && CONST_INT_P (elt)
13082           && aarch64_bitmask_imm (INTVAL (elt),
13083                                   GET_MODE_INNER (GET_MODE (x))));
13084 }
13085
13086 /* Return true if X is a valid immediate for the SVE DUP and CPY
13087    instructions.  */
13088
13089 bool
13090 aarch64_sve_dup_immediate_p (rtx x)
13091 {
13092   rtx elt;
13093
13094   if (!const_vec_duplicate_p (x, &elt)
13095       || !CONST_INT_P (elt))
13096     return false;
13097
13098   HOST_WIDE_INT val = INTVAL (elt);
13099   if (val & 0xff)
13100     return IN_RANGE (val, -0x80, 0x7f);
13101   return IN_RANGE (val, -0x8000, 0x7f00);
13102 }
13103
13104 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
13105    SIGNED_P says whether the operand is signed rather than unsigned.  */
13106
13107 bool
13108 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
13109 {
13110   rtx elt;
13111
13112   return (const_vec_duplicate_p (x, &elt)
13113           && CONST_INT_P (elt)
13114           && (signed_p
13115               ? IN_RANGE (INTVAL (elt), -16, 15)
13116               : IN_RANGE (INTVAL (elt), 0, 127)));
13117 }
13118
13119 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
13120    instruction.  Negate X first if NEGATE_P is true.  */
13121
13122 bool
13123 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
13124 {
13125   rtx elt;
13126   REAL_VALUE_TYPE r;
13127
13128   if (!const_vec_duplicate_p (x, &elt)
13129       || GET_CODE (elt) != CONST_DOUBLE)
13130     return false;
13131
13132   r = *CONST_DOUBLE_REAL_VALUE (elt);
13133
13134   if (negate_p)
13135     r = real_value_negate (&r);
13136
13137   if (real_equal (&r, &dconst1))
13138     return true;
13139   if (real_equal (&r, &dconsthalf))
13140     return true;
13141   return false;
13142 }
13143
13144 /* Return true if X is a valid immediate operand for an SVE FMUL
13145    instruction.  */
13146
13147 bool
13148 aarch64_sve_float_mul_immediate_p (rtx x)
13149 {
13150   rtx elt;
13151
13152   /* GCC will never generate a multiply with an immediate of 2, so there is no
13153      point testing for it (even though it is a valid constant).  */
13154   return (const_vec_duplicate_p (x, &elt)
13155           && GET_CODE (elt) == CONST_DOUBLE
13156           && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
13157 }
13158
13159 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
13160    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
13161    is nonnull, use it to describe valid immediates.  */
13162 static bool
13163 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
13164                                     simd_immediate_info *info,
13165                                     enum simd_immediate_check which,
13166                                     simd_immediate_info::insn_type insn)
13167 {
13168   /* Try a 4-byte immediate with LSL.  */
13169   for (unsigned int shift = 0; shift < 32; shift += 8)
13170     if ((val32 & (0xff << shift)) == val32)
13171       {
13172         if (info)
13173           *info = simd_immediate_info (SImode, val32 >> shift, insn,
13174                                        simd_immediate_info::LSL, shift);
13175         return true;
13176       }
13177
13178   /* Try a 2-byte immediate with LSL.  */
13179   unsigned int imm16 = val32 & 0xffff;
13180   if (imm16 == (val32 >> 16))
13181     for (unsigned int shift = 0; shift < 16; shift += 8)
13182       if ((imm16 & (0xff << shift)) == imm16)
13183         {
13184           if (info)
13185             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
13186                                          simd_immediate_info::LSL, shift);
13187           return true;
13188         }
13189
13190   /* Try a 4-byte immediate with MSL, except for cases that MVN
13191      can handle.  */
13192   if (which == AARCH64_CHECK_MOV)
13193     for (unsigned int shift = 8; shift < 24; shift += 8)
13194       {
13195         unsigned int low = (1 << shift) - 1;
13196         if (((val32 & (0xff << shift)) | low) == val32)
13197           {
13198             if (info)
13199               *info = simd_immediate_info (SImode, val32 >> shift, insn,
13200                                            simd_immediate_info::MSL, shift);
13201             return true;
13202           }
13203       }
13204
13205   return false;
13206 }
13207
13208 /* Return true if replicating VAL64 is a valid immediate for the
13209    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
13210    use it to describe valid immediates.  */
13211 static bool
13212 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
13213                                  simd_immediate_info *info,
13214                                  enum simd_immediate_check which)
13215 {
13216   unsigned int val32 = val64 & 0xffffffff;
13217   unsigned int val16 = val64 & 0xffff;
13218   unsigned int val8 = val64 & 0xff;
13219
13220   if (val32 == (val64 >> 32))
13221     {
13222       if ((which & AARCH64_CHECK_ORR) != 0
13223           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
13224                                                  simd_immediate_info::MOV))
13225         return true;
13226
13227       if ((which & AARCH64_CHECK_BIC) != 0
13228           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
13229                                                  simd_immediate_info::MVN))
13230         return true;
13231
13232       /* Try using a replicated byte.  */
13233       if (which == AARCH64_CHECK_MOV
13234           && val16 == (val32 >> 16)
13235           && val8 == (val16 >> 8))
13236         {
13237           if (info)
13238             *info = simd_immediate_info (QImode, val8);
13239           return true;
13240         }
13241     }
13242
13243   /* Try using a bit-to-bytemask.  */
13244   if (which == AARCH64_CHECK_MOV)
13245     {
13246       unsigned int i;
13247       for (i = 0; i < 64; i += 8)
13248         {
13249           unsigned char byte = (val64 >> i) & 0xff;
13250           if (byte != 0 && byte != 0xff)
13251             break;
13252         }
13253       if (i == 64)
13254         {
13255           if (info)
13256             *info = simd_immediate_info (DImode, val64);
13257           return true;
13258         }
13259     }
13260   return false;
13261 }
13262
13263 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
13264    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
13265
13266 static bool
13267 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
13268                              simd_immediate_info *info)
13269 {
13270   scalar_int_mode mode = DImode;
13271   unsigned int val32 = val64 & 0xffffffff;
13272   if (val32 == (val64 >> 32))
13273     {
13274       mode = SImode;
13275       unsigned int val16 = val32 & 0xffff;
13276       if (val16 == (val32 >> 16))
13277         {
13278           mode = HImode;
13279           unsigned int val8 = val16 & 0xff;
13280           if (val8 == (val16 >> 8))
13281             mode = QImode;
13282         }
13283     }
13284   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
13285   if (IN_RANGE (val, -0x80, 0x7f))
13286     {
13287       /* DUP with no shift.  */
13288       if (info)
13289         *info = simd_immediate_info (mode, val);
13290       return true;
13291     }
13292   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
13293     {
13294       /* DUP with LSL #8.  */
13295       if (info)
13296         *info = simd_immediate_info (mode, val);
13297       return true;
13298     }
13299   if (aarch64_bitmask_imm (val64, mode))
13300     {
13301       /* DUPM.  */
13302       if (info)
13303         *info = simd_immediate_info (mode, val);
13304       return true;
13305     }
13306   return false;
13307 }
13308
13309 /* Return true if OP is a valid SIMD immediate for the operation
13310    described by WHICH.  If INFO is nonnull, use it to describe valid
13311    immediates.  */
13312 bool
13313 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
13314                               enum simd_immediate_check which)
13315 {
13316   machine_mode mode = GET_MODE (op);
13317   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13318   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13319     return false;
13320
13321   scalar_mode elt_mode = GET_MODE_INNER (mode);
13322   rtx base, step;
13323   unsigned int n_elts;
13324   if (GET_CODE (op) == CONST_VECTOR
13325       && CONST_VECTOR_DUPLICATE_P (op))
13326     n_elts = CONST_VECTOR_NPATTERNS (op);
13327   else if ((vec_flags & VEC_SVE_DATA)
13328            && const_vec_series_p (op, &base, &step))
13329     {
13330       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
13331       if (!aarch64_sve_index_immediate_p (base)
13332           || !aarch64_sve_index_immediate_p (step))
13333         return false;
13334
13335       if (info)
13336         *info = simd_immediate_info (elt_mode, base, step);
13337       return true;
13338     }
13339   else if (GET_CODE (op) == CONST_VECTOR
13340            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
13341     /* N_ELTS set above.  */;
13342   else
13343     return false;
13344
13345   /* Handle PFALSE and PTRUE.  */
13346   if (vec_flags & VEC_SVE_PRED)
13347     return (op == CONST0_RTX (mode)
13348             || op == CONSTM1_RTX (mode));
13349
13350   scalar_float_mode elt_float_mode;
13351   if (n_elts == 1
13352       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
13353     {
13354       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
13355       if (aarch64_float_const_zero_rtx_p (elt)
13356           || aarch64_float_const_representable_p (elt))
13357         {
13358           if (info)
13359             *info = simd_immediate_info (elt_float_mode, elt);
13360           return true;
13361         }
13362     }
13363
13364   unsigned int elt_size = GET_MODE_SIZE (elt_mode);
13365   if (elt_size > 8)
13366     return false;
13367
13368   scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
13369
13370   /* Expand the vector constant out into a byte vector, with the least
13371      significant byte of the register first.  */
13372   auto_vec<unsigned char, 16> bytes;
13373   bytes.reserve (n_elts * elt_size);
13374   for (unsigned int i = 0; i < n_elts; i++)
13375     {
13376       /* The vector is provided in gcc endian-neutral fashion.
13377          For aarch64_be Advanced SIMD, it must be laid out in the vector
13378          register in reverse order.  */
13379       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
13380       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
13381
13382       if (elt_mode != elt_int_mode)
13383         elt = gen_lowpart (elt_int_mode, elt);
13384
13385       if (!CONST_INT_P (elt))
13386         return false;
13387
13388       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
13389       for (unsigned int byte = 0; byte < elt_size; byte++)
13390         {
13391           bytes.quick_push (elt_val & 0xff);
13392           elt_val >>= BITS_PER_UNIT;
13393         }
13394     }
13395
13396   /* The immediate must repeat every eight bytes.  */
13397   unsigned int nbytes = bytes.length ();
13398   for (unsigned i = 8; i < nbytes; ++i)
13399     if (bytes[i] != bytes[i - 8])
13400       return false;
13401
13402   /* Get the repeating 8-byte value as an integer.  No endian correction
13403      is needed here because bytes is already in lsb-first order.  */
13404   unsigned HOST_WIDE_INT val64 = 0;
13405   for (unsigned int i = 0; i < 8; i++)
13406     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
13407               << (i * BITS_PER_UNIT));
13408
13409   if (vec_flags & VEC_SVE_DATA)
13410     return aarch64_sve_valid_immediate (val64, info);
13411   else
13412     return aarch64_advsimd_valid_immediate (val64, info, which);
13413 }
13414
13415 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
13416    has a step in the range of INDEX.  Return the index expression if so,
13417    otherwise return null.  */
13418 rtx
13419 aarch64_check_zero_based_sve_index_immediate (rtx x)
13420 {
13421   rtx base, step;
13422   if (const_vec_series_p (x, &base, &step)
13423       && base == const0_rtx
13424       && aarch64_sve_index_immediate_p (step))
13425     return step;
13426   return NULL_RTX;
13427 }
13428
13429 /* Check of immediate shift constants are within range.  */
13430 bool
13431 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
13432 {
13433   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
13434   if (left)
13435     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
13436   else
13437     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
13438 }
13439
13440 /* Return the bitmask CONST_INT to select the bits required by a zero extract
13441    operation of width WIDTH at bit position POS.  */
13442
13443 rtx
13444 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
13445 {
13446   gcc_assert (CONST_INT_P (width));
13447   gcc_assert (CONST_INT_P (pos));
13448
13449   unsigned HOST_WIDE_INT mask
13450     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
13451   return GEN_INT (mask << UINTVAL (pos));
13452 }
13453
13454 bool
13455 aarch64_mov_operand_p (rtx x, machine_mode mode)
13456 {
13457   if (GET_CODE (x) == HIGH
13458       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
13459     return true;
13460
13461   if (CONST_INT_P (x))
13462     return true;
13463
13464   if (VECTOR_MODE_P (GET_MODE (x)))
13465     return aarch64_simd_valid_immediate (x, NULL);
13466
13467   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
13468     return true;
13469
13470   if (aarch64_sve_cnt_immediate_p (x))
13471     return true;
13472
13473   return aarch64_classify_symbolic_expression (x)
13474     == SYMBOL_TINY_ABSOLUTE;
13475 }
13476
13477 /* Return a const_int vector of VAL.  */
13478 rtx
13479 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
13480 {
13481   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
13482   return gen_const_vec_duplicate (mode, c);
13483 }
13484
13485 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
13486
13487 bool
13488 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
13489 {
13490   machine_mode vmode;
13491
13492   vmode = aarch64_simd_container_mode (mode, 64);
13493   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
13494   return aarch64_simd_valid_immediate (op_v, NULL);
13495 }
13496
13497 /* Construct and return a PARALLEL RTX vector with elements numbering the
13498    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
13499    the vector - from the perspective of the architecture.  This does not
13500    line up with GCC's perspective on lane numbers, so we end up with
13501    different masks depending on our target endian-ness.  The diagram
13502    below may help.  We must draw the distinction when building masks
13503    which select one half of the vector.  An instruction selecting
13504    architectural low-lanes for a big-endian target, must be described using
13505    a mask selecting GCC high-lanes.
13506
13507                  Big-Endian             Little-Endian
13508
13509 GCC             0   1   2   3           3   2   1   0
13510               | x | x | x | x |       | x | x | x | x |
13511 Architecture    3   2   1   0           3   2   1   0
13512
13513 Low Mask:         { 2, 3 }                { 0, 1 }
13514 High Mask:        { 0, 1 }                { 2, 3 }
13515
13516    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
13517
13518 rtx
13519 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
13520 {
13521   rtvec v = rtvec_alloc (nunits / 2);
13522   int high_base = nunits / 2;
13523   int low_base = 0;
13524   int base;
13525   rtx t1;
13526   int i;
13527
13528   if (BYTES_BIG_ENDIAN)
13529     base = high ? low_base : high_base;
13530   else
13531     base = high ? high_base : low_base;
13532
13533   for (i = 0; i < nunits / 2; i++)
13534     RTVEC_ELT (v, i) = GEN_INT (base + i);
13535
13536   t1 = gen_rtx_PARALLEL (mode, v);
13537   return t1;
13538 }
13539
13540 /* Check OP for validity as a PARALLEL RTX vector with elements
13541    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
13542    from the perspective of the architecture.  See the diagram above
13543    aarch64_simd_vect_par_cnst_half for more details.  */
13544
13545 bool
13546 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
13547                                        bool high)
13548 {
13549   int nelts;
13550   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
13551     return false;
13552
13553   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
13554   HOST_WIDE_INT count_op = XVECLEN (op, 0);
13555   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
13556   int i = 0;
13557
13558   if (count_op != count_ideal)
13559     return false;
13560
13561   for (i = 0; i < count_ideal; i++)
13562     {
13563       rtx elt_op = XVECEXP (op, 0, i);
13564       rtx elt_ideal = XVECEXP (ideal, 0, i);
13565
13566       if (!CONST_INT_P (elt_op)
13567           || INTVAL (elt_ideal) != INTVAL (elt_op))
13568         return false;
13569     }
13570   return true;
13571 }
13572
13573 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
13574    HIGH (exclusive).  */
13575 void
13576 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
13577                           const_tree exp)
13578 {
13579   HOST_WIDE_INT lane;
13580   gcc_assert (CONST_INT_P (operand));
13581   lane = INTVAL (operand);
13582
13583   if (lane < low || lane >= high)
13584   {
13585     if (exp)
13586       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
13587     else
13588       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
13589   }
13590 }
13591
13592 /* Peform endian correction on lane number N, which indexes a vector
13593    of mode MODE, and return the result as an SImode rtx.  */
13594
13595 rtx
13596 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
13597 {
13598   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
13599 }
13600
13601 /* Return TRUE if OP is a valid vector addressing mode.  */
13602
13603 bool
13604 aarch64_simd_mem_operand_p (rtx op)
13605 {
13606   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
13607                         || REG_P (XEXP (op, 0)));
13608 }
13609
13610 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
13611
13612 bool
13613 aarch64_sve_ld1r_operand_p (rtx op)
13614 {
13615   struct aarch64_address_info addr;
13616   scalar_mode mode;
13617
13618   return (MEM_P (op)
13619           && is_a <scalar_mode> (GET_MODE (op), &mode)
13620           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
13621           && addr.type == ADDRESS_REG_IMM
13622           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
13623 }
13624
13625 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
13626    The conditions for STR are the same.  */
13627 bool
13628 aarch64_sve_ldr_operand_p (rtx op)
13629 {
13630   struct aarch64_address_info addr;
13631
13632   return (MEM_P (op)
13633           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
13634                                        false, ADDR_QUERY_ANY)
13635           && addr.type == ADDRESS_REG_IMM);
13636 }
13637
13638 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
13639    We need to be able to access the individual pieces, so the range
13640    is different from LD[234] and ST[234].  */
13641 bool
13642 aarch64_sve_struct_memory_operand_p (rtx op)
13643 {
13644   if (!MEM_P (op))
13645     return false;
13646
13647   machine_mode mode = GET_MODE (op);
13648   struct aarch64_address_info addr;
13649   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
13650                                  ADDR_QUERY_ANY)
13651       || addr.type != ADDRESS_REG_IMM)
13652     return false;
13653
13654   poly_int64 first = addr.const_offset;
13655   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
13656   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
13657           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
13658 }
13659
13660 /* Emit a register copy from operand to operand, taking care not to
13661    early-clobber source registers in the process.
13662
13663    COUNT is the number of components into which the copy needs to be
13664    decomposed.  */
13665 void
13666 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
13667                                 unsigned int count)
13668 {
13669   unsigned int i;
13670   int rdest = REGNO (operands[0]);
13671   int rsrc = REGNO (operands[1]);
13672
13673   if (!reg_overlap_mentioned_p (operands[0], operands[1])
13674       || rdest < rsrc)
13675     for (i = 0; i < count; i++)
13676       emit_move_insn (gen_rtx_REG (mode, rdest + i),
13677                       gen_rtx_REG (mode, rsrc + i));
13678   else
13679     for (i = 0; i < count; i++)
13680       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
13681                       gen_rtx_REG (mode, rsrc + count - i - 1));
13682 }
13683
13684 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
13685    one of VSTRUCT modes: OI, CI, or XI.  */
13686 int
13687 aarch64_simd_attr_length_rglist (machine_mode mode)
13688 {
13689   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
13690   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
13691 }
13692
13693 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
13694    alignment of a vector to 128 bits.  SVE predicates have an alignment of
13695    16 bits.  */
13696 static HOST_WIDE_INT
13697 aarch64_simd_vector_alignment (const_tree type)
13698 {
13699   if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13700     /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
13701        be set for non-predicate vectors of booleans.  Modes are the most
13702        direct way we have of identifying real SVE predicate types.  */
13703     return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
13704   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
13705   return MIN (align, 128);
13706 }
13707
13708 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
13709 static HOST_WIDE_INT
13710 aarch64_vectorize_preferred_vector_alignment (const_tree type)
13711 {
13712   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
13713     {
13714       /* If the length of the vector is fixed, try to align to that length,
13715          otherwise don't try to align at all.  */
13716       HOST_WIDE_INT result;
13717       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
13718         result = TYPE_ALIGN (TREE_TYPE (type));
13719       return result;
13720     }
13721   return TYPE_ALIGN (type);
13722 }
13723
13724 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
13725 static bool
13726 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
13727 {
13728   if (is_packed)
13729     return false;
13730
13731   /* For fixed-length vectors, check that the vectorizer will aim for
13732      full-vector alignment.  This isn't true for generic GCC vectors
13733      that are wider than the ABI maximum of 128 bits.  */
13734   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
13735       && (wi::to_widest (TYPE_SIZE (type))
13736           != aarch64_vectorize_preferred_vector_alignment (type)))
13737     return false;
13738
13739   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
13740   return true;
13741 }
13742
13743 /* Return true if the vector misalignment factor is supported by the
13744    target.  */
13745 static bool
13746 aarch64_builtin_support_vector_misalignment (machine_mode mode,
13747                                              const_tree type, int misalignment,
13748                                              bool is_packed)
13749 {
13750   if (TARGET_SIMD && STRICT_ALIGNMENT)
13751     {
13752       /* Return if movmisalign pattern is not supported for this mode.  */
13753       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
13754         return false;
13755
13756       /* Misalignment factor is unknown at compile time.  */
13757       if (misalignment == -1)
13758         return false;
13759     }
13760   return default_builtin_support_vector_misalignment (mode, type, misalignment,
13761                                                       is_packed);
13762 }
13763
13764 /* If VALS is a vector constant that can be loaded into a register
13765    using DUP, generate instructions to do so and return an RTX to
13766    assign to the register.  Otherwise return NULL_RTX.  */
13767 static rtx
13768 aarch64_simd_dup_constant (rtx vals)
13769 {
13770   machine_mode mode = GET_MODE (vals);
13771   machine_mode inner_mode = GET_MODE_INNER (mode);
13772   rtx x;
13773
13774   if (!const_vec_duplicate_p (vals, &x))
13775     return NULL_RTX;
13776
13777   /* We can load this constant by using DUP and a constant in a
13778      single ARM register.  This will be cheaper than a vector
13779      load.  */
13780   x = copy_to_mode_reg (inner_mode, x);
13781   return gen_vec_duplicate (mode, x);
13782 }
13783
13784
13785 /* Generate code to load VALS, which is a PARALLEL containing only
13786    constants (for vec_init) or CONST_VECTOR, efficiently into a
13787    register.  Returns an RTX to copy into the register, or NULL_RTX
13788    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
13789 static rtx
13790 aarch64_simd_make_constant (rtx vals)
13791 {
13792   machine_mode mode = GET_MODE (vals);
13793   rtx const_dup;
13794   rtx const_vec = NULL_RTX;
13795   int n_const = 0;
13796   int i;
13797
13798   if (GET_CODE (vals) == CONST_VECTOR)
13799     const_vec = vals;
13800   else if (GET_CODE (vals) == PARALLEL)
13801     {
13802       /* A CONST_VECTOR must contain only CONST_INTs and
13803          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
13804          Only store valid constants in a CONST_VECTOR.  */
13805       int n_elts = XVECLEN (vals, 0);
13806       for (i = 0; i < n_elts; ++i)
13807         {
13808           rtx x = XVECEXP (vals, 0, i);
13809           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13810             n_const++;
13811         }
13812       if (n_const == n_elts)
13813         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
13814     }
13815   else
13816     gcc_unreachable ();
13817
13818   if (const_vec != NULL_RTX
13819       && aarch64_simd_valid_immediate (const_vec, NULL))
13820     /* Load using MOVI/MVNI.  */
13821     return const_vec;
13822   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
13823     /* Loaded using DUP.  */
13824     return const_dup;
13825   else if (const_vec != NULL_RTX)
13826     /* Load from constant pool. We can not take advantage of single-cycle
13827        LD1 because we need a PC-relative addressing mode.  */
13828     return const_vec;
13829   else
13830     /* A PARALLEL containing something not valid inside CONST_VECTOR.
13831        We can not construct an initializer.  */
13832     return NULL_RTX;
13833 }
13834
13835 /* Expand a vector initialisation sequence, such that TARGET is
13836    initialised to contain VALS.  */
13837
13838 void
13839 aarch64_expand_vector_init (rtx target, rtx vals)
13840 {
13841   machine_mode mode = GET_MODE (target);
13842   scalar_mode inner_mode = GET_MODE_INNER (mode);
13843   /* The number of vector elements.  */
13844   int n_elts = XVECLEN (vals, 0);
13845   /* The number of vector elements which are not constant.  */
13846   int n_var = 0;
13847   rtx any_const = NULL_RTX;
13848   /* The first element of vals.  */
13849   rtx v0 = XVECEXP (vals, 0, 0);
13850   bool all_same = true;
13851
13852   /* Count the number of variable elements to initialise.  */
13853   for (int i = 0; i < n_elts; ++i)
13854     {
13855       rtx x = XVECEXP (vals, 0, i);
13856       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
13857         ++n_var;
13858       else
13859         any_const = x;
13860
13861       all_same &= rtx_equal_p (x, v0);
13862     }
13863
13864   /* No variable elements, hand off to aarch64_simd_make_constant which knows
13865      how best to handle this.  */
13866   if (n_var == 0)
13867     {
13868       rtx constant = aarch64_simd_make_constant (vals);
13869       if (constant != NULL_RTX)
13870         {
13871           emit_move_insn (target, constant);
13872           return;
13873         }
13874     }
13875
13876   /* Splat a single non-constant element if we can.  */
13877   if (all_same)
13878     {
13879       rtx x = copy_to_mode_reg (inner_mode, v0);
13880       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13881       return;
13882     }
13883
13884   enum insn_code icode = optab_handler (vec_set_optab, mode);
13885   gcc_assert (icode != CODE_FOR_nothing);
13886
13887   /* If there are only variable elements, try to optimize
13888      the insertion using dup for the most common element
13889      followed by insertions.  */
13890
13891   /* The algorithm will fill matches[*][0] with the earliest matching element,
13892      and matches[X][1] with the count of duplicate elements (if X is the
13893      earliest element which has duplicates).  */
13894
13895   if (n_var == n_elts && n_elts <= 16)
13896     {
13897       int matches[16][2] = {0};
13898       for (int i = 0; i < n_elts; i++)
13899         {
13900           for (int j = 0; j <= i; j++)
13901             {
13902               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
13903                 {
13904                   matches[i][0] = j;
13905                   matches[j][1]++;
13906                   break;
13907                 }
13908             }
13909         }
13910       int maxelement = 0;
13911       int maxv = 0;
13912       for (int i = 0; i < n_elts; i++)
13913         if (matches[i][1] > maxv)
13914           {
13915             maxelement = i;
13916             maxv = matches[i][1];
13917           }
13918
13919       /* Create a duplicate of the most common element, unless all elements
13920          are equally useless to us, in which case just immediately set the
13921          vector register using the first element.  */
13922
13923       if (maxv == 1)
13924         {
13925           /* For vectors of two 64-bit elements, we can do even better.  */
13926           if (n_elts == 2
13927               && (inner_mode == E_DImode
13928                   || inner_mode == E_DFmode))
13929
13930             {
13931               rtx x0 = XVECEXP (vals, 0, 0);
13932               rtx x1 = XVECEXP (vals, 0, 1);
13933               /* Combine can pick up this case, but handling it directly
13934                  here leaves clearer RTL.
13935
13936                  This is load_pair_lanes<mode>, and also gives us a clean-up
13937                  for store_pair_lanes<mode>.  */
13938               if (memory_operand (x0, inner_mode)
13939                   && memory_operand (x1, inner_mode)
13940                   && !STRICT_ALIGNMENT
13941                   && rtx_equal_p (XEXP (x1, 0),
13942                                   plus_constant (Pmode,
13943                                                  XEXP (x0, 0),
13944                                                  GET_MODE_SIZE (inner_mode))))
13945                 {
13946                   rtx t;
13947                   if (inner_mode == DFmode)
13948                     t = gen_load_pair_lanesdf (target, x0, x1);
13949                   else
13950                     t = gen_load_pair_lanesdi (target, x0, x1);
13951                   emit_insn (t);
13952                   return;
13953                 }
13954             }
13955           /* The subreg-move sequence below will move into lane zero of the
13956              vector register.  For big-endian we want that position to hold
13957              the last element of VALS.  */
13958           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
13959           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
13960           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
13961         }
13962       else
13963         {
13964           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
13965           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13966         }
13967
13968       /* Insert the rest.  */
13969       for (int i = 0; i < n_elts; i++)
13970         {
13971           rtx x = XVECEXP (vals, 0, i);
13972           if (matches[i][0] == maxelement)
13973             continue;
13974           x = copy_to_mode_reg (inner_mode, x);
13975           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
13976         }
13977       return;
13978     }
13979
13980   /* Initialise a vector which is part-variable.  We want to first try
13981      to build those lanes which are constant in the most efficient way we
13982      can.  */
13983   if (n_var != n_elts)
13984     {
13985       rtx copy = copy_rtx (vals);
13986
13987       /* Load constant part of vector.  We really don't care what goes into the
13988          parts we will overwrite, but we're more likely to be able to load the
13989          constant efficiently if it has fewer, larger, repeating parts
13990          (see aarch64_simd_valid_immediate).  */
13991       for (int i = 0; i < n_elts; i++)
13992         {
13993           rtx x = XVECEXP (vals, 0, i);
13994           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13995             continue;
13996           rtx subst = any_const;
13997           for (int bit = n_elts / 2; bit > 0; bit /= 2)
13998             {
13999               /* Look in the copied vector, as more elements are const.  */
14000               rtx test = XVECEXP (copy, 0, i ^ bit);
14001               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
14002                 {
14003                   subst = test;
14004                   break;
14005                 }
14006             }
14007           XVECEXP (copy, 0, i) = subst;
14008         }
14009       aarch64_expand_vector_init (target, copy);
14010     }
14011
14012   /* Insert the variable lanes directly.  */
14013   for (int i = 0; i < n_elts; i++)
14014     {
14015       rtx x = XVECEXP (vals, 0, i);
14016       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
14017         continue;
14018       x = copy_to_mode_reg (inner_mode, x);
14019       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
14020     }
14021 }
14022
14023 static unsigned HOST_WIDE_INT
14024 aarch64_shift_truncation_mask (machine_mode mode)
14025 {
14026   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
14027     return 0;
14028   return GET_MODE_UNIT_BITSIZE (mode) - 1;
14029 }
14030
14031 /* Select a format to encode pointers in exception handling data.  */
14032 int
14033 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
14034 {
14035    int type;
14036    switch (aarch64_cmodel)
14037      {
14038      case AARCH64_CMODEL_TINY:
14039      case AARCH64_CMODEL_TINY_PIC:
14040      case AARCH64_CMODEL_SMALL:
14041      case AARCH64_CMODEL_SMALL_PIC:
14042      case AARCH64_CMODEL_SMALL_SPIC:
14043        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
14044           for everything.  */
14045        type = DW_EH_PE_sdata4;
14046        break;
14047      default:
14048        /* No assumptions here.  8-byte relocs required.  */
14049        type = DW_EH_PE_sdata8;
14050        break;
14051      }
14052    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
14053 }
14054
14055 /* The last .arch and .tune assembly strings that we printed.  */
14056 static std::string aarch64_last_printed_arch_string;
14057 static std::string aarch64_last_printed_tune_string;
14058
14059 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
14060    by the function fndecl.  */
14061
14062 void
14063 aarch64_declare_function_name (FILE *stream, const char* name,
14064                                 tree fndecl)
14065 {
14066   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
14067
14068   struct cl_target_option *targ_options;
14069   if (target_parts)
14070     targ_options = TREE_TARGET_OPTION (target_parts);
14071   else
14072     targ_options = TREE_TARGET_OPTION (target_option_current_node);
14073   gcc_assert (targ_options);
14074
14075   const struct processor *this_arch
14076     = aarch64_get_arch (targ_options->x_explicit_arch);
14077
14078   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
14079   std::string extension
14080     = aarch64_get_extension_string_for_isa_flags (isa_flags,
14081                                                   this_arch->flags);
14082   /* Only update the assembler .arch string if it is distinct from the last
14083      such string we printed.  */
14084   std::string to_print = this_arch->name + extension;
14085   if (to_print != aarch64_last_printed_arch_string)
14086     {
14087       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
14088       aarch64_last_printed_arch_string = to_print;
14089     }
14090
14091   /* Print the cpu name we're tuning for in the comments, might be
14092      useful to readers of the generated asm.  Do it only when it changes
14093      from function to function and verbose assembly is requested.  */
14094   const struct processor *this_tune
14095     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
14096
14097   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
14098     {
14099       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
14100                    this_tune->name);
14101       aarch64_last_printed_tune_string = this_tune->name;
14102     }
14103
14104   /* Don't forget the type directive for ELF.  */
14105   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
14106   ASM_OUTPUT_LABEL (stream, name);
14107 }
14108
14109 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
14110
14111 static void
14112 aarch64_start_file (void)
14113 {
14114   struct cl_target_option *default_options
14115     = TREE_TARGET_OPTION (target_option_default_node);
14116
14117   const struct processor *default_arch
14118     = aarch64_get_arch (default_options->x_explicit_arch);
14119   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
14120   std::string extension
14121     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
14122                                                   default_arch->flags);
14123
14124    aarch64_last_printed_arch_string = default_arch->name + extension;
14125    aarch64_last_printed_tune_string = "";
14126    asm_fprintf (asm_out_file, "\t.arch %s\n",
14127                 aarch64_last_printed_arch_string.c_str ());
14128
14129    default_file_start ();
14130 }
14131
14132 /* Emit load exclusive.  */
14133
14134 static void
14135 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
14136                              rtx mem, rtx model_rtx)
14137 {
14138   rtx (*gen) (rtx, rtx, rtx);
14139
14140   switch (mode)
14141     {
14142     case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
14143     case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
14144     case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
14145     case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
14146     default:
14147       gcc_unreachable ();
14148     }
14149
14150   emit_insn (gen (rval, mem, model_rtx));
14151 }
14152
14153 /* Emit store exclusive.  */
14154
14155 static void
14156 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
14157                               rtx rval, rtx mem, rtx model_rtx)
14158 {
14159   rtx (*gen) (rtx, rtx, rtx, rtx);
14160
14161   switch (mode)
14162     {
14163     case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
14164     case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
14165     case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
14166     case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
14167     default:
14168       gcc_unreachable ();
14169     }
14170
14171   emit_insn (gen (bval, rval, mem, model_rtx));
14172 }
14173
14174 /* Mark the previous jump instruction as unlikely.  */
14175
14176 static void
14177 aarch64_emit_unlikely_jump (rtx insn)
14178 {
14179   rtx_insn *jump = emit_jump_insn (insn);
14180   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
14181 }
14182
14183 /* Expand a compare and swap pattern.  */
14184
14185 void
14186 aarch64_expand_compare_and_swap (rtx operands[])
14187 {
14188   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
14189   machine_mode mode, cmp_mode;
14190   typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
14191   int idx;
14192   gen_cas_fn gen;
14193   const gen_cas_fn split_cas[] =
14194   {
14195     gen_aarch64_compare_and_swapqi,
14196     gen_aarch64_compare_and_swaphi,
14197     gen_aarch64_compare_and_swapsi,
14198     gen_aarch64_compare_and_swapdi
14199   };
14200   const gen_cas_fn atomic_cas[] =
14201   {
14202     gen_aarch64_compare_and_swapqi_lse,
14203     gen_aarch64_compare_and_swaphi_lse,
14204     gen_aarch64_compare_and_swapsi_lse,
14205     gen_aarch64_compare_and_swapdi_lse
14206   };
14207
14208   bval = operands[0];
14209   rval = operands[1];
14210   mem = operands[2];
14211   oldval = operands[3];
14212   newval = operands[4];
14213   is_weak = operands[5];
14214   mod_s = operands[6];
14215   mod_f = operands[7];
14216   mode = GET_MODE (mem);
14217   cmp_mode = mode;
14218
14219   /* Normally the succ memory model must be stronger than fail, but in the
14220      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
14221      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
14222
14223   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
14224       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
14225     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
14226
14227   switch (mode)
14228     {
14229     case E_QImode:
14230     case E_HImode:
14231       /* For short modes, we're going to perform the comparison in SImode,
14232          so do the zero-extension now.  */
14233       cmp_mode = SImode;
14234       rval = gen_reg_rtx (SImode);
14235       oldval = convert_modes (SImode, mode, oldval, true);
14236       /* Fall through.  */
14237
14238     case E_SImode:
14239     case E_DImode:
14240       /* Force the value into a register if needed.  */
14241       if (!aarch64_plus_operand (oldval, mode))
14242         oldval = force_reg (cmp_mode, oldval);
14243       break;
14244
14245     default:
14246       gcc_unreachable ();
14247     }
14248
14249   switch (mode)
14250     {
14251     case E_QImode: idx = 0; break;
14252     case E_HImode: idx = 1; break;
14253     case E_SImode: idx = 2; break;
14254     case E_DImode: idx = 3; break;
14255     default:
14256       gcc_unreachable ();
14257     }
14258   if (TARGET_LSE)
14259     gen = atomic_cas[idx];
14260   else
14261     gen = split_cas[idx];
14262
14263   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
14264
14265   if (mode == QImode || mode == HImode)
14266     emit_move_insn (operands[1], gen_lowpart (mode, rval));
14267
14268   x = gen_rtx_REG (CCmode, CC_REGNUM);
14269   x = gen_rtx_EQ (SImode, x, const0_rtx);
14270   emit_insn (gen_rtx_SET (bval, x));
14271 }
14272
14273 /* Test whether the target supports using a atomic load-operate instruction.
14274    CODE is the operation and AFTER is TRUE if the data in memory after the
14275    operation should be returned and FALSE if the data before the operation
14276    should be returned.  Returns FALSE if the operation isn't supported by the
14277    architecture.  */
14278
14279 bool
14280 aarch64_atomic_ldop_supported_p (enum rtx_code code)
14281 {
14282   if (!TARGET_LSE)
14283     return false;
14284
14285   switch (code)
14286     {
14287     case SET:
14288     case AND:
14289     case IOR:
14290     case XOR:
14291     case MINUS:
14292     case PLUS:
14293       return true;
14294     default:
14295       return false;
14296     }
14297 }
14298
14299 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
14300    sequence implementing an atomic operation.  */
14301
14302 static void
14303 aarch64_emit_post_barrier (enum memmodel model)
14304 {
14305   const enum memmodel base_model = memmodel_base (model);
14306
14307   if (is_mm_sync (model)
14308       && (base_model == MEMMODEL_ACQUIRE
14309           || base_model == MEMMODEL_ACQ_REL
14310           || base_model == MEMMODEL_SEQ_CST))
14311     {
14312       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
14313     }
14314 }
14315
14316 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
14317    for the data in memory.  EXPECTED is the value expected to be in memory.
14318    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
14319    is the memory ordering to use.  */
14320
14321 void
14322 aarch64_gen_atomic_cas (rtx rval, rtx mem,
14323                         rtx expected, rtx desired,
14324                         rtx model)
14325 {
14326   rtx (*gen) (rtx, rtx, rtx, rtx);
14327   machine_mode mode;
14328
14329   mode = GET_MODE (mem);
14330
14331   switch (mode)
14332     {
14333     case E_QImode: gen = gen_aarch64_atomic_casqi; break;
14334     case E_HImode: gen = gen_aarch64_atomic_cashi; break;
14335     case E_SImode: gen = gen_aarch64_atomic_cassi; break;
14336     case E_DImode: gen = gen_aarch64_atomic_casdi; break;
14337     default:
14338       gcc_unreachable ();
14339     }
14340
14341   /* Move the expected value into the CAS destination register.  */
14342   emit_insn (gen_rtx_SET (rval, expected));
14343
14344   /* Emit the CAS.  */
14345   emit_insn (gen (rval, mem, desired, model));
14346
14347   /* Compare the expected value with the value loaded by the CAS, to establish
14348      whether the swap was made.  */
14349   aarch64_gen_compare_reg (EQ, rval, expected);
14350 }
14351
14352 /* Split a compare and swap pattern.  */
14353
14354 void
14355 aarch64_split_compare_and_swap (rtx operands[])
14356 {
14357   rtx rval, mem, oldval, newval, scratch;
14358   machine_mode mode;
14359   bool is_weak;
14360   rtx_code_label *label1, *label2;
14361   rtx x, cond;
14362   enum memmodel model;
14363   rtx model_rtx;
14364
14365   rval = operands[0];
14366   mem = operands[1];
14367   oldval = operands[2];
14368   newval = operands[3];
14369   is_weak = (operands[4] != const0_rtx);
14370   model_rtx = operands[5];
14371   scratch = operands[7];
14372   mode = GET_MODE (mem);
14373   model = memmodel_from_int (INTVAL (model_rtx));
14374
14375   /* When OLDVAL is zero and we want the strong version we can emit a tighter
14376     loop:
14377     .label1:
14378         LD[A]XR rval, [mem]
14379         CBNZ    rval, .label2
14380         ST[L]XR scratch, newval, [mem]
14381         CBNZ    scratch, .label1
14382     .label2:
14383         CMP     rval, 0.  */
14384   bool strong_zero_p = !is_weak && oldval == const0_rtx;
14385
14386   label1 = NULL;
14387   if (!is_weak)
14388     {
14389       label1 = gen_label_rtx ();
14390       emit_label (label1);
14391     }
14392   label2 = gen_label_rtx ();
14393
14394   /* The initial load can be relaxed for a __sync operation since a final
14395      barrier will be emitted to stop code hoisting.  */
14396   if (is_mm_sync (model))
14397     aarch64_emit_load_exclusive (mode, rval, mem,
14398                                  GEN_INT (MEMMODEL_RELAXED));
14399   else
14400     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
14401
14402   if (strong_zero_p)
14403     {
14404       x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
14405       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14406                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14407       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14408     }
14409   else
14410     {
14411       cond = aarch64_gen_compare_reg (NE, rval, oldval);
14412       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14413       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14414                                  gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14415       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14416     }
14417
14418   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
14419
14420   if (!is_weak)
14421     {
14422       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
14423       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14424                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
14425       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14426     }
14427   else
14428     {
14429       cond = gen_rtx_REG (CCmode, CC_REGNUM);
14430       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
14431       emit_insn (gen_rtx_SET (cond, x));
14432     }
14433
14434   emit_label (label2);
14435   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
14436      to set the condition flags.  If this is not used it will be removed by
14437      later passes.  */
14438   if (strong_zero_p)
14439     {
14440       cond = gen_rtx_REG (CCmode, CC_REGNUM);
14441       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
14442       emit_insn (gen_rtx_SET (cond, x));
14443     }
14444   /* Emit any final barrier needed for a __sync operation.  */
14445   if (is_mm_sync (model))
14446     aarch64_emit_post_barrier (model);
14447 }
14448
14449 /* Emit a BIC instruction.  */
14450
14451 static void
14452 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
14453 {
14454   rtx shift_rtx = GEN_INT (shift);
14455   rtx (*gen) (rtx, rtx, rtx, rtx);
14456
14457   switch (mode)
14458     {
14459     case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
14460     case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
14461     default:
14462       gcc_unreachable ();
14463     }
14464
14465   emit_insn (gen (dst, s2, shift_rtx, s1));
14466 }
14467
14468 /* Emit an atomic swap.  */
14469
14470 static void
14471 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
14472                           rtx mem, rtx model)
14473 {
14474   rtx (*gen) (rtx, rtx, rtx, rtx);
14475
14476   switch (mode)
14477     {
14478     case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
14479     case E_HImode: gen = gen_aarch64_atomic_swphi; break;
14480     case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
14481     case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
14482     default:
14483       gcc_unreachable ();
14484     }
14485
14486   emit_insn (gen (dst, mem, value, model));
14487 }
14488
14489 /* Operations supported by aarch64_emit_atomic_load_op.  */
14490
14491 enum aarch64_atomic_load_op_code
14492 {
14493   AARCH64_LDOP_PLUS,    /* A + B  */
14494   AARCH64_LDOP_XOR,     /* A ^ B  */
14495   AARCH64_LDOP_OR,      /* A | B  */
14496   AARCH64_LDOP_BIC      /* A & ~B  */
14497 };
14498
14499 /* Emit an atomic load-operate.  */
14500
14501 static void
14502 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
14503                              machine_mode mode, rtx dst, rtx src,
14504                              rtx mem, rtx model)
14505 {
14506   typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
14507   const aarch64_atomic_load_op_fn plus[] =
14508   {
14509     gen_aarch64_atomic_loadaddqi,
14510     gen_aarch64_atomic_loadaddhi,
14511     gen_aarch64_atomic_loadaddsi,
14512     gen_aarch64_atomic_loadadddi
14513   };
14514   const aarch64_atomic_load_op_fn eor[] =
14515   {
14516     gen_aarch64_atomic_loadeorqi,
14517     gen_aarch64_atomic_loadeorhi,
14518     gen_aarch64_atomic_loadeorsi,
14519     gen_aarch64_atomic_loadeordi
14520   };
14521   const aarch64_atomic_load_op_fn ior[] =
14522   {
14523     gen_aarch64_atomic_loadsetqi,
14524     gen_aarch64_atomic_loadsethi,
14525     gen_aarch64_atomic_loadsetsi,
14526     gen_aarch64_atomic_loadsetdi
14527   };
14528   const aarch64_atomic_load_op_fn bic[] =
14529   {
14530     gen_aarch64_atomic_loadclrqi,
14531     gen_aarch64_atomic_loadclrhi,
14532     gen_aarch64_atomic_loadclrsi,
14533     gen_aarch64_atomic_loadclrdi
14534   };
14535   aarch64_atomic_load_op_fn gen;
14536   int idx = 0;
14537
14538   switch (mode)
14539     {
14540     case E_QImode: idx = 0; break;
14541     case E_HImode: idx = 1; break;
14542     case E_SImode: idx = 2; break;
14543     case E_DImode: idx = 3; break;
14544     default:
14545       gcc_unreachable ();
14546     }
14547
14548   switch (code)
14549     {
14550     case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
14551     case AARCH64_LDOP_XOR: gen = eor[idx]; break;
14552     case AARCH64_LDOP_OR: gen = ior[idx]; break;
14553     case AARCH64_LDOP_BIC: gen = bic[idx]; break;
14554     default:
14555       gcc_unreachable ();
14556     }
14557
14558   emit_insn (gen (dst, mem, src, model));
14559 }
14560
14561 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
14562    location to store the data read from memory.  OUT_RESULT is the location to
14563    store the result of the operation.  MEM is the memory location to read and
14564    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
14565    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
14566    be NULL.  */
14567
14568 void
14569 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
14570                          rtx mem, rtx value, rtx model_rtx)
14571 {
14572   machine_mode mode = GET_MODE (mem);
14573   machine_mode wmode = (mode == DImode ? DImode : SImode);
14574   const bool short_mode = (mode < SImode);
14575   aarch64_atomic_load_op_code ldop_code;
14576   rtx src;
14577   rtx x;
14578
14579   if (out_data)
14580     out_data = gen_lowpart (mode, out_data);
14581
14582   if (out_result)
14583     out_result = gen_lowpart (mode, out_result);
14584
14585   /* Make sure the value is in a register, putting it into a destination
14586      register if it needs to be manipulated.  */
14587   if (!register_operand (value, mode)
14588       || code == AND || code == MINUS)
14589     {
14590       src = out_result ? out_result : out_data;
14591       emit_move_insn (src, gen_lowpart (mode, value));
14592     }
14593   else
14594     src = value;
14595   gcc_assert (register_operand (src, mode));
14596
14597   /* Preprocess the data for the operation as necessary.  If the operation is
14598      a SET then emit a swap instruction and finish.  */
14599   switch (code)
14600     {
14601     case SET:
14602       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
14603       return;
14604
14605     case MINUS:
14606       /* Negate the value and treat it as a PLUS.  */
14607       {
14608         rtx neg_src;
14609
14610         /* Resize the value if necessary.  */
14611         if (short_mode)
14612           src = gen_lowpart (wmode, src);
14613
14614         neg_src = gen_rtx_NEG (wmode, src);
14615         emit_insn (gen_rtx_SET (src, neg_src));
14616
14617         if (short_mode)
14618           src = gen_lowpart (mode, src);
14619       }
14620       /* Fall-through.  */
14621     case PLUS:
14622       ldop_code = AARCH64_LDOP_PLUS;
14623       break;
14624
14625     case IOR:
14626       ldop_code = AARCH64_LDOP_OR;
14627       break;
14628
14629     case XOR:
14630       ldop_code = AARCH64_LDOP_XOR;
14631       break;
14632
14633     case AND:
14634       {
14635         rtx not_src;
14636
14637         /* Resize the value if necessary.  */
14638         if (short_mode)
14639           src = gen_lowpart (wmode, src);
14640
14641         not_src = gen_rtx_NOT (wmode, src);
14642         emit_insn (gen_rtx_SET (src, not_src));
14643
14644         if (short_mode)
14645           src = gen_lowpart (mode, src);
14646       }
14647       ldop_code = AARCH64_LDOP_BIC;
14648       break;
14649
14650     default:
14651       /* The operation can't be done with atomic instructions.  */
14652       gcc_unreachable ();
14653     }
14654
14655   aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
14656
14657   /* If necessary, calculate the data in memory after the update by redoing the
14658      operation from values in registers.  */
14659   if (!out_result)
14660     return;
14661
14662   if (short_mode)
14663     {
14664       src = gen_lowpart (wmode, src);
14665       out_data = gen_lowpart (wmode, out_data);
14666       out_result = gen_lowpart (wmode, out_result);
14667     }
14668
14669   x = NULL_RTX;
14670
14671   switch (code)
14672     {
14673     case MINUS:
14674     case PLUS:
14675       x = gen_rtx_PLUS (wmode, out_data, src);
14676       break;
14677     case IOR:
14678       x = gen_rtx_IOR (wmode, out_data, src);
14679       break;
14680     case XOR:
14681       x = gen_rtx_XOR (wmode, out_data, src);
14682       break;
14683     case AND:
14684       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
14685       return;
14686     default:
14687       gcc_unreachable ();
14688     }
14689
14690   emit_set_insn (out_result, x);
14691
14692   return;
14693 }
14694
14695 /* Split an atomic operation.  */
14696
14697 void
14698 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
14699                          rtx value, rtx model_rtx, rtx cond)
14700 {
14701   machine_mode mode = GET_MODE (mem);
14702   machine_mode wmode = (mode == DImode ? DImode : SImode);
14703   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
14704   const bool is_sync = is_mm_sync (model);
14705   rtx_code_label *label;
14706   rtx x;
14707
14708   /* Split the atomic operation into a sequence.  */
14709   label = gen_label_rtx ();
14710   emit_label (label);
14711
14712   if (new_out)
14713     new_out = gen_lowpart (wmode, new_out);
14714   if (old_out)
14715     old_out = gen_lowpart (wmode, old_out);
14716   else
14717     old_out = new_out;
14718   value = simplify_gen_subreg (wmode, value, mode, 0);
14719
14720   /* The initial load can be relaxed for a __sync operation since a final
14721      barrier will be emitted to stop code hoisting.  */
14722  if (is_sync)
14723     aarch64_emit_load_exclusive (mode, old_out, mem,
14724                                  GEN_INT (MEMMODEL_RELAXED));
14725   else
14726     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
14727
14728   switch (code)
14729     {
14730     case SET:
14731       new_out = value;
14732       break;
14733
14734     case NOT:
14735       x = gen_rtx_AND (wmode, old_out, value);
14736       emit_insn (gen_rtx_SET (new_out, x));
14737       x = gen_rtx_NOT (wmode, new_out);
14738       emit_insn (gen_rtx_SET (new_out, x));
14739       break;
14740
14741     case MINUS:
14742       if (CONST_INT_P (value))
14743         {
14744           value = GEN_INT (-INTVAL (value));
14745           code = PLUS;
14746         }
14747       /* Fall through.  */
14748
14749     default:
14750       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
14751       emit_insn (gen_rtx_SET (new_out, x));
14752       break;
14753     }
14754
14755   aarch64_emit_store_exclusive (mode, cond, mem,
14756                                 gen_lowpart (mode, new_out), model_rtx);
14757
14758   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14759   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14760                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
14761   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14762
14763   /* Emit any final barrier needed for a __sync operation.  */
14764   if (is_sync)
14765     aarch64_emit_post_barrier (model);
14766 }
14767
14768 static void
14769 aarch64_init_libfuncs (void)
14770 {
14771    /* Half-precision float operations.  The compiler handles all operations
14772      with NULL libfuncs by converting to SFmode.  */
14773
14774   /* Conversions.  */
14775   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
14776   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
14777
14778   /* Arithmetic.  */
14779   set_optab_libfunc (add_optab, HFmode, NULL);
14780   set_optab_libfunc (sdiv_optab, HFmode, NULL);
14781   set_optab_libfunc (smul_optab, HFmode, NULL);
14782   set_optab_libfunc (neg_optab, HFmode, NULL);
14783   set_optab_libfunc (sub_optab, HFmode, NULL);
14784
14785   /* Comparisons.  */
14786   set_optab_libfunc (eq_optab, HFmode, NULL);
14787   set_optab_libfunc (ne_optab, HFmode, NULL);
14788   set_optab_libfunc (lt_optab, HFmode, NULL);
14789   set_optab_libfunc (le_optab, HFmode, NULL);
14790   set_optab_libfunc (ge_optab, HFmode, NULL);
14791   set_optab_libfunc (gt_optab, HFmode, NULL);
14792   set_optab_libfunc (unord_optab, HFmode, NULL);
14793 }
14794
14795 /* Target hook for c_mode_for_suffix.  */
14796 static machine_mode
14797 aarch64_c_mode_for_suffix (char suffix)
14798 {
14799   if (suffix == 'q')
14800     return TFmode;
14801
14802   return VOIDmode;
14803 }
14804
14805 /* We can only represent floating point constants which will fit in
14806    "quarter-precision" values.  These values are characterised by
14807    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
14808    by:
14809
14810    (-1)^s * (n/16) * 2^r
14811
14812    Where:
14813      's' is the sign bit.
14814      'n' is an integer in the range 16 <= n <= 31.
14815      'r' is an integer in the range -3 <= r <= 4.  */
14816
14817 /* Return true iff X can be represented by a quarter-precision
14818    floating point immediate operand X.  Note, we cannot represent 0.0.  */
14819 bool
14820 aarch64_float_const_representable_p (rtx x)
14821 {
14822   /* This represents our current view of how many bits
14823      make up the mantissa.  */
14824   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
14825   int exponent;
14826   unsigned HOST_WIDE_INT mantissa, mask;
14827   REAL_VALUE_TYPE r, m;
14828   bool fail;
14829
14830   if (!CONST_DOUBLE_P (x))
14831     return false;
14832
14833   /* We don't support HFmode constants yet.  */
14834   if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
14835     return false;
14836
14837   r = *CONST_DOUBLE_REAL_VALUE (x);
14838
14839   /* We cannot represent infinities, NaNs or +/-zero.  We won't
14840      know if we have +zero until we analyse the mantissa, but we
14841      can reject the other invalid values.  */
14842   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
14843       || REAL_VALUE_MINUS_ZERO (r))
14844     return false;
14845
14846   /* Extract exponent.  */
14847   r = real_value_abs (&r);
14848   exponent = REAL_EXP (&r);
14849
14850   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
14851      highest (sign) bit, with a fixed binary point at bit point_pos.
14852      m1 holds the low part of the mantissa, m2 the high part.
14853      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
14854      bits for the mantissa, this can fail (low bits will be lost).  */
14855   real_ldexp (&m, &r, point_pos - exponent);
14856   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
14857
14858   /* If the low part of the mantissa has bits set we cannot represent
14859      the value.  */
14860   if (w.ulow () != 0)
14861     return false;
14862   /* We have rejected the lower HOST_WIDE_INT, so update our
14863      understanding of how many bits lie in the mantissa and
14864      look only at the high HOST_WIDE_INT.  */
14865   mantissa = w.elt (1);
14866   point_pos -= HOST_BITS_PER_WIDE_INT;
14867
14868   /* We can only represent values with a mantissa of the form 1.xxxx.  */
14869   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
14870   if ((mantissa & mask) != 0)
14871     return false;
14872
14873   /* Having filtered unrepresentable values, we may now remove all
14874      but the highest 5 bits.  */
14875   mantissa >>= point_pos - 5;
14876
14877   /* We cannot represent the value 0.0, so reject it.  This is handled
14878      elsewhere.  */
14879   if (mantissa == 0)
14880     return false;
14881
14882   /* Then, as bit 4 is always set, we can mask it off, leaving
14883      the mantissa in the range [0, 15].  */
14884   mantissa &= ~(1 << 4);
14885   gcc_assert (mantissa <= 15);
14886
14887   /* GCC internally does not use IEEE754-like encoding (where normalized
14888      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
14889      Our mantissa values are shifted 4 places to the left relative to
14890      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
14891      by 5 places to correct for GCC's representation.  */
14892   exponent = 5 - exponent;
14893
14894   return (exponent >= 0 && exponent <= 7);
14895 }
14896
14897 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
14898    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
14899    output MOVI/MVNI, ORR or BIC immediate.  */
14900 char*
14901 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
14902                                    enum simd_immediate_check which)
14903 {
14904   bool is_valid;
14905   static char templ[40];
14906   const char *mnemonic;
14907   const char *shift_op;
14908   unsigned int lane_count = 0;
14909   char element_char;
14910
14911   struct simd_immediate_info info;
14912
14913   /* This will return true to show const_vector is legal for use as either
14914      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
14915      It will also update INFO to show how the immediate should be generated.
14916      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
14917   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
14918   gcc_assert (is_valid);
14919
14920   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14921   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
14922
14923   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
14924     {
14925       gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
14926       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
14927          move immediate path.  */
14928       if (aarch64_float_const_zero_rtx_p (info.value))
14929         info.value = GEN_INT (0);
14930       else
14931         {
14932           const unsigned int buf_size = 20;
14933           char float_buf[buf_size] = {'\0'};
14934           real_to_decimal_for_mode (float_buf,
14935                                     CONST_DOUBLE_REAL_VALUE (info.value),
14936                                     buf_size, buf_size, 1, info.elt_mode);
14937
14938           if (lane_count == 1)
14939             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
14940           else
14941             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
14942                       lane_count, element_char, float_buf);
14943           return templ;
14944         }
14945     }
14946
14947   gcc_assert (CONST_INT_P (info.value));
14948
14949   if (which == AARCH64_CHECK_MOV)
14950     {
14951       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
14952       shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
14953       if (lane_count == 1)
14954         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
14955                   mnemonic, UINTVAL (info.value));
14956       else if (info.shift)
14957         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
14958                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
14959                   element_char, UINTVAL (info.value), shift_op, info.shift);
14960       else
14961         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
14962                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
14963                   element_char, UINTVAL (info.value));
14964     }
14965   else
14966     {
14967       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
14968       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
14969       if (info.shift)
14970         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
14971                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
14972                   element_char, UINTVAL (info.value), "lsl", info.shift);
14973       else
14974         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
14975                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
14976                   element_char, UINTVAL (info.value));
14977     }
14978   return templ;
14979 }
14980
14981 char*
14982 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
14983 {
14984
14985   /* If a floating point number was passed and we desire to use it in an
14986      integer mode do the conversion to integer.  */
14987   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
14988     {
14989       unsigned HOST_WIDE_INT ival;
14990       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
14991           gcc_unreachable ();
14992       immediate = gen_int_mode (ival, mode);
14993     }
14994
14995   machine_mode vmode;
14996   /* use a 64 bit mode for everything except for DI/DF mode, where we use
14997      a 128 bit vector mode.  */
14998   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
14999
15000   vmode = aarch64_simd_container_mode (mode, width);
15001   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
15002   return aarch64_output_simd_mov_immediate (v_op, width);
15003 }
15004
15005 /* Return the output string to use for moving immediate CONST_VECTOR
15006    into an SVE register.  */
15007
15008 char *
15009 aarch64_output_sve_mov_immediate (rtx const_vector)
15010 {
15011   static char templ[40];
15012   struct simd_immediate_info info;
15013   char element_char;
15014
15015   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
15016   gcc_assert (is_valid);
15017
15018   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
15019
15020   if (info.step)
15021     {
15022       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
15023                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
15024                 element_char, INTVAL (info.value), INTVAL (info.step));
15025       return templ;
15026     }
15027
15028   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15029     {
15030       if (aarch64_float_const_zero_rtx_p (info.value))
15031         info.value = GEN_INT (0);
15032       else
15033         {
15034           const int buf_size = 20;
15035           char float_buf[buf_size] = {};
15036           real_to_decimal_for_mode (float_buf,
15037                                     CONST_DOUBLE_REAL_VALUE (info.value),
15038                                     buf_size, buf_size, 1, info.elt_mode);
15039
15040           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
15041                     element_char, float_buf);
15042           return templ;
15043         }
15044     }
15045
15046   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
15047             element_char, INTVAL (info.value));
15048   return templ;
15049 }
15050
15051 /* Return the asm format for a PTRUE instruction whose destination has
15052    mode MODE.  SUFFIX is the element size suffix.  */
15053
15054 char *
15055 aarch64_output_ptrue (machine_mode mode, char suffix)
15056 {
15057   unsigned int nunits;
15058   static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
15059   if (GET_MODE_NUNITS (mode).is_constant (&nunits))
15060     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
15061   else
15062     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
15063   return buf;
15064 }
15065
15066 /* Split operands into moves from op[1] + op[2] into op[0].  */
15067
15068 void
15069 aarch64_split_combinev16qi (rtx operands[3])
15070 {
15071   unsigned int dest = REGNO (operands[0]);
15072   unsigned int src1 = REGNO (operands[1]);
15073   unsigned int src2 = REGNO (operands[2]);
15074   machine_mode halfmode = GET_MODE (operands[1]);
15075   unsigned int halfregs = REG_NREGS (operands[1]);
15076   rtx destlo, desthi;
15077
15078   gcc_assert (halfmode == V16QImode);
15079
15080   if (src1 == dest && src2 == dest + halfregs)
15081     {
15082       /* No-op move.  Can't split to nothing; emit something.  */
15083       emit_note (NOTE_INSN_DELETED);
15084       return;
15085     }
15086
15087   /* Preserve register attributes for variable tracking.  */
15088   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
15089   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
15090                                GET_MODE_SIZE (halfmode));
15091
15092   /* Special case of reversed high/low parts.  */
15093   if (reg_overlap_mentioned_p (operands[2], destlo)
15094       && reg_overlap_mentioned_p (operands[1], desthi))
15095     {
15096       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15097       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
15098       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15099     }
15100   else if (!reg_overlap_mentioned_p (operands[2], destlo))
15101     {
15102       /* Try to avoid unnecessary moves if part of the result
15103          is in the right place already.  */
15104       if (src1 != dest)
15105         emit_move_insn (destlo, operands[1]);
15106       if (src2 != dest + halfregs)
15107         emit_move_insn (desthi, operands[2]);
15108     }
15109   else
15110     {
15111       if (src2 != dest + halfregs)
15112         emit_move_insn (desthi, operands[2]);
15113       if (src1 != dest)
15114         emit_move_insn (destlo, operands[1]);
15115     }
15116 }
15117
15118 /* vec_perm support.  */
15119
15120 struct expand_vec_perm_d
15121 {
15122   rtx target, op0, op1;
15123   vec_perm_indices perm;
15124   machine_mode vmode;
15125   unsigned int vec_flags;
15126   bool one_vector_p;
15127   bool testing_p;
15128 };
15129
15130 /* Generate a variable permutation.  */
15131
15132 static void
15133 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
15134 {
15135   machine_mode vmode = GET_MODE (target);
15136   bool one_vector_p = rtx_equal_p (op0, op1);
15137
15138   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
15139   gcc_checking_assert (GET_MODE (op0) == vmode);
15140   gcc_checking_assert (GET_MODE (op1) == vmode);
15141   gcc_checking_assert (GET_MODE (sel) == vmode);
15142   gcc_checking_assert (TARGET_SIMD);
15143
15144   if (one_vector_p)
15145     {
15146       if (vmode == V8QImode)
15147         {
15148           /* Expand the argument to a V16QI mode by duplicating it.  */
15149           rtx pair = gen_reg_rtx (V16QImode);
15150           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
15151           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15152         }
15153       else
15154         {
15155           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
15156         }
15157     }
15158   else
15159     {
15160       rtx pair;
15161
15162       if (vmode == V8QImode)
15163         {
15164           pair = gen_reg_rtx (V16QImode);
15165           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
15166           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15167         }
15168       else
15169         {
15170           pair = gen_reg_rtx (OImode);
15171           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
15172           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
15173         }
15174     }
15175 }
15176
15177 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
15178    NELT is the number of elements in the vector.  */
15179
15180 void
15181 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
15182                          unsigned int nelt)
15183 {
15184   machine_mode vmode = GET_MODE (target);
15185   bool one_vector_p = rtx_equal_p (op0, op1);
15186   rtx mask;
15187
15188   /* The TBL instruction does not use a modulo index, so we must take care
15189      of that ourselves.  */
15190   mask = aarch64_simd_gen_const_vector_dup (vmode,
15191       one_vector_p ? nelt - 1 : 2 * nelt - 1);
15192   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
15193
15194   /* For big-endian, we also need to reverse the index within the vector
15195      (but not which vector).  */
15196   if (BYTES_BIG_ENDIAN)
15197     {
15198       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
15199       if (!one_vector_p)
15200         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
15201       sel = expand_simple_binop (vmode, XOR, sel, mask,
15202                                  NULL, 0, OPTAB_LIB_WIDEN);
15203     }
15204   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
15205 }
15206
15207 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
15208
15209 static void
15210 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
15211 {
15212   emit_insn (gen_rtx_SET (target,
15213                           gen_rtx_UNSPEC (GET_MODE (target),
15214                                           gen_rtvec (2, op0, op1), code)));
15215 }
15216
15217 /* Expand an SVE vec_perm with the given operands.  */
15218
15219 void
15220 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
15221 {
15222   machine_mode data_mode = GET_MODE (target);
15223   machine_mode sel_mode = GET_MODE (sel);
15224   /* Enforced by the pattern condition.  */
15225   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
15226
15227   /* Note: vec_perm indices are supposed to wrap when they go beyond the
15228      size of the two value vectors, i.e. the upper bits of the indices
15229      are effectively ignored.  SVE TBL instead produces 0 for any
15230      out-of-range indices, so we need to modulo all the vec_perm indices
15231      to ensure they are all in range.  */
15232   rtx sel_reg = force_reg (sel_mode, sel);
15233
15234   /* Check if the sel only references the first values vector.  */
15235   if (GET_CODE (sel) == CONST_VECTOR
15236       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
15237     {
15238       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
15239       return;
15240     }
15241
15242   /* Check if the two values vectors are the same.  */
15243   if (rtx_equal_p (op0, op1))
15244     {
15245       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
15246       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15247                                          NULL, 0, OPTAB_DIRECT);
15248       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
15249       return;
15250     }
15251
15252   /* Run TBL on for each value vector and combine the results.  */
15253
15254   rtx res0 = gen_reg_rtx (data_mode);
15255   rtx res1 = gen_reg_rtx (data_mode);
15256   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
15257   if (GET_CODE (sel) != CONST_VECTOR
15258       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
15259     {
15260       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
15261                                                        2 * nunits - 1);
15262       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15263                                      NULL, 0, OPTAB_DIRECT);
15264     }
15265   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
15266   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
15267                                      NULL, 0, OPTAB_DIRECT);
15268   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
15269   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
15270     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
15271   else
15272     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
15273 }
15274
15275 /* Recognize patterns suitable for the TRN instructions.  */
15276 static bool
15277 aarch64_evpc_trn (struct expand_vec_perm_d *d)
15278 {
15279   HOST_WIDE_INT odd;
15280   poly_uint64 nelt = d->perm.length ();
15281   rtx out, in0, in1, x;
15282   machine_mode vmode = d->vmode;
15283
15284   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15285     return false;
15286
15287   /* Note that these are little-endian tests.
15288      We correct for big-endian later.  */
15289   if (!d->perm[0].is_constant (&odd)
15290       || (odd != 0 && odd != 1)
15291       || !d->perm.series_p (0, 2, odd, 2)
15292       || !d->perm.series_p (1, 2, nelt + odd, 2))
15293     return false;
15294
15295   /* Success!  */
15296   if (d->testing_p)
15297     return true;
15298
15299   in0 = d->op0;
15300   in1 = d->op1;
15301   /* We don't need a big-endian lane correction for SVE; see the comment
15302      at the head of aarch64-sve.md for details.  */
15303   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15304     {
15305       x = in0, in0 = in1, in1 = x;
15306       odd = !odd;
15307     }
15308   out = d->target;
15309
15310   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15311                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
15312   return true;
15313 }
15314
15315 /* Recognize patterns suitable for the UZP instructions.  */
15316 static bool
15317 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
15318 {
15319   HOST_WIDE_INT odd;
15320   rtx out, in0, in1, x;
15321   machine_mode vmode = d->vmode;
15322
15323   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15324     return false;
15325
15326   /* Note that these are little-endian tests.
15327      We correct for big-endian later.  */
15328   if (!d->perm[0].is_constant (&odd)
15329       || (odd != 0 && odd != 1)
15330       || !d->perm.series_p (0, 1, odd, 2))
15331     return false;
15332
15333   /* Success!  */
15334   if (d->testing_p)
15335     return true;
15336
15337   in0 = d->op0;
15338   in1 = d->op1;
15339   /* We don't need a big-endian lane correction for SVE; see the comment
15340      at the head of aarch64-sve.md for details.  */
15341   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15342     {
15343       x = in0, in0 = in1, in1 = x;
15344       odd = !odd;
15345     }
15346   out = d->target;
15347
15348   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15349                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
15350   return true;
15351 }
15352
15353 /* Recognize patterns suitable for the ZIP instructions.  */
15354 static bool
15355 aarch64_evpc_zip (struct expand_vec_perm_d *d)
15356 {
15357   unsigned int high;
15358   poly_uint64 nelt = d->perm.length ();
15359   rtx out, in0, in1, x;
15360   machine_mode vmode = d->vmode;
15361
15362   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15363     return false;
15364
15365   /* Note that these are little-endian tests.
15366      We correct for big-endian later.  */
15367   poly_uint64 first = d->perm[0];
15368   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
15369       || !d->perm.series_p (0, 2, first, 1)
15370       || !d->perm.series_p (1, 2, first + nelt, 1))
15371     return false;
15372   high = maybe_ne (first, 0U);
15373
15374   /* Success!  */
15375   if (d->testing_p)
15376     return true;
15377
15378   in0 = d->op0;
15379   in1 = d->op1;
15380   /* We don't need a big-endian lane correction for SVE; see the comment
15381      at the head of aarch64-sve.md for details.  */
15382   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15383     {
15384       x = in0, in0 = in1, in1 = x;
15385       high = !high;
15386     }
15387   out = d->target;
15388
15389   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15390                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
15391   return true;
15392 }
15393
15394 /* Recognize patterns for the EXT insn.  */
15395
15396 static bool
15397 aarch64_evpc_ext (struct expand_vec_perm_d *d)
15398 {
15399   HOST_WIDE_INT location;
15400   rtx offset;
15401
15402   /* The first element always refers to the first vector.
15403      Check if the extracted indices are increasing by one.  */
15404   if (d->vec_flags == VEC_SVE_PRED
15405       || !d->perm[0].is_constant (&location)
15406       || !d->perm.series_p (0, 1, location, 1))
15407     return false;
15408
15409   /* Success! */
15410   if (d->testing_p)
15411     return true;
15412
15413   /* The case where (location == 0) is a no-op for both big- and little-endian,
15414      and is removed by the mid-end at optimization levels -O1 and higher.
15415
15416      We don't need a big-endian lane correction for SVE; see the comment
15417      at the head of aarch64-sve.md for details.  */
15418   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
15419     {
15420       /* After setup, we want the high elements of the first vector (stored
15421          at the LSB end of the register), and the low elements of the second
15422          vector (stored at the MSB end of the register). So swap.  */
15423       std::swap (d->op0, d->op1);
15424       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
15425          to_constant () is safe since this is restricted to Advanced SIMD
15426          vectors.  */
15427       location = d->perm.length ().to_constant () - location;
15428     }
15429
15430   offset = GEN_INT (location);
15431   emit_set_insn (d->target,
15432                  gen_rtx_UNSPEC (d->vmode,
15433                                  gen_rtvec (3, d->op0, d->op1, offset),
15434                                  UNSPEC_EXT));
15435   return true;
15436 }
15437
15438 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
15439    within each 64-bit, 32-bit or 16-bit granule.  */
15440
15441 static bool
15442 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
15443 {
15444   HOST_WIDE_INT diff;
15445   unsigned int i, size, unspec;
15446   machine_mode pred_mode;
15447
15448   if (d->vec_flags == VEC_SVE_PRED
15449       || !d->one_vector_p
15450       || !d->perm[0].is_constant (&diff))
15451     return false;
15452
15453   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
15454   if (size == 8)
15455     {
15456       unspec = UNSPEC_REV64;
15457       pred_mode = VNx2BImode;
15458     }
15459   else if (size == 4)
15460     {
15461       unspec = UNSPEC_REV32;
15462       pred_mode = VNx4BImode;
15463     }
15464   else if (size == 2)
15465     {
15466       unspec = UNSPEC_REV16;
15467       pred_mode = VNx8BImode;
15468     }
15469   else
15470     return false;
15471
15472   unsigned int step = diff + 1;
15473   for (i = 0; i < step; ++i)
15474     if (!d->perm.series_p (i, step, diff - i, step))
15475       return false;
15476
15477   /* Success! */
15478   if (d->testing_p)
15479     return true;
15480
15481   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
15482   if (d->vec_flags == VEC_SVE_DATA)
15483     {
15484       rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15485       src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
15486                             UNSPEC_MERGE_PTRUE);
15487     }
15488   emit_set_insn (d->target, src);
15489   return true;
15490 }
15491
15492 /* Recognize patterns for the REV insn, which reverses elements within
15493    a full vector.  */
15494
15495 static bool
15496 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
15497 {
15498   poly_uint64 nelt = d->perm.length ();
15499
15500   if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
15501     return false;
15502
15503   if (!d->perm.series_p (0, 1, nelt - 1, -1))
15504     return false;
15505
15506   /* Success! */
15507   if (d->testing_p)
15508     return true;
15509
15510   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
15511   emit_set_insn (d->target, src);
15512   return true;
15513 }
15514
15515 static bool
15516 aarch64_evpc_dup (struct expand_vec_perm_d *d)
15517 {
15518   rtx out = d->target;
15519   rtx in0;
15520   HOST_WIDE_INT elt;
15521   machine_mode vmode = d->vmode;
15522   rtx lane;
15523
15524   if (d->vec_flags == VEC_SVE_PRED
15525       || d->perm.encoding ().encoded_nelts () != 1
15526       || !d->perm[0].is_constant (&elt))
15527     return false;
15528
15529   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
15530     return false;
15531
15532   /* Success! */
15533   if (d->testing_p)
15534     return true;
15535
15536   /* The generic preparation in aarch64_expand_vec_perm_const_1
15537      swaps the operand order and the permute indices if it finds
15538      d->perm[0] to be in the second operand.  Thus, we can always
15539      use d->op0 and need not do any extra arithmetic to get the
15540      correct lane number.  */
15541   in0 = d->op0;
15542   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
15543
15544   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
15545   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
15546   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
15547   return true;
15548 }
15549
15550 static bool
15551 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
15552 {
15553   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
15554   machine_mode vmode = d->vmode;
15555
15556   /* Make sure that the indices are constant.  */
15557   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
15558   for (unsigned int i = 0; i < encoded_nelts; ++i)
15559     if (!d->perm[i].is_constant ())
15560       return false;
15561
15562   if (d->testing_p)
15563     return true;
15564
15565   /* Generic code will try constant permutation twice.  Once with the
15566      original mode and again with the elements lowered to QImode.
15567      So wait and don't do the selector expansion ourselves.  */
15568   if (vmode != V8QImode && vmode != V16QImode)
15569     return false;
15570
15571   /* to_constant is safe since this routine is specific to Advanced SIMD
15572      vectors.  */
15573   unsigned int nelt = d->perm.length ().to_constant ();
15574   for (unsigned int i = 0; i < nelt; ++i)
15575     /* If big-endian and two vectors we end up with a weird mixed-endian
15576        mode on NEON.  Reverse the index within each word but not the word
15577        itself.  to_constant is safe because we checked is_constant above.  */
15578     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
15579                         ? d->perm[i].to_constant () ^ (nelt - 1)
15580                         : d->perm[i].to_constant ());
15581
15582   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
15583   sel = force_reg (vmode, sel);
15584
15585   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
15586   return true;
15587 }
15588
15589 /* Try to implement D using an SVE TBL instruction.  */
15590
15591 static bool
15592 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
15593 {
15594   unsigned HOST_WIDE_INT nelt;
15595
15596   /* Permuting two variable-length vectors could overflow the
15597      index range.  */
15598   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
15599     return false;
15600
15601   if (d->testing_p)
15602     return true;
15603
15604   machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
15605   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
15606   aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
15607   return true;
15608 }
15609
15610 static bool
15611 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
15612 {
15613   /* The pattern matching functions above are written to look for a small
15614      number to begin the sequence (0, 1, N/2).  If we begin with an index
15615      from the second operand, we can swap the operands.  */
15616   poly_int64 nelt = d->perm.length ();
15617   if (known_ge (d->perm[0], nelt))
15618     {
15619       d->perm.rotate_inputs (1);
15620       std::swap (d->op0, d->op1);
15621     }
15622
15623   if ((d->vec_flags == VEC_ADVSIMD
15624        || d->vec_flags == VEC_SVE_DATA
15625        || d->vec_flags == VEC_SVE_PRED)
15626       && known_gt (nelt, 1))
15627     {
15628       if (aarch64_evpc_rev_local (d))
15629         return true;
15630       else if (aarch64_evpc_rev_global (d))
15631         return true;
15632       else if (aarch64_evpc_ext (d))
15633         return true;
15634       else if (aarch64_evpc_dup (d))
15635         return true;
15636       else if (aarch64_evpc_zip (d))
15637         return true;
15638       else if (aarch64_evpc_uzp (d))
15639         return true;
15640       else if (aarch64_evpc_trn (d))
15641         return true;
15642       if (d->vec_flags == VEC_SVE_DATA)
15643         return aarch64_evpc_sve_tbl (d);
15644       else if (d->vec_flags == VEC_SVE_DATA)
15645         return aarch64_evpc_tbl (d);
15646     }
15647   return false;
15648 }
15649
15650 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
15651
15652 static bool
15653 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
15654                                   rtx op1, const vec_perm_indices &sel)
15655 {
15656   struct expand_vec_perm_d d;
15657
15658   /* Check whether the mask can be applied to a single vector.  */
15659   if (op0 && rtx_equal_p (op0, op1))
15660     d.one_vector_p = true;
15661   else if (sel.all_from_input_p (0))
15662     {
15663       d.one_vector_p = true;
15664       op1 = op0;
15665     }
15666   else if (sel.all_from_input_p (1))
15667     {
15668       d.one_vector_p = true;
15669       op0 = op1;
15670     }
15671   else
15672     d.one_vector_p = false;
15673
15674   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
15675                      sel.nelts_per_input ());
15676   d.vmode = vmode;
15677   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
15678   d.target = target;
15679   d.op0 = op0;
15680   d.op1 = op1;
15681   d.testing_p = !target;
15682
15683   if (!d.testing_p)
15684     return aarch64_expand_vec_perm_const_1 (&d);
15685
15686   rtx_insn *last = get_last_insn ();
15687   bool ret = aarch64_expand_vec_perm_const_1 (&d);
15688   gcc_assert (last == get_last_insn ());
15689
15690   return ret;
15691 }
15692
15693 /* Generate a byte permute mask for a register of mode MODE,
15694    which has NUNITS units.  */
15695
15696 rtx
15697 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
15698 {
15699   /* We have to reverse each vector because we dont have
15700      a permuted load that can reverse-load according to ABI rules.  */
15701   rtx mask;
15702   rtvec v = rtvec_alloc (16);
15703   unsigned int i, j;
15704   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
15705
15706   gcc_assert (BYTES_BIG_ENDIAN);
15707   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
15708
15709   for (i = 0; i < nunits; i++)
15710     for (j = 0; j < usize; j++)
15711       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
15712   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
15713   return force_reg (V16QImode, mask);
15714 }
15715
15716 /* Return true if X is a valid second operand for the SVE instruction
15717    that implements integer comparison OP_CODE.  */
15718
15719 static bool
15720 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
15721 {
15722   if (register_operand (x, VOIDmode))
15723     return true;
15724
15725   switch (op_code)
15726     {
15727     case LTU:
15728     case LEU:
15729     case GEU:
15730     case GTU:
15731       return aarch64_sve_cmp_immediate_p (x, false);
15732     case LT:
15733     case LE:
15734     case GE:
15735     case GT:
15736     case NE:
15737     case EQ:
15738       return aarch64_sve_cmp_immediate_p (x, true);
15739     default:
15740       gcc_unreachable ();
15741     }
15742 }
15743
15744 /* Use predicated SVE instructions to implement the equivalent of:
15745
15746      (set TARGET OP)
15747
15748    given that PTRUE is an all-true predicate of the appropriate mode.  */
15749
15750 static void
15751 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
15752 {
15753   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
15754                                gen_rtvec (2, ptrue, op),
15755                                UNSPEC_MERGE_PTRUE);
15756   rtx_insn *insn = emit_set_insn (target, unspec);
15757   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
15758 }
15759
15760 /* Likewise, but also clobber the condition codes.  */
15761
15762 static void
15763 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
15764 {
15765   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
15766                                gen_rtvec (2, ptrue, op),
15767                                UNSPEC_MERGE_PTRUE);
15768   rtx_insn *insn = emit_insn (gen_set_clobber_cc (target, unspec));
15769   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
15770 }
15771
15772 /* Return the UNSPEC_COND_* code for comparison CODE.  */
15773
15774 static unsigned int
15775 aarch64_unspec_cond_code (rtx_code code)
15776 {
15777   switch (code)
15778     {
15779     case NE:
15780       return UNSPEC_COND_NE;
15781     case EQ:
15782       return UNSPEC_COND_EQ;
15783     case LT:
15784       return UNSPEC_COND_LT;
15785     case GT:
15786       return UNSPEC_COND_GT;
15787     case LE:
15788       return UNSPEC_COND_LE;
15789     case GE:
15790       return UNSPEC_COND_GE;
15791     default:
15792       gcc_unreachable ();
15793     }
15794 }
15795
15796 /* Emit:
15797
15798       (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
15799
15800    where <X> is the operation associated with comparison CODE.  This form
15801    of instruction is used when (and (CODE OP0 OP1) PRED) would have different
15802    semantics, such as when PRED might not be all-true and when comparing
15803    inactive lanes could have side effects.  */
15804
15805 static void
15806 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
15807                                   rtx pred, rtx op0, rtx op1)
15808 {
15809   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
15810                                gen_rtvec (3, pred, op0, op1),
15811                                aarch64_unspec_cond_code (code));
15812   emit_set_insn (target, unspec);
15813 }
15814
15815 /* Expand an SVE integer comparison using the SVE equivalent of:
15816
15817      (set TARGET (CODE OP0 OP1)).  */
15818
15819 void
15820 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
15821 {
15822   machine_mode pred_mode = GET_MODE (target);
15823   machine_mode data_mode = GET_MODE (op0);
15824
15825   if (!aarch64_sve_cmp_operand_p (code, op1))
15826     op1 = force_reg (data_mode, op1);
15827
15828   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15829   rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15830   aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
15831 }
15832
15833 /* Emit the SVE equivalent of:
15834
15835       (set TMP1 (CODE1 OP0 OP1))
15836       (set TMP2 (CODE2 OP0 OP1))
15837       (set TARGET (ior:PRED_MODE TMP1 TMP2))
15838
15839    PTRUE is an all-true predicate with the same mode as TARGET.  */
15840
15841 static void
15842 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
15843                            rtx ptrue, rtx op0, rtx op1)
15844 {
15845   machine_mode pred_mode = GET_MODE (ptrue);
15846   rtx tmp1 = gen_reg_rtx (pred_mode);
15847   aarch64_emit_sve_ptrue_op (tmp1, ptrue,
15848                              gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
15849   rtx tmp2 = gen_reg_rtx (pred_mode);
15850   aarch64_emit_sve_ptrue_op (tmp2, ptrue,
15851                              gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
15852   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
15853 }
15854
15855 /* Emit the SVE equivalent of:
15856
15857       (set TMP (CODE OP0 OP1))
15858       (set TARGET (not TMP))
15859
15860    PTRUE is an all-true predicate with the same mode as TARGET.  */
15861
15862 static void
15863 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
15864                                 rtx op0, rtx op1)
15865 {
15866   machine_mode pred_mode = GET_MODE (ptrue);
15867   rtx tmp = gen_reg_rtx (pred_mode);
15868   aarch64_emit_sve_ptrue_op (tmp, ptrue,
15869                              gen_rtx_fmt_ee (code, pred_mode, op0, op1));
15870   aarch64_emit_unop (target, one_cmpl_optab, tmp);
15871 }
15872
15873 /* Expand an SVE floating-point comparison using the SVE equivalent of:
15874
15875      (set TARGET (CODE OP0 OP1))
15876
15877    If CAN_INVERT_P is true, the caller can also handle inverted results;
15878    return true if the result is in fact inverted.  */
15879
15880 bool
15881 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
15882                                   rtx op0, rtx op1, bool can_invert_p)
15883 {
15884   machine_mode pred_mode = GET_MODE (target);
15885   machine_mode data_mode = GET_MODE (op0);
15886
15887   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15888   switch (code)
15889     {
15890     case UNORDERED:
15891       /* UNORDERED has no immediate form.  */
15892       op1 = force_reg (data_mode, op1);
15893       /* fall through */
15894     case LT:
15895     case LE:
15896     case GT:
15897     case GE:
15898     case EQ:
15899     case NE:
15900       {
15901         /* There is native support for the comparison.  */
15902         rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15903         aarch64_emit_sve_ptrue_op (target, ptrue, cond);
15904         return false;
15905       }
15906
15907     case LTGT:
15908       /* This is a trapping operation (LT or GT).  */
15909       aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
15910       return false;
15911
15912     case UNEQ:
15913       if (!flag_trapping_math)
15914         {
15915           /* This would trap for signaling NaNs.  */
15916           op1 = force_reg (data_mode, op1);
15917           aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
15918           return false;
15919         }
15920       /* fall through */
15921     case UNLT:
15922     case UNLE:
15923     case UNGT:
15924     case UNGE:
15925       if (flag_trapping_math)
15926         {
15927           /* Work out which elements are ordered.  */
15928           rtx ordered = gen_reg_rtx (pred_mode);
15929           op1 = force_reg (data_mode, op1);
15930           aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
15931
15932           /* Test the opposite condition for the ordered elements,
15933              then invert the result.  */
15934           if (code == UNEQ)
15935             code = NE;
15936           else
15937             code = reverse_condition_maybe_unordered (code);
15938           if (can_invert_p)
15939             {
15940               aarch64_emit_sve_predicated_cond (target, code,
15941                                                 ordered, op0, op1);
15942               return true;
15943             }
15944           rtx tmp = gen_reg_rtx (pred_mode);
15945           aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
15946           aarch64_emit_unop (target, one_cmpl_optab, tmp);
15947           return false;
15948         }
15949       break;
15950
15951     case ORDERED:
15952       /* ORDERED has no immediate form.  */
15953       op1 = force_reg (data_mode, op1);
15954       break;
15955
15956     default:
15957       gcc_unreachable ();
15958     }
15959
15960   /* There is native support for the inverse comparison.  */
15961   code = reverse_condition_maybe_unordered (code);
15962   if (can_invert_p)
15963     {
15964       rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15965       aarch64_emit_sve_ptrue_op (target, ptrue, cond);
15966       return true;
15967     }
15968   aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
15969   return false;
15970 }
15971
15972 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
15973    of the data being selected and CMP_MODE is the mode of the values being
15974    compared.  */
15975
15976 void
15977 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
15978                           rtx *ops)
15979 {
15980   machine_mode pred_mode
15981     = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
15982                              GET_MODE_SIZE (cmp_mode)).require ();
15983   rtx pred = gen_reg_rtx (pred_mode);
15984   if (FLOAT_MODE_P (cmp_mode))
15985     {
15986       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
15987                                             ops[4], ops[5], true))
15988         std::swap (ops[1], ops[2]);
15989     }
15990   else
15991     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
15992
15993   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
15994   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
15995 }
15996
15997 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
15998    true.  However due to issues with register allocation it is preferable
15999    to avoid tieing integer scalar and FP scalar modes.  Executing integer
16000    operations in general registers is better than treating them as scalar
16001    vector operations.  This reduces latency and avoids redundant int<->FP
16002    moves.  So tie modes if they are either the same class, or vector modes
16003    with other vector modes, vector structs or any scalar mode.  */
16004
16005 static bool
16006 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
16007 {
16008   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
16009     return true;
16010
16011   /* We specifically want to allow elements of "structure" modes to
16012      be tieable to the structure.  This more general condition allows
16013      other rarer situations too.  The reason we don't extend this to
16014      predicate modes is that there are no predicate structure modes
16015      nor any specific instructions for extracting part of a predicate
16016      register.  */
16017   if (aarch64_vector_data_mode_p (mode1)
16018       && aarch64_vector_data_mode_p (mode2))
16019     return true;
16020
16021   /* Also allow any scalar modes with vectors.  */
16022   if (aarch64_vector_mode_supported_p (mode1)
16023       || aarch64_vector_mode_supported_p (mode2))
16024     return true;
16025
16026   return false;
16027 }
16028
16029 /* Return a new RTX holding the result of moving POINTER forward by
16030    AMOUNT bytes.  */
16031
16032 static rtx
16033 aarch64_move_pointer (rtx pointer, poly_int64 amount)
16034 {
16035   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
16036
16037   return adjust_automodify_address (pointer, GET_MODE (pointer),
16038                                     next, amount);
16039 }
16040
16041 /* Return a new RTX holding the result of moving POINTER forward by the
16042    size of the mode it points to.  */
16043
16044 static rtx
16045 aarch64_progress_pointer (rtx pointer)
16046 {
16047   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
16048 }
16049
16050 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
16051    MODE bytes.  */
16052
16053 static void
16054 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
16055                                               machine_mode mode)
16056 {
16057   rtx reg = gen_reg_rtx (mode);
16058
16059   /* "Cast" the pointers to the correct mode.  */
16060   *src = adjust_address (*src, mode, 0);
16061   *dst = adjust_address (*dst, mode, 0);
16062   /* Emit the memcpy.  */
16063   emit_move_insn (reg, *src);
16064   emit_move_insn (*dst, reg);
16065   /* Move the pointers forward.  */
16066   *src = aarch64_progress_pointer (*src);
16067   *dst = aarch64_progress_pointer (*dst);
16068 }
16069
16070 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
16071    we succeed, otherwise return false.  */
16072
16073 bool
16074 aarch64_expand_movmem (rtx *operands)
16075 {
16076   unsigned int n;
16077   rtx dst = operands[0];
16078   rtx src = operands[1];
16079   rtx base;
16080   bool speed_p = !optimize_function_for_size_p (cfun);
16081
16082   /* When optimizing for size, give a better estimate of the length of a
16083      memcpy call, but use the default otherwise.  */
16084   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
16085
16086   /* We can't do anything smart if the amount to copy is not constant.  */
16087   if (!CONST_INT_P (operands[2]))
16088     return false;
16089
16090   n = UINTVAL (operands[2]);
16091
16092   /* Try to keep the number of instructions low.  For cases below 16 bytes we
16093      need to make at most two moves.  For cases above 16 bytes it will be one
16094      move for each 16 byte chunk, then at most two additional moves.  */
16095   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
16096     return false;
16097
16098   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
16099   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
16100
16101   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
16102   src = adjust_automodify_address (src, VOIDmode, base, 0);
16103
16104   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
16105      1-byte chunk.  */
16106   if (n < 4)
16107     {
16108       if (n >= 2)
16109         {
16110           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
16111           n -= 2;
16112         }
16113
16114       if (n == 1)
16115         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
16116
16117       return true;
16118     }
16119
16120   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
16121      4-byte chunk, partially overlapping with the previously copied chunk.  */
16122   if (n < 8)
16123     {
16124       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16125       n -= 4;
16126       if (n > 0)
16127         {
16128           int move = n - 4;
16129
16130           src = aarch64_move_pointer (src, move);
16131           dst = aarch64_move_pointer (dst, move);
16132           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16133         }
16134       return true;
16135     }
16136
16137   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
16138      them, then (if applicable) an 8-byte chunk.  */
16139   while (n >= 8)
16140     {
16141       if (n / 16)
16142         {
16143           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
16144           n -= 16;
16145         }
16146       else
16147         {
16148           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
16149           n -= 8;
16150         }
16151     }
16152
16153   /* Finish the final bytes of the copy.  We can always do this in one
16154      instruction.  We either copy the exact amount we need, or partially
16155      overlap with the previous chunk we copied and copy 8-bytes.  */
16156   if (n == 0)
16157     return true;
16158   else if (n == 1)
16159     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
16160   else if (n == 2)
16161     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
16162   else if (n == 4)
16163     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16164   else
16165     {
16166       if (n == 3)
16167         {
16168           src = aarch64_move_pointer (src, -1);
16169           dst = aarch64_move_pointer (dst, -1);
16170           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16171         }
16172       else
16173         {
16174           int move = n - 8;
16175
16176           src = aarch64_move_pointer (src, move);
16177           dst = aarch64_move_pointer (dst, move);
16178           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
16179         }
16180     }
16181
16182   return true;
16183 }
16184
16185 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
16186    SImode stores.  Handle the case when the constant has identical
16187    bottom and top halves.  This is beneficial when the two stores can be
16188    merged into an STP and we avoid synthesising potentially expensive
16189    immediates twice.  Return true if such a split is possible.  */
16190
16191 bool
16192 aarch64_split_dimode_const_store (rtx dst, rtx src)
16193 {
16194   rtx lo = gen_lowpart (SImode, src);
16195   rtx hi = gen_highpart_mode (SImode, DImode, src);
16196
16197   bool size_p = optimize_function_for_size_p (cfun);
16198
16199   if (!rtx_equal_p (lo, hi))
16200     return false;
16201
16202   unsigned int orig_cost
16203     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
16204   unsigned int lo_cost
16205     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
16206
16207   /* We want to transform:
16208      MOV        x1, 49370
16209      MOVK       x1, 0x140, lsl 16
16210      MOVK       x1, 0xc0da, lsl 32
16211      MOVK       x1, 0x140, lsl 48
16212      STR        x1, [x0]
16213    into:
16214      MOV        w1, 49370
16215      MOVK       w1, 0x140, lsl 16
16216      STP        w1, w1, [x0]
16217    So we want to perform this only when we save two instructions
16218    or more.  When optimizing for size, however, accept any code size
16219    savings we can.  */
16220   if (size_p && orig_cost <= lo_cost)
16221     return false;
16222
16223   if (!size_p
16224       && (orig_cost <= lo_cost + 1))
16225     return false;
16226
16227   rtx mem_lo = adjust_address (dst, SImode, 0);
16228   if (!aarch64_mem_pair_operand (mem_lo, SImode))
16229     return false;
16230
16231   rtx tmp_reg = gen_reg_rtx (SImode);
16232   aarch64_expand_mov_immediate (tmp_reg, lo);
16233   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
16234   /* Don't emit an explicit store pair as this may not be always profitable.
16235      Let the sched-fusion logic decide whether to merge them.  */
16236   emit_move_insn (mem_lo, tmp_reg);
16237   emit_move_insn (mem_hi, tmp_reg);
16238
16239   return true;
16240 }
16241
16242 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
16243
16244 static unsigned HOST_WIDE_INT
16245 aarch64_asan_shadow_offset (void)
16246 {
16247   return (HOST_WIDE_INT_1 << 36);
16248 }
16249
16250 static rtx
16251 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
16252                         int code, tree treeop0, tree treeop1)
16253 {
16254   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16255   rtx op0, op1;
16256   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16257   insn_code icode;
16258   struct expand_operand ops[4];
16259
16260   start_sequence ();
16261   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16262
16263   op_mode = GET_MODE (op0);
16264   if (op_mode == VOIDmode)
16265     op_mode = GET_MODE (op1);
16266
16267   switch (op_mode)
16268     {
16269     case E_QImode:
16270     case E_HImode:
16271     case E_SImode:
16272       cmp_mode = SImode;
16273       icode = CODE_FOR_cmpsi;
16274       break;
16275
16276     case E_DImode:
16277       cmp_mode = DImode;
16278       icode = CODE_FOR_cmpdi;
16279       break;
16280
16281     case E_SFmode:
16282       cmp_mode = SFmode;
16283       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16284       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
16285       break;
16286
16287     case E_DFmode:
16288       cmp_mode = DFmode;
16289       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16290       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
16291       break;
16292
16293     default:
16294       end_sequence ();
16295       return NULL_RTX;
16296     }
16297
16298   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
16299   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
16300   if (!op0 || !op1)
16301     {
16302       end_sequence ();
16303       return NULL_RTX;
16304     }
16305   *prep_seq = get_insns ();
16306   end_sequence ();
16307
16308   create_fixed_operand (&ops[0], op0);
16309   create_fixed_operand (&ops[1], op1);
16310
16311   start_sequence ();
16312   if (!maybe_expand_insn (icode, 2, ops))
16313     {
16314       end_sequence ();
16315       return NULL_RTX;
16316     }
16317   *gen_seq = get_insns ();
16318   end_sequence ();
16319
16320   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
16321                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
16322 }
16323
16324 static rtx
16325 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
16326                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
16327 {
16328   rtx op0, op1, target;
16329   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16330   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16331   insn_code icode;
16332   struct expand_operand ops[6];
16333   int aarch64_cond;
16334
16335   push_to_sequence (*prep_seq);
16336   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16337
16338   op_mode = GET_MODE (op0);
16339   if (op_mode == VOIDmode)
16340     op_mode = GET_MODE (op1);
16341
16342   switch (op_mode)
16343     {
16344     case E_QImode:
16345     case E_HImode:
16346     case E_SImode:
16347       cmp_mode = SImode;
16348       icode = CODE_FOR_ccmpsi;
16349       break;
16350
16351     case E_DImode:
16352       cmp_mode = DImode;
16353       icode = CODE_FOR_ccmpdi;
16354       break;
16355
16356     case E_SFmode:
16357       cmp_mode = SFmode;
16358       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16359       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
16360       break;
16361
16362     case E_DFmode:
16363       cmp_mode = DFmode;
16364       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16365       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
16366       break;
16367
16368     default:
16369       end_sequence ();
16370       return NULL_RTX;
16371     }
16372
16373   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
16374   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
16375   if (!op0 || !op1)
16376     {
16377       end_sequence ();
16378       return NULL_RTX;
16379     }
16380   *prep_seq = get_insns ();
16381   end_sequence ();
16382
16383   target = gen_rtx_REG (cc_mode, CC_REGNUM);
16384   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
16385
16386   if (bit_code != AND)
16387     {
16388       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
16389                                                 GET_MODE (XEXP (prev, 0))),
16390                              VOIDmode, XEXP (prev, 0), const0_rtx);
16391       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
16392     }
16393
16394   create_fixed_operand (&ops[0], XEXP (prev, 0));
16395   create_fixed_operand (&ops[1], target);
16396   create_fixed_operand (&ops[2], op0);
16397   create_fixed_operand (&ops[3], op1);
16398   create_fixed_operand (&ops[4], prev);
16399   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
16400
16401   push_to_sequence (*gen_seq);
16402   if (!maybe_expand_insn (icode, 6, ops))
16403     {
16404       end_sequence ();
16405       return NULL_RTX;
16406     }
16407
16408   *gen_seq = get_insns ();
16409   end_sequence ();
16410
16411   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
16412 }
16413
16414 #undef TARGET_GEN_CCMP_FIRST
16415 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
16416
16417 #undef TARGET_GEN_CCMP_NEXT
16418 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
16419
16420 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
16421    instruction fusion of some sort.  */
16422
16423 static bool
16424 aarch64_macro_fusion_p (void)
16425 {
16426   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
16427 }
16428
16429
16430 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
16431    should be kept together during scheduling.  */
16432
16433 static bool
16434 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
16435 {
16436   rtx set_dest;
16437   rtx prev_set = single_set (prev);
16438   rtx curr_set = single_set (curr);
16439   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
16440   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
16441
16442   if (!aarch64_macro_fusion_p ())
16443     return false;
16444
16445   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
16446     {
16447       /* We are trying to match:
16448          prev (mov)  == (set (reg r0) (const_int imm16))
16449          curr (movk) == (set (zero_extract (reg r0)
16450                                            (const_int 16)
16451                                            (const_int 16))
16452                              (const_int imm16_1))  */
16453
16454       set_dest = SET_DEST (curr_set);
16455
16456       if (GET_CODE (set_dest) == ZERO_EXTRACT
16457           && CONST_INT_P (SET_SRC (curr_set))
16458           && CONST_INT_P (SET_SRC (prev_set))
16459           && CONST_INT_P (XEXP (set_dest, 2))
16460           && INTVAL (XEXP (set_dest, 2)) == 16
16461           && REG_P (XEXP (set_dest, 0))
16462           && REG_P (SET_DEST (prev_set))
16463           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
16464         {
16465           return true;
16466         }
16467     }
16468
16469   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
16470     {
16471
16472       /*  We're trying to match:
16473           prev (adrp) == (set (reg r1)
16474                               (high (symbol_ref ("SYM"))))
16475           curr (add) == (set (reg r0)
16476                              (lo_sum (reg r1)
16477                                      (symbol_ref ("SYM"))))
16478           Note that r0 need not necessarily be the same as r1, especially
16479           during pre-regalloc scheduling.  */
16480
16481       if (satisfies_constraint_Ush (SET_SRC (prev_set))
16482           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16483         {
16484           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
16485               && REG_P (XEXP (SET_SRC (curr_set), 0))
16486               && REGNO (XEXP (SET_SRC (curr_set), 0))
16487                  == REGNO (SET_DEST (prev_set))
16488               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
16489                               XEXP (SET_SRC (curr_set), 1)))
16490             return true;
16491         }
16492     }
16493
16494   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
16495     {
16496
16497       /* We're trying to match:
16498          prev (movk) == (set (zero_extract (reg r0)
16499                                            (const_int 16)
16500                                            (const_int 32))
16501                              (const_int imm16_1))
16502          curr (movk) == (set (zero_extract (reg r0)
16503                                            (const_int 16)
16504                                            (const_int 48))
16505                              (const_int imm16_2))  */
16506
16507       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
16508           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
16509           && REG_P (XEXP (SET_DEST (prev_set), 0))
16510           && REG_P (XEXP (SET_DEST (curr_set), 0))
16511           && REGNO (XEXP (SET_DEST (prev_set), 0))
16512              == REGNO (XEXP (SET_DEST (curr_set), 0))
16513           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
16514           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
16515           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
16516           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
16517           && CONST_INT_P (SET_SRC (prev_set))
16518           && CONST_INT_P (SET_SRC (curr_set)))
16519         return true;
16520
16521     }
16522   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
16523     {
16524       /* We're trying to match:
16525           prev (adrp) == (set (reg r0)
16526                               (high (symbol_ref ("SYM"))))
16527           curr (ldr) == (set (reg r1)
16528                              (mem (lo_sum (reg r0)
16529                                              (symbol_ref ("SYM")))))
16530                  or
16531           curr (ldr) == (set (reg r1)
16532                              (zero_extend (mem
16533                                            (lo_sum (reg r0)
16534                                                    (symbol_ref ("SYM"))))))  */
16535       if (satisfies_constraint_Ush (SET_SRC (prev_set))
16536           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16537         {
16538           rtx curr_src = SET_SRC (curr_set);
16539
16540           if (GET_CODE (curr_src) == ZERO_EXTEND)
16541             curr_src = XEXP (curr_src, 0);
16542
16543           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
16544               && REG_P (XEXP (XEXP (curr_src, 0), 0))
16545               && REGNO (XEXP (XEXP (curr_src, 0), 0))
16546                  == REGNO (SET_DEST (prev_set))
16547               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
16548                               XEXP (SET_SRC (prev_set), 0)))
16549               return true;
16550         }
16551     }
16552
16553   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
16554        && aarch_crypto_can_dual_issue (prev, curr))
16555     return true;
16556
16557   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
16558       && any_condjump_p (curr))
16559     {
16560       enum attr_type prev_type = get_attr_type (prev);
16561
16562       unsigned int condreg1, condreg2;
16563       rtx cc_reg_1;
16564       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
16565       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
16566
16567       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
16568           && prev
16569           && modified_in_p (cc_reg_1, prev))
16570         {
16571           /* FIXME: this misses some which is considered simple arthematic
16572              instructions for ThunderX.  Simple shifts are missed here.  */
16573           if (prev_type == TYPE_ALUS_SREG
16574               || prev_type == TYPE_ALUS_IMM
16575               || prev_type == TYPE_LOGICS_REG
16576               || prev_type == TYPE_LOGICS_IMM)
16577             return true;
16578         }
16579     }
16580
16581   if (prev_set
16582       && curr_set
16583       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
16584       && any_condjump_p (curr))
16585     {
16586       /* We're trying to match:
16587           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
16588           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
16589                                                          (const_int 0))
16590                                                  (label_ref ("SYM"))
16591                                                  (pc))  */
16592       if (SET_DEST (curr_set) == (pc_rtx)
16593           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
16594           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
16595           && REG_P (SET_DEST (prev_set))
16596           && REGNO (SET_DEST (prev_set))
16597              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
16598         {
16599           /* Fuse ALU operations followed by conditional branch instruction.  */
16600           switch (get_attr_type (prev))
16601             {
16602             case TYPE_ALU_IMM:
16603             case TYPE_ALU_SREG:
16604             case TYPE_ADC_REG:
16605             case TYPE_ADC_IMM:
16606             case TYPE_ADCS_REG:
16607             case TYPE_ADCS_IMM:
16608             case TYPE_LOGIC_REG:
16609             case TYPE_LOGIC_IMM:
16610             case TYPE_CSEL:
16611             case TYPE_ADR:
16612             case TYPE_MOV_IMM:
16613             case TYPE_SHIFT_REG:
16614             case TYPE_SHIFT_IMM:
16615             case TYPE_BFM:
16616             case TYPE_RBIT:
16617             case TYPE_REV:
16618             case TYPE_EXTEND:
16619               return true;
16620
16621             default:;
16622             }
16623         }
16624     }
16625
16626   return false;
16627 }
16628
16629 /* Return true iff the instruction fusion described by OP is enabled.  */
16630
16631 bool
16632 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
16633 {
16634   return (aarch64_tune_params.fusible_ops & op) != 0;
16635 }
16636
16637 /* If MEM is in the form of [base+offset], extract the two parts
16638    of address and set to BASE and OFFSET, otherwise return false
16639    after clearing BASE and OFFSET.  */
16640
16641 bool
16642 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
16643 {
16644   rtx addr;
16645
16646   gcc_assert (MEM_P (mem));
16647
16648   addr = XEXP (mem, 0);
16649
16650   if (REG_P (addr))
16651     {
16652       *base = addr;
16653       *offset = const0_rtx;
16654       return true;
16655     }
16656
16657   if (GET_CODE (addr) == PLUS
16658       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
16659     {
16660       *base = XEXP (addr, 0);
16661       *offset = XEXP (addr, 1);
16662       return true;
16663     }
16664
16665   *base = NULL_RTX;
16666   *offset = NULL_RTX;
16667
16668   return false;
16669 }
16670
16671 /* Types for scheduling fusion.  */
16672 enum sched_fusion_type
16673 {
16674   SCHED_FUSION_NONE = 0,
16675   SCHED_FUSION_LD_SIGN_EXTEND,
16676   SCHED_FUSION_LD_ZERO_EXTEND,
16677   SCHED_FUSION_LD,
16678   SCHED_FUSION_ST,
16679   SCHED_FUSION_NUM
16680 };
16681
16682 /* If INSN is a load or store of address in the form of [base+offset],
16683    extract the two parts and set to BASE and OFFSET.  Return scheduling
16684    fusion type this INSN is.  */
16685
16686 static enum sched_fusion_type
16687 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
16688 {
16689   rtx x, dest, src;
16690   enum sched_fusion_type fusion = SCHED_FUSION_LD;
16691
16692   gcc_assert (INSN_P (insn));
16693   x = PATTERN (insn);
16694   if (GET_CODE (x) != SET)
16695     return SCHED_FUSION_NONE;
16696
16697   src = SET_SRC (x);
16698   dest = SET_DEST (x);
16699
16700   machine_mode dest_mode = GET_MODE (dest);
16701
16702   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
16703     return SCHED_FUSION_NONE;
16704
16705   if (GET_CODE (src) == SIGN_EXTEND)
16706     {
16707       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
16708       src = XEXP (src, 0);
16709       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16710         return SCHED_FUSION_NONE;
16711     }
16712   else if (GET_CODE (src) == ZERO_EXTEND)
16713     {
16714       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
16715       src = XEXP (src, 0);
16716       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16717         return SCHED_FUSION_NONE;
16718     }
16719
16720   if (GET_CODE (src) == MEM && REG_P (dest))
16721     extract_base_offset_in_addr (src, base, offset);
16722   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
16723     {
16724       fusion = SCHED_FUSION_ST;
16725       extract_base_offset_in_addr (dest, base, offset);
16726     }
16727   else
16728     return SCHED_FUSION_NONE;
16729
16730   if (*base == NULL_RTX || *offset == NULL_RTX)
16731     fusion = SCHED_FUSION_NONE;
16732
16733   return fusion;
16734 }
16735
16736 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
16737
16738    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
16739    and PRI are only calculated for these instructions.  For other instruction,
16740    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
16741    type instruction fusion can be added by returning different priorities.
16742
16743    It's important that irrelevant instructions get the largest FUSION_PRI.  */
16744
16745 static void
16746 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
16747                                int *fusion_pri, int *pri)
16748 {
16749   int tmp, off_val;
16750   rtx base, offset;
16751   enum sched_fusion_type fusion;
16752
16753   gcc_assert (INSN_P (insn));
16754
16755   tmp = max_pri - 1;
16756   fusion = fusion_load_store (insn, &base, &offset);
16757   if (fusion == SCHED_FUSION_NONE)
16758     {
16759       *pri = tmp;
16760       *fusion_pri = tmp;
16761       return;
16762     }
16763
16764   /* Set FUSION_PRI according to fusion type and base register.  */
16765   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
16766
16767   /* Calculate PRI.  */
16768   tmp /= 2;
16769
16770   /* INSN with smaller offset goes first.  */
16771   off_val = (int)(INTVAL (offset));
16772   if (off_val >= 0)
16773     tmp -= (off_val & 0xfffff);
16774   else
16775     tmp += ((- off_val) & 0xfffff);
16776
16777   *pri = tmp;
16778   return;
16779 }
16780
16781 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
16782    Adjust priority of sha1h instructions so they are scheduled before
16783    other SHA1 instructions.  */
16784
16785 static int
16786 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
16787 {
16788   rtx x = PATTERN (insn);
16789
16790   if (GET_CODE (x) == SET)
16791     {
16792       x = SET_SRC (x);
16793
16794       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
16795         return priority + 10;
16796     }
16797
16798   return priority;
16799 }
16800
16801 /* Given OPERANDS of consecutive load/store, check if we can merge
16802    them into ldp/stp.  LOAD is true if they are load instructions.
16803    MODE is the mode of memory operands.  */
16804
16805 bool
16806 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
16807                                 machine_mode mode)
16808 {
16809   HOST_WIDE_INT offval_1, offval_2, msize;
16810   enum reg_class rclass_1, rclass_2;
16811   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
16812
16813   if (load)
16814     {
16815       mem_1 = operands[1];
16816       mem_2 = operands[3];
16817       reg_1 = operands[0];
16818       reg_2 = operands[2];
16819       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
16820       if (REGNO (reg_1) == REGNO (reg_2))
16821         return false;
16822     }
16823   else
16824     {
16825       mem_1 = operands[0];
16826       mem_2 = operands[2];
16827       reg_1 = operands[1];
16828       reg_2 = operands[3];
16829     }
16830
16831   /* The mems cannot be volatile.  */
16832   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
16833     return false;
16834
16835   /* If we have SImode and slow unaligned ldp,
16836      check the alignment to be at least 8 byte. */
16837   if (mode == SImode
16838       && (aarch64_tune_params.extra_tuning_flags
16839           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16840       && !optimize_size
16841       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
16842     return false;
16843
16844   /* Check if the addresses are in the form of [base+offset].  */
16845   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16846   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16847     return false;
16848   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16849   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16850     return false;
16851
16852   /* Check if the bases are same.  */
16853   if (!rtx_equal_p (base_1, base_2))
16854     return false;
16855
16856   /* The operands must be of the same size.  */
16857   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
16858                          GET_MODE_SIZE (GET_MODE (mem_2))));
16859
16860   offval_1 = INTVAL (offset_1);
16861   offval_2 = INTVAL (offset_2);
16862   /* We should only be trying this for fixed-sized modes.  There is no
16863      SVE LDP/STP instruction.  */
16864   msize = GET_MODE_SIZE (mode).to_constant ();
16865   /* Check if the offsets are consecutive.  */
16866   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
16867     return false;
16868
16869   /* Check if the addresses are clobbered by load.  */
16870   if (load)
16871     {
16872       if (reg_mentioned_p (reg_1, mem_1))
16873         return false;
16874
16875       /* In increasing order, the last load can clobber the address.  */
16876       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
16877       return false;
16878     }
16879
16880   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
16881     rclass_1 = FP_REGS;
16882   else
16883     rclass_1 = GENERAL_REGS;
16884
16885   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
16886     rclass_2 = FP_REGS;
16887   else
16888     rclass_2 = GENERAL_REGS;
16889
16890   /* Check if the registers are of same class.  */
16891   if (rclass_1 != rclass_2)
16892     return false;
16893
16894   return true;
16895 }
16896
16897 /* Given OPERANDS of consecutive load/store, check if we can merge
16898    them into ldp/stp by adjusting the offset.  LOAD is true if they
16899    are load instructions.  MODE is the mode of memory operands.
16900
16901    Given below consecutive stores:
16902
16903      str  w1, [xb, 0x100]
16904      str  w1, [xb, 0x104]
16905      str  w1, [xb, 0x108]
16906      str  w1, [xb, 0x10c]
16907
16908    Though the offsets are out of the range supported by stp, we can
16909    still pair them after adjusting the offset, like:
16910
16911      add  scratch, xb, 0x100
16912      stp  w1, w1, [scratch]
16913      stp  w1, w1, [scratch, 0x8]
16914
16915    The peephole patterns detecting this opportunity should guarantee
16916    the scratch register is avaliable.  */
16917
16918 bool
16919 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
16920                                        scalar_mode mode)
16921 {
16922   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
16923   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
16924   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
16925   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
16926
16927   if (load)
16928     {
16929       reg_1 = operands[0];
16930       mem_1 = operands[1];
16931       reg_2 = operands[2];
16932       mem_2 = operands[3];
16933       reg_3 = operands[4];
16934       mem_3 = operands[5];
16935       reg_4 = operands[6];
16936       mem_4 = operands[7];
16937       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
16938                   && REG_P (reg_3) && REG_P (reg_4));
16939       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
16940         return false;
16941     }
16942   else
16943     {
16944       mem_1 = operands[0];
16945       reg_1 = operands[1];
16946       mem_2 = operands[2];
16947       reg_2 = operands[3];
16948       mem_3 = operands[4];
16949       reg_3 = operands[5];
16950       mem_4 = operands[6];
16951       reg_4 = operands[7];
16952     }
16953   /* Skip if memory operand is by itslef valid for ldp/stp.  */
16954   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
16955     return false;
16956
16957   /* The mems cannot be volatile.  */
16958   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
16959       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
16960     return false;
16961
16962   /* Check if the addresses are in the form of [base+offset].  */
16963   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16964   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16965     return false;
16966   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16967   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16968     return false;
16969   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
16970   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
16971     return false;
16972   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
16973   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
16974     return false;
16975
16976   /* Check if the bases are same.  */
16977   if (!rtx_equal_p (base_1, base_2)
16978       || !rtx_equal_p (base_2, base_3)
16979       || !rtx_equal_p (base_3, base_4))
16980     return false;
16981
16982   offval_1 = INTVAL (offset_1);
16983   offval_2 = INTVAL (offset_2);
16984   offval_3 = INTVAL (offset_3);
16985   offval_4 = INTVAL (offset_4);
16986   msize = GET_MODE_SIZE (mode);
16987   /* Check if the offsets are consecutive.  */
16988   if ((offval_1 != (offval_2 + msize)
16989        || offval_1 != (offval_3 + msize * 2)
16990        || offval_1 != (offval_4 + msize * 3))
16991       && (offval_4 != (offval_3 + msize)
16992           || offval_4 != (offval_2 + msize * 2)
16993           || offval_4 != (offval_1 + msize * 3)))
16994     return false;
16995
16996   /* Check if the addresses are clobbered by load.  */
16997   if (load)
16998     {
16999       if (reg_mentioned_p (reg_1, mem_1)
17000           || reg_mentioned_p (reg_2, mem_2)
17001           || reg_mentioned_p (reg_3, mem_3))
17002         return false;
17003
17004       /* In increasing order, the last load can clobber the address.  */
17005       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
17006         return false;
17007     }
17008
17009   /* If we have SImode and slow unaligned ldp,
17010      check the alignment to be at least 8 byte. */
17011   if (mode == SImode
17012       && (aarch64_tune_params.extra_tuning_flags
17013           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
17014       && !optimize_size
17015       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
17016     return false;
17017
17018   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
17019     rclass_1 = FP_REGS;
17020   else
17021     rclass_1 = GENERAL_REGS;
17022
17023   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
17024     rclass_2 = FP_REGS;
17025   else
17026     rclass_2 = GENERAL_REGS;
17027
17028   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
17029     rclass_3 = FP_REGS;
17030   else
17031     rclass_3 = GENERAL_REGS;
17032
17033   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
17034     rclass_4 = FP_REGS;
17035   else
17036     rclass_4 = GENERAL_REGS;
17037
17038   /* Check if the registers are of same class.  */
17039   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
17040     return false;
17041
17042   return true;
17043 }
17044
17045 /* Given OPERANDS of consecutive load/store, this function pairs them
17046    into ldp/stp after adjusting the offset.  It depends on the fact
17047    that addresses of load/store instructions are in increasing order.
17048    MODE is the mode of memory operands.  CODE is the rtl operator
17049    which should be applied to all memory operands, it's SIGN_EXTEND,
17050    ZERO_EXTEND or UNKNOWN.  */
17051
17052 bool
17053 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
17054                              scalar_mode mode, RTX_CODE code)
17055 {
17056   rtx base, offset, t1, t2;
17057   rtx mem_1, mem_2, mem_3, mem_4;
17058   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
17059
17060   if (load)
17061     {
17062       mem_1 = operands[1];
17063       mem_2 = operands[3];
17064       mem_3 = operands[5];
17065       mem_4 = operands[7];
17066     }
17067   else
17068     {
17069       mem_1 = operands[0];
17070       mem_2 = operands[2];
17071       mem_3 = operands[4];
17072       mem_4 = operands[6];
17073       gcc_assert (code == UNKNOWN);
17074     }
17075
17076   extract_base_offset_in_addr (mem_1, &base, &offset);
17077   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
17078
17079   /* Adjust offset thus it can fit in ldp/stp instruction.  */
17080   msize = GET_MODE_SIZE (mode);
17081   stp_off_limit = msize * 0x40;
17082   off_val = INTVAL (offset);
17083   abs_off = (off_val < 0) ? -off_val : off_val;
17084   new_off = abs_off % stp_off_limit;
17085   adj_off = abs_off - new_off;
17086
17087   /* Further adjust to make sure all offsets are OK.  */
17088   if ((new_off + msize * 2) >= stp_off_limit)
17089     {
17090       adj_off += stp_off_limit;
17091       new_off -= stp_off_limit;
17092     }
17093
17094   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
17095   if (adj_off >= 0x1000)
17096     return false;
17097
17098   if (off_val < 0)
17099     {
17100       adj_off = -adj_off;
17101       new_off = -new_off;
17102     }
17103
17104   /* Create new memory references.  */
17105   mem_1 = change_address (mem_1, VOIDmode,
17106                           plus_constant (DImode, operands[8], new_off));
17107
17108   /* Check if the adjusted address is OK for ldp/stp.  */
17109   if (!aarch64_mem_pair_operand (mem_1, mode))
17110     return false;
17111
17112   msize = GET_MODE_SIZE (mode);
17113   mem_2 = change_address (mem_2, VOIDmode,
17114                           plus_constant (DImode,
17115                                          operands[8],
17116                                          new_off + msize));
17117   mem_3 = change_address (mem_3, VOIDmode,
17118                           plus_constant (DImode,
17119                                          operands[8],
17120                                          new_off + msize * 2));
17121   mem_4 = change_address (mem_4, VOIDmode,
17122                           plus_constant (DImode,
17123                                          operands[8],
17124                                          new_off + msize * 3));
17125
17126   if (code == ZERO_EXTEND)
17127     {
17128       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
17129       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
17130       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
17131       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
17132     }
17133   else if (code == SIGN_EXTEND)
17134     {
17135       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
17136       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
17137       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
17138       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
17139     }
17140
17141   if (load)
17142     {
17143       operands[1] = mem_1;
17144       operands[3] = mem_2;
17145       operands[5] = mem_3;
17146       operands[7] = mem_4;
17147     }
17148   else
17149     {
17150       operands[0] = mem_1;
17151       operands[2] = mem_2;
17152       operands[4] = mem_3;
17153       operands[6] = mem_4;
17154     }
17155
17156   /* Emit adjusting instruction.  */
17157   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
17158   /* Emit ldp/stp instructions.  */
17159   t1 = gen_rtx_SET (operands[0], operands[1]);
17160   t2 = gen_rtx_SET (operands[2], operands[3]);
17161   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17162   t1 = gen_rtx_SET (operands[4], operands[5]);
17163   t2 = gen_rtx_SET (operands[6], operands[7]);
17164   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17165   return true;
17166 }
17167
17168 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
17169    it isn't worth branching around empty masked ops (including masked
17170    stores).  */
17171
17172 static bool
17173 aarch64_empty_mask_is_expensive (unsigned)
17174 {
17175   return false;
17176 }
17177
17178 /* Return 1 if pseudo register should be created and used to hold
17179    GOT address for PIC code.  */
17180
17181 bool
17182 aarch64_use_pseudo_pic_reg (void)
17183 {
17184   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
17185 }
17186
17187 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
17188
17189 static int
17190 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
17191 {
17192   switch (XINT (x, 1))
17193     {
17194     case UNSPEC_GOTSMALLPIC:
17195     case UNSPEC_GOTSMALLPIC28K:
17196     case UNSPEC_GOTTINYPIC:
17197       return 0;
17198     default:
17199       break;
17200     }
17201
17202   return default_unspec_may_trap_p (x, flags);
17203 }
17204
17205
17206 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
17207    return the log2 of that value.  Otherwise return -1.  */
17208
17209 int
17210 aarch64_fpconst_pow_of_2 (rtx x)
17211 {
17212   const REAL_VALUE_TYPE *r;
17213
17214   if (!CONST_DOUBLE_P (x))
17215     return -1;
17216
17217   r = CONST_DOUBLE_REAL_VALUE (x);
17218
17219   if (REAL_VALUE_NEGATIVE (*r)
17220       || REAL_VALUE_ISNAN (*r)
17221       || REAL_VALUE_ISINF (*r)
17222       || !real_isinteger (r, DFmode))
17223     return -1;
17224
17225   return exact_log2 (real_to_integer (r));
17226 }
17227
17228 /* If X is a vector of equal CONST_DOUBLE values and that value is
17229    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
17230
17231 int
17232 aarch64_vec_fpconst_pow_of_2 (rtx x)
17233 {
17234   int nelts;
17235   if (GET_CODE (x) != CONST_VECTOR
17236       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
17237     return -1;
17238
17239   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
17240     return -1;
17241
17242   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
17243   if (firstval <= 0)
17244     return -1;
17245
17246   for (int i = 1; i < nelts; i++)
17247     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
17248       return -1;
17249
17250   return firstval;
17251 }
17252
17253 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
17254    to float.
17255
17256    __fp16 always promotes through this hook.
17257    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
17258    through the generic excess precision logic rather than here.  */
17259
17260 static tree
17261 aarch64_promoted_type (const_tree t)
17262 {
17263   if (SCALAR_FLOAT_TYPE_P (t)
17264       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
17265     return float_type_node;
17266
17267   return NULL_TREE;
17268 }
17269
17270 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
17271
17272 static bool
17273 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
17274                            optimization_type opt_type)
17275 {
17276   switch (op)
17277     {
17278     case rsqrt_optab:
17279       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
17280
17281     default:
17282       return true;
17283     }
17284 }
17285
17286 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
17287
17288 static unsigned int
17289 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
17290                                         int *offset)
17291 {
17292   /* Polynomial invariant 1 == (VG / 2) - 1.  */
17293   gcc_assert (i == 1);
17294   *factor = 2;
17295   *offset = 1;
17296   return AARCH64_DWARF_VG;
17297 }
17298
17299 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
17300    if MODE is HFmode, and punt to the generic implementation otherwise.  */
17301
17302 static bool
17303 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
17304 {
17305   return (mode == HFmode
17306           ? true
17307           : default_libgcc_floating_mode_supported_p (mode));
17308 }
17309
17310 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
17311    if MODE is HFmode, and punt to the generic implementation otherwise.  */
17312
17313 static bool
17314 aarch64_scalar_mode_supported_p (scalar_mode mode)
17315 {
17316   return (mode == HFmode
17317           ? true
17318           : default_scalar_mode_supported_p (mode));
17319 }
17320
17321 /* Set the value of FLT_EVAL_METHOD.
17322    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
17323
17324     0: evaluate all operations and constants, whose semantic type has at
17325        most the range and precision of type float, to the range and
17326        precision of float; evaluate all other operations and constants to
17327        the range and precision of the semantic type;
17328
17329     N, where _FloatN is a supported interchange floating type
17330        evaluate all operations and constants, whose semantic type has at
17331        most the range and precision of _FloatN type, to the range and
17332        precision of the _FloatN type; evaluate all other operations and
17333        constants to the range and precision of the semantic type;
17334
17335    If we have the ARMv8.2-A extensions then we support _Float16 in native
17336    precision, so we should set this to 16.  Otherwise, we support the type,
17337    but want to evaluate expressions in float precision, so set this to
17338    0.  */
17339
17340 static enum flt_eval_method
17341 aarch64_excess_precision (enum excess_precision_type type)
17342 {
17343   switch (type)
17344     {
17345       case EXCESS_PRECISION_TYPE_FAST:
17346       case EXCESS_PRECISION_TYPE_STANDARD:
17347         /* We can calculate either in 16-bit range and precision or
17348            32-bit range and precision.  Make that decision based on whether
17349            we have native support for the ARMv8.2-A 16-bit floating-point
17350            instructions or not.  */
17351         return (TARGET_FP_F16INST
17352                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
17353                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
17354       case EXCESS_PRECISION_TYPE_IMPLICIT:
17355         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
17356       default:
17357         gcc_unreachable ();
17358     }
17359   return FLT_EVAL_METHOD_UNPREDICTABLE;
17360 }
17361
17362 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
17363    scheduled for speculative execution.  Reject the long-running division
17364    and square-root instructions.  */
17365
17366 static bool
17367 aarch64_sched_can_speculate_insn (rtx_insn *insn)
17368 {
17369   switch (get_attr_type (insn))
17370     {
17371       case TYPE_SDIV:
17372       case TYPE_UDIV:
17373       case TYPE_FDIVS:
17374       case TYPE_FDIVD:
17375       case TYPE_FSQRTS:
17376       case TYPE_FSQRTD:
17377       case TYPE_NEON_FP_SQRT_S:
17378       case TYPE_NEON_FP_SQRT_D:
17379       case TYPE_NEON_FP_SQRT_S_Q:
17380       case TYPE_NEON_FP_SQRT_D_Q:
17381       case TYPE_NEON_FP_DIV_S:
17382       case TYPE_NEON_FP_DIV_D:
17383       case TYPE_NEON_FP_DIV_S_Q:
17384       case TYPE_NEON_FP_DIV_D_Q:
17385         return false;
17386       default:
17387         return true;
17388     }
17389 }
17390
17391 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
17392
17393 static int
17394 aarch64_compute_pressure_classes (reg_class *classes)
17395 {
17396   int i = 0;
17397   classes[i++] = GENERAL_REGS;
17398   classes[i++] = FP_REGS;
17399   /* PR_REGS isn't a useful pressure class because many predicate pseudo
17400      registers need to go in PR_LO_REGS at some point during their
17401      lifetime.  Splitting it into two halves has the effect of making
17402      all predicates count against PR_LO_REGS, so that we try whenever
17403      possible to restrict the number of live predicates to 8.  This
17404      greatly reduces the amount of spilling in certain loops.  */
17405   classes[i++] = PR_LO_REGS;
17406   classes[i++] = PR_HI_REGS;
17407   return i;
17408 }
17409
17410 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
17411
17412 static bool
17413 aarch64_can_change_mode_class (machine_mode from,
17414                                machine_mode to, reg_class_t)
17415 {
17416   if (BYTES_BIG_ENDIAN)
17417     {
17418       bool from_sve_p = aarch64_sve_data_mode_p (from);
17419       bool to_sve_p = aarch64_sve_data_mode_p (to);
17420
17421       /* Don't allow changes between SVE data modes and non-SVE modes.
17422          See the comment at the head of aarch64-sve.md for details.  */
17423       if (from_sve_p != to_sve_p)
17424         return false;
17425
17426       /* Don't allow changes in element size: lane 0 of the new vector
17427          would not then be lane 0 of the old vector.  See the comment
17428          above aarch64_maybe_expand_sve_subreg_move for a more detailed
17429          description.
17430
17431          In the worst case, this forces a register to be spilled in
17432          one mode and reloaded in the other, which handles the
17433          endianness correctly.  */
17434       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
17435         return false;
17436     }
17437   return true;
17438 }
17439
17440 /* Implement TARGET_EARLY_REMAT_MODES.  */
17441
17442 static void
17443 aarch64_select_early_remat_modes (sbitmap modes)
17444 {
17445   /* SVE values are not normally live across a call, so it should be
17446      worth doing early rematerialization even in VL-specific mode.  */
17447   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
17448     {
17449       machine_mode mode = (machine_mode) i;
17450       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
17451       if (vec_flags & VEC_ANY_SVE)
17452         bitmap_set_bit (modes, i);
17453     }
17454 }
17455
17456 /* Target-specific selftests.  */
17457
17458 #if CHECKING_P
17459
17460 namespace selftest {
17461
17462 /* Selftest for the RTL loader.
17463    Verify that the RTL loader copes with a dump from
17464    print_rtx_function.  This is essentially just a test that class
17465    function_reader can handle a real dump, but it also verifies
17466    that lookup_reg_by_dump_name correctly handles hard regs.
17467    The presence of hard reg names in the dump means that the test is
17468    target-specific, hence it is in this file.  */
17469
17470 static void
17471 aarch64_test_loading_full_dump ()
17472 {
17473   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
17474
17475   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
17476
17477   rtx_insn *insn_1 = get_insn_by_uid (1);
17478   ASSERT_EQ (NOTE, GET_CODE (insn_1));
17479
17480   rtx_insn *insn_15 = get_insn_by_uid (15);
17481   ASSERT_EQ (INSN, GET_CODE (insn_15));
17482   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
17483
17484   /* Verify crtl->return_rtx.  */
17485   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
17486   ASSERT_EQ (0, REGNO (crtl->return_rtx));
17487   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
17488 }
17489
17490 /* Run all target-specific selftests.  */
17491
17492 static void
17493 aarch64_run_selftests (void)
17494 {
17495   aarch64_test_loading_full_dump ();
17496 }
17497
17498 } // namespace selftest
17499
17500 #endif /* #if CHECKING_P */
17501
17502 #undef TARGET_ADDRESS_COST
17503 #define TARGET_ADDRESS_COST aarch64_address_cost
17504
17505 /* This hook will determines whether unnamed bitfields affect the alignment
17506    of the containing structure.  The hook returns true if the structure
17507    should inherit the alignment requirements of an unnamed bitfield's
17508    type.  */
17509 #undef TARGET_ALIGN_ANON_BITFIELD
17510 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
17511
17512 #undef TARGET_ASM_ALIGNED_DI_OP
17513 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
17514
17515 #undef TARGET_ASM_ALIGNED_HI_OP
17516 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
17517
17518 #undef TARGET_ASM_ALIGNED_SI_OP
17519 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
17520
17521 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
17522 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
17523   hook_bool_const_tree_hwi_hwi_const_tree_true
17524
17525 #undef TARGET_ASM_FILE_START
17526 #define TARGET_ASM_FILE_START aarch64_start_file
17527
17528 #undef TARGET_ASM_OUTPUT_MI_THUNK
17529 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
17530
17531 #undef TARGET_ASM_SELECT_RTX_SECTION
17532 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
17533
17534 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
17535 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
17536
17537 #undef TARGET_BUILD_BUILTIN_VA_LIST
17538 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
17539
17540 #undef TARGET_CALLEE_COPIES
17541 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
17542
17543 #undef TARGET_CAN_ELIMINATE
17544 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
17545
17546 #undef TARGET_CAN_INLINE_P
17547 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
17548
17549 #undef TARGET_CANNOT_FORCE_CONST_MEM
17550 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
17551
17552 #undef TARGET_CASE_VALUES_THRESHOLD
17553 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
17554
17555 #undef TARGET_CONDITIONAL_REGISTER_USAGE
17556 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
17557
17558 /* Only the least significant bit is used for initialization guard
17559    variables.  */
17560 #undef TARGET_CXX_GUARD_MASK_BIT
17561 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
17562
17563 #undef TARGET_C_MODE_FOR_SUFFIX
17564 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
17565
17566 #ifdef TARGET_BIG_ENDIAN_DEFAULT
17567 #undef  TARGET_DEFAULT_TARGET_FLAGS
17568 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
17569 #endif
17570
17571 #undef TARGET_CLASS_MAX_NREGS
17572 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
17573
17574 #undef TARGET_BUILTIN_DECL
17575 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
17576
17577 #undef TARGET_BUILTIN_RECIPROCAL
17578 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
17579
17580 #undef TARGET_C_EXCESS_PRECISION
17581 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
17582
17583 #undef  TARGET_EXPAND_BUILTIN
17584 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
17585
17586 #undef TARGET_EXPAND_BUILTIN_VA_START
17587 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
17588
17589 #undef TARGET_FOLD_BUILTIN
17590 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
17591
17592 #undef TARGET_FUNCTION_ARG
17593 #define TARGET_FUNCTION_ARG aarch64_function_arg
17594
17595 #undef TARGET_FUNCTION_ARG_ADVANCE
17596 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
17597
17598 #undef TARGET_FUNCTION_ARG_BOUNDARY
17599 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
17600
17601 #undef TARGET_FUNCTION_ARG_PADDING
17602 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
17603
17604 #undef TARGET_GET_RAW_RESULT_MODE
17605 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
17606 #undef TARGET_GET_RAW_ARG_MODE
17607 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
17608
17609 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
17610 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
17611
17612 #undef TARGET_FUNCTION_VALUE
17613 #define TARGET_FUNCTION_VALUE aarch64_function_value
17614
17615 #undef TARGET_FUNCTION_VALUE_REGNO_P
17616 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
17617
17618 #undef TARGET_GIMPLE_FOLD_BUILTIN
17619 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
17620
17621 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
17622 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
17623
17624 #undef  TARGET_INIT_BUILTINS
17625 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
17626
17627 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
17628 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
17629   aarch64_ira_change_pseudo_allocno_class
17630
17631 #undef TARGET_LEGITIMATE_ADDRESS_P
17632 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
17633
17634 #undef TARGET_LEGITIMATE_CONSTANT_P
17635 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
17636
17637 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
17638 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
17639   aarch64_legitimize_address_displacement
17640
17641 #undef TARGET_LIBGCC_CMP_RETURN_MODE
17642 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
17643
17644 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
17645 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
17646 aarch64_libgcc_floating_mode_supported_p
17647
17648 #undef TARGET_MANGLE_TYPE
17649 #define TARGET_MANGLE_TYPE aarch64_mangle_type
17650
17651 #undef TARGET_MEMORY_MOVE_COST
17652 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
17653
17654 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
17655 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
17656
17657 #undef TARGET_MUST_PASS_IN_STACK
17658 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
17659
17660 /* This target hook should return true if accesses to volatile bitfields
17661    should use the narrowest mode possible.  It should return false if these
17662    accesses should use the bitfield container type.  */
17663 #undef TARGET_NARROW_VOLATILE_BITFIELD
17664 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
17665
17666 #undef  TARGET_OPTION_OVERRIDE
17667 #define TARGET_OPTION_OVERRIDE aarch64_override_options
17668
17669 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
17670 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
17671   aarch64_override_options_after_change
17672
17673 #undef TARGET_OPTION_SAVE
17674 #define TARGET_OPTION_SAVE aarch64_option_save
17675
17676 #undef TARGET_OPTION_RESTORE
17677 #define TARGET_OPTION_RESTORE aarch64_option_restore
17678
17679 #undef TARGET_OPTION_PRINT
17680 #define TARGET_OPTION_PRINT aarch64_option_print
17681
17682 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
17683 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
17684
17685 #undef TARGET_SET_CURRENT_FUNCTION
17686 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
17687
17688 #undef TARGET_PASS_BY_REFERENCE
17689 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
17690
17691 #undef TARGET_PREFERRED_RELOAD_CLASS
17692 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
17693
17694 #undef TARGET_SCHED_REASSOCIATION_WIDTH
17695 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
17696
17697 #undef TARGET_PROMOTED_TYPE
17698 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
17699
17700 #undef TARGET_SECONDARY_RELOAD
17701 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
17702
17703 #undef TARGET_SHIFT_TRUNCATION_MASK
17704 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
17705
17706 #undef TARGET_SETUP_INCOMING_VARARGS
17707 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
17708
17709 #undef TARGET_STRUCT_VALUE_RTX
17710 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
17711
17712 #undef TARGET_REGISTER_MOVE_COST
17713 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
17714
17715 #undef TARGET_RETURN_IN_MEMORY
17716 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
17717
17718 #undef TARGET_RETURN_IN_MSB
17719 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
17720
17721 #undef TARGET_RTX_COSTS
17722 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
17723
17724 #undef TARGET_SCALAR_MODE_SUPPORTED_P
17725 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
17726
17727 #undef TARGET_SCHED_ISSUE_RATE
17728 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
17729
17730 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
17731 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
17732   aarch64_sched_first_cycle_multipass_dfa_lookahead
17733
17734 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
17735 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
17736   aarch64_first_cycle_multipass_dfa_lookahead_guard
17737
17738 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
17739 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
17740   aarch64_get_separate_components
17741
17742 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
17743 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
17744   aarch64_components_for_bb
17745
17746 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
17747 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
17748   aarch64_disqualify_components
17749
17750 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
17751 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
17752   aarch64_emit_prologue_components
17753
17754 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
17755 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
17756   aarch64_emit_epilogue_components
17757
17758 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
17759 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
17760   aarch64_set_handled_components
17761
17762 #undef TARGET_TRAMPOLINE_INIT
17763 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
17764
17765 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
17766 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
17767
17768 #undef TARGET_VECTOR_MODE_SUPPORTED_P
17769 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
17770
17771 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
17772 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
17773   aarch64_builtin_support_vector_misalignment
17774
17775 #undef TARGET_ARRAY_MODE
17776 #define TARGET_ARRAY_MODE aarch64_array_mode
17777
17778 #undef TARGET_ARRAY_MODE_SUPPORTED_P
17779 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
17780
17781 #undef TARGET_VECTORIZE_ADD_STMT_COST
17782 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
17783
17784 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
17785 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
17786   aarch64_builtin_vectorization_cost
17787
17788 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
17789 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
17790
17791 #undef TARGET_VECTORIZE_BUILTINS
17792 #define TARGET_VECTORIZE_BUILTINS
17793
17794 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
17795 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
17796   aarch64_builtin_vectorized_function
17797
17798 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
17799 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
17800   aarch64_autovectorize_vector_sizes
17801
17802 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
17803 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
17804   aarch64_atomic_assign_expand_fenv
17805
17806 /* Section anchor support.  */
17807
17808 #undef TARGET_MIN_ANCHOR_OFFSET
17809 #define TARGET_MIN_ANCHOR_OFFSET -256
17810
17811 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
17812    byte offset; we can do much more for larger data types, but have no way
17813    to determine the size of the access.  We assume accesses are aligned.  */
17814 #undef TARGET_MAX_ANCHOR_OFFSET
17815 #define TARGET_MAX_ANCHOR_OFFSET 4095
17816
17817 #undef TARGET_VECTOR_ALIGNMENT
17818 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
17819
17820 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
17821 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
17822   aarch64_vectorize_preferred_vector_alignment
17823 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
17824 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
17825   aarch64_simd_vector_alignment_reachable
17826
17827 /* vec_perm support.  */
17828
17829 #undef TARGET_VECTORIZE_VEC_PERM_CONST
17830 #define TARGET_VECTORIZE_VEC_PERM_CONST \
17831   aarch64_vectorize_vec_perm_const
17832
17833 #undef TARGET_VECTORIZE_GET_MASK_MODE
17834 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
17835 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
17836 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
17837   aarch64_empty_mask_is_expensive
17838
17839 #undef TARGET_INIT_LIBFUNCS
17840 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
17841
17842 #undef TARGET_FIXED_CONDITION_CODE_REGS
17843 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
17844
17845 #undef TARGET_FLAGS_REGNUM
17846 #define TARGET_FLAGS_REGNUM CC_REGNUM
17847
17848 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
17849 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
17850
17851 #undef TARGET_ASAN_SHADOW_OFFSET
17852 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
17853
17854 #undef TARGET_LEGITIMIZE_ADDRESS
17855 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
17856
17857 #undef TARGET_SCHED_CAN_SPECULATE_INSN
17858 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
17859
17860 #undef TARGET_CAN_USE_DOLOOP_P
17861 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
17862
17863 #undef TARGET_SCHED_ADJUST_PRIORITY
17864 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
17865
17866 #undef TARGET_SCHED_MACRO_FUSION_P
17867 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
17868
17869 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
17870 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
17871
17872 #undef TARGET_SCHED_FUSION_PRIORITY
17873 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
17874
17875 #undef TARGET_UNSPEC_MAY_TRAP_P
17876 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
17877
17878 #undef TARGET_USE_PSEUDO_PIC_REG
17879 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
17880
17881 #undef TARGET_PRINT_OPERAND
17882 #define TARGET_PRINT_OPERAND aarch64_print_operand
17883
17884 #undef TARGET_PRINT_OPERAND_ADDRESS
17885 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
17886
17887 #undef TARGET_OPTAB_SUPPORTED_P
17888 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
17889
17890 #undef TARGET_OMIT_STRUCT_RETURN_REG
17891 #define TARGET_OMIT_STRUCT_RETURN_REG true
17892
17893 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
17894 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
17895   aarch64_dwarf_poly_indeterminate_value
17896
17897 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
17898 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
17899 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
17900
17901 #undef TARGET_HARD_REGNO_NREGS
17902 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
17903 #undef TARGET_HARD_REGNO_MODE_OK
17904 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
17905
17906 #undef TARGET_MODES_TIEABLE_P
17907 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
17908
17909 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
17910 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
17911   aarch64_hard_regno_call_part_clobbered
17912
17913 #undef TARGET_CONSTANT_ALIGNMENT
17914 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
17915
17916 #undef TARGET_COMPUTE_PRESSURE_CLASSES
17917 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
17918
17919 #undef TARGET_CAN_CHANGE_MODE_CLASS
17920 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
17921
17922 #undef TARGET_SELECT_EARLY_REMAT_MODES
17923 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
17924
17925 #if CHECKING_P
17926 #undef TARGET_RUN_TARGET_SELFTESTS
17927 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
17928 #endif /* #if CHECKING_P */
17929
17930 struct gcc_target targetm = TARGET_INITIALIZER;
17931
17932 #include "gt-aarch64.h"