gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2018 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #include "config.h"
  24 #define INCLUDE_STRING
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "memmodel.h"
  32 #include "gimple.h"
  33 #include "cfghooks.h"
  34 #include "cfgloop.h"
  35 #include "df.h"
  36 #include "tm_p.h"
  37 #include "stringpool.h"
  38 #include "attribs.h"
  39 #include "optabs.h"
  40 #include "regs.h"
  41 #include "emit-rtl.h"
  42 #include "recog.h"
  43 #include "diagnostic.h"
  44 #include "insn-attr.h"
  45 #include "alias.h"
  46 #include "fold-const.h"
  47 #include "stor-layout.h"
  48 #include "calls.h"
  49 #include "varasm.h"
  50 #include "output.h"
  51 #include "flags.h"
  52 #include "explow.h"
  53 #include "expr.h"
  54 #include "reload.h"
  55 #include "langhooks.h"
  56 #include "opts.h"
  57 #include "params.h"
  58 #include "gimplify.h"
  59 #include "dwarf2.h"
  60 #include "gimple-iterator.h"
  61 #include "tree-vectorizer.h"
  62 #include "aarch64-cost-tables.h"
  63 #include "dumpfile.h"
  64 #include "builtins.h"
  65 #include "rtl-iter.h"
  66 #include "tm-constrs.h"
  67 #include "sched-int.h"
  68 #include "target-globals.h"
  69 #include "common/common-target.h"
  70 #include "cfgrtl.h"
  71 #include "selftest.h"
  72 #include "selftest-rtl.h"
  73 #include "rtx-vector-builder.h"
  74
  75 /* This file should be included last.  */
  76 #include "target-def.h"
  77
  78 /* Defined for convenience.  */
  79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  80
  81 /* Classifies an address.
  82
  83    ADDRESS_REG_IMM
  84        A simple base register plus immediate offset.
  85
  86    ADDRESS_REG_WB
  87        A base register indexed by immediate offset with writeback.
  88
  89    ADDRESS_REG_REG
  90        A base register indexed by (optionally scaled) register.
  91
  92    ADDRESS_REG_UXTW
  93        A base register indexed by (optionally scaled) zero-extended register.
  94
  95    ADDRESS_REG_SXTW
  96        A base register indexed by (optionally scaled) sign-extended register.
  97
  98    ADDRESS_LO_SUM
  99        A LO_SUM rtx with a base register and "LO12" symbol relocation.
 100
 101    ADDRESS_SYMBOLIC:
 102        A constant symbolic address, in pc-relative literal pool.  */
 103
 104 enum aarch64_address_type {
 105   ADDRESS_REG_IMM,
 106   ADDRESS_REG_WB,
 107   ADDRESS_REG_REG,
 108   ADDRESS_REG_UXTW,
 109   ADDRESS_REG_SXTW,
 110   ADDRESS_LO_SUM,
 111   ADDRESS_SYMBOLIC
 112 };
 113
 114 struct aarch64_address_info {
 115   enum aarch64_address_type type;
 116   rtx base;
 117   rtx offset;
 118   poly_int64 const_offset;
 119   int shift;
 120   enum aarch64_symbol_type symbol_type;
 121 };
 122
 123 /* Information about a legitimate vector immediate operand.  */
 124 struct simd_immediate_info
 125 {
 126   enum insn_type { MOV, MVN };
 127   enum modifier_type { LSL, MSL };
 128
 129   simd_immediate_info () {}
 130   simd_immediate_info (scalar_float_mode, rtx);
 131   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
 132                        insn_type = MOV, modifier_type = LSL,
 133                        unsigned int = 0);
 134   simd_immediate_info (scalar_mode, rtx, rtx);
 135
 136   /* The mode of the elements.  */
 137   scalar_mode elt_mode;
 138
 139   /* The value of each element if all elements are the same, or the
 140      first value if the constant is a series.  */
 141   rtx value;
 142
 143   /* The value of the step if the constant is a series, null otherwise.  */
 144   rtx step;
 145
 146   /* The instruction to use to move the immediate into a vector.  */
 147   insn_type insn;
 148
 149   /* The kind of shift modifier to use, and the number of bits to shift.
 150      This is (LSL, 0) if no shift is needed.  */
 151   modifier_type modifier;
 152   unsigned int shift;
 153 };
 154
 155 /* Construct a floating-point immediate in which each element has mode
 156    ELT_MODE_IN and value VALUE_IN.  */
 157 inline simd_immediate_info
 158 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 159   : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
 160     modifier (LSL), shift (0)
 161 {}
 162
 163 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 164    and value VALUE_IN.  The other parameters are as for the structure
 165    fields.  */
 166 inline simd_immediate_info
 167 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 168                        unsigned HOST_WIDE_INT value_in,
 169                        insn_type insn_in, modifier_type modifier_in,
 170                        unsigned int shift_in)
 171   : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
 172     step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
 173 {}
 174
 175 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 176    and where element I is equal to VALUE_IN + I * STEP_IN.  */
 177 inline simd_immediate_info
 178 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
 179   : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
 180     modifier (LSL), shift (0)
 181 {}
 182
 183 /* The current code model.  */
 184 enum aarch64_code_model aarch64_cmodel;
 185
 186 /* The number of 64-bit elements in an SVE vector.  */
 187 poly_uint16 aarch64_sve_vg;
 188
 189 #ifdef HAVE_AS_TLS
 190 #undef TARGET_HAVE_TLS
 191 #define TARGET_HAVE_TLS 1
 192 #endif
 193
 194 static bool aarch64_composite_type_p (const_tree, machine_mode);
 195 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 196                                                      const_tree,
 197                                                      machine_mode *, int *,
 198                                                      bool *);
 199 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 200 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 201 static void aarch64_override_options_after_change (void);
 202 static bool aarch64_vector_mode_supported_p (machine_mode);
 203 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 204 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 205                                                          const_tree type,
 206                                                          int misalignment,
 207                                                          bool is_packed);
 208 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 209 static bool aarch64_print_ldpstp_address (FILE *, machine_mode, rtx);
 210
 211 /* Major revision number of the ARM Architecture implemented by the target.  */
 212 unsigned aarch64_architecture_version;
 213
 214 /* The processor for which instructions should be scheduled.  */
 215 enum aarch64_processor aarch64_tune = cortexa53;
 216
 217 /* Mask to specify which instruction scheduling options should be used.  */
 218 unsigned long aarch64_tune_flags = 0;
 219
 220 /* Global flag for PC relative loads.  */
 221 bool aarch64_pcrelative_literal_loads;
 222
 223 /* Support for command line parsing of boolean flags in the tuning
 224    structures.  */
 225 struct aarch64_flag_desc
 226 {
 227   const char* name;
 228   unsigned int flag;
 229 };
 230
 231 #define AARCH64_FUSION_PAIR(name, internal_name) \
 232   { name, AARCH64_FUSE_##internal_name },
 233 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 234 {
 235   { "none", AARCH64_FUSE_NOTHING },
 236 #include "aarch64-fusion-pairs.def"
 237   { "all", AARCH64_FUSE_ALL },
 238   { NULL, AARCH64_FUSE_NOTHING }
 239 };
 240
 241 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 242   { name, AARCH64_EXTRA_TUNE_##internal_name },
 243 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 244 {
 245   { "none", AARCH64_EXTRA_TUNE_NONE },
 246 #include "aarch64-tuning-flags.def"
 247   { "all", AARCH64_EXTRA_TUNE_ALL },
 248   { NULL, AARCH64_EXTRA_TUNE_NONE }
 249 };
 250
 251 /* Tuning parameters.  */
 252
 253 static const struct cpu_addrcost_table generic_addrcost_table =
 254 {
 255     {
 256       1, /* hi  */
 257       0, /* si  */
 258       0, /* di  */
 259       1, /* ti  */
 260     },
 261   0, /* pre_modify  */
 262   0, /* post_modify  */
 263   0, /* register_offset  */
 264   0, /* register_sextend  */
 265   0, /* register_zextend  */
 266   0 /* imm_offset  */
 267 };
 268
 269 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 270 {
 271     {
 272       0, /* hi  */
 273       0, /* si  */
 274       0, /* di  */
 275       2, /* ti  */
 276     },
 277   0, /* pre_modify  */
 278   0, /* post_modify  */
 279   1, /* register_offset  */
 280   1, /* register_sextend  */
 281   2, /* register_zextend  */
 282   0, /* imm_offset  */
 283 };
 284
 285 static const struct cpu_addrcost_table xgene1_addrcost_table =
 286 {
 287     {
 288       1, /* hi  */
 289       0, /* si  */
 290       0, /* di  */
 291       1, /* ti  */
 292     },
 293   1, /* pre_modify  */
 294   0, /* post_modify  */
 295   0, /* register_offset  */
 296   1, /* register_sextend  */
 297   1, /* register_zextend  */
 298   0, /* imm_offset  */
 299 };
 300
 301 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 302 {
 303     {
 304       1, /* hi  */
 305       1, /* si  */
 306       1, /* di  */
 307       2, /* ti  */
 308     },
 309   0, /* pre_modify  */
 310   0, /* post_modify  */
 311   2, /* register_offset  */
 312   3, /* register_sextend  */
 313   3, /* register_zextend  */
 314   0, /* imm_offset  */
 315 };
 316
 317 static const struct cpu_regmove_cost generic_regmove_cost =
 318 {
 319   1, /* GP2GP  */
 320   /* Avoid the use of slow int<->fp moves for spilling by setting
 321      their cost higher than memmov_cost.  */
 322   5, /* GP2FP  */
 323   5, /* FP2GP  */
 324   2 /* FP2FP  */
 325 };
 326
 327 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 328 {
 329   1, /* GP2GP  */
 330   /* Avoid the use of slow int<->fp moves for spilling by setting
 331      their cost higher than memmov_cost.  */
 332   5, /* GP2FP  */
 333   5, /* FP2GP  */
 334   2 /* FP2FP  */
 335 };
 336
 337 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 338 {
 339   1, /* GP2GP  */
 340   /* Avoid the use of slow int<->fp moves for spilling by setting
 341      their cost higher than memmov_cost.  */
 342   5, /* GP2FP  */
 343   5, /* FP2GP  */
 344   2 /* FP2FP  */
 345 };
 346
 347 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 348 {
 349   1, /* GP2GP  */
 350   /* Avoid the use of slow int<->fp moves for spilling by setting
 351      their cost higher than memmov_cost (actual, 4 and 9).  */
 352   9, /* GP2FP  */
 353   9, /* FP2GP  */
 354   1 /* FP2FP  */
 355 };
 356
 357 static const struct cpu_regmove_cost thunderx_regmove_cost =
 358 {
 359   2, /* GP2GP  */
 360   2, /* GP2FP  */
 361   6, /* FP2GP  */
 362   4 /* FP2FP  */
 363 };
 364
 365 static const struct cpu_regmove_cost xgene1_regmove_cost =
 366 {
 367   1, /* GP2GP  */
 368   /* Avoid the use of slow int<->fp moves for spilling by setting
 369      their cost higher than memmov_cost.  */
 370   8, /* GP2FP  */
 371   8, /* FP2GP  */
 372   2 /* FP2FP  */
 373 };
 374
 375 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 376 {
 377   2, /* GP2GP  */
 378   /* Avoid the use of int<->fp moves for spilling.  */
 379   6, /* GP2FP  */
 380   6, /* FP2GP  */
 381   4 /* FP2FP  */
 382 };
 383
 384 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 385 {
 386   1, /* GP2GP  */
 387   /* Avoid the use of int<->fp moves for spilling.  */
 388   8, /* GP2FP  */
 389   8, /* FP2GP  */
 390   4  /* FP2FP  */
 391 };
 392
 393 /* Generic costs for vector insn classes.  */
 394 static const struct cpu_vector_cost generic_vector_cost =
 395 {
 396   1, /* scalar_int_stmt_cost  */
 397   1, /* scalar_fp_stmt_cost  */
 398   1, /* scalar_load_cost  */
 399   1, /* scalar_store_cost  */
 400   1, /* vec_int_stmt_cost  */
 401   1, /* vec_fp_stmt_cost  */
 402   2, /* vec_permute_cost  */
 403   1, /* vec_to_scalar_cost  */
 404   1, /* scalar_to_vec_cost  */
 405   1, /* vec_align_load_cost  */
 406   1, /* vec_unalign_load_cost  */
 407   1, /* vec_unalign_store_cost  */
 408   1, /* vec_store_cost  */
 409   3, /* cond_taken_branch_cost  */
 410   1 /* cond_not_taken_branch_cost  */
 411 };
 412
 413 /* ThunderX costs for vector insn classes.  */
 414 static const struct cpu_vector_cost thunderx_vector_cost =
 415 {
 416   1, /* scalar_int_stmt_cost  */
 417   1, /* scalar_fp_stmt_cost  */
 418   3, /* scalar_load_cost  */
 419   1, /* scalar_store_cost  */
 420   4, /* vec_int_stmt_cost  */
 421   1, /* vec_fp_stmt_cost  */
 422   4, /* vec_permute_cost  */
 423   2, /* vec_to_scalar_cost  */
 424   2, /* scalar_to_vec_cost  */
 425   3, /* vec_align_load_cost  */
 426   5, /* vec_unalign_load_cost  */
 427   5, /* vec_unalign_store_cost  */
 428   1, /* vec_store_cost  */
 429   3, /* cond_taken_branch_cost  */
 430   3 /* cond_not_taken_branch_cost  */
 431 };
 432
 433 /* Generic costs for vector insn classes.  */
 434 static const struct cpu_vector_cost cortexa57_vector_cost =
 435 {
 436   1, /* scalar_int_stmt_cost  */
 437   1, /* scalar_fp_stmt_cost  */
 438   4, /* scalar_load_cost  */
 439   1, /* scalar_store_cost  */
 440   2, /* vec_int_stmt_cost  */
 441   2, /* vec_fp_stmt_cost  */
 442   3, /* vec_permute_cost  */
 443   8, /* vec_to_scalar_cost  */
 444   8, /* scalar_to_vec_cost  */
 445   4, /* vec_align_load_cost  */
 446   4, /* vec_unalign_load_cost  */
 447   1, /* vec_unalign_store_cost  */
 448   1, /* vec_store_cost  */
 449   1, /* cond_taken_branch_cost  */
 450   1 /* cond_not_taken_branch_cost  */
 451 };
 452
 453 static const struct cpu_vector_cost exynosm1_vector_cost =
 454 {
 455   1, /* scalar_int_stmt_cost  */
 456   1, /* scalar_fp_stmt_cost  */
 457   5, /* scalar_load_cost  */
 458   1, /* scalar_store_cost  */
 459   3, /* vec_int_stmt_cost  */
 460   3, /* vec_fp_stmt_cost  */
 461   3, /* vec_permute_cost  */
 462   3, /* vec_to_scalar_cost  */
 463   3, /* scalar_to_vec_cost  */
 464   5, /* vec_align_load_cost  */
 465   5, /* vec_unalign_load_cost  */
 466   1, /* vec_unalign_store_cost  */
 467   1, /* vec_store_cost  */
 468   1, /* cond_taken_branch_cost  */
 469   1 /* cond_not_taken_branch_cost  */
 470 };
 471
 472 /* Generic costs for vector insn classes.  */
 473 static const struct cpu_vector_cost xgene1_vector_cost =
 474 {
 475   1, /* scalar_int_stmt_cost  */
 476   1, /* scalar_fp_stmt_cost  */
 477   5, /* scalar_load_cost  */
 478   1, /* scalar_store_cost  */
 479   2, /* vec_int_stmt_cost  */
 480   2, /* vec_fp_stmt_cost  */
 481   2, /* vec_permute_cost  */
 482   4, /* vec_to_scalar_cost  */
 483   4, /* scalar_to_vec_cost  */
 484   10, /* vec_align_load_cost  */
 485   10, /* vec_unalign_load_cost  */
 486   2, /* vec_unalign_store_cost  */
 487   2, /* vec_store_cost  */
 488   2, /* cond_taken_branch_cost  */
 489   1 /* cond_not_taken_branch_cost  */
 490 };
 491
 492 /* Costs for vector insn classes for Vulcan.  */
 493 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 494 {
 495   1, /* scalar_int_stmt_cost  */
 496   6, /* scalar_fp_stmt_cost  */
 497   4, /* scalar_load_cost  */
 498   1, /* scalar_store_cost  */
 499   5, /* vec_int_stmt_cost  */
 500   6, /* vec_fp_stmt_cost  */
 501   3, /* vec_permute_cost  */
 502   6, /* vec_to_scalar_cost  */
 503   5, /* scalar_to_vec_cost  */
 504   8, /* vec_align_load_cost  */
 505   8, /* vec_unalign_load_cost  */
 506   4, /* vec_unalign_store_cost  */
 507   4, /* vec_store_cost  */
 508   2, /* cond_taken_branch_cost  */
 509   1  /* cond_not_taken_branch_cost  */
 510 };
 511
 512 /* Generic costs for branch instructions.  */
 513 static const struct cpu_branch_cost generic_branch_cost =
 514 {
 515   1,  /* Predictable.  */
 516   3   /* Unpredictable.  */
 517 };
 518
 519 /* Generic approximation modes.  */
 520 static const cpu_approx_modes generic_approx_modes =
 521 {
 522   AARCH64_APPROX_NONE,  /* division  */
 523   AARCH64_APPROX_NONE,  /* sqrt  */
 524   AARCH64_APPROX_NONE   /* recip_sqrt  */
 525 };
 526
 527 /* Approximation modes for Exynos M1.  */
 528 static const cpu_approx_modes exynosm1_approx_modes =
 529 {
 530   AARCH64_APPROX_NONE,  /* division  */
 531   AARCH64_APPROX_ALL,   /* sqrt  */
 532   AARCH64_APPROX_ALL    /* recip_sqrt  */
 533 };
 534
 535 /* Approximation modes for X-Gene 1.  */
 536 static const cpu_approx_modes xgene1_approx_modes =
 537 {
 538   AARCH64_APPROX_NONE,  /* division  */
 539   AARCH64_APPROX_NONE,  /* sqrt  */
 540   AARCH64_APPROX_ALL    /* recip_sqrt  */
 541 };
 542
 543 /* Generic prefetch settings (which disable prefetch).  */
 544 static const cpu_prefetch_tune generic_prefetch_tune =
 545 {
 546   0,                    /* num_slots  */
 547   -1,                   /* l1_cache_size  */
 548   -1,                   /* l1_cache_line_size  */
 549   -1,                   /* l2_cache_size  */
 550   -1                    /* default_opt_level  */
 551 };
 552
 553 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 554 {
 555   0,                    /* num_slots  */
 556   -1,                   /* l1_cache_size  */
 557   64,                   /* l1_cache_line_size  */
 558   -1,                   /* l2_cache_size  */
 559   -1                    /* default_opt_level  */
 560 };
 561
 562 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 563 {
 564   4,                    /* num_slots  */
 565   32,                   /* l1_cache_size  */
 566   64,                   /* l1_cache_line_size  */
 567   1024,                 /* l2_cache_size  */
 568   -1                    /* default_opt_level  */
 569 };
 570
 571 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 572 {
 573   8,                    /* num_slots  */
 574   32,                   /* l1_cache_size  */
 575   128,                  /* l1_cache_line_size  */
 576   16*1024,              /* l2_cache_size  */
 577   3                     /* default_opt_level  */
 578 };
 579
 580 static const cpu_prefetch_tune thunderx_prefetch_tune =
 581 {
 582   8,                    /* num_slots  */
 583   32,                   /* l1_cache_size  */
 584   128,                  /* l1_cache_line_size  */
 585   -1,                   /* l2_cache_size  */
 586   -1                    /* default_opt_level  */
 587 };
 588
 589 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 590 {
 591   8,                    /* num_slots  */
 592   32,                   /* l1_cache_size  */
 593   64,                   /* l1_cache_line_size  */
 594   256,                  /* l2_cache_size  */
 595   -1                    /* default_opt_level  */
 596 };
 597
 598 static const struct tune_params generic_tunings =
 599 {
 600   &cortexa57_extra_costs,
 601   &generic_addrcost_table,
 602   &generic_regmove_cost,
 603   &generic_vector_cost,
 604   &generic_branch_cost,
 605   &generic_approx_modes,
 606   4, /* memmov_cost  */
 607   2, /* issue_rate  */
 608   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 609   8,    /* function_align.  */
 610   4,    /* jump_align.  */
 611   8,    /* loop_align.  */
 612   2,    /* int_reassoc_width.  */
 613   4,    /* fp_reassoc_width.  */
 614   1,    /* vec_reassoc_width.  */
 615   2,    /* min_div_recip_mul_sf.  */
 616   2,    /* min_div_recip_mul_df.  */
 617   0,    /* max_case_values.  */
 618   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 619   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 620   &generic_prefetch_tune
 621 };
 622
 623 static const struct tune_params cortexa35_tunings =
 624 {
 625   &cortexa53_extra_costs,
 626   &generic_addrcost_table,
 627   &cortexa53_regmove_cost,
 628   &generic_vector_cost,
 629   &generic_branch_cost,
 630   &generic_approx_modes,
 631   4, /* memmov_cost  */
 632   1, /* issue_rate  */
 633   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 634    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 635   16,   /* function_align.  */
 636   4,    /* jump_align.  */
 637   8,    /* loop_align.  */
 638   2,    /* int_reassoc_width.  */
 639   4,    /* fp_reassoc_width.  */
 640   1,    /* vec_reassoc_width.  */
 641   2,    /* min_div_recip_mul_sf.  */
 642   2,    /* min_div_recip_mul_df.  */
 643   0,    /* max_case_values.  */
 644   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 645   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 646   &generic_prefetch_tune
 647 };
 648
 649 static const struct tune_params cortexa53_tunings =
 650 {
 651   &cortexa53_extra_costs,
 652   &generic_addrcost_table,
 653   &cortexa53_regmove_cost,
 654   &generic_vector_cost,
 655   &generic_branch_cost,
 656   &generic_approx_modes,
 657   4, /* memmov_cost  */
 658   2, /* issue_rate  */
 659   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 660    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 661   16,   /* function_align.  */
 662   4,    /* jump_align.  */
 663   8,    /* loop_align.  */
 664   2,    /* int_reassoc_width.  */
 665   4,    /* fp_reassoc_width.  */
 666   1,    /* vec_reassoc_width.  */
 667   2,    /* min_div_recip_mul_sf.  */
 668   2,    /* min_div_recip_mul_df.  */
 669   0,    /* max_case_values.  */
 670   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 671   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 672   &generic_prefetch_tune
 673 };
 674
 675 static const struct tune_params cortexa57_tunings =
 676 {
 677   &cortexa57_extra_costs,
 678   &generic_addrcost_table,
 679   &cortexa57_regmove_cost,
 680   &cortexa57_vector_cost,
 681   &generic_branch_cost,
 682   &generic_approx_modes,
 683   4, /* memmov_cost  */
 684   3, /* issue_rate  */
 685   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 686    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 687   16,   /* function_align.  */
 688   4,    /* jump_align.  */
 689   8,    /* loop_align.  */
 690   2,    /* int_reassoc_width.  */
 691   4,    /* fp_reassoc_width.  */
 692   1,    /* vec_reassoc_width.  */
 693   2,    /* min_div_recip_mul_sf.  */
 694   2,    /* min_div_recip_mul_df.  */
 695   0,    /* max_case_values.  */
 696   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 697   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 698   &generic_prefetch_tune
 699 };
 700
 701 static const struct tune_params cortexa72_tunings =
 702 {
 703   &cortexa57_extra_costs,
 704   &generic_addrcost_table,
 705   &cortexa57_regmove_cost,
 706   &cortexa57_vector_cost,
 707   &generic_branch_cost,
 708   &generic_approx_modes,
 709   4, /* memmov_cost  */
 710   3, /* issue_rate  */
 711   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 712    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 713   16,   /* function_align.  */
 714   4,    /* jump_align.  */
 715   8,    /* loop_align.  */
 716   2,    /* int_reassoc_width.  */
 717   4,    /* fp_reassoc_width.  */
 718   1,    /* vec_reassoc_width.  */
 719   2,    /* min_div_recip_mul_sf.  */
 720   2,    /* min_div_recip_mul_df.  */
 721   0,    /* max_case_values.  */
 722   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 723   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 724   &generic_prefetch_tune
 725 };
 726
 727 static const struct tune_params cortexa73_tunings =
 728 {
 729   &cortexa57_extra_costs,
 730   &generic_addrcost_table,
 731   &cortexa57_regmove_cost,
 732   &cortexa57_vector_cost,
 733   &generic_branch_cost,
 734   &generic_approx_modes,
 735   4, /* memmov_cost.  */
 736   2, /* issue_rate.  */
 737   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 738    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 739   16,   /* function_align.  */
 740   4,    /* jump_align.  */
 741   8,    /* loop_align.  */
 742   2,    /* int_reassoc_width.  */
 743   4,    /* fp_reassoc_width.  */
 744   1,    /* vec_reassoc_width.  */
 745   2,    /* min_div_recip_mul_sf.  */
 746   2,    /* min_div_recip_mul_df.  */
 747   0,    /* max_case_values.  */
 748   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 749   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 750   &generic_prefetch_tune
 751 };
 752
 753
 754
 755 static const struct tune_params exynosm1_tunings =
 756 {
 757   &exynosm1_extra_costs,
 758   &exynosm1_addrcost_table,
 759   &exynosm1_regmove_cost,
 760   &exynosm1_vector_cost,
 761   &generic_branch_cost,
 762   &exynosm1_approx_modes,
 763   4,    /* memmov_cost  */
 764   3,    /* issue_rate  */
 765   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 766   4,    /* function_align.  */
 767   4,    /* jump_align.  */
 768   4,    /* loop_align.  */
 769   2,    /* int_reassoc_width.  */
 770   4,    /* fp_reassoc_width.  */
 771   1,    /* vec_reassoc_width.  */
 772   2,    /* min_div_recip_mul_sf.  */
 773   2,    /* min_div_recip_mul_df.  */
 774   48,   /* max_case_values.  */
 775   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 776   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 777   &exynosm1_prefetch_tune
 778 };
 779
 780 static const struct tune_params thunderxt88_tunings =
 781 {
 782   &thunderx_extra_costs,
 783   &generic_addrcost_table,
 784   &thunderx_regmove_cost,
 785   &thunderx_vector_cost,
 786   &generic_branch_cost,
 787   &generic_approx_modes,
 788   6, /* memmov_cost  */
 789   2, /* issue_rate  */
 790   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 791   8,    /* function_align.  */
 792   8,    /* jump_align.  */
 793   8,    /* loop_align.  */
 794   2,    /* int_reassoc_width.  */
 795   4,    /* fp_reassoc_width.  */
 796   1,    /* vec_reassoc_width.  */
 797   2,    /* min_div_recip_mul_sf.  */
 798   2,    /* min_div_recip_mul_df.  */
 799   0,    /* max_case_values.  */
 800   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 801   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 802   &thunderxt88_prefetch_tune
 803 };
 804
 805 static const struct tune_params thunderx_tunings =
 806 {
 807   &thunderx_extra_costs,
 808   &generic_addrcost_table,
 809   &thunderx_regmove_cost,
 810   &thunderx_vector_cost,
 811   &generic_branch_cost,
 812   &generic_approx_modes,
 813   6, /* memmov_cost  */
 814   2, /* issue_rate  */
 815   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 816   8,    /* function_align.  */
 817   8,    /* jump_align.  */
 818   8,    /* loop_align.  */
 819   2,    /* int_reassoc_width.  */
 820   4,    /* fp_reassoc_width.  */
 821   1,    /* vec_reassoc_width.  */
 822   2,    /* min_div_recip_mul_sf.  */
 823   2,    /* min_div_recip_mul_df.  */
 824   0,    /* max_case_values.  */
 825   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 826   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 827    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 828   &thunderx_prefetch_tune
 829 };
 830
 831 static const struct tune_params xgene1_tunings =
 832 {
 833   &xgene1_extra_costs,
 834   &xgene1_addrcost_table,
 835   &xgene1_regmove_cost,
 836   &xgene1_vector_cost,
 837   &generic_branch_cost,
 838   &xgene1_approx_modes,
 839   6, /* memmov_cost  */
 840   4, /* issue_rate  */
 841   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 842   16,   /* function_align.  */
 843   8,    /* jump_align.  */
 844   16,   /* loop_align.  */
 845   2,    /* int_reassoc_width.  */
 846   4,    /* fp_reassoc_width.  */
 847   1,    /* vec_reassoc_width.  */
 848   2,    /* min_div_recip_mul_sf.  */
 849   2,    /* min_div_recip_mul_df.  */
 850   0,    /* max_case_values.  */
 851   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 852   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 853   &generic_prefetch_tune
 854 };
 855
 856 static const struct tune_params qdf24xx_tunings =
 857 {
 858   &qdf24xx_extra_costs,
 859   &generic_addrcost_table,
 860   &qdf24xx_regmove_cost,
 861   &generic_vector_cost,
 862   &generic_branch_cost,
 863   &generic_approx_modes,
 864   4, /* memmov_cost  */
 865   4, /* issue_rate  */
 866   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 867    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 868   16,   /* function_align.  */
 869   8,    /* jump_align.  */
 870   16,   /* loop_align.  */
 871   2,    /* int_reassoc_width.  */
 872   4,    /* fp_reassoc_width.  */
 873   1,    /* vec_reassoc_width.  */
 874   2,    /* min_div_recip_mul_sf.  */
 875   2,    /* min_div_recip_mul_df.  */
 876   0,    /* max_case_values.  */
 877   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 878   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
 879   &qdf24xx_prefetch_tune
 880 };
 881
 882 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
 883    for now.  */
 884 static const struct tune_params saphira_tunings =
 885 {
 886   &generic_extra_costs,
 887   &generic_addrcost_table,
 888   &generic_regmove_cost,
 889   &generic_vector_cost,
 890   &generic_branch_cost,
 891   &generic_approx_modes,
 892   4, /* memmov_cost  */
 893   4, /* issue_rate  */
 894   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 895    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 896   16,   /* function_align.  */
 897   8,    /* jump_align.  */
 898   16,   /* loop_align.  */
 899   2,    /* int_reassoc_width.  */
 900   4,    /* fp_reassoc_width.  */
 901   1,    /* vec_reassoc_width.  */
 902   2,    /* min_div_recip_mul_sf.  */
 903   2,    /* min_div_recip_mul_df.  */
 904   0,    /* max_case_values.  */
 905   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 906   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
 907   &generic_prefetch_tune
 908 };
 909
 910 static const struct tune_params thunderx2t99_tunings =
 911 {
 912   &thunderx2t99_extra_costs,
 913   &thunderx2t99_addrcost_table,
 914   &thunderx2t99_regmove_cost,
 915   &thunderx2t99_vector_cost,
 916   &generic_branch_cost,
 917   &generic_approx_modes,
 918   4, /* memmov_cost.  */
 919   4, /* issue_rate.  */
 920   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
 921    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 922   16,   /* function_align.  */
 923   8,    /* jump_align.  */
 924   16,   /* loop_align.  */
 925   3,    /* int_reassoc_width.  */
 926   2,    /* fp_reassoc_width.  */
 927   2,    /* vec_reassoc_width.  */
 928   2,    /* min_div_recip_mul_sf.  */
 929   2,    /* min_div_recip_mul_df.  */
 930   0,    /* max_case_values.  */
 931   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 932   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 933   &thunderx2t99_prefetch_tune
 934 };
 935
 936 /* Support for fine-grained override of the tuning structures.  */
 937 struct aarch64_tuning_override_function
 938 {
 939   const char* name;
 940   void (*parse_override)(const char*, struct tune_params*);
 941 };
 942
 943 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 944 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 945
 946 static const struct aarch64_tuning_override_function
 947 aarch64_tuning_override_functions[] =
 948 {
 949   { "fuse", aarch64_parse_fuse_string },
 950   { "tune", aarch64_parse_tune_string },
 951   { NULL, NULL }
 952 };
 953
 954 /* A processor implementing AArch64.  */
 955 struct processor
 956 {
 957   const char *const name;
 958   enum aarch64_processor ident;
 959   enum aarch64_processor sched_core;
 960   enum aarch64_arch arch;
 961   unsigned architecture_version;
 962   const unsigned long flags;
 963   const struct tune_params *const tune;
 964 };
 965
 966 /* Architectures implementing AArch64.  */
 967 static const struct processor all_architectures[] =
 968 {
 969 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
 970   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
 971 #include "aarch64-arches.def"
 972   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 973 };
 974
 975 /* Processor cores implementing AArch64.  */
 976 static const struct processor all_cores[] =
 977 {
 978 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
 979   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
 980   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
 981   FLAGS, &COSTS##_tunings},
 982 #include "aarch64-cores.def"
 983   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
 984     AARCH64_FL_FOR_ARCH8, &generic_tunings},
 985   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 986 };
 987
 988
 989 /* Target specification.  These are populated by the -march, -mtune, -mcpu
 990    handling code or by target attributes.  */
 991 static const struct processor *selected_arch;
 992 static const struct processor *selected_cpu;
 993 static const struct processor *selected_tune;
 994
 995 /* The current tuning set.  */
 996 struct tune_params aarch64_tune_params = generic_tunings;
 997
 998 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 999
1000 /* An ISA extension in the co-processor and main instruction set space.  */
1001 struct aarch64_option_extension
1002 {
1003   const char *const name;
1004   const unsigned long flags_on;
1005   const unsigned long flags_off;
1006 };
1007
1008 typedef enum aarch64_cond_code
1009 {
1010   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1011   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1012   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1013 }
1014 aarch64_cc;
1015
1016 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1017
1018 /* The condition codes of the processor, and the inverse function.  */
1019 static const char * const aarch64_condition_codes[] =
1020 {
1021   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1022   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1023 };
1024
1025 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1026 const char *
1027 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1028                         const char * branch_format)
1029 {
1030     rtx_code_label * tmp_label = gen_label_rtx ();
1031     char label_buf[256];
1032     char buffer[128];
1033     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1034                                  CODE_LABEL_NUMBER (tmp_label));
1035     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1036     rtx dest_label = operands[pos_label];
1037     operands[pos_label] = tmp_label;
1038
1039     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1040     output_asm_insn (buffer, operands);
1041
1042     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1043     operands[pos_label] = dest_label;
1044     output_asm_insn (buffer, operands);
1045     return "";
1046 }
1047
1048 void
1049 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
1050 {
1051   const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
1052   if (TARGET_GENERAL_REGS_ONLY)
1053     error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
1054   else
1055     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
1056 }
1057
1058 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1059    The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
1060    the same cost even if ALL_REGS has a much larger cost.  ALL_REGS is also
1061    used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
1062    cost (in this case the best class is the lowest cost one).  Using ALL_REGS
1063    irrespectively of its cost results in bad allocations with many redundant
1064    int<->FP moves which are expensive on various cores.
1065    To avoid this we don't allow ALL_REGS as the allocno class, but force a
1066    decision between FP_REGS and GENERAL_REGS.  We use the allocno class if it
1067    isn't ALL_REGS.  Similarly, use the best class if it isn't ALL_REGS.
1068    Otherwise set the allocno class depending on the mode.
1069    The result of this is that it is no longer inefficient to have a higher
1070    memory move cost than the register move cost.
1071 */
1072
1073 static reg_class_t
1074 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1075                                          reg_class_t best_class)
1076 {
1077   machine_mode mode;
1078
1079   if (allocno_class != ALL_REGS)
1080     return allocno_class;
1081
1082   if (best_class != ALL_REGS)
1083     return best_class;
1084
1085   mode = PSEUDO_REGNO_MODE (regno);
1086   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1087 }
1088
1089 static unsigned int
1090 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1091 {
1092   if (GET_MODE_UNIT_SIZE (mode) == 4)
1093     return aarch64_tune_params.min_div_recip_mul_sf;
1094   return aarch64_tune_params.min_div_recip_mul_df;
1095 }
1096
1097 static int
1098 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
1099                              machine_mode mode)
1100 {
1101   if (VECTOR_MODE_P (mode))
1102     return aarch64_tune_params.vec_reassoc_width;
1103   if (INTEGRAL_MODE_P (mode))
1104     return aarch64_tune_params.int_reassoc_width;
1105   if (FLOAT_MODE_P (mode))
1106     return aarch64_tune_params.fp_reassoc_width;
1107   return 1;
1108 }
1109
1110 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1111 unsigned
1112 aarch64_dbx_register_number (unsigned regno)
1113 {
1114    if (GP_REGNUM_P (regno))
1115      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1116    else if (regno == SP_REGNUM)
1117      return AARCH64_DWARF_SP;
1118    else if (FP_REGNUM_P (regno))
1119      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1120    else if (PR_REGNUM_P (regno))
1121      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1122    else if (regno == VG_REGNUM)
1123      return AARCH64_DWARF_VG;
1124
1125    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1126       equivalent DWARF register.  */
1127    return DWARF_FRAME_REGISTERS;
1128 }
1129
1130 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1131 static bool
1132 aarch64_advsimd_struct_mode_p (machine_mode mode)
1133 {
1134   return (TARGET_SIMD
1135           && (mode == OImode || mode == CImode || mode == XImode));
1136 }
1137
1138 /* Return true if MODE is an SVE predicate mode.  */
1139 static bool
1140 aarch64_sve_pred_mode_p (machine_mode mode)
1141 {
1142   return (TARGET_SVE
1143           && (mode == VNx16BImode
1144               || mode == VNx8BImode
1145               || mode == VNx4BImode
1146               || mode == VNx2BImode));
1147 }
1148
1149 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1150 const unsigned int VEC_ADVSIMD  = 1;
1151 const unsigned int VEC_SVE_DATA = 2;
1152 const unsigned int VEC_SVE_PRED = 4;
1153 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1154    a structure of 2, 3 or 4 vectors.  */
1155 const unsigned int VEC_STRUCT   = 8;
1156 /* Useful combinations of the above.  */
1157 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1158 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1159
1160 /* Return a set of flags describing the vector properties of mode MODE.
1161    Ignore modes that are not supported by the current target.  */
1162 static unsigned int
1163 aarch64_classify_vector_mode (machine_mode mode)
1164 {
1165   if (aarch64_advsimd_struct_mode_p (mode))
1166     return VEC_ADVSIMD | VEC_STRUCT;
1167
1168   if (aarch64_sve_pred_mode_p (mode))
1169     return VEC_SVE_PRED;
1170
1171   scalar_mode inner = GET_MODE_INNER (mode);
1172   if (VECTOR_MODE_P (mode)
1173       && (inner == QImode
1174           || inner == HImode
1175           || inner == HFmode
1176           || inner == SImode
1177           || inner == SFmode
1178           || inner == DImode
1179           || inner == DFmode))
1180     {
1181       if (TARGET_SVE)
1182         {
1183           if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1184             return VEC_SVE_DATA;
1185           if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1186               || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1187               || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1188             return VEC_SVE_DATA | VEC_STRUCT;
1189         }
1190
1191       /* This includes V1DF but not V1DI (which doesn't exist).  */
1192       if (TARGET_SIMD
1193           && (known_eq (GET_MODE_BITSIZE (mode), 64)
1194               || known_eq (GET_MODE_BITSIZE (mode), 128)))
1195         return VEC_ADVSIMD;
1196     }
1197
1198   return 0;
1199 }
1200
1201 /* Return true if MODE is any of the data vector modes, including
1202    structure modes.  */
1203 static bool
1204 aarch64_vector_data_mode_p (machine_mode mode)
1205 {
1206   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1207 }
1208
1209 /* Return true if MODE is an SVE data vector mode; either a single vector
1210    or a structure of vectors.  */
1211 static bool
1212 aarch64_sve_data_mode_p (machine_mode mode)
1213 {
1214   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1215 }
1216
1217 /* Implement target hook TARGET_ARRAY_MODE.  */
1218 static opt_machine_mode
1219 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1220 {
1221   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1222       && IN_RANGE (nelems, 2, 4))
1223     return mode_for_vector (GET_MODE_INNER (mode),
1224                             GET_MODE_NUNITS (mode) * nelems);
1225
1226   return opt_machine_mode ();
1227 }
1228
1229 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1230 static bool
1231 aarch64_array_mode_supported_p (machine_mode mode,
1232                                 unsigned HOST_WIDE_INT nelems)
1233 {
1234   if (TARGET_SIMD
1235       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1236           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1237       && (nelems >= 2 && nelems <= 4))
1238     return true;
1239
1240   return false;
1241 }
1242
1243 /* Return the SVE predicate mode to use for elements that have
1244    ELEM_NBYTES bytes, if such a mode exists.  */
1245
1246 opt_machine_mode
1247 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1248 {
1249   if (TARGET_SVE)
1250     {
1251       if (elem_nbytes == 1)
1252         return VNx16BImode;
1253       if (elem_nbytes == 2)
1254         return VNx8BImode;
1255       if (elem_nbytes == 4)
1256         return VNx4BImode;
1257       if (elem_nbytes == 8)
1258         return VNx2BImode;
1259     }
1260   return opt_machine_mode ();
1261 }
1262
1263 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1264
1265 static opt_machine_mode
1266 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1267 {
1268   if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1269     {
1270       unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1271       machine_mode pred_mode;
1272       if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1273         return pred_mode;
1274     }
1275
1276   return default_get_mask_mode (nunits, nbytes);
1277 }
1278
1279 /* Implement TARGET_HARD_REGNO_NREGS.  */
1280
1281 static unsigned int
1282 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1283 {
1284   /* ??? Logically we should only need to provide a value when
1285      HARD_REGNO_MODE_OK says that the combination is valid,
1286      but at the moment we need to handle all modes.  Just ignore
1287      any runtime parts for registers that can't store them.  */
1288   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1289   switch (aarch64_regno_regclass (regno))
1290     {
1291     case FP_REGS:
1292     case FP_LO_REGS:
1293       if (aarch64_sve_data_mode_p (mode))
1294         return exact_div (GET_MODE_SIZE (mode),
1295                           BYTES_PER_SVE_VECTOR).to_constant ();
1296       return CEIL (lowest_size, UNITS_PER_VREG);
1297     case PR_REGS:
1298     case PR_LO_REGS:
1299     case PR_HI_REGS:
1300       return 1;
1301     default:
1302       return CEIL (lowest_size, UNITS_PER_WORD);
1303     }
1304   gcc_unreachable ();
1305 }
1306
1307 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1308
1309 static bool
1310 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1311 {
1312   if (GET_MODE_CLASS (mode) == MODE_CC)
1313     return regno == CC_REGNUM;
1314
1315   if (regno == VG_REGNUM)
1316     /* This must have the same size as _Unwind_Word.  */
1317     return mode == DImode;
1318
1319   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1320   if (vec_flags & VEC_SVE_PRED)
1321     return PR_REGNUM_P (regno);
1322
1323   if (PR_REGNUM_P (regno))
1324     return 0;
1325
1326   if (regno == SP_REGNUM)
1327     /* The purpose of comparing with ptr_mode is to support the
1328        global register variable associated with the stack pointer
1329        register via the syntax of asm ("wsp") in ILP32.  */
1330     return mode == Pmode || mode == ptr_mode;
1331
1332   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1333     return mode == Pmode;
1334
1335   if (GP_REGNUM_P (regno) && known_le (GET_MODE_SIZE (mode), 16))
1336     return true;
1337
1338   if (FP_REGNUM_P (regno))
1339     {
1340       if (vec_flags & VEC_STRUCT)
1341         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1342       else
1343         return !VECTOR_MODE_P (mode) || vec_flags != 0;
1344     }
1345
1346   return false;
1347 }
1348
1349 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1350    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1351    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1352
1353 static bool
1354 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1355 {
1356   return FP_REGNUM_P (regno) && maybe_gt (GET_MODE_SIZE (mode), 8);
1357 }
1358
1359 /* Implement REGMODE_NATURAL_SIZE.  */
1360 poly_uint64
1361 aarch64_regmode_natural_size (machine_mode mode)
1362 {
1363   /* The natural size for SVE data modes is one SVE data vector,
1364      and similarly for predicates.  We can't independently modify
1365      anything smaller than that.  */
1366   /* ??? For now, only do this for variable-width SVE registers.
1367      Doing it for constant-sized registers breaks lower-subreg.c.  */
1368   /* ??? And once that's fixed, we should probably have similar
1369      code for Advanced SIMD.  */
1370   if (!aarch64_sve_vg.is_constant ())
1371     {
1372       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1373       if (vec_flags & VEC_SVE_PRED)
1374         return BYTES_PER_SVE_PRED;
1375       if (vec_flags & VEC_SVE_DATA)
1376         return BYTES_PER_SVE_VECTOR;
1377     }
1378   return UNITS_PER_WORD;
1379 }
1380
1381 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1382 machine_mode
1383 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1384                                      machine_mode mode)
1385 {
1386   /* The predicate mode determines which bits are significant and
1387      which are "don't care".  Decreasing the number of lanes would
1388      lose data while increasing the number of lanes would make bits
1389      unnecessarily significant.  */
1390   if (PR_REGNUM_P (regno))
1391     return mode;
1392   if (known_ge (GET_MODE_SIZE (mode), 4))
1393     return mode;
1394   else
1395     return SImode;
1396 }
1397
1398 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
1399    that strcpy from constants will be faster.  */
1400
1401 static HOST_WIDE_INT
1402 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1403 {
1404   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1405     return MAX (align, BITS_PER_WORD);
1406   return align;
1407 }
1408
1409 /* Return true if calls to DECL should be treated as
1410    long-calls (ie called via a register).  */
1411 static bool
1412 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1413 {
1414   return false;
1415 }
1416
1417 /* Return true if calls to symbol-ref SYM should be treated as
1418    long-calls (ie called via a register).  */
1419 bool
1420 aarch64_is_long_call_p (rtx sym)
1421 {
1422   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1423 }
1424
1425 /* Return true if calls to symbol-ref SYM should not go through
1426    plt stubs.  */
1427
1428 bool
1429 aarch64_is_noplt_call_p (rtx sym)
1430 {
1431   const_tree decl = SYMBOL_REF_DECL (sym);
1432
1433   if (flag_pic
1434       && decl
1435       && (!flag_plt
1436           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1437       && !targetm.binds_local_p (decl))
1438     return true;
1439
1440   return false;
1441 }
1442
1443 /* Return true if the offsets to a zero/sign-extract operation
1444    represent an expression that matches an extend operation.  The
1445    operands represent the paramters from
1446
1447    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1448 bool
1449 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1450                                 rtx extract_imm)
1451 {
1452   HOST_WIDE_INT mult_val, extract_val;
1453
1454   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1455     return false;
1456
1457   mult_val = INTVAL (mult_imm);
1458   extract_val = INTVAL (extract_imm);
1459
1460   if (extract_val > 8
1461       && extract_val < GET_MODE_BITSIZE (mode)
1462       && exact_log2 (extract_val & ~7) > 0
1463       && (extract_val & 7) <= 4
1464       && mult_val == (1 << (extract_val & 7)))
1465     return true;
1466
1467   return false;
1468 }
1469
1470 /* Emit an insn that's a simple single-set.  Both the operands must be
1471    known to be valid.  */
1472 inline static rtx_insn *
1473 emit_set_insn (rtx x, rtx y)
1474 {
1475   return emit_insn (gen_rtx_SET (x, y));
1476 }
1477
1478 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1479    return the rtx for register 0 in the proper mode.  */
1480 rtx
1481 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1482 {
1483   machine_mode mode = SELECT_CC_MODE (code, x, y);
1484   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1485
1486   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1487   return cc_reg;
1488 }
1489
1490 /* Build the SYMBOL_REF for __tls_get_addr.  */
1491
1492 static GTY(()) rtx tls_get_addr_libfunc;
1493
1494 rtx
1495 aarch64_tls_get_addr (void)
1496 {
1497   if (!tls_get_addr_libfunc)
1498     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1499   return tls_get_addr_libfunc;
1500 }
1501
1502 /* Return the TLS model to use for ADDR.  */
1503
1504 static enum tls_model
1505 tls_symbolic_operand_type (rtx addr)
1506 {
1507   enum tls_model tls_kind = TLS_MODEL_NONE;
1508   if (GET_CODE (addr) == CONST)
1509     {
1510       poly_int64 addend;
1511       rtx sym = strip_offset (addr, &addend);
1512       if (GET_CODE (sym) == SYMBOL_REF)
1513         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1514     }
1515   else if (GET_CODE (addr) == SYMBOL_REF)
1516     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1517
1518   return tls_kind;
1519 }
1520
1521 /* We'll allow lo_sum's in addresses in our legitimate addresses
1522    so that combine would take care of combining addresses where
1523    necessary, but for generation purposes, we'll generate the address
1524    as :
1525    RTL                               Absolute
1526    tmp = hi (symbol_ref);            adrp  x1, foo
1527    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1528                                      nop
1529
1530    PIC                               TLS
1531    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1532    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1533                                      bl   __tls_get_addr
1534                                      nop
1535
1536    Load TLS symbol, depending on TLS mechanism and TLS access model.
1537
1538    Global Dynamic - Traditional TLS:
1539    adrp tmp, :tlsgd:imm
1540    add  dest, tmp, #:tlsgd_lo12:imm
1541    bl   __tls_get_addr
1542
1543    Global Dynamic - TLS Descriptors:
1544    adrp dest, :tlsdesc:imm
1545    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1546    add  dest, dest, #:tlsdesc_lo12:imm
1547    blr  tmp
1548    mrs  tp, tpidr_el0
1549    add  dest, dest, tp
1550
1551    Initial Exec:
1552    mrs  tp, tpidr_el0
1553    adrp tmp, :gottprel:imm
1554    ldr  dest, [tmp, #:gottprel_lo12:imm]
1555    add  dest, dest, tp
1556
1557    Local Exec:
1558    mrs  tp, tpidr_el0
1559    add  t0, tp, #:tprel_hi12:imm, lsl #12
1560    add  t0, t0, #:tprel_lo12_nc:imm
1561 */
1562
1563 static void
1564 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1565                                    enum aarch64_symbol_type type)
1566 {
1567   switch (type)
1568     {
1569     case SYMBOL_SMALL_ABSOLUTE:
1570       {
1571         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1572         rtx tmp_reg = dest;
1573         machine_mode mode = GET_MODE (dest);
1574
1575         gcc_assert (mode == Pmode || mode == ptr_mode);
1576
1577         if (can_create_pseudo_p ())
1578           tmp_reg = gen_reg_rtx (mode);
1579
1580         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1581         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1582         return;
1583       }
1584
1585     case SYMBOL_TINY_ABSOLUTE:
1586       emit_insn (gen_rtx_SET (dest, imm));
1587       return;
1588
1589     case SYMBOL_SMALL_GOT_28K:
1590       {
1591         machine_mode mode = GET_MODE (dest);
1592         rtx gp_rtx = pic_offset_table_rtx;
1593         rtx insn;
1594         rtx mem;
1595
1596         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1597            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1598            decide rtx costs, in which case pic_offset_table_rtx is not
1599            initialized.  For that case no need to generate the first adrp
1600            instruction as the final cost for global variable access is
1601            one instruction.  */
1602         if (gp_rtx != NULL)
1603           {
1604             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1605                using the page base as GOT base, the first page may be wasted,
1606                in the worst scenario, there is only 28K space for GOT).
1607
1608                The generate instruction sequence for accessing global variable
1609                is:
1610
1611                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1612
1613                Only one instruction needed. But we must initialize
1614                pic_offset_table_rtx properly.  We generate initialize insn for
1615                every global access, and allow CSE to remove all redundant.
1616
1617                The final instruction sequences will look like the following
1618                for multiply global variables access.
1619
1620                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1621
1622                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1623                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1624                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1625                  ...  */
1626
1627             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1628             crtl->uses_pic_offset_table = 1;
1629             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1630
1631             if (mode != GET_MODE (gp_rtx))
1632              gp_rtx = gen_lowpart (mode, gp_rtx);
1633
1634           }
1635
1636         if (mode == ptr_mode)
1637           {
1638             if (mode == DImode)
1639               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1640             else
1641               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1642
1643             mem = XVECEXP (SET_SRC (insn), 0, 0);
1644           }
1645         else
1646           {
1647             gcc_assert (mode == Pmode);
1648
1649             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1650             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1651           }
1652
1653         /* The operand is expected to be MEM.  Whenever the related insn
1654            pattern changed, above code which calculate mem should be
1655            updated.  */
1656         gcc_assert (GET_CODE (mem) == MEM);
1657         MEM_READONLY_P (mem) = 1;
1658         MEM_NOTRAP_P (mem) = 1;
1659         emit_insn (insn);
1660         return;
1661       }
1662
1663     case SYMBOL_SMALL_GOT_4G:
1664       {
1665         /* In ILP32, the mode of dest can be either SImode or DImode,
1666            while the got entry is always of SImode size.  The mode of
1667            dest depends on how dest is used: if dest is assigned to a
1668            pointer (e.g. in the memory), it has SImode; it may have
1669            DImode if dest is dereferenced to access the memeory.
1670            This is why we have to handle three different ldr_got_small
1671            patterns here (two patterns for ILP32).  */
1672
1673         rtx insn;
1674         rtx mem;
1675         rtx tmp_reg = dest;
1676         machine_mode mode = GET_MODE (dest);
1677
1678         if (can_create_pseudo_p ())
1679           tmp_reg = gen_reg_rtx (mode);
1680
1681         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1682         if (mode == ptr_mode)
1683           {
1684             if (mode == DImode)
1685               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1686             else
1687               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1688
1689             mem = XVECEXP (SET_SRC (insn), 0, 0);
1690           }
1691         else
1692           {
1693             gcc_assert (mode == Pmode);
1694
1695             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1696             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1697           }
1698
1699         gcc_assert (GET_CODE (mem) == MEM);
1700         MEM_READONLY_P (mem) = 1;
1701         MEM_NOTRAP_P (mem) = 1;
1702         emit_insn (insn);
1703         return;
1704       }
1705
1706     case SYMBOL_SMALL_TLSGD:
1707       {
1708         rtx_insn *insns;
1709         machine_mode mode = GET_MODE (dest);
1710         rtx result = gen_rtx_REG (mode, R0_REGNUM);
1711
1712         start_sequence ();
1713         if (TARGET_ILP32)
1714           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1715         else
1716           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1717         insns = get_insns ();
1718         end_sequence ();
1719
1720         RTL_CONST_CALL_P (insns) = 1;
1721         emit_libcall_block (insns, dest, result, imm);
1722         return;
1723       }
1724
1725     case SYMBOL_SMALL_TLSDESC:
1726       {
1727         machine_mode mode = GET_MODE (dest);
1728         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1729         rtx tp;
1730
1731         gcc_assert (mode == Pmode || mode == ptr_mode);
1732
1733         /* In ILP32, the got entry is always of SImode size.  Unlike
1734            small GOT, the dest is fixed at reg 0.  */
1735         if (TARGET_ILP32)
1736           emit_insn (gen_tlsdesc_small_si (imm));
1737         else
1738           emit_insn (gen_tlsdesc_small_di (imm));
1739         tp = aarch64_load_tp (NULL);
1740
1741         if (mode != Pmode)
1742           tp = gen_lowpart (mode, tp);
1743
1744         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1745         if (REG_P (dest))
1746           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1747         return;
1748       }
1749
1750     case SYMBOL_SMALL_TLSIE:
1751       {
1752         /* In ILP32, the mode of dest can be either SImode or DImode,
1753            while the got entry is always of SImode size.  The mode of
1754            dest depends on how dest is used: if dest is assigned to a
1755            pointer (e.g. in the memory), it has SImode; it may have
1756            DImode if dest is dereferenced to access the memeory.
1757            This is why we have to handle three different tlsie_small
1758            patterns here (two patterns for ILP32).  */
1759         machine_mode mode = GET_MODE (dest);
1760         rtx tmp_reg = gen_reg_rtx (mode);
1761         rtx tp = aarch64_load_tp (NULL);
1762
1763         if (mode == ptr_mode)
1764           {
1765             if (mode == DImode)
1766               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1767             else
1768               {
1769                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1770                 tp = gen_lowpart (mode, tp);
1771               }
1772           }
1773         else
1774           {
1775             gcc_assert (mode == Pmode);
1776             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1777           }
1778
1779         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1780         if (REG_P (dest))
1781           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1782         return;
1783       }
1784
1785     case SYMBOL_TLSLE12:
1786     case SYMBOL_TLSLE24:
1787     case SYMBOL_TLSLE32:
1788     case SYMBOL_TLSLE48:
1789       {
1790         machine_mode mode = GET_MODE (dest);
1791         rtx tp = aarch64_load_tp (NULL);
1792
1793         if (mode != Pmode)
1794           tp = gen_lowpart (mode, tp);
1795
1796         switch (type)
1797           {
1798           case SYMBOL_TLSLE12:
1799             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1800                         (dest, tp, imm));
1801             break;
1802           case SYMBOL_TLSLE24:
1803             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1804                         (dest, tp, imm));
1805           break;
1806           case SYMBOL_TLSLE32:
1807             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1808                         (dest, imm));
1809             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1810                         (dest, dest, tp));
1811           break;
1812           case SYMBOL_TLSLE48:
1813             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1814                         (dest, imm));
1815             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1816                         (dest, dest, tp));
1817             break;
1818           default:
1819             gcc_unreachable ();
1820           }
1821
1822         if (REG_P (dest))
1823           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1824         return;
1825       }
1826
1827     case SYMBOL_TINY_GOT:
1828       emit_insn (gen_ldr_got_tiny (dest, imm));
1829       return;
1830
1831     case SYMBOL_TINY_TLSIE:
1832       {
1833         machine_mode mode = GET_MODE (dest);
1834         rtx tp = aarch64_load_tp (NULL);
1835
1836         if (mode == ptr_mode)
1837           {
1838             if (mode == DImode)
1839               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1840             else
1841               {
1842                 tp = gen_lowpart (mode, tp);
1843                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1844               }
1845           }
1846         else
1847           {
1848             gcc_assert (mode == Pmode);
1849             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1850           }
1851
1852         if (REG_P (dest))
1853           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1854         return;
1855       }
1856
1857     default:
1858       gcc_unreachable ();
1859     }
1860 }
1861
1862 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1863    handle all moves if !can_create_pseudo_p ().  The distinction is
1864    important because, unlike emit_move_insn, the move expanders know
1865    how to force Pmode objects into the constant pool even when the
1866    constant pool address is not itself legitimate.  */
1867 static rtx
1868 aarch64_emit_move (rtx dest, rtx src)
1869 {
1870   return (can_create_pseudo_p ()
1871           ? emit_move_insn (dest, src)
1872           : emit_move_insn_1 (dest, src));
1873 }
1874
1875 /* Split a 128-bit move operation into two 64-bit move operations,
1876    taking care to handle partial overlap of register to register
1877    copies.  Special cases are needed when moving between GP regs and
1878    FP regs.  SRC can be a register, constant or memory; DST a register
1879    or memory.  If either operand is memory it must not have any side
1880    effects.  */
1881 void
1882 aarch64_split_128bit_move (rtx dst, rtx src)
1883 {
1884   rtx dst_lo, dst_hi;
1885   rtx src_lo, src_hi;
1886
1887   machine_mode mode = GET_MODE (dst);
1888
1889   gcc_assert (mode == TImode || mode == TFmode);
1890   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1891   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1892
1893   if (REG_P (dst) && REG_P (src))
1894     {
1895       int src_regno = REGNO (src);
1896       int dst_regno = REGNO (dst);
1897
1898       /* Handle FP <-> GP regs.  */
1899       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1900         {
1901           src_lo = gen_lowpart (word_mode, src);
1902           src_hi = gen_highpart (word_mode, src);
1903
1904           if (mode == TImode)
1905             {
1906               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1907               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1908             }
1909           else
1910             {
1911               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1912               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1913             }
1914           return;
1915         }
1916       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1917         {
1918           dst_lo = gen_lowpart (word_mode, dst);
1919           dst_hi = gen_highpart (word_mode, dst);
1920
1921           if (mode == TImode)
1922             {
1923               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1924               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1925             }
1926           else
1927             {
1928               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1929               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1930             }
1931           return;
1932         }
1933     }
1934
1935   dst_lo = gen_lowpart (word_mode, dst);
1936   dst_hi = gen_highpart (word_mode, dst);
1937   src_lo = gen_lowpart (word_mode, src);
1938   src_hi = gen_highpart_mode (word_mode, mode, src);
1939
1940   /* At most one pairing may overlap.  */
1941   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1942     {
1943       aarch64_emit_move (dst_hi, src_hi);
1944       aarch64_emit_move (dst_lo, src_lo);
1945     }
1946   else
1947     {
1948       aarch64_emit_move (dst_lo, src_lo);
1949       aarch64_emit_move (dst_hi, src_hi);
1950     }
1951 }
1952
1953 bool
1954 aarch64_split_128bit_move_p (rtx dst, rtx src)
1955 {
1956   return (! REG_P (src)
1957           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1958 }
1959
1960 /* Split a complex SIMD combine.  */
1961
1962 void
1963 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1964 {
1965   machine_mode src_mode = GET_MODE (src1);
1966   machine_mode dst_mode = GET_MODE (dst);
1967
1968   gcc_assert (VECTOR_MODE_P (dst_mode));
1969   gcc_assert (register_operand (dst, dst_mode)
1970               && register_operand (src1, src_mode)
1971               && register_operand (src2, src_mode));
1972
1973   rtx (*gen) (rtx, rtx, rtx);
1974
1975   switch (src_mode)
1976     {
1977     case E_V8QImode:
1978       gen = gen_aarch64_simd_combinev8qi;
1979       break;
1980     case E_V4HImode:
1981       gen = gen_aarch64_simd_combinev4hi;
1982       break;
1983     case E_V2SImode:
1984       gen = gen_aarch64_simd_combinev2si;
1985       break;
1986     case E_V4HFmode:
1987       gen = gen_aarch64_simd_combinev4hf;
1988       break;
1989     case E_V2SFmode:
1990       gen = gen_aarch64_simd_combinev2sf;
1991       break;
1992     case E_DImode:
1993       gen = gen_aarch64_simd_combinedi;
1994       break;
1995     case E_DFmode:
1996       gen = gen_aarch64_simd_combinedf;
1997       break;
1998     default:
1999       gcc_unreachable ();
2000     }
2001
2002   emit_insn (gen (dst, src1, src2));
2003   return;
2004 }
2005
2006 /* Split a complex SIMD move.  */
2007
2008 void
2009 aarch64_split_simd_move (rtx dst, rtx src)
2010 {
2011   machine_mode src_mode = GET_MODE (src);
2012   machine_mode dst_mode = GET_MODE (dst);
2013
2014   gcc_assert (VECTOR_MODE_P (dst_mode));
2015
2016   if (REG_P (dst) && REG_P (src))
2017     {
2018       rtx (*gen) (rtx, rtx);
2019
2020       gcc_assert (VECTOR_MODE_P (src_mode));
2021
2022       switch (src_mode)
2023         {
2024         case E_V16QImode:
2025           gen = gen_aarch64_split_simd_movv16qi;
2026           break;
2027         case E_V8HImode:
2028           gen = gen_aarch64_split_simd_movv8hi;
2029           break;
2030         case E_V4SImode:
2031           gen = gen_aarch64_split_simd_movv4si;
2032           break;
2033         case E_V2DImode:
2034           gen = gen_aarch64_split_simd_movv2di;
2035           break;
2036         case E_V8HFmode:
2037           gen = gen_aarch64_split_simd_movv8hf;
2038           break;
2039         case E_V4SFmode:
2040           gen = gen_aarch64_split_simd_movv4sf;
2041           break;
2042         case E_V2DFmode:
2043           gen = gen_aarch64_split_simd_movv2df;
2044           break;
2045         default:
2046           gcc_unreachable ();
2047         }
2048
2049       emit_insn (gen (dst, src));
2050       return;
2051     }
2052 }
2053
2054 bool
2055 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2056                               machine_mode ymode, rtx y)
2057 {
2058   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2059   gcc_assert (r != NULL);
2060   return rtx_equal_p (x, r);
2061 }
2062
2063
2064 static rtx
2065 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2066 {
2067   if (can_create_pseudo_p ())
2068     return force_reg (mode, value);
2069   else
2070     {
2071       gcc_assert (x);
2072       aarch64_emit_move (x, value);
2073       return x;
2074     }
2075 }
2076
2077 /* Return true if we can move VALUE into a register using a single
2078    CNT[BHWD] instruction.  */
2079
2080 static bool
2081 aarch64_sve_cnt_immediate_p (poly_int64 value)
2082 {
2083   HOST_WIDE_INT factor = value.coeffs[0];
2084   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
2085   return (value.coeffs[1] == factor
2086           && IN_RANGE (factor, 2, 16 * 16)
2087           && (factor & 1) == 0
2088           && factor <= 16 * (factor & -factor));
2089 }
2090
2091 /* Likewise for rtx X.  */
2092
2093 bool
2094 aarch64_sve_cnt_immediate_p (rtx x)
2095 {
2096   poly_int64 value;
2097   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2098 }
2099
2100 /* Return the asm string for an instruction with a CNT-like vector size
2101    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2102    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2103    first part of the operands template (the part that comes before the
2104    vector size itself).  FACTOR is the number of quadwords.
2105    NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2106    If it is zero, we can use any element size.  */
2107
2108 static char *
2109 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2110                                   unsigned int factor,
2111                                   unsigned int nelts_per_vq)
2112 {
2113   static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2114
2115   if (nelts_per_vq == 0)
2116     /* There is some overlap in the ranges of the four CNT instructions.
2117        Here we always use the smallest possible element size, so that the
2118        multiplier is 1 whereever possible.  */
2119     nelts_per_vq = factor & -factor;
2120   int shift = std::min (exact_log2 (nelts_per_vq), 4);
2121   gcc_assert (IN_RANGE (shift, 1, 4));
2122   char suffix = "dwhb"[shift - 1];
2123
2124   factor >>= shift;
2125   unsigned int written;
2126   if (factor == 1)
2127     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2128                         prefix, suffix, operands);
2129   else
2130     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2131                         prefix, suffix, operands, factor);
2132   gcc_assert (written < sizeof (buffer));
2133   return buffer;
2134 }
2135
2136 /* Return the asm string for an instruction with a CNT-like vector size
2137    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2138    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2139    first part of the operands template (the part that comes before the
2140    vector size itself).  X is the value of the vector size operand,
2141    as a polynomial integer rtx.  */
2142
2143 char *
2144 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2145                                   rtx x)
2146 {
2147   poly_int64 value = rtx_to_poly_int64 (x);
2148   gcc_assert (aarch64_sve_cnt_immediate_p (value));
2149   return aarch64_output_sve_cnt_immediate (prefix, operands,
2150                                            value.coeffs[1], 0);
2151 }
2152
2153 /* Return true if we can add VALUE to a register using a single ADDVL
2154    or ADDPL instruction.  */
2155
2156 static bool
2157 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2158 {
2159   HOST_WIDE_INT factor = value.coeffs[0];
2160   if (factor == 0 || value.coeffs[1] != factor)
2161     return false;
2162   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2163      and a value of 16 is one vector width.  */
2164   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2165           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2166 }
2167
2168 /* Likewise for rtx X.  */
2169
2170 bool
2171 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2172 {
2173   poly_int64 value;
2174   return (poly_int_rtx_p (x, &value)
2175           && aarch64_sve_addvl_addpl_immediate_p (value));
2176 }
2177
2178 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2179    and storing the result in operand 0.  */
2180
2181 char *
2182 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2183 {
2184   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2185   poly_int64 offset_value = rtx_to_poly_int64 (offset);
2186   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2187
2188   /* Use INC or DEC if possible.  */
2189   if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2190     {
2191       if (aarch64_sve_cnt_immediate_p (offset_value))
2192         return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2193                                                  offset_value.coeffs[1], 0);
2194       if (aarch64_sve_cnt_immediate_p (-offset_value))
2195         return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2196                                                  -offset_value.coeffs[1], 0);
2197     }
2198
2199   int factor = offset_value.coeffs[1];
2200   if ((factor & 15) == 0)
2201     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2202   else
2203     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2204   return buffer;
2205 }
2206
2207 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2208    instruction.  If it is, store the number of elements in each vector
2209    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2210    factor in *FACTOR_OUT (if nonnull).  */
2211
2212 bool
2213 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2214                                  unsigned int *nelts_per_vq_out)
2215 {
2216   rtx elt;
2217   poly_int64 value;
2218
2219   if (!const_vec_duplicate_p (x, &elt)
2220       || !poly_int_rtx_p (elt, &value))
2221     return false;
2222
2223   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2224   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2225     /* There's no vector INCB.  */
2226     return false;
2227
2228   HOST_WIDE_INT factor = value.coeffs[0];
2229   if (value.coeffs[1] != factor)
2230     return false;
2231
2232   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
2233   if ((factor % nelts_per_vq) != 0
2234       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2235     return false;
2236
2237   if (factor_out)
2238     *factor_out = factor;
2239   if (nelts_per_vq_out)
2240     *nelts_per_vq_out = nelts_per_vq;
2241   return true;
2242 }
2243
2244 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2245    instruction.  */
2246
2247 bool
2248 aarch64_sve_inc_dec_immediate_p (rtx x)
2249 {
2250   return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2251 }
2252
2253 /* Return the asm template for an SVE vector INC or DEC instruction.
2254    OPERANDS gives the operands before the vector count and X is the
2255    value of the vector count operand itself.  */
2256
2257 char *
2258 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2259 {
2260   int factor;
2261   unsigned int nelts_per_vq;
2262   if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2263     gcc_unreachable ();
2264   if (factor < 0)
2265     return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2266                                              nelts_per_vq);
2267   else
2268     return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2269                                              nelts_per_vq);
2270 }
2271
2272 static int
2273 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2274                                 scalar_int_mode mode)
2275 {
2276   int i;
2277   unsigned HOST_WIDE_INT val, val2, mask;
2278   int one_match, zero_match;
2279   int num_insns;
2280
2281   val = INTVAL (imm);
2282
2283   if (aarch64_move_imm (val, mode))
2284     {
2285       if (generate)
2286         emit_insn (gen_rtx_SET (dest, imm));
2287       return 1;
2288     }
2289
2290   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2291      (with XXXX non-zero). In that case check to see if the move can be done in
2292      a smaller mode.  */
2293   val2 = val & 0xffffffff;
2294   if (mode == DImode
2295       && aarch64_move_imm (val2, SImode)
2296       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2297     {
2298       if (generate)
2299         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2300
2301       /* Check if we have to emit a second instruction by checking to see
2302          if any of the upper 32 bits of the original DI mode value is set.  */
2303       if (val == val2)
2304         return 1;
2305
2306       i = (val >> 48) ? 48 : 32;
2307
2308       if (generate)
2309          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2310                                     GEN_INT ((val >> i) & 0xffff)));
2311
2312       return 2;
2313     }
2314
2315   if ((val >> 32) == 0 || mode == SImode)
2316     {
2317       if (generate)
2318         {
2319           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2320           if (mode == SImode)
2321             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2322                                        GEN_INT ((val >> 16) & 0xffff)));
2323           else
2324             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2325                                        GEN_INT ((val >> 16) & 0xffff)));
2326         }
2327       return 2;
2328     }
2329
2330   /* Remaining cases are all for DImode.  */
2331
2332   mask = 0xffff;
2333   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2334     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2335   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2336     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2337
2338   if (zero_match != 2 && one_match != 2)
2339     {
2340       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2341          For a 64-bit bitmask try whether changing 16 bits to all ones or
2342          zeroes creates a valid bitmask.  To check any repeated bitmask,
2343          try using 16 bits from the other 32-bit half of val.  */
2344
2345       for (i = 0; i < 64; i += 16, mask <<= 16)
2346         {
2347           val2 = val & ~mask;
2348           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2349             break;
2350           val2 = val | mask;
2351           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2352             break;
2353           val2 = val2 & ~mask;
2354           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2355           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2356             break;
2357         }
2358       if (i != 64)
2359         {
2360           if (generate)
2361             {
2362               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2363               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2364                                          GEN_INT ((val >> i) & 0xffff)));
2365             }
2366           return 2;
2367         }
2368     }
2369
2370   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2371      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
2372      otherwise skip zero bits.  */
2373
2374   num_insns = 1;
2375   mask = 0xffff;
2376   val2 = one_match > zero_match ? ~val : val;
2377   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2378
2379   if (generate)
2380     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2381                                            ? (val | ~(mask << i))
2382                                            : (val & (mask << i)))));
2383   for (i += 16; i < 64; i += 16)
2384     {
2385       if ((val2 & (mask << i)) == 0)
2386         continue;
2387       if (generate)
2388         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2389                                    GEN_INT ((val >> i) & 0xffff)));
2390       num_insns ++;
2391     }
2392
2393   return num_insns;
2394 }
2395
2396 /* Return whether imm is a 128-bit immediate which is simple enough to
2397    expand inline.  */
2398 bool
2399 aarch64_mov128_immediate (rtx imm)
2400 {
2401   if (GET_CODE (imm) == CONST_INT)
2402     return true;
2403
2404   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2405
2406   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2407   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2408
2409   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2410          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2411 }
2412
2413
2414 /* Return the number of temporary registers that aarch64_add_offset_1
2415    would need to add OFFSET to a register.  */
2416
2417 static unsigned int
2418 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2419 {
2420   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2421 }
2422
2423 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
2424    a non-polynomial OFFSET.  MODE is the mode of the addition.
2425    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2426    be set and CFA adjustments added to the generated instructions.
2427
2428    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2429    temporary if register allocation is already complete.  This temporary
2430    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
2431    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2432    the immediate again.
2433
2434    Since this function may be used to adjust the stack pointer, we must
2435    ensure that it cannot cause transient stack deallocation (for example
2436    by first incrementing SP and then decrementing when adjusting by a
2437    large immediate).  */
2438
2439 static void
2440 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2441                       rtx src, HOST_WIDE_INT offset, rtx temp1,
2442                       bool frame_related_p, bool emit_move_imm)
2443 {
2444   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2445   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2446
2447   HOST_WIDE_INT moffset = abs_hwi (offset);
2448   rtx_insn *insn;
2449
2450   if (!moffset)
2451     {
2452       if (!rtx_equal_p (dest, src))
2453         {
2454           insn = emit_insn (gen_rtx_SET (dest, src));
2455           RTX_FRAME_RELATED_P (insn) = frame_related_p;
2456         }
2457       return;
2458     }
2459
2460   /* Single instruction adjustment.  */
2461   if (aarch64_uimm12_shift (moffset))
2462     {
2463       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2464       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2465       return;
2466     }
2467
2468   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2469      and either:
2470
2471      a) the offset cannot be loaded by a 16-bit move or
2472      b) there is no spare register into which we can move it.  */
2473   if (moffset < 0x1000000
2474       && ((!temp1 && !can_create_pseudo_p ())
2475           || !aarch64_move_imm (moffset, mode)))
2476     {
2477       HOST_WIDE_INT low_off = moffset & 0xfff;
2478
2479       low_off = offset < 0 ? -low_off : low_off;
2480       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2481       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2482       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2483       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2484       return;
2485     }
2486
2487   /* Emit a move immediate if required and an addition/subtraction.  */
2488   if (emit_move_imm)
2489     {
2490       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2491       temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2492     }
2493   insn = emit_insn (offset < 0
2494                     ? gen_sub3_insn (dest, src, temp1)
2495                     : gen_add3_insn (dest, src, temp1));
2496   if (frame_related_p)
2497     {
2498       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2499       rtx adj = plus_constant (mode, src, offset);
2500       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2501     }
2502 }
2503
2504 /* Return the number of temporary registers that aarch64_add_offset
2505    would need to move OFFSET into a register or add OFFSET to a register;
2506    ADD_P is true if we want the latter rather than the former.  */
2507
2508 static unsigned int
2509 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2510 {
2511   /* This follows the same structure as aarch64_add_offset.  */
2512   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2513     return 0;
2514
2515   unsigned int count = 0;
2516   HOST_WIDE_INT factor = offset.coeffs[1];
2517   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2518   poly_int64 poly_offset (factor, factor);
2519   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2520     /* Need one register for the ADDVL/ADDPL result.  */
2521     count += 1;
2522   else if (factor != 0)
2523     {
2524       factor = abs (factor);
2525       if (factor > 16 * (factor & -factor))
2526         /* Need one register for the CNT result and one for the multiplication
2527            factor.  If necessary, the second temporary can be reused for the
2528            constant part of the offset.  */
2529         return 2;
2530       /* Need one register for the CNT result (which might then
2531          be shifted).  */
2532       count += 1;
2533     }
2534   return count + aarch64_add_offset_1_temporaries (constant);
2535 }
2536
2537 /* If X can be represented as a poly_int64, return the number
2538    of temporaries that are required to add it to a register.
2539    Return -1 otherwise.  */
2540
2541 int
2542 aarch64_add_offset_temporaries (rtx x)
2543 {
2544   poly_int64 offset;
2545   if (!poly_int_rtx_p (x, &offset))
2546     return -1;
2547   return aarch64_offset_temporaries (true, offset);
2548 }
2549
2550 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
2551    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2552    be set and CFA adjustments added to the generated instructions.
2553
2554    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2555    temporary if register allocation is already complete.  This temporary
2556    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2557    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2558    false to avoid emitting the immediate again.
2559
2560    TEMP2, if nonnull, is a second temporary register that doesn't
2561    overlap either DEST or REG.
2562
2563    Since this function may be used to adjust the stack pointer, we must
2564    ensure that it cannot cause transient stack deallocation (for example
2565    by first incrementing SP and then decrementing when adjusting by a
2566    large immediate).  */
2567
2568 static void
2569 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2570                     poly_int64 offset, rtx temp1, rtx temp2,
2571                     bool frame_related_p, bool emit_move_imm = true)
2572 {
2573   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2574   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2575   gcc_assert (temp1 == NULL_RTX
2576               || !frame_related_p
2577               || !reg_overlap_mentioned_p (temp1, dest));
2578   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2579
2580   /* Try using ADDVL or ADDPL to add the whole value.  */
2581   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2582     {
2583       rtx offset_rtx = gen_int_mode (offset, mode);
2584       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2585       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2586       return;
2587     }
2588
2589   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2590      SVE vector register, over and above the minimum size of 128 bits.
2591      This is equivalent to half the value returned by CNTD with a
2592      vector shape of ALL.  */
2593   HOST_WIDE_INT factor = offset.coeffs[1];
2594   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2595
2596   /* Try using ADDVL or ADDPL to add the VG-based part.  */
2597   poly_int64 poly_offset (factor, factor);
2598   if (src != const0_rtx
2599       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2600     {
2601       rtx offset_rtx = gen_int_mode (poly_offset, mode);
2602       if (frame_related_p)
2603         {
2604           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2605           RTX_FRAME_RELATED_P (insn) = true;
2606           src = dest;
2607         }
2608       else
2609         {
2610           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2611           src = aarch64_force_temporary (mode, temp1, addr);
2612           temp1 = temp2;
2613           temp2 = NULL_RTX;
2614         }
2615     }
2616   /* Otherwise use a CNT-based sequence.  */
2617   else if (factor != 0)
2618     {
2619       /* Use a subtraction if we have a negative factor.  */
2620       rtx_code code = PLUS;
2621       if (factor < 0)
2622         {
2623           factor = -factor;
2624           code = MINUS;
2625         }
2626
2627       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
2628          into the multiplication.  */
2629       rtx val;
2630       int shift = 0;
2631       if (factor & 1)
2632         /* Use a right shift by 1.  */
2633         shift = -1;
2634       else
2635         factor /= 2;
2636       HOST_WIDE_INT low_bit = factor & -factor;
2637       if (factor <= 16 * low_bit)
2638         {
2639           if (factor > 16 * 8)
2640             {
2641               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2642                  the value with the minimum multiplier and shift it into
2643                  position.  */
2644               int extra_shift = exact_log2 (low_bit);
2645               shift += extra_shift;
2646               factor >>= extra_shift;
2647             }
2648           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
2649         }
2650       else
2651         {
2652           /* Use CNTD, then multiply it by FACTOR.  */
2653           val = gen_int_mode (poly_int64 (2, 2), mode);
2654           val = aarch64_force_temporary (mode, temp1, val);
2655
2656           /* Go back to using a negative multiplication factor if we have
2657              no register from which to subtract.  */
2658           if (code == MINUS && src == const0_rtx)
2659             {
2660               factor = -factor;
2661               code = PLUS;
2662             }
2663           rtx coeff1 = gen_int_mode (factor, mode);
2664           coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
2665           val = gen_rtx_MULT (mode, val, coeff1);
2666         }
2667
2668       if (shift > 0)
2669         {
2670           /* Multiply by 1 << SHIFT.  */
2671           val = aarch64_force_temporary (mode, temp1, val);
2672           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
2673         }
2674       else if (shift == -1)
2675         {
2676           /* Divide by 2.  */
2677           val = aarch64_force_temporary (mode, temp1, val);
2678           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
2679         }
2680
2681       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
2682       if (src != const0_rtx)
2683         {
2684           val = aarch64_force_temporary (mode, temp1, val);
2685           val = gen_rtx_fmt_ee (code, mode, src, val);
2686         }
2687       else if (code == MINUS)
2688         {
2689           val = aarch64_force_temporary (mode, temp1, val);
2690           val = gen_rtx_NEG (mode, val);
2691         }
2692
2693       if (constant == 0 || frame_related_p)
2694         {
2695           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
2696           if (frame_related_p)
2697             {
2698               RTX_FRAME_RELATED_P (insn) = true;
2699               add_reg_note (insn, REG_CFA_ADJUST_CFA,
2700                             gen_rtx_SET (dest, plus_constant (Pmode, src,
2701                                                               poly_offset)));
2702             }
2703           src = dest;
2704           if (constant == 0)
2705             return;
2706         }
2707       else
2708         {
2709           src = aarch64_force_temporary (mode, temp1, val);
2710           temp1 = temp2;
2711           temp2 = NULL_RTX;
2712         }
2713
2714       emit_move_imm = true;
2715     }
2716
2717   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
2718                         frame_related_p, emit_move_imm);
2719 }
2720
2721 /* Like aarch64_add_offset, but the offset is given as an rtx rather
2722    than a poly_int64.  */
2723
2724 void
2725 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2726                           rtx offset_rtx, rtx temp1, rtx temp2)
2727 {
2728   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
2729                       temp1, temp2, false);
2730 }
2731
2732 /* Add DELTA to the stack pointer, marking the instructions frame-related.
2733    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
2734    if TEMP1 already contains abs (DELTA).  */
2735
2736 static inline void
2737 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
2738 {
2739   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
2740                       temp1, temp2, true, emit_move_imm);
2741 }
2742
2743 /* Subtract DELTA from the stack pointer, marking the instructions
2744    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
2745    if nonnull.  */
2746
2747 static inline void
2748 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p)
2749 {
2750   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
2751                       temp1, temp2, frame_related_p);
2752 }
2753
2754 /* Set DEST to (vec_series BASE STEP).  */
2755
2756 static void
2757 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
2758 {
2759   machine_mode mode = GET_MODE (dest);
2760   scalar_mode inner = GET_MODE_INNER (mode);
2761
2762   /* Each operand can be a register or an immediate in the range [-16, 15].  */
2763   if (!aarch64_sve_index_immediate_p (base))
2764     base = force_reg (inner, base);
2765   if (!aarch64_sve_index_immediate_p (step))
2766     step = force_reg (inner, step);
2767
2768   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
2769 }
2770
2771 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
2772    integer of mode INT_MODE.  Return true on success.  */
2773
2774 static bool
2775 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
2776                                       rtx src)
2777 {
2778   /* If the constant is smaller than 128 bits, we can do the move
2779      using a vector of SRC_MODEs.  */
2780   if (src_mode != TImode)
2781     {
2782       poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
2783                                      GET_MODE_SIZE (src_mode));
2784       machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
2785       emit_move_insn (gen_lowpart (dup_mode, dest),
2786                       gen_const_vec_duplicate (dup_mode, src));
2787       return true;
2788     }
2789
2790   /* Use LD1RQ[BHWD] to load the 128 bits from memory.  */
2791   src = force_const_mem (src_mode, src);
2792   if (!src)
2793     return false;
2794
2795   /* Make sure that the address is legitimate.  */
2796   if (!aarch64_sve_ld1r_operand_p (src))
2797     {
2798       rtx addr = force_reg (Pmode, XEXP (src, 0));
2799       src = replace_equiv_address (src, addr);
2800     }
2801
2802   machine_mode mode = GET_MODE (dest);
2803   unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
2804   machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
2805   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
2806   src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
2807   emit_insn (gen_rtx_SET (dest, src));
2808   return true;
2809 }
2810
2811 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
2812    isn't a simple duplicate or series.  */
2813
2814 static void
2815 aarch64_expand_sve_const_vector (rtx dest, rtx src)
2816 {
2817   machine_mode mode = GET_MODE (src);
2818   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
2819   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
2820   gcc_assert (npatterns > 1);
2821
2822   if (nelts_per_pattern == 1)
2823     {
2824       /* The constant is a repeating seqeuence of at least two elements,
2825          where the repeating elements occupy no more than 128 bits.
2826          Get an integer representation of the replicated value.  */
2827       scalar_int_mode int_mode;
2828       if (BYTES_BIG_ENDIAN)
2829         /* For now, always use LD1RQ to load the value on big-endian
2830            targets, since the handling of smaller integers includes a
2831            subreg that is semantically an element reverse.  */
2832         int_mode = TImode;
2833       else
2834         {
2835           unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
2836           gcc_assert (int_bits <= 128);
2837           int_mode = int_mode_for_size (int_bits, 0).require ();
2838         }
2839       rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
2840       if (int_value
2841           && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
2842         return;
2843     }
2844
2845   /* Expand each pattern individually.  */
2846   rtx_vector_builder builder;
2847   auto_vec<rtx, 16> vectors (npatterns);
2848   for (unsigned int i = 0; i < npatterns; ++i)
2849     {
2850       builder.new_vector (mode, 1, nelts_per_pattern);
2851       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
2852         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
2853       vectors.quick_push (force_reg (mode, builder.build ()));
2854     }
2855
2856   /* Use permutes to interleave the separate vectors.  */
2857   while (npatterns > 1)
2858     {
2859       npatterns /= 2;
2860       for (unsigned int i = 0; i < npatterns; ++i)
2861         {
2862           rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
2863           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
2864           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
2865           vectors[i] = tmp;
2866         }
2867     }
2868   gcc_assert (vectors[0] == dest);
2869 }
2870
2871 /* Set DEST to immediate IMM.  For SVE vector modes, GEN_VEC_DUPLICATE
2872    is a pattern that can be used to set DEST to a replicated scalar
2873    element.  */
2874
2875 void
2876 aarch64_expand_mov_immediate (rtx dest, rtx imm,
2877                               rtx (*gen_vec_duplicate) (rtx, rtx))
2878 {
2879   machine_mode mode = GET_MODE (dest);
2880
2881   /* Check on what type of symbol it is.  */
2882   scalar_int_mode int_mode;
2883   if ((GET_CODE (imm) == SYMBOL_REF
2884        || GET_CODE (imm) == LABEL_REF
2885        || GET_CODE (imm) == CONST
2886        || GET_CODE (imm) == CONST_POLY_INT)
2887       && is_a <scalar_int_mode> (mode, &int_mode))
2888     {
2889       rtx mem;
2890       poly_int64 offset;
2891       HOST_WIDE_INT const_offset;
2892       enum aarch64_symbol_type sty;
2893
2894       /* If we have (const (plus symbol offset)), separate out the offset
2895          before we start classifying the symbol.  */
2896       rtx base = strip_offset (imm, &offset);
2897
2898       /* We must always add an offset involving VL separately, rather than
2899          folding it into the relocation.  */
2900       if (!offset.is_constant (&const_offset))
2901         {
2902           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
2903             emit_insn (gen_rtx_SET (dest, imm));
2904           else
2905             {
2906               /* Do arithmetic on 32-bit values if the result is smaller
2907                  than that.  */
2908               if (partial_subreg_p (int_mode, SImode))
2909                 {
2910                   /* It is invalid to do symbol calculations in modes
2911                      narrower than SImode.  */
2912                   gcc_assert (base == const0_rtx);
2913                   dest = gen_lowpart (SImode, dest);
2914                   int_mode = SImode;
2915                 }
2916               if (base != const0_rtx)
2917                 {
2918                   base = aarch64_force_temporary (int_mode, dest, base);
2919                   aarch64_add_offset (int_mode, dest, base, offset,
2920                                       NULL_RTX, NULL_RTX, false);
2921                 }
2922               else
2923                 aarch64_add_offset (int_mode, dest, base, offset,
2924                                     dest, NULL_RTX, false);
2925             }
2926           return;
2927         }
2928
2929       sty = aarch64_classify_symbol (base, const_offset);
2930       switch (sty)
2931         {
2932         case SYMBOL_FORCE_TO_MEM:
2933           if (const_offset != 0
2934               && targetm.cannot_force_const_mem (int_mode, imm))
2935             {
2936               gcc_assert (can_create_pseudo_p ());
2937               base = aarch64_force_temporary (int_mode, dest, base);
2938               aarch64_add_offset (int_mode, dest, base, const_offset,
2939                                   NULL_RTX, NULL_RTX, false);
2940               return;
2941             }
2942
2943           mem = force_const_mem (ptr_mode, imm);
2944           gcc_assert (mem);
2945
2946           /* If we aren't generating PC relative literals, then
2947              we need to expand the literal pool access carefully.
2948              This is something that needs to be done in a number
2949              of places, so could well live as a separate function.  */
2950           if (!aarch64_pcrelative_literal_loads)
2951             {
2952               gcc_assert (can_create_pseudo_p ());
2953               base = gen_reg_rtx (ptr_mode);
2954               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
2955               if (ptr_mode != Pmode)
2956                 base = convert_memory_address (Pmode, base);
2957               mem = gen_rtx_MEM (ptr_mode, base);
2958             }
2959
2960           if (int_mode != ptr_mode)
2961             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
2962
2963           emit_insn (gen_rtx_SET (dest, mem));
2964
2965           return;
2966
2967         case SYMBOL_SMALL_TLSGD:
2968         case SYMBOL_SMALL_TLSDESC:
2969         case SYMBOL_SMALL_TLSIE:
2970         case SYMBOL_SMALL_GOT_28K:
2971         case SYMBOL_SMALL_GOT_4G:
2972         case SYMBOL_TINY_GOT:
2973         case SYMBOL_TINY_TLSIE:
2974           if (const_offset != 0)
2975             {
2976               gcc_assert(can_create_pseudo_p ());
2977               base = aarch64_force_temporary (int_mode, dest, base);
2978               aarch64_add_offset (int_mode, dest, base, const_offset,
2979                                   NULL_RTX, NULL_RTX, false);
2980               return;
2981             }
2982           /* FALLTHRU */
2983
2984         case SYMBOL_SMALL_ABSOLUTE:
2985         case SYMBOL_TINY_ABSOLUTE:
2986         case SYMBOL_TLSLE12:
2987         case SYMBOL_TLSLE24:
2988         case SYMBOL_TLSLE32:
2989         case SYMBOL_TLSLE48:
2990           aarch64_load_symref_appropriately (dest, imm, sty);
2991           return;
2992
2993         default:
2994           gcc_unreachable ();
2995         }
2996     }
2997
2998   if (!CONST_INT_P (imm))
2999     {
3000       rtx base, step, value;
3001       if (GET_CODE (imm) == HIGH
3002           || aarch64_simd_valid_immediate (imm, NULL))
3003         emit_insn (gen_rtx_SET (dest, imm));
3004       else if (const_vec_series_p (imm, &base, &step))
3005         aarch64_expand_vec_series (dest, base, step);
3006       else if (const_vec_duplicate_p (imm, &value))
3007         {
3008           /* If the constant is out of range of an SVE vector move,
3009              load it from memory if we can, otherwise move it into
3010              a register and use a DUP.  */
3011           scalar_mode inner_mode = GET_MODE_INNER (mode);
3012           rtx op = force_const_mem (inner_mode, value);
3013           if (!op)
3014             op = force_reg (inner_mode, value);
3015           else if (!aarch64_sve_ld1r_operand_p (op))
3016             {
3017               rtx addr = force_reg (Pmode, XEXP (op, 0));
3018               op = replace_equiv_address (op, addr);
3019             }
3020           emit_insn (gen_vec_duplicate (dest, op));
3021         }
3022       else if (GET_CODE (imm) == CONST_VECTOR
3023                && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3024         aarch64_expand_sve_const_vector (dest, imm);
3025       else
3026         {
3027           rtx mem = force_const_mem (mode, imm);
3028           gcc_assert (mem);
3029           emit_move_insn (dest, mem);
3030         }
3031
3032       return;
3033     }
3034
3035   aarch64_internal_mov_immediate (dest, imm, true,
3036                                   as_a <scalar_int_mode> (mode));
3037 }
3038
3039 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
3040    that is known to contain PTRUE.  */
3041
3042 void
3043 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3044 {
3045   emit_insn (gen_rtx_SET (dest, gen_rtx_UNSPEC (GET_MODE (dest),
3046                                                 gen_rtvec (2, pred, src),
3047                                                 UNSPEC_MERGE_PTRUE)));
3048 }
3049
3050 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3051    operand is in memory.  In this case we need to use the predicated LD1
3052    and ST1 instead of LDR and STR, both for correctness on big-endian
3053    targets and because LD1 and ST1 support a wider range of addressing modes.
3054    PRED_MODE is the mode of the predicate.
3055
3056    See the comment at the head of aarch64-sve.md for details about the
3057    big-endian handling.  */
3058
3059 void
3060 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3061 {
3062   machine_mode mode = GET_MODE (dest);
3063   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3064   if (!register_operand (src, mode)
3065       && !register_operand (dest, mode))
3066     {
3067       rtx tmp = gen_reg_rtx (mode);
3068       if (MEM_P (src))
3069         aarch64_emit_sve_pred_move (tmp, ptrue, src);
3070       else
3071         emit_move_insn (tmp, src);
3072       src = tmp;
3073     }
3074   aarch64_emit_sve_pred_move (dest, ptrue, src);
3075 }
3076
3077 static bool
3078 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3079                                  tree exp ATTRIBUTE_UNUSED)
3080 {
3081   /* Currently, always true.  */
3082   return true;
3083 }
3084
3085 /* Implement TARGET_PASS_BY_REFERENCE.  */
3086
3087 static bool
3088 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3089                            machine_mode mode,
3090                            const_tree type,
3091                            bool named ATTRIBUTE_UNUSED)
3092 {
3093   HOST_WIDE_INT size;
3094   machine_mode dummymode;
3095   int nregs;
3096
3097   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
3098   if (mode == BLKmode && type)
3099     size = int_size_in_bytes (type);
3100   else
3101     /* No frontends can create types with variable-sized modes, so we
3102        shouldn't be asked to pass or return them.  */
3103     size = GET_MODE_SIZE (mode).to_constant ();
3104
3105   /* Aggregates are passed by reference based on their size.  */
3106   if (type && AGGREGATE_TYPE_P (type))
3107     {
3108       size = int_size_in_bytes (type);
3109     }
3110
3111   /* Variable sized arguments are always returned by reference.  */
3112   if (size < 0)
3113     return true;
3114
3115   /* Can this be a candidate to be passed in fp/simd register(s)?  */
3116   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3117                                                &dummymode, &nregs,
3118                                                NULL))
3119     return false;
3120
3121   /* Arguments which are variable sized or larger than 2 registers are
3122      passed by reference unless they are a homogenous floating point
3123      aggregate.  */
3124   return size > 2 * UNITS_PER_WORD;
3125 }
3126
3127 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
3128 static bool
3129 aarch64_return_in_msb (const_tree valtype)
3130 {
3131   machine_mode dummy_mode;
3132   int dummy_int;
3133
3134   /* Never happens in little-endian mode.  */
3135   if (!BYTES_BIG_ENDIAN)
3136     return false;
3137
3138   /* Only composite types smaller than or equal to 16 bytes can
3139      be potentially returned in registers.  */
3140   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3141       || int_size_in_bytes (valtype) <= 0
3142       || int_size_in_bytes (valtype) > 16)
3143     return false;
3144
3145   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3146      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3147      is always passed/returned in the least significant bits of fp/simd
3148      register(s).  */
3149   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3150                                                &dummy_mode, &dummy_int, NULL))
3151     return false;
3152
3153   return true;
3154 }
3155
3156 /* Implement TARGET_FUNCTION_VALUE.
3157    Define how to find the value returned by a function.  */
3158
3159 static rtx
3160 aarch64_function_value (const_tree type, const_tree func,
3161                         bool outgoing ATTRIBUTE_UNUSED)
3162 {
3163   machine_mode mode;
3164   int unsignedp;
3165   int count;
3166   machine_mode ag_mode;
3167
3168   mode = TYPE_MODE (type);
3169   if (INTEGRAL_TYPE_P (type))
3170     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3171
3172   if (aarch64_return_in_msb (type))
3173     {
3174       HOST_WIDE_INT size = int_size_in_bytes (type);
3175
3176       if (size % UNITS_PER_WORD != 0)
3177         {
3178           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3179           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3180         }
3181     }
3182
3183   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3184                                                &ag_mode, &count, NULL))
3185     {
3186       if (!aarch64_composite_type_p (type, mode))
3187         {
3188           gcc_assert (count == 1 && mode == ag_mode);
3189           return gen_rtx_REG (mode, V0_REGNUM);
3190         }
3191       else
3192         {
3193           int i;
3194           rtx par;
3195
3196           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3197           for (i = 0; i < count; i++)
3198             {
3199               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3200               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3201               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3202               XVECEXP (par, 0, i) = tmp;
3203             }
3204           return par;
3205         }
3206     }
3207   else
3208     return gen_rtx_REG (mode, R0_REGNUM);
3209 }
3210
3211 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3212    Return true if REGNO is the number of a hard register in which the values
3213    of called function may come back.  */
3214
3215 static bool
3216 aarch64_function_value_regno_p (const unsigned int regno)
3217 {
3218   /* Maximum of 16 bytes can be returned in the general registers.  Examples
3219      of 16-byte return values are: 128-bit integers and 16-byte small
3220      structures (excluding homogeneous floating-point aggregates).  */
3221   if (regno == R0_REGNUM || regno == R1_REGNUM)
3222     return true;
3223
3224   /* Up to four fp/simd registers can return a function value, e.g. a
3225      homogeneous floating-point aggregate having four members.  */
3226   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3227     return TARGET_FLOAT;
3228
3229   return false;
3230 }
3231
3232 /* Implement TARGET_RETURN_IN_MEMORY.
3233
3234    If the type T of the result of a function is such that
3235      void func (T arg)
3236    would require that arg be passed as a value in a register (or set of
3237    registers) according to the parameter passing rules, then the result
3238    is returned in the same registers as would be used for such an
3239    argument.  */
3240
3241 static bool
3242 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3243 {
3244   HOST_WIDE_INT size;
3245   machine_mode ag_mode;
3246   int count;
3247
3248   if (!AGGREGATE_TYPE_P (type)
3249       && TREE_CODE (type) != COMPLEX_TYPE
3250       && TREE_CODE (type) != VECTOR_TYPE)
3251     /* Simple scalar types always returned in registers.  */
3252     return false;
3253
3254   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3255                                                type,
3256                                                &ag_mode,
3257                                                &count,
3258                                                NULL))
3259     return false;
3260
3261   /* Types larger than 2 registers returned in memory.  */
3262   size = int_size_in_bytes (type);
3263   return (size < 0 || size > 2 * UNITS_PER_WORD);
3264 }
3265
3266 static bool
3267 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3268                                const_tree type, int *nregs)
3269 {
3270   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3271   return aarch64_vfp_is_call_or_return_candidate (mode,
3272                                                   type,
3273                                                   &pcum->aapcs_vfp_rmode,
3274                                                   nregs,
3275                                                   NULL);
3276 }
3277
3278 /* Given MODE and TYPE of a function argument, return the alignment in
3279    bits.  The idea is to suppress any stronger alignment requested by
3280    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3281    This is a helper function for local use only.  */
3282
3283 static unsigned int
3284 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
3285 {
3286   if (!type)
3287     return GET_MODE_ALIGNMENT (mode);
3288
3289   if (integer_zerop (TYPE_SIZE (type)))
3290     return 0;
3291
3292   gcc_assert (TYPE_MODE (type) == mode);
3293
3294   if (!AGGREGATE_TYPE_P (type))
3295     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3296
3297   if (TREE_CODE (type) == ARRAY_TYPE)
3298     return TYPE_ALIGN (TREE_TYPE (type));
3299
3300   unsigned int alignment = 0;
3301   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3302     if (TREE_CODE (field) == FIELD_DECL)
3303       alignment = std::max (alignment, DECL_ALIGN (field));
3304
3305   return alignment;
3306 }
3307
3308 /* Layout a function argument according to the AAPCS64 rules.  The rule
3309    numbers refer to the rule numbers in the AAPCS64.  */
3310
3311 static void
3312 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3313                     const_tree type,
3314                     bool named ATTRIBUTE_UNUSED)
3315 {
3316   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3317   int ncrn, nvrn, nregs;
3318   bool allocate_ncrn, allocate_nvrn;
3319   HOST_WIDE_INT size;
3320
3321   /* We need to do this once per argument.  */
3322   if (pcum->aapcs_arg_processed)
3323     return;
3324
3325   pcum->aapcs_arg_processed = true;
3326
3327   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
3328   if (type)
3329     size = int_size_in_bytes (type);
3330   else
3331     /* No frontends can create types with variable-sized modes, so we
3332        shouldn't be asked to pass or return them.  */
3333     size = GET_MODE_SIZE (mode).to_constant ();
3334   size = ROUND_UP (size, UNITS_PER_WORD);
3335
3336   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3337   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3338                                                  mode,
3339                                                  type,
3340                                                  &nregs);
3341
3342   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3343      The following code thus handles passing by SIMD/FP registers first.  */
3344
3345   nvrn = pcum->aapcs_nvrn;
3346
3347   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3348      and homogenous short-vector aggregates (HVA).  */
3349   if (allocate_nvrn)
3350     {
3351       if (!TARGET_FLOAT)
3352         aarch64_err_no_fpadvsimd (mode, "argument");
3353
3354       if (nvrn + nregs <= NUM_FP_ARG_REGS)
3355         {
3356           pcum->aapcs_nextnvrn = nvrn + nregs;
3357           if (!aarch64_composite_type_p (type, mode))
3358             {
3359               gcc_assert (nregs == 1);
3360               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3361             }
3362           else
3363             {
3364               rtx par;
3365               int i;
3366               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3367               for (i = 0; i < nregs; i++)
3368                 {
3369                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3370                                          V0_REGNUM + nvrn + i);
3371                   rtx offset = gen_int_mode
3372                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3373                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3374                   XVECEXP (par, 0, i) = tmp;
3375                 }
3376               pcum->aapcs_reg = par;
3377             }
3378           return;
3379         }
3380       else
3381         {
3382           /* C.3 NSRN is set to 8.  */
3383           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3384           goto on_stack;
3385         }
3386     }
3387
3388   ncrn = pcum->aapcs_ncrn;
3389   nregs = size / UNITS_PER_WORD;
3390
3391   /* C6 - C9.  though the sign and zero extension semantics are
3392      handled elsewhere.  This is the case where the argument fits
3393      entirely general registers.  */
3394   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3395     {
3396
3397       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3398
3399       /* C.8 if the argument has an alignment of 16 then the NGRN is
3400          rounded up to the next even number.  */
3401       if (nregs == 2
3402           && ncrn % 2
3403           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3404              comparison is there because for > 16 * BITS_PER_UNIT
3405              alignment nregs should be > 2 and therefore it should be
3406              passed by reference rather than value.  */
3407           && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3408         {
3409           ++ncrn;
3410           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3411         }
3412
3413       /* NREGS can be 0 when e.g. an empty structure is to be passed.
3414          A reg is still generated for it, but the caller should be smart
3415          enough not to use it.  */
3416       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3417         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3418       else
3419         {
3420           rtx par;
3421           int i;
3422
3423           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3424           for (i = 0; i < nregs; i++)
3425             {
3426               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3427               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3428                                        GEN_INT (i * UNITS_PER_WORD));
3429               XVECEXP (par, 0, i) = tmp;
3430             }
3431           pcum->aapcs_reg = par;
3432         }
3433
3434       pcum->aapcs_nextncrn = ncrn + nregs;
3435       return;
3436     }
3437
3438   /* C.11  */
3439   pcum->aapcs_nextncrn = NUM_ARG_REGS;
3440
3441   /* The argument is passed on stack; record the needed number of words for
3442      this argument and align the total size if necessary.  */
3443 on_stack:
3444   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3445
3446   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3447     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
3448                                        16 / UNITS_PER_WORD);
3449   return;
3450 }
3451
3452 /* Implement TARGET_FUNCTION_ARG.  */
3453
3454 static rtx
3455 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3456                       const_tree type, bool named)
3457 {
3458   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3459   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3460
3461   if (mode == VOIDmode)
3462     return NULL_RTX;
3463
3464   aarch64_layout_arg (pcum_v, mode, type, named);
3465   return pcum->aapcs_reg;
3466 }
3467
3468 void
3469 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
3470                            const_tree fntype ATTRIBUTE_UNUSED,
3471                            rtx libname ATTRIBUTE_UNUSED,
3472                            const_tree fndecl ATTRIBUTE_UNUSED,
3473                            unsigned n_named ATTRIBUTE_UNUSED)
3474 {
3475   pcum->aapcs_ncrn = 0;
3476   pcum->aapcs_nvrn = 0;
3477   pcum->aapcs_nextncrn = 0;
3478   pcum->aapcs_nextnvrn = 0;
3479   pcum->pcs_variant = ARM_PCS_AAPCS64;
3480   pcum->aapcs_reg = NULL_RTX;
3481   pcum->aapcs_arg_processed = false;
3482   pcum->aapcs_stack_words = 0;
3483   pcum->aapcs_stack_size = 0;
3484
3485   if (!TARGET_FLOAT
3486       && fndecl && TREE_PUBLIC (fndecl)
3487       && fntype && fntype != error_mark_node)
3488     {
3489       const_tree type = TREE_TYPE (fntype);
3490       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
3491       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
3492       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
3493                                                    &mode, &nregs, NULL))
3494         aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
3495     }
3496   return;
3497 }
3498
3499 static void
3500 aarch64_function_arg_advance (cumulative_args_t pcum_v,
3501                               machine_mode mode,
3502                               const_tree type,
3503                               bool named)
3504 {
3505   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3506   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
3507     {
3508       aarch64_layout_arg (pcum_v, mode, type, named);
3509       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
3510                   != (pcum->aapcs_stack_words != 0));
3511       pcum->aapcs_arg_processed = false;
3512       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
3513       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
3514       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
3515       pcum->aapcs_stack_words = 0;
3516       pcum->aapcs_reg = NULL_RTX;
3517     }
3518 }
3519
3520 bool
3521 aarch64_function_arg_regno_p (unsigned regno)
3522 {
3523   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
3524           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
3525 }
3526
3527 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
3528    PARM_BOUNDARY bits of alignment, but will be given anything up
3529    to STACK_BOUNDARY bits if the type requires it.  This makes sure
3530    that both before and after the layout of each argument, the Next
3531    Stacked Argument Address (NSAA) will have a minimum alignment of
3532    8 bytes.  */
3533
3534 static unsigned int
3535 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
3536 {
3537   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
3538   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
3539 }
3540
3541 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
3542
3543 static fixed_size_mode
3544 aarch64_get_reg_raw_mode (int regno)
3545 {
3546   if (TARGET_SVE && FP_REGNUM_P (regno))
3547     /* Don't use the SVE part of the register for __builtin_apply and
3548        __builtin_return.  The SVE registers aren't used by the normal PCS,
3549        so using them there would be a waste of time.  The PCS extensions
3550        for SVE types are fundamentally incompatible with the
3551        __builtin_return/__builtin_apply interface.  */
3552     return as_a <fixed_size_mode> (V16QImode);
3553   return default_get_reg_raw_mode (regno);
3554 }
3555
3556 /* Implement TARGET_FUNCTION_ARG_PADDING.
3557
3558    Small aggregate types are placed in the lowest memory address.
3559
3560    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
3561
3562 static pad_direction
3563 aarch64_function_arg_padding (machine_mode mode, const_tree type)
3564 {
3565   /* On little-endian targets, the least significant byte of every stack
3566      argument is passed at the lowest byte address of the stack slot.  */
3567   if (!BYTES_BIG_ENDIAN)
3568     return PAD_UPWARD;
3569
3570   /* Otherwise, integral, floating-point and pointer types are padded downward:
3571      the least significant byte of a stack argument is passed at the highest
3572      byte address of the stack slot.  */
3573   if (type
3574       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
3575          || POINTER_TYPE_P (type))
3576       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
3577     return PAD_DOWNWARD;
3578
3579   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
3580   return PAD_UPWARD;
3581 }
3582
3583 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
3584
3585    It specifies padding for the last (may also be the only)
3586    element of a block move between registers and memory.  If
3587    assuming the block is in the memory, padding upward means that
3588    the last element is padded after its highest significant byte,
3589    while in downward padding, the last element is padded at the
3590    its least significant byte side.
3591
3592    Small aggregates and small complex types are always padded
3593    upwards.
3594
3595    We don't need to worry about homogeneous floating-point or
3596    short-vector aggregates; their move is not affected by the
3597    padding direction determined here.  Regardless of endianness,
3598    each element of such an aggregate is put in the least
3599    significant bits of a fp/simd register.
3600
3601    Return !BYTES_BIG_ENDIAN if the least significant byte of the
3602    register has useful data, and return the opposite if the most
3603    significant byte does.  */
3604
3605 bool
3606 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
3607                      bool first ATTRIBUTE_UNUSED)
3608 {
3609
3610   /* Small composite types are always padded upward.  */
3611   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
3612     {
3613       HOST_WIDE_INT size;
3614       if (type)
3615         size = int_size_in_bytes (type);
3616       else
3617         /* No frontends can create types with variable-sized modes, so we
3618            shouldn't be asked to pass or return them.  */
3619         size = GET_MODE_SIZE (mode).to_constant ();
3620       if (size < 2 * UNITS_PER_WORD)
3621         return true;
3622     }
3623
3624   /* Otherwise, use the default padding.  */
3625   return !BYTES_BIG_ENDIAN;
3626 }
3627
3628 static scalar_int_mode
3629 aarch64_libgcc_cmp_return_mode (void)
3630 {
3631   return SImode;
3632 }
3633
3634 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
3635
3636 /* We use the 12-bit shifted immediate arithmetic instructions so values
3637    must be multiple of (1 << 12), i.e. 4096.  */
3638 #define ARITH_FACTOR 4096
3639
3640 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
3641 #error Cannot use simple address calculation for stack probing
3642 #endif
3643
3644 /* The pair of scratch registers used for stack probing.  */
3645 #define PROBE_STACK_FIRST_REG  9
3646 #define PROBE_STACK_SECOND_REG 10
3647
3648 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
3649    inclusive.  These are offsets from the current stack pointer.  */
3650
3651 static void
3652 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
3653 {
3654   HOST_WIDE_INT size;
3655   if (!poly_size.is_constant (&size))
3656     {
3657       sorry ("stack probes for SVE frames");
3658       return;
3659     }
3660
3661   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
3662
3663   /* See the same assertion on PROBE_INTERVAL above.  */
3664   gcc_assert ((first % ARITH_FACTOR) == 0);
3665
3666   /* See if we have a constant small number of probes to generate.  If so,
3667      that's the easy case.  */
3668   if (size <= PROBE_INTERVAL)
3669     {
3670       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
3671
3672       emit_set_insn (reg1,
3673                      plus_constant (Pmode,
3674                                     stack_pointer_rtx, -(first + base)));
3675       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
3676     }
3677
3678   /* The run-time loop is made up of 8 insns in the generic case while the
3679      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
3680   else if (size <= 4 * PROBE_INTERVAL)
3681     {
3682       HOST_WIDE_INT i, rem;
3683
3684       emit_set_insn (reg1,
3685                      plus_constant (Pmode,
3686                                     stack_pointer_rtx,
3687                                     -(first + PROBE_INTERVAL)));
3688       emit_stack_probe (reg1);
3689
3690       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
3691          it exceeds SIZE.  If only two probes are needed, this will not
3692          generate any code.  Then probe at FIRST + SIZE.  */
3693       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
3694         {
3695           emit_set_insn (reg1,
3696                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
3697           emit_stack_probe (reg1);
3698         }
3699
3700       rem = size - (i - PROBE_INTERVAL);
3701       if (rem > 256)
3702         {
3703           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3704
3705           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
3706           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
3707         }
3708       else
3709         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
3710     }
3711
3712   /* Otherwise, do the same as above, but in a loop.  Note that we must be
3713      extra careful with variables wrapping around because we might be at
3714      the very top (or the very bottom) of the address space and we have
3715      to be able to handle this case properly; in particular, we use an
3716      equality test for the loop condition.  */
3717   else
3718     {
3719       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
3720
3721       /* Step 1: round SIZE to the previous multiple of the interval.  */
3722
3723       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
3724
3725
3726       /* Step 2: compute initial and final value of the loop counter.  */
3727
3728       /* TEST_ADDR = SP + FIRST.  */
3729       emit_set_insn (reg1,
3730                      plus_constant (Pmode, stack_pointer_rtx, -first));
3731
3732       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
3733       HOST_WIDE_INT adjustment = - (first + rounded_size);
3734       if (! aarch64_uimm12_shift (adjustment))
3735         {
3736           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
3737                                           true, Pmode);
3738           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
3739         }
3740       else
3741         {
3742           emit_set_insn (reg2,
3743                          plus_constant (Pmode, stack_pointer_rtx, adjustment));
3744         }
3745
3746       /* Step 3: the loop
3747
3748          do
3749            {
3750              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
3751              probe at TEST_ADDR
3752            }
3753          while (TEST_ADDR != LAST_ADDR)
3754
3755          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
3756          until it is equal to ROUNDED_SIZE.  */
3757
3758       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
3759
3760
3761       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
3762          that SIZE is equal to ROUNDED_SIZE.  */
3763
3764       if (size != rounded_size)
3765         {
3766           HOST_WIDE_INT rem = size - rounded_size;
3767
3768           if (rem > 256)
3769             {
3770               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3771
3772               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
3773               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
3774             }
3775           else
3776             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
3777         }
3778     }
3779
3780   /* Make sure nothing is scheduled before we are done.  */
3781   emit_insn (gen_blockage ());
3782 }
3783
3784 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
3785    absolute addresses.  */
3786
3787 const char *
3788 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
3789 {
3790   static int labelno = 0;
3791   char loop_lab[32];
3792   rtx xops[2];
3793
3794   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
3795
3796   /* Loop.  */
3797   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
3798
3799   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
3800   xops[0] = reg1;
3801   xops[1] = GEN_INT (PROBE_INTERVAL);
3802   output_asm_insn ("sub\t%0, %0, %1", xops);
3803
3804   /* Probe at TEST_ADDR.  */
3805   output_asm_insn ("str\txzr, [%0]", xops);
3806
3807   /* Test if TEST_ADDR == LAST_ADDR.  */
3808   xops[1] = reg2;
3809   output_asm_insn ("cmp\t%0, %1", xops);
3810
3811   /* Branch.  */
3812   fputs ("\tb.ne\t", asm_out_file);
3813   assemble_name_raw (asm_out_file, loop_lab);
3814   fputc ('\n', asm_out_file);
3815
3816   return "";
3817 }
3818
3819 /* Mark the registers that need to be saved by the callee and calculate
3820    the size of the callee-saved registers area and frame record (both FP
3821    and LR may be omitted).  */
3822 static void
3823 aarch64_layout_frame (void)
3824 {
3825   HOST_WIDE_INT offset = 0;
3826   int regno, last_fp_reg = INVALID_REGNUM;
3827
3828   if (reload_completed && cfun->machine->frame.laid_out)
3829     return;
3830
3831   /* Force a frame chain for EH returns so the return address is at FP+8.  */
3832   cfun->machine->frame.emit_frame_chain
3833     = frame_pointer_needed || crtl->calls_eh_return;
3834
3835   /* Emit a frame chain if the frame pointer is enabled.
3836      If -momit-leaf-frame-pointer is used, do not use a frame chain
3837      in leaf functions which do not use LR.  */
3838   if (flag_omit_frame_pointer == 2
3839       && !(flag_omit_leaf_frame_pointer && crtl->is_leaf
3840            && !df_regs_ever_live_p (LR_REGNUM)))
3841     cfun->machine->frame.emit_frame_chain = true;
3842
3843 #define SLOT_NOT_REQUIRED (-2)
3844 #define SLOT_REQUIRED     (-1)
3845
3846   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
3847   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
3848
3849   /* First mark all the registers that really need to be saved...  */
3850   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
3851     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
3852
3853   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
3854     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
3855
3856   /* ... that includes the eh data registers (if needed)...  */
3857   if (crtl->calls_eh_return)
3858     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
3859       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
3860         = SLOT_REQUIRED;
3861
3862   /* ... and any callee saved register that dataflow says is live.  */
3863   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
3864     if (df_regs_ever_live_p (regno)
3865         && (regno == R30_REGNUM
3866             || !call_used_regs[regno]))
3867       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
3868
3869   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
3870     if (df_regs_ever_live_p (regno)
3871         && !call_used_regs[regno])
3872       {
3873         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
3874         last_fp_reg = regno;
3875       }
3876
3877   if (cfun->machine->frame.emit_frame_chain)
3878     {
3879       /* FP and LR are placed in the linkage record.  */
3880       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
3881       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
3882       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
3883       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
3884       offset = 2 * UNITS_PER_WORD;
3885     }
3886
3887   /* Now assign stack slots for them.  */
3888   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
3889     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
3890       {
3891         cfun->machine->frame.reg_offset[regno] = offset;
3892         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
3893           cfun->machine->frame.wb_candidate1 = regno;
3894         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
3895           cfun->machine->frame.wb_candidate2 = regno;
3896         offset += UNITS_PER_WORD;
3897       }
3898
3899   HOST_WIDE_INT max_int_offset = offset;
3900   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
3901   bool has_align_gap = offset != max_int_offset;
3902
3903   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
3904     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
3905       {
3906         /* If there is an alignment gap between integer and fp callee-saves,
3907            allocate the last fp register to it if possible.  */
3908         if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
3909           {
3910             cfun->machine->frame.reg_offset[regno] = max_int_offset;
3911             break;
3912           }
3913
3914         cfun->machine->frame.reg_offset[regno] = offset;
3915         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
3916           cfun->machine->frame.wb_candidate1 = regno;
3917         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
3918                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
3919           cfun->machine->frame.wb_candidate2 = regno;
3920         offset += UNITS_PER_WORD;
3921       }
3922
3923   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
3924
3925   cfun->machine->frame.saved_regs_size = offset;
3926
3927   HOST_WIDE_INT varargs_and_saved_regs_size
3928     = offset + cfun->machine->frame.saved_varargs_size;
3929
3930   cfun->machine->frame.hard_fp_offset
3931     = aligned_upper_bound (varargs_and_saved_regs_size
3932                            + get_frame_size (),
3933                            STACK_BOUNDARY / BITS_PER_UNIT);
3934
3935   /* Both these values are already aligned.  */
3936   gcc_assert (multiple_p (crtl->outgoing_args_size,
3937                           STACK_BOUNDARY / BITS_PER_UNIT));
3938   cfun->machine->frame.frame_size
3939     = (cfun->machine->frame.hard_fp_offset
3940        + crtl->outgoing_args_size);
3941
3942   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
3943
3944   cfun->machine->frame.initial_adjust = 0;
3945   cfun->machine->frame.final_adjust = 0;
3946   cfun->machine->frame.callee_adjust = 0;
3947   cfun->machine->frame.callee_offset = 0;
3948
3949   HOST_WIDE_INT max_push_offset = 0;
3950   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
3951     max_push_offset = 512;
3952   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
3953     max_push_offset = 256;
3954
3955   HOST_WIDE_INT const_size, const_fp_offset;
3956   if (cfun->machine->frame.frame_size.is_constant (&const_size)
3957       && const_size < max_push_offset
3958       && known_eq (crtl->outgoing_args_size, 0))
3959     {
3960       /* Simple, small frame with no outgoing arguments:
3961          stp reg1, reg2, [sp, -frame_size]!
3962          stp reg3, reg4, [sp, 16]  */
3963       cfun->machine->frame.callee_adjust = const_size;
3964     }
3965   else if (known_lt (crtl->outgoing_args_size
3966                      + cfun->machine->frame.saved_regs_size, 512)
3967            && !(cfun->calls_alloca
3968                 && known_lt (cfun->machine->frame.hard_fp_offset,
3969                              max_push_offset)))
3970     {
3971       /* Frame with small outgoing arguments:
3972          sub sp, sp, frame_size
3973          stp reg1, reg2, [sp, outgoing_args_size]
3974          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
3975       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
3976       cfun->machine->frame.callee_offset
3977         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
3978     }
3979   else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
3980            && const_fp_offset < max_push_offset)
3981     {
3982       /* Frame with large outgoing arguments but a small local area:
3983          stp reg1, reg2, [sp, -hard_fp_offset]!
3984          stp reg3, reg4, [sp, 16]
3985          sub sp, sp, outgoing_args_size  */
3986       cfun->machine->frame.callee_adjust = const_fp_offset;
3987       cfun->machine->frame.final_adjust
3988         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3989     }
3990   else
3991     {
3992       /* Frame with large local area and outgoing arguments using frame pointer:
3993          sub sp, sp, hard_fp_offset
3994          stp x29, x30, [sp, 0]
3995          add x29, sp, 0
3996          stp reg3, reg4, [sp, 16]
3997          sub sp, sp, outgoing_args_size  */
3998       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
3999       cfun->machine->frame.final_adjust
4000         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4001     }
4002
4003   cfun->machine->frame.laid_out = true;
4004 }
4005
4006 /* Return true if the register REGNO is saved on entry to
4007    the current function.  */
4008
4009 static bool
4010 aarch64_register_saved_on_entry (int regno)
4011 {
4012   return cfun->machine->frame.reg_offset[regno] >= 0;
4013 }
4014
4015 /* Return the next register up from REGNO up to LIMIT for the callee
4016    to save.  */
4017
4018 static unsigned
4019 aarch64_next_callee_save (unsigned regno, unsigned limit)
4020 {
4021   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4022     regno ++;
4023   return regno;
4024 }
4025
4026 /* Push the register number REGNO of mode MODE to the stack with write-back
4027    adjusting the stack by ADJUSTMENT.  */
4028
4029 static void
4030 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4031                            HOST_WIDE_INT adjustment)
4032  {
4033   rtx base_rtx = stack_pointer_rtx;
4034   rtx insn, reg, mem;
4035
4036   reg = gen_rtx_REG (mode, regno);
4037   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4038                             plus_constant (Pmode, base_rtx, -adjustment));
4039   mem = gen_frame_mem (mode, mem);
4040
4041   insn = emit_move_insn (mem, reg);
4042   RTX_FRAME_RELATED_P (insn) = 1;
4043 }
4044
4045 /* Generate and return an instruction to store the pair of registers
4046    REG and REG2 of mode MODE to location BASE with write-back adjusting
4047    the stack location BASE by ADJUSTMENT.  */
4048
4049 static rtx
4050 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4051                           HOST_WIDE_INT adjustment)
4052 {
4053   switch (mode)
4054     {
4055     case E_DImode:
4056       return gen_storewb_pairdi_di (base, base, reg, reg2,
4057                                     GEN_INT (-adjustment),
4058                                     GEN_INT (UNITS_PER_WORD - adjustment));
4059     case E_DFmode:
4060       return gen_storewb_pairdf_di (base, base, reg, reg2,
4061                                     GEN_INT (-adjustment),
4062                                     GEN_INT (UNITS_PER_WORD - adjustment));
4063     default:
4064       gcc_unreachable ();
4065     }
4066 }
4067
4068 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4069    stack pointer by ADJUSTMENT.  */
4070
4071 static void
4072 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4073 {
4074   rtx_insn *insn;
4075   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4076
4077   if (regno2 == INVALID_REGNUM)
4078     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4079
4080   rtx reg1 = gen_rtx_REG (mode, regno1);
4081   rtx reg2 = gen_rtx_REG (mode, regno2);
4082
4083   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4084                                               reg2, adjustment));
4085   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4086   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4087   RTX_FRAME_RELATED_P (insn) = 1;
4088 }
4089
4090 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4091    adjusting it by ADJUSTMENT afterwards.  */
4092
4093 static rtx
4094 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4095                          HOST_WIDE_INT adjustment)
4096 {
4097   switch (mode)
4098     {
4099     case E_DImode:
4100       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4101                                    GEN_INT (UNITS_PER_WORD));
4102     case E_DFmode:
4103       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4104                                    GEN_INT (UNITS_PER_WORD));
4105     default:
4106       gcc_unreachable ();
4107     }
4108 }
4109
4110 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4111    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4112    into CFI_OPS.  */
4113
4114 static void
4115 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4116                   rtx *cfi_ops)
4117 {
4118   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4119   rtx reg1 = gen_rtx_REG (mode, regno1);
4120
4121   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4122
4123   if (regno2 == INVALID_REGNUM)
4124     {
4125       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4126       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4127       emit_move_insn (reg1, gen_frame_mem (mode, mem));
4128     }
4129   else
4130     {
4131       rtx reg2 = gen_rtx_REG (mode, regno2);
4132       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4133       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4134                                           reg2, adjustment));
4135     }
4136 }
4137
4138 /* Generate and return a store pair instruction of mode MODE to store
4139    register REG1 to MEM1 and register REG2 to MEM2.  */
4140
4141 static rtx
4142 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4143                         rtx reg2)
4144 {
4145   switch (mode)
4146     {
4147     case E_DImode:
4148       return gen_store_pairdi (mem1, reg1, mem2, reg2);
4149
4150     case E_DFmode:
4151       return gen_store_pairdf (mem1, reg1, mem2, reg2);
4152
4153     default:
4154       gcc_unreachable ();
4155     }
4156 }
4157
4158 /* Generate and regurn a load pair isntruction of mode MODE to load register
4159    REG1 from MEM1 and register REG2 from MEM2.  */
4160
4161 static rtx
4162 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4163                        rtx mem2)
4164 {
4165   switch (mode)
4166     {
4167     case E_DImode:
4168       return gen_load_pairdi (reg1, mem1, reg2, mem2);
4169
4170     case E_DFmode:
4171       return gen_load_pairdf (reg1, mem1, reg2, mem2);
4172
4173     default:
4174       gcc_unreachable ();
4175     }
4176 }
4177
4178 /* Return TRUE if return address signing should be enabled for the current
4179    function, otherwise return FALSE.  */
4180
4181 bool
4182 aarch64_return_address_signing_enabled (void)
4183 {
4184   /* This function should only be called after frame laid out.   */
4185   gcc_assert (cfun->machine->frame.laid_out);
4186
4187   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4188      if it's LR is pushed onto stack.  */
4189   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4190           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4191               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4192 }
4193
4194 /* Emit code to save the callee-saved registers from register number START
4195    to LIMIT to the stack at the location starting at offset START_OFFSET,
4196    skipping any write-back candidates if SKIP_WB is true.  */
4197
4198 static void
4199 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4200                            unsigned start, unsigned limit, bool skip_wb)
4201 {
4202   rtx_insn *insn;
4203   unsigned regno;
4204   unsigned regno2;
4205
4206   for (regno = aarch64_next_callee_save (start, limit);
4207        regno <= limit;
4208        regno = aarch64_next_callee_save (regno + 1, limit))
4209     {
4210       rtx reg, mem;
4211       poly_int64 offset;
4212
4213       if (skip_wb
4214           && (regno == cfun->machine->frame.wb_candidate1
4215               || regno == cfun->machine->frame.wb_candidate2))
4216         continue;
4217
4218       if (cfun->machine->reg_is_wrapped_separately[regno])
4219        continue;
4220
4221       reg = gen_rtx_REG (mode, regno);
4222       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4223       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4224                                                 offset));
4225
4226       regno2 = aarch64_next_callee_save (regno + 1, limit);
4227
4228       if (regno2 <= limit
4229           && !cfun->machine->reg_is_wrapped_separately[regno2]
4230           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4231               == cfun->machine->frame.reg_offset[regno2]))
4232
4233         {
4234           rtx reg2 = gen_rtx_REG (mode, regno2);
4235           rtx mem2;
4236
4237           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4238           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4239                                                      offset));
4240           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4241                                                     reg2));
4242
4243           /* The first part of a frame-related parallel insn is
4244              always assumed to be relevant to the frame
4245              calculations; subsequent parts, are only
4246              frame-related if explicitly marked.  */
4247           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4248           regno = regno2;
4249         }
4250       else
4251         insn = emit_move_insn (mem, reg);
4252
4253       RTX_FRAME_RELATED_P (insn) = 1;
4254     }
4255 }
4256
4257 /* Emit code to restore the callee registers of mode MODE from register
4258    number START up to and including LIMIT.  Restore from the stack offset
4259    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4260    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
4261
4262 static void
4263 aarch64_restore_callee_saves (machine_mode mode,
4264                               poly_int64 start_offset, unsigned start,
4265                               unsigned limit, bool skip_wb, rtx *cfi_ops)
4266 {
4267   rtx base_rtx = stack_pointer_rtx;
4268   unsigned regno;
4269   unsigned regno2;
4270   poly_int64 offset;
4271
4272   for (regno = aarch64_next_callee_save (start, limit);
4273        regno <= limit;
4274        regno = aarch64_next_callee_save (regno + 1, limit))
4275     {
4276       if (cfun->machine->reg_is_wrapped_separately[regno])
4277        continue;
4278
4279       rtx reg, mem;
4280
4281       if (skip_wb
4282           && (regno == cfun->machine->frame.wb_candidate1
4283               || regno == cfun->machine->frame.wb_candidate2))
4284         continue;
4285
4286       reg = gen_rtx_REG (mode, regno);
4287       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4288       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4289
4290       regno2 = aarch64_next_callee_save (regno + 1, limit);
4291
4292       if (regno2 <= limit
4293           && !cfun->machine->reg_is_wrapped_separately[regno2]
4294           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4295               == cfun->machine->frame.reg_offset[regno2]))
4296         {
4297           rtx reg2 = gen_rtx_REG (mode, regno2);
4298           rtx mem2;
4299
4300           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4301           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4302           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4303
4304           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4305           regno = regno2;
4306         }
4307       else
4308         emit_move_insn (reg, mem);
4309       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
4310     }
4311 }
4312
4313 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4314    of MODE.  */
4315
4316 static inline bool
4317 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4318 {
4319   HOST_WIDE_INT multiple;
4320   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4321           && IN_RANGE (multiple, -8, 7));
4322 }
4323
4324 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4325    of MODE.  */
4326
4327 static inline bool
4328 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4329 {
4330   HOST_WIDE_INT multiple;
4331   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4332           && IN_RANGE (multiple, 0, 63));
4333 }
4334
4335 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4336    of MODE.  */
4337
4338 bool
4339 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4340 {
4341   HOST_WIDE_INT multiple;
4342   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4343           && IN_RANGE (multiple, -64, 63));
4344 }
4345
4346 /* Return true if OFFSET is a signed 9-bit value.  */
4347
4348 static inline bool
4349 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
4350                                poly_int64 offset)
4351 {
4352   HOST_WIDE_INT const_offset;
4353   return (offset.is_constant (&const_offset)
4354           && IN_RANGE (const_offset, -256, 255));
4355 }
4356
4357 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4358    of MODE.  */
4359
4360 static inline bool
4361 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4362 {
4363   HOST_WIDE_INT multiple;
4364   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4365           && IN_RANGE (multiple, -256, 255));
4366 }
4367
4368 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4369    of MODE.  */
4370
4371 static inline bool
4372 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4373 {
4374   HOST_WIDE_INT multiple;
4375   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4376           && IN_RANGE (multiple, 0, 4095));
4377 }
4378
4379 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
4380
4381 static sbitmap
4382 aarch64_get_separate_components (void)
4383 {
4384   aarch64_layout_frame ();
4385
4386   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4387   bitmap_clear (components);
4388
4389   /* The registers we need saved to the frame.  */
4390   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4391     if (aarch64_register_saved_on_entry (regno))
4392       {
4393         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4394         if (!frame_pointer_needed)
4395           offset += cfun->machine->frame.frame_size
4396                     - cfun->machine->frame.hard_fp_offset;
4397         /* Check that we can access the stack slot of the register with one
4398            direct load with no adjustments needed.  */
4399         if (offset_12bit_unsigned_scaled_p (DImode, offset))
4400           bitmap_set_bit (components, regno);
4401       }
4402
4403   /* Don't mess with the hard frame pointer.  */
4404   if (frame_pointer_needed)
4405     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
4406
4407   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4408   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4409   /* If aarch64_layout_frame has chosen registers to store/restore with
4410      writeback don't interfere with them to avoid having to output explicit
4411      stack adjustment instructions.  */
4412   if (reg2 != INVALID_REGNUM)
4413     bitmap_clear_bit (components, reg2);
4414   if (reg1 != INVALID_REGNUM)
4415     bitmap_clear_bit (components, reg1);
4416
4417   bitmap_clear_bit (components, LR_REGNUM);
4418   bitmap_clear_bit (components, SP_REGNUM);
4419
4420   return components;
4421 }
4422
4423 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
4424
4425 static sbitmap
4426 aarch64_components_for_bb (basic_block bb)
4427 {
4428   bitmap in = DF_LIVE_IN (bb);
4429   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
4430   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
4431
4432   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4433   bitmap_clear (components);
4434
4435   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
4436   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4437     if ((!call_used_regs[regno])
4438        && (bitmap_bit_p (in, regno)
4439            || bitmap_bit_p (gen, regno)
4440            || bitmap_bit_p (kill, regno)))
4441           bitmap_set_bit (components, regno);
4442
4443   return components;
4444 }
4445
4446 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
4447    Nothing to do for aarch64.  */
4448
4449 static void
4450 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
4451 {
4452 }
4453
4454 /* Return the next set bit in BMP from START onwards.  Return the total number
4455    of bits in BMP if no set bit is found at or after START.  */
4456
4457 static unsigned int
4458 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
4459 {
4460   unsigned int nbits = SBITMAP_SIZE (bmp);
4461   if (start == nbits)
4462     return start;
4463
4464   gcc_assert (start < nbits);
4465   for (unsigned int i = start; i < nbits; i++)
4466     if (bitmap_bit_p (bmp, i))
4467       return i;
4468
4469   return nbits;
4470 }
4471
4472 /* Do the work for aarch64_emit_prologue_components and
4473    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
4474    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
4475    for these components or the epilogue sequence.  That is, it determines
4476    whether we should emit stores or loads and what kind of CFA notes to attach
4477    to the insns.  Otherwise the logic for the two sequences is very
4478    similar.  */
4479
4480 static void
4481 aarch64_process_components (sbitmap components, bool prologue_p)
4482 {
4483   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
4484                              ? HARD_FRAME_POINTER_REGNUM
4485                              : STACK_POINTER_REGNUM);
4486
4487   unsigned last_regno = SBITMAP_SIZE (components);
4488   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
4489   rtx_insn *insn = NULL;
4490
4491   while (regno != last_regno)
4492     {
4493       /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
4494          so DFmode for the vector registers is enough.  */
4495       machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
4496       rtx reg = gen_rtx_REG (mode, regno);
4497       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4498       if (!frame_pointer_needed)
4499         offset += cfun->machine->frame.frame_size
4500                   - cfun->machine->frame.hard_fp_offset;
4501       rtx addr = plus_constant (Pmode, ptr_reg, offset);
4502       rtx mem = gen_frame_mem (mode, addr);
4503
4504       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
4505       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
4506       /* No more registers to handle after REGNO.
4507          Emit a single save/restore and exit.  */
4508       if (regno2 == last_regno)
4509         {
4510           insn = emit_insn (set);
4511           RTX_FRAME_RELATED_P (insn) = 1;
4512           if (prologue_p)
4513             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4514           else
4515             add_reg_note (insn, REG_CFA_RESTORE, reg);
4516           break;
4517         }
4518
4519       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
4520       /* The next register is not of the same class or its offset is not
4521          mergeable with the current one into a pair.  */
4522       if (!satisfies_constraint_Ump (mem)
4523           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
4524           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
4525                        GET_MODE_SIZE (mode)))
4526         {
4527           insn = emit_insn (set);
4528           RTX_FRAME_RELATED_P (insn) = 1;
4529           if (prologue_p)
4530             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4531           else
4532             add_reg_note (insn, REG_CFA_RESTORE, reg);
4533
4534           regno = regno2;
4535           continue;
4536         }
4537
4538       /* REGNO2 can be saved/restored in a pair with REGNO.  */
4539       rtx reg2 = gen_rtx_REG (mode, regno2);
4540       if (!frame_pointer_needed)
4541         offset2 += cfun->machine->frame.frame_size
4542                   - cfun->machine->frame.hard_fp_offset;
4543       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
4544       rtx mem2 = gen_frame_mem (mode, addr2);
4545       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
4546                              : gen_rtx_SET (reg2, mem2);
4547
4548       if (prologue_p)
4549         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
4550       else
4551         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4552
4553       RTX_FRAME_RELATED_P (insn) = 1;
4554       if (prologue_p)
4555         {
4556           add_reg_note (insn, REG_CFA_OFFSET, set);
4557           add_reg_note (insn, REG_CFA_OFFSET, set2);
4558         }
4559       else
4560         {
4561           add_reg_note (insn, REG_CFA_RESTORE, reg);
4562           add_reg_note (insn, REG_CFA_RESTORE, reg2);
4563         }
4564
4565       regno = aarch64_get_next_set_bit (components, regno2 + 1);
4566     }
4567 }
4568
4569 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
4570
4571 static void
4572 aarch64_emit_prologue_components (sbitmap components)
4573 {
4574   aarch64_process_components (components, true);
4575 }
4576
4577 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
4578
4579 static void
4580 aarch64_emit_epilogue_components (sbitmap components)
4581 {
4582   aarch64_process_components (components, false);
4583 }
4584
4585 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
4586
4587 static void
4588 aarch64_set_handled_components (sbitmap components)
4589 {
4590   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4591     if (bitmap_bit_p (components, regno))
4592       cfun->machine->reg_is_wrapped_separately[regno] = true;
4593 }
4594
4595 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
4596    is saved at BASE + OFFSET.  */
4597
4598 static void
4599 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
4600                             rtx base, poly_int64 offset)
4601 {
4602   rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
4603   add_reg_note (insn, REG_CFA_EXPRESSION,
4604                 gen_rtx_SET (mem, regno_reg_rtx[reg]));
4605 }
4606
4607 /* AArch64 stack frames generated by this compiler look like:
4608
4609         +-------------------------------+
4610         |                               |
4611         |  incoming stack arguments     |
4612         |                               |
4613         +-------------------------------+
4614         |                               | <-- incoming stack pointer (aligned)
4615         |  callee-allocated save area   |
4616         |  for register varargs         |
4617         |                               |
4618         +-------------------------------+
4619         |  local variables              | <-- frame_pointer_rtx
4620         |                               |
4621         +-------------------------------+
4622         |  padding0                     | \
4623         +-------------------------------+  |
4624         |  callee-saved registers       |  | frame.saved_regs_size
4625         +-------------------------------+  |
4626         |  LR'                          |  |
4627         +-------------------------------+  |
4628         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
4629         +-------------------------------+
4630         |  dynamic allocation           |
4631         +-------------------------------+
4632         |  padding                      |
4633         +-------------------------------+
4634         |  outgoing stack arguments     | <-- arg_pointer
4635         |                               |
4636         +-------------------------------+
4637         |                               | <-- stack_pointer_rtx (aligned)
4638
4639    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
4640    but leave frame_pointer_rtx and hard_frame_pointer_rtx
4641    unchanged.  */
4642
4643 /* Generate the prologue instructions for entry into a function.
4644    Establish the stack frame by decreasing the stack pointer with a
4645    properly calculated size and, if necessary, create a frame record
4646    filled with the values of LR and previous frame pointer.  The
4647    current FP is also set up if it is in use.  */
4648
4649 void
4650 aarch64_expand_prologue (void)
4651 {
4652   aarch64_layout_frame ();
4653
4654   poly_int64 frame_size = cfun->machine->frame.frame_size;
4655   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4656   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4657   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4658   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4659   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4660   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4661   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
4662   rtx_insn *insn;
4663
4664   /* Sign return address for functions.  */
4665   if (aarch64_return_address_signing_enabled ())
4666     {
4667       insn = emit_insn (gen_pacisp ());
4668       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
4669       RTX_FRAME_RELATED_P (insn) = 1;
4670     }
4671
4672   if (flag_stack_usage_info)
4673     current_function_static_stack_size = constant_lower_bound (frame_size);
4674
4675   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4676     {
4677       if (crtl->is_leaf && !cfun->calls_alloca)
4678         {
4679           if (maybe_gt (frame_size, PROBE_INTERVAL)
4680               && maybe_gt (frame_size, get_stack_check_protect ()))
4681             aarch64_emit_probe_stack_range (get_stack_check_protect (),
4682                                             (frame_size
4683                                              - get_stack_check_protect ()));
4684         }
4685       else if (maybe_gt (frame_size, 0))
4686         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
4687     }
4688
4689   rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4690   rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4691
4692   aarch64_sub_sp (ip0_rtx, ip1_rtx, initial_adjust, true);
4693
4694   if (callee_adjust != 0)
4695     aarch64_push_regs (reg1, reg2, callee_adjust);
4696
4697   if (emit_frame_chain)
4698     {
4699       poly_int64 reg_offset = callee_adjust;
4700       if (callee_adjust == 0)
4701         {
4702           reg1 = R29_REGNUM;
4703           reg2 = R30_REGNUM;
4704           reg_offset = callee_offset;
4705           aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
4706         }
4707       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
4708                           stack_pointer_rtx, callee_offset,
4709                           ip1_rtx, ip0_rtx, frame_pointer_needed);
4710       if (frame_pointer_needed && !frame_size.is_constant ())
4711         {
4712           /* Variable-sized frames need to describe the save slot
4713              address using DW_CFA_expression rather than DW_CFA_offset.
4714              This means that, without taking further action, the
4715              locations of the registers that we've already saved would
4716              remain based on the stack pointer even after we redefine
4717              the CFA based on the frame pointer.  We therefore need new
4718              DW_CFA_expressions to re-express the save slots with addresses
4719              based on the frame pointer.  */
4720           rtx_insn *insn = get_last_insn ();
4721           gcc_assert (RTX_FRAME_RELATED_P (insn));
4722
4723           /* Add an explicit CFA definition if this was previously
4724              implicit.  */
4725           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
4726             {
4727               rtx src = plus_constant (Pmode, stack_pointer_rtx,
4728                                        callee_offset);
4729               add_reg_note (insn, REG_CFA_ADJUST_CFA,
4730                             gen_rtx_SET (hard_frame_pointer_rtx, src));
4731             }
4732
4733           /* Change the save slot expressions for the registers that
4734              we've already saved.  */
4735           reg_offset -= callee_offset;
4736           aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
4737                                       reg_offset + UNITS_PER_WORD);
4738           aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
4739                                       reg_offset);
4740         }
4741       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
4742     }
4743
4744   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4745                              callee_adjust != 0 || emit_frame_chain);
4746   aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4747                              callee_adjust != 0 || emit_frame_chain);
4748   aarch64_sub_sp (ip1_rtx, ip0_rtx, final_adjust, !frame_pointer_needed);
4749 }
4750
4751 /* Return TRUE if we can use a simple_return insn.
4752
4753    This function checks whether the callee saved stack is empty, which
4754    means no restore actions are need. The pro_and_epilogue will use
4755    this to check whether shrink-wrapping opt is feasible.  */
4756
4757 bool
4758 aarch64_use_return_insn_p (void)
4759 {
4760   if (!reload_completed)
4761     return false;
4762
4763   if (crtl->profile)
4764     return false;
4765
4766   aarch64_layout_frame ();
4767
4768   return known_eq (cfun->machine->frame.frame_size, 0);
4769 }
4770
4771 /* Generate the epilogue instructions for returning from a function.
4772    This is almost exactly the reverse of the prolog sequence, except
4773    that we need to insert barriers to avoid scheduling loads that read
4774    from a deallocated stack, and we optimize the unwind records by
4775    emitting them all together if possible.  */
4776 void
4777 aarch64_expand_epilogue (bool for_sibcall)
4778 {
4779   aarch64_layout_frame ();
4780
4781   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4782   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4783   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4784   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4785   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4786   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4787   rtx cfi_ops = NULL;
4788   rtx_insn *insn;
4789   /* A stack clash protection prologue may not have left IP0_REGNUM or
4790      IP1_REGNUM in a usable state.  The same is true for allocations
4791      with an SVE component, since we then need both temporary registers
4792      for each allocation.  */
4793   bool can_inherit_p = (initial_adjust.is_constant ()
4794                         && final_adjust.is_constant ()
4795                         && !flag_stack_clash_protection);
4796
4797   /* We need to add memory barrier to prevent read from deallocated stack.  */
4798   bool need_barrier_p
4799     = maybe_ne (get_frame_size ()
4800                 + cfun->machine->frame.saved_varargs_size, 0);
4801
4802   /* Emit a barrier to prevent loads from a deallocated stack.  */
4803   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
4804       || cfun->calls_alloca
4805       || crtl->calls_eh_return)
4806     {
4807       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
4808       need_barrier_p = false;
4809     }
4810
4811   /* Restore the stack pointer from the frame pointer if it may not
4812      be the same as the stack pointer.  */
4813   rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4814   rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4815   if (frame_pointer_needed
4816       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
4817     /* If writeback is used when restoring callee-saves, the CFA
4818        is restored on the instruction doing the writeback.  */
4819     aarch64_add_offset (Pmode, stack_pointer_rtx,
4820                         hard_frame_pointer_rtx, -callee_offset,
4821                         ip1_rtx, ip0_rtx, callee_adjust == 0);
4822   else
4823     aarch64_add_sp (ip1_rtx, ip0_rtx, final_adjust,
4824                     !can_inherit_p || df_regs_ever_live_p (IP1_REGNUM));
4825
4826   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4827                                 callee_adjust != 0, &cfi_ops);
4828   aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4829                                 callee_adjust != 0, &cfi_ops);
4830
4831   if (need_barrier_p)
4832     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
4833
4834   if (callee_adjust != 0)
4835     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
4836
4837   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
4838     {
4839       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
4840       insn = get_last_insn ();
4841       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
4842       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
4843       RTX_FRAME_RELATED_P (insn) = 1;
4844       cfi_ops = NULL;
4845     }
4846
4847   aarch64_add_sp (ip0_rtx, ip1_rtx, initial_adjust,
4848                   !can_inherit_p || df_regs_ever_live_p (IP0_REGNUM));
4849
4850   if (cfi_ops)
4851     {
4852       /* Emit delayed restores and reset the CFA to be SP.  */
4853       insn = get_last_insn ();
4854       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
4855       REG_NOTES (insn) = cfi_ops;
4856       RTX_FRAME_RELATED_P (insn) = 1;
4857     }
4858
4859   /* We prefer to emit the combined return/authenticate instruction RETAA,
4860      however there are three cases in which we must instead emit an explicit
4861      authentication instruction.
4862
4863         1) Sibcalls don't return in a normal way, so if we're about to call one
4864            we must authenticate.
4865
4866         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
4867            generating code for !TARGET_ARMV8_3 we can't use it and must
4868            explicitly authenticate.
4869
4870         3) On an eh_return path we make extra stack adjustments to update the
4871            canonical frame address to be the exception handler's CFA.  We want
4872            to authenticate using the CFA of the function which calls eh_return.
4873     */
4874   if (aarch64_return_address_signing_enabled ()
4875       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
4876     {
4877       insn = emit_insn (gen_autisp ());
4878       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
4879       RTX_FRAME_RELATED_P (insn) = 1;
4880     }
4881
4882   /* Stack adjustment for exception handler.  */
4883   if (crtl->calls_eh_return)
4884     {
4885       /* We need to unwind the stack by the offset computed by
4886          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
4887          to be SP; letting the CFA move during this adjustment
4888          is just as correct as retaining the CFA from the body
4889          of the function.  Therefore, do nothing special.  */
4890       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
4891     }
4892
4893   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
4894   if (!for_sibcall)
4895     emit_jump_insn (ret_rtx);
4896 }
4897
4898 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
4899    normally or return to a previous frame after unwinding.
4900
4901    An EH return uses a single shared return sequence.  The epilogue is
4902    exactly like a normal epilogue except that it has an extra input
4903    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
4904    that must be applied after the frame has been destroyed.  An extra label
4905    is inserted before the epilogue which initializes this register to zero,
4906    and this is the entry point for a normal return.
4907
4908    An actual EH return updates the return address, initializes the stack
4909    adjustment and jumps directly into the epilogue (bypassing the zeroing
4910    of the adjustment).  Since the return address is typically saved on the
4911    stack when a function makes a call, the saved LR must be updated outside
4912    the epilogue.
4913
4914    This poses problems as the store is generated well before the epilogue,
4915    so the offset of LR is not known yet.  Also optimizations will remove the
4916    store as it appears dead, even after the epilogue is generated (as the
4917    base or offset for loading LR is different in many cases).
4918
4919    To avoid these problems this implementation forces the frame pointer
4920    in eh_return functions so that the location of LR is fixed and known early.
4921    It also marks the store volatile, so no optimization is permitted to
4922    remove the store.  */
4923 rtx
4924 aarch64_eh_return_handler_rtx (void)
4925 {
4926   rtx tmp = gen_frame_mem (Pmode,
4927     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
4928
4929   /* Mark the store volatile, so no optimization is permitted to remove it.  */
4930   MEM_VOLATILE_P (tmp) = true;
4931   return tmp;
4932 }
4933
4934 /* Output code to add DELTA to the first argument, and then jump
4935    to FUNCTION.  Used for C++ multiple inheritance.  */
4936 static void
4937 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
4938                          HOST_WIDE_INT delta,
4939                          HOST_WIDE_INT vcall_offset,
4940                          tree function)
4941 {
4942   /* The this pointer is always in x0.  Note that this differs from
4943      Arm where the this pointer maybe bumped to r1 if r0 is required
4944      to return a pointer to an aggregate.  On AArch64 a result value
4945      pointer will be in x8.  */
4946   int this_regno = R0_REGNUM;
4947   rtx this_rtx, temp0, temp1, addr, funexp;
4948   rtx_insn *insn;
4949
4950   reload_completed = 1;
4951   emit_note (NOTE_INSN_PROLOGUE_END);
4952
4953   this_rtx = gen_rtx_REG (Pmode, this_regno);
4954   temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
4955   temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
4956
4957   if (vcall_offset == 0)
4958     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
4959   else
4960     {
4961       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
4962
4963       addr = this_rtx;
4964       if (delta != 0)
4965         {
4966           if (delta >= -256 && delta < 256)
4967             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
4968                                        plus_constant (Pmode, this_rtx, delta));
4969           else
4970             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
4971                                 temp1, temp0, false);
4972         }
4973
4974       if (Pmode == ptr_mode)
4975         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
4976       else
4977         aarch64_emit_move (temp0,
4978                            gen_rtx_ZERO_EXTEND (Pmode,
4979                                                 gen_rtx_MEM (ptr_mode, addr)));
4980
4981       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
4982           addr = plus_constant (Pmode, temp0, vcall_offset);
4983       else
4984         {
4985           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
4986                                           Pmode);
4987           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
4988         }
4989
4990       if (Pmode == ptr_mode)
4991         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
4992       else
4993         aarch64_emit_move (temp1,
4994                            gen_rtx_SIGN_EXTEND (Pmode,
4995                                                 gen_rtx_MEM (ptr_mode, addr)));
4996
4997       emit_insn (gen_add2_insn (this_rtx, temp1));
4998     }
4999
5000   /* Generate a tail call to the target function.  */
5001   if (!TREE_USED (function))
5002     {
5003       assemble_external (function);
5004       TREE_USED (function) = 1;
5005     }
5006   funexp = XEXP (DECL_RTL (function), 0);
5007   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
5008   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
5009   SIBLING_CALL_P (insn) = 1;
5010
5011   insn = get_insns ();
5012   shorten_branches (insn);
5013   final_start_function (insn, file, 1);
5014   final (insn, file, 1);
5015   final_end_function ();
5016
5017   /* Stop pretending to be a post-reload pass.  */
5018   reload_completed = 0;
5019 }
5020
5021 static bool
5022 aarch64_tls_referenced_p (rtx x)
5023 {
5024   if (!TARGET_HAVE_TLS)
5025     return false;
5026   subrtx_iterator::array_type array;
5027   FOR_EACH_SUBRTX (iter, array, x, ALL)
5028     {
5029       const_rtx x = *iter;
5030       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
5031         return true;
5032       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
5033          TLS offsets, not real symbol references.  */
5034       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5035         iter.skip_subrtxes ();
5036     }
5037   return false;
5038 }
5039
5040
5041 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5042    a left shift of 0 or 12 bits.  */
5043 bool
5044 aarch64_uimm12_shift (HOST_WIDE_INT val)
5045 {
5046   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
5047           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
5048           );
5049 }
5050
5051
5052 /* Return true if val is an immediate that can be loaded into a
5053    register by a MOVZ instruction.  */
5054 static bool
5055 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
5056 {
5057   if (GET_MODE_SIZE (mode) > 4)
5058     {
5059       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
5060           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
5061         return 1;
5062     }
5063   else
5064     {
5065       /* Ignore sign extension.  */
5066       val &= (HOST_WIDE_INT) 0xffffffff;
5067     }
5068   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
5069           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
5070 }
5071
5072 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
5073    64-bit (DImode) integer.  */
5074
5075 static unsigned HOST_WIDE_INT
5076 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5077 {
5078   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
5079   while (size < 64)
5080     {
5081       val &= (HOST_WIDE_INT_1U << size) - 1;
5082       val |= val << size;
5083       size *= 2;
5084     }
5085   return val;
5086 }
5087
5088 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
5089
5090 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
5091   {
5092     0x0000000100000001ull,
5093     0x0001000100010001ull,
5094     0x0101010101010101ull,
5095     0x1111111111111111ull,
5096     0x5555555555555555ull,
5097   };
5098
5099
5100 /* Return true if val is a valid bitmask immediate.  */
5101
5102 bool
5103 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
5104 {
5105   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
5106   int bits;
5107
5108   /* Check for a single sequence of one bits and return quickly if so.
5109      The special cases of all ones and all zeroes returns false.  */
5110   val = aarch64_replicate_bitmask_imm (val_in, mode);
5111   tmp = val + (val & -val);
5112
5113   if (tmp == (tmp & -tmp))
5114     return (val + 1) > 1;
5115
5116   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
5117   if (mode == SImode)
5118     val = (val << 32) | (val & 0xffffffff);
5119
5120   /* Invert if the immediate doesn't start with a zero bit - this means we
5121      only need to search for sequences of one bits.  */
5122   if (val & 1)
5123     val = ~val;
5124
5125   /* Find the first set bit and set tmp to val with the first sequence of one
5126      bits removed.  Return success if there is a single sequence of ones.  */
5127   first_one = val & -val;
5128   tmp = val & (val + first_one);
5129
5130   if (tmp == 0)
5131     return true;
5132
5133   /* Find the next set bit and compute the difference in bit position.  */
5134   next_one = tmp & -tmp;
5135   bits = clz_hwi (first_one) - clz_hwi (next_one);
5136   mask = val ^ tmp;
5137
5138   /* Check the bit position difference is a power of 2, and that the first
5139      sequence of one bits fits within 'bits' bits.  */
5140   if ((mask >> bits) != 0 || bits != (bits & -bits))
5141     return false;
5142
5143   /* Check the sequence of one bits is repeated 64/bits times.  */
5144   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
5145 }
5146
5147 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5148    Assumed precondition: VAL_IN Is not zero.  */
5149
5150 unsigned HOST_WIDE_INT
5151 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
5152 {
5153   int lowest_bit_set = ctz_hwi (val_in);
5154   int highest_bit_set = floor_log2 (val_in);
5155   gcc_assert (val_in != 0);
5156
5157   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
5158           (HOST_WIDE_INT_1U << lowest_bit_set));
5159 }
5160
5161 /* Create constant where bits outside of lowest bit set to highest bit set
5162    are set to 1.  */
5163
5164 unsigned HOST_WIDE_INT
5165 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
5166 {
5167   return val_in | ~aarch64_and_split_imm1 (val_in);
5168 }
5169
5170 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
5171
5172 bool
5173 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
5174 {
5175   scalar_int_mode int_mode;
5176   if (!is_a <scalar_int_mode> (mode, &int_mode))
5177     return false;
5178
5179   if (aarch64_bitmask_imm (val_in, int_mode))
5180     return false;
5181
5182   if (aarch64_move_imm (val_in, int_mode))
5183     return false;
5184
5185   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
5186
5187   return aarch64_bitmask_imm (imm2, int_mode);
5188 }
5189
5190 /* Return true if val is an immediate that can be loaded into a
5191    register in a single instruction.  */
5192 bool
5193 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
5194 {
5195   scalar_int_mode int_mode;
5196   if (!is_a <scalar_int_mode> (mode, &int_mode))
5197     return false;
5198
5199   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
5200     return 1;
5201   return aarch64_bitmask_imm (val, int_mode);
5202 }
5203
5204 static bool
5205 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
5206 {
5207   rtx base, offset;
5208
5209   if (GET_CODE (x) == HIGH)
5210     return true;
5211
5212   /* There's no way to calculate VL-based values using relocations.  */
5213   subrtx_iterator::array_type array;
5214   FOR_EACH_SUBRTX (iter, array, x, ALL)
5215     if (GET_CODE (*iter) == CONST_POLY_INT)
5216       return true;
5217
5218   split_const (x, &base, &offset);
5219   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
5220     {
5221       if (aarch64_classify_symbol (base, INTVAL (offset))
5222           != SYMBOL_FORCE_TO_MEM)
5223         return true;
5224       else
5225         /* Avoid generating a 64-bit relocation in ILP32; leave
5226            to aarch64_expand_mov_immediate to handle it properly.  */
5227         return mode != ptr_mode;
5228     }
5229
5230   return aarch64_tls_referenced_p (x);
5231 }
5232
5233 /* Implement TARGET_CASE_VALUES_THRESHOLD.
5234    The expansion for a table switch is quite expensive due to the number
5235    of instructions, the table lookup and hard to predict indirect jump.
5236    When optimizing for speed, and -O3 enabled, use the per-core tuning if
5237    set, otherwise use tables for > 16 cases as a tradeoff between size and
5238    performance.  When optimizing for size, use the default setting.  */
5239
5240 static unsigned int
5241 aarch64_case_values_threshold (void)
5242 {
5243   /* Use the specified limit for the number of cases before using jump
5244      tables at higher optimization levels.  */
5245   if (optimize > 2
5246       && selected_cpu->tune->max_case_values != 0)
5247     return selected_cpu->tune->max_case_values;
5248   else
5249     return optimize_size ? default_case_values_threshold () : 17;
5250 }
5251
5252 /* Return true if register REGNO is a valid index register.
5253    STRICT_P is true if REG_OK_STRICT is in effect.  */
5254
5255 bool
5256 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
5257 {
5258   if (!HARD_REGISTER_NUM_P (regno))
5259     {
5260       if (!strict_p)
5261         return true;
5262
5263       if (!reg_renumber)
5264         return false;
5265
5266       regno = reg_renumber[regno];
5267     }
5268   return GP_REGNUM_P (regno);
5269 }
5270
5271 /* Return true if register REGNO is a valid base register for mode MODE.
5272    STRICT_P is true if REG_OK_STRICT is in effect.  */
5273
5274 bool
5275 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
5276 {
5277   if (!HARD_REGISTER_NUM_P (regno))
5278     {
5279       if (!strict_p)
5280         return true;
5281
5282       if (!reg_renumber)
5283         return false;
5284
5285       regno = reg_renumber[regno];
5286     }
5287
5288   /* The fake registers will be eliminated to either the stack or
5289      hard frame pointer, both of which are usually valid base registers.
5290      Reload deals with the cases where the eliminated form isn't valid.  */
5291   return (GP_REGNUM_P (regno)
5292           || regno == SP_REGNUM
5293           || regno == FRAME_POINTER_REGNUM
5294           || regno == ARG_POINTER_REGNUM);
5295 }
5296
5297 /* Return true if X is a valid base register for mode MODE.
5298    STRICT_P is true if REG_OK_STRICT is in effect.  */
5299
5300 static bool
5301 aarch64_base_register_rtx_p (rtx x, bool strict_p)
5302 {
5303   if (!strict_p
5304       && GET_CODE (x) == SUBREG
5305       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
5306     x = SUBREG_REG (x);
5307
5308   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
5309 }
5310
5311 /* Return true if address offset is a valid index.  If it is, fill in INFO
5312    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
5313
5314 static bool
5315 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
5316                         machine_mode mode, bool strict_p)
5317 {
5318   enum aarch64_address_type type;
5319   rtx index;
5320   int shift;
5321
5322   /* (reg:P) */
5323   if ((REG_P (x) || GET_CODE (x) == SUBREG)
5324       && GET_MODE (x) == Pmode)
5325     {
5326       type = ADDRESS_REG_REG;
5327       index = x;
5328       shift = 0;
5329     }
5330   /* (sign_extend:DI (reg:SI)) */
5331   else if ((GET_CODE (x) == SIGN_EXTEND
5332             || GET_CODE (x) == ZERO_EXTEND)
5333            && GET_MODE (x) == DImode
5334            && GET_MODE (XEXP (x, 0)) == SImode)
5335     {
5336       type = (GET_CODE (x) == SIGN_EXTEND)
5337         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5338       index = XEXP (x, 0);
5339       shift = 0;
5340     }
5341   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
5342   else if (GET_CODE (x) == MULT
5343            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5344                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5345            && GET_MODE (XEXP (x, 0)) == DImode
5346            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5347            && CONST_INT_P (XEXP (x, 1)))
5348     {
5349       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5350         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5351       index = XEXP (XEXP (x, 0), 0);
5352       shift = exact_log2 (INTVAL (XEXP (x, 1)));
5353     }
5354   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
5355   else if (GET_CODE (x) == ASHIFT
5356            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5357                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5358            && GET_MODE (XEXP (x, 0)) == DImode
5359            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5360            && CONST_INT_P (XEXP (x, 1)))
5361     {
5362       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5363         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5364       index = XEXP (XEXP (x, 0), 0);
5365       shift = INTVAL (XEXP (x, 1));
5366     }
5367   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
5368   else if ((GET_CODE (x) == SIGN_EXTRACT
5369             || GET_CODE (x) == ZERO_EXTRACT)
5370            && GET_MODE (x) == DImode
5371            && GET_CODE (XEXP (x, 0)) == MULT
5372            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5373            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5374     {
5375       type = (GET_CODE (x) == SIGN_EXTRACT)
5376         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5377       index = XEXP (XEXP (x, 0), 0);
5378       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5379       if (INTVAL (XEXP (x, 1)) != 32 + shift
5380           || INTVAL (XEXP (x, 2)) != 0)
5381         shift = -1;
5382     }
5383   /* (and:DI (mult:DI (reg:DI) (const_int scale))
5384      (const_int 0xffffffff<<shift)) */
5385   else if (GET_CODE (x) == AND
5386            && GET_MODE (x) == DImode
5387            && GET_CODE (XEXP (x, 0)) == MULT
5388            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5389            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5390            && CONST_INT_P (XEXP (x, 1)))
5391     {
5392       type = ADDRESS_REG_UXTW;
5393       index = XEXP (XEXP (x, 0), 0);
5394       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5395       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5396         shift = -1;
5397     }
5398   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
5399   else if ((GET_CODE (x) == SIGN_EXTRACT
5400             || GET_CODE (x) == ZERO_EXTRACT)
5401            && GET_MODE (x) == DImode
5402            && GET_CODE (XEXP (x, 0)) == ASHIFT
5403            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5404            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5405     {
5406       type = (GET_CODE (x) == SIGN_EXTRACT)
5407         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5408       index = XEXP (XEXP (x, 0), 0);
5409       shift = INTVAL (XEXP (XEXP (x, 0), 1));
5410       if (INTVAL (XEXP (x, 1)) != 32 + shift
5411           || INTVAL (XEXP (x, 2)) != 0)
5412         shift = -1;
5413     }
5414   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
5415      (const_int 0xffffffff<<shift)) */
5416   else if (GET_CODE (x) == AND
5417            && GET_MODE (x) == DImode
5418            && GET_CODE (XEXP (x, 0)) == ASHIFT
5419            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5420            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5421            && CONST_INT_P (XEXP (x, 1)))
5422     {
5423       type = ADDRESS_REG_UXTW;
5424       index = XEXP (XEXP (x, 0), 0);
5425       shift = INTVAL (XEXP (XEXP (x, 0), 1));
5426       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5427         shift = -1;
5428     }
5429   /* (mult:P (reg:P) (const_int scale)) */
5430   else if (GET_CODE (x) == MULT
5431            && GET_MODE (x) == Pmode
5432            && GET_MODE (XEXP (x, 0)) == Pmode
5433            && CONST_INT_P (XEXP (x, 1)))
5434     {
5435       type = ADDRESS_REG_REG;
5436       index = XEXP (x, 0);
5437       shift = exact_log2 (INTVAL (XEXP (x, 1)));
5438     }
5439   /* (ashift:P (reg:P) (const_int shift)) */
5440   else if (GET_CODE (x) == ASHIFT
5441            && GET_MODE (x) == Pmode
5442            && GET_MODE (XEXP (x, 0)) == Pmode
5443            && CONST_INT_P (XEXP (x, 1)))
5444     {
5445       type = ADDRESS_REG_REG;
5446       index = XEXP (x, 0);
5447       shift = INTVAL (XEXP (x, 1));
5448     }
5449   else
5450     return false;
5451
5452   if (!strict_p
5453       && GET_CODE (index) == SUBREG
5454       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
5455     index = SUBREG_REG (index);
5456
5457   if (aarch64_sve_data_mode_p (mode))
5458     {
5459       if (type != ADDRESS_REG_REG
5460           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
5461         return false;
5462     }
5463   else
5464     {
5465       if (shift != 0
5466           && !(IN_RANGE (shift, 1, 3)
5467                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
5468         return false;
5469     }
5470
5471   if (REG_P (index)
5472       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
5473     {
5474       info->type = type;
5475       info->offset = index;
5476       info->shift = shift;
5477       return true;
5478     }
5479
5480   return false;
5481 }
5482
5483 /* Return true if MODE is one of the modes for which we
5484    support LDP/STP operations.  */
5485
5486 static bool
5487 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
5488 {
5489   return mode == SImode || mode == DImode
5490          || mode == SFmode || mode == DFmode
5491          || (aarch64_vector_mode_supported_p (mode)
5492              && known_eq (GET_MODE_SIZE (mode), 8));
5493 }
5494
5495 /* Return true if REGNO is a virtual pointer register, or an eliminable
5496    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
5497    include stack_pointer or hard_frame_pointer.  */
5498 static bool
5499 virt_or_elim_regno_p (unsigned regno)
5500 {
5501   return ((regno >= FIRST_VIRTUAL_REGISTER
5502            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
5503           || regno == FRAME_POINTER_REGNUM
5504           || regno == ARG_POINTER_REGNUM);
5505 }
5506
5507 /* Return true if X is a valid address of type TYPE for machine mode MODE.
5508    If it is, fill in INFO appropriately.  STRICT_P is true if
5509    REG_OK_STRICT is in effect.  */
5510
5511 static bool
5512 aarch64_classify_address (struct aarch64_address_info *info,
5513                           rtx x, machine_mode mode, bool strict_p,
5514                           aarch64_addr_query_type type = ADDR_QUERY_M)
5515 {
5516   enum rtx_code code = GET_CODE (x);
5517   rtx op0, op1;
5518   poly_int64 offset;
5519
5520   HOST_WIDE_INT const_size;
5521
5522   /* On BE, we use load/store pair for all large int mode load/stores.
5523      TI/TFmode may also use a load/store pair.  */
5524   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5525   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
5526   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
5527                             || mode == TImode
5528                             || mode == TFmode
5529                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
5530
5531   bool allow_reg_index_p = (!load_store_pair_p
5532                             && (known_lt (GET_MODE_SIZE (mode), 16)
5533                                 || vec_flags == VEC_ADVSIMD
5534                                 || vec_flags == VEC_SVE_DATA));
5535
5536   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
5537      [Rn, #offset, MUL VL].  */
5538   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
5539       && (code != REG && code != PLUS))
5540     return false;
5541
5542   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
5543      REG addressing.  */
5544   if (advsimd_struct_p
5545       && !BYTES_BIG_ENDIAN
5546       && (code != POST_INC && code != REG))
5547     return false;
5548
5549   gcc_checking_assert (GET_MODE (x) == VOIDmode
5550                        || SCALAR_INT_MODE_P (GET_MODE (x)));
5551
5552   switch (code)
5553     {
5554     case REG:
5555     case SUBREG:
5556       info->type = ADDRESS_REG_IMM;
5557       info->base = x;
5558       info->offset = const0_rtx;
5559       info->const_offset = 0;
5560       return aarch64_base_register_rtx_p (x, strict_p);
5561
5562     case PLUS:
5563       op0 = XEXP (x, 0);
5564       op1 = XEXP (x, 1);
5565
5566       if (! strict_p
5567           && REG_P (op0)
5568           && virt_or_elim_regno_p (REGNO (op0))
5569           && poly_int_rtx_p (op1, &offset))
5570         {
5571           info->type = ADDRESS_REG_IMM;
5572           info->base = op0;
5573           info->offset = op1;
5574           info->const_offset = offset;
5575
5576           return true;
5577         }
5578
5579       if (maybe_ne (GET_MODE_SIZE (mode), 0)
5580           && aarch64_base_register_rtx_p (op0, strict_p)
5581           && poly_int_rtx_p (op1, &offset))
5582         {
5583           info->type = ADDRESS_REG_IMM;
5584           info->base = op0;
5585           info->offset = op1;
5586           info->const_offset = offset;
5587
5588           /* TImode and TFmode values are allowed in both pairs of X
5589              registers and individual Q registers.  The available
5590              address modes are:
5591              X,X: 7-bit signed scaled offset
5592              Q:   9-bit signed offset
5593              We conservatively require an offset representable in either mode.
5594              When performing the check for pairs of X registers i.e.  LDP/STP
5595              pass down DImode since that is the natural size of the LDP/STP
5596              instruction memory accesses.  */
5597           if (mode == TImode || mode == TFmode)
5598             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
5599                     && (offset_9bit_signed_unscaled_p (mode, offset)
5600                         || offset_12bit_unsigned_scaled_p (mode, offset)));
5601
5602           /* A 7bit offset check because OImode will emit a ldp/stp
5603              instruction (only big endian will get here).
5604              For ldp/stp instructions, the offset is scaled for the size of a
5605              single element of the pair.  */
5606           if (mode == OImode)
5607             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
5608
5609           /* Three 9/12 bit offsets checks because CImode will emit three
5610              ldr/str instructions (only big endian will get here).  */
5611           if (mode == CImode)
5612             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5613                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
5614                         || offset_12bit_unsigned_scaled_p (V16QImode,
5615                                                            offset + 32)));
5616
5617           /* Two 7bit offsets checks because XImode will emit two ldp/stp
5618              instructions (only big endian will get here).  */
5619           if (mode == XImode)
5620             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5621                     && aarch64_offset_7bit_signed_scaled_p (TImode,
5622                                                             offset + 32));
5623
5624           /* Make "m" use the LD1 offset range for SVE data modes, so
5625              that pre-RTL optimizers like ivopts will work to that
5626              instead of the wider LDR/STR range.  */
5627           if (vec_flags == VEC_SVE_DATA)
5628             return (type == ADDR_QUERY_M
5629                     ? offset_4bit_signed_scaled_p (mode, offset)
5630                     : offset_9bit_signed_scaled_p (mode, offset));
5631
5632           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
5633             {
5634               poly_int64 end_offset = (offset
5635                                        + GET_MODE_SIZE (mode)
5636                                        - BYTES_PER_SVE_VECTOR);
5637               return (type == ADDR_QUERY_M
5638                       ? offset_4bit_signed_scaled_p (mode, offset)
5639                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
5640                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
5641                                                          end_offset)));
5642             }
5643
5644           if (vec_flags == VEC_SVE_PRED)
5645             return offset_9bit_signed_scaled_p (mode, offset);
5646
5647           if (load_store_pair_p)
5648             return ((known_eq (GET_MODE_SIZE (mode), 4)
5649                      || known_eq (GET_MODE_SIZE (mode), 8))
5650                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5651           else
5652             return (offset_9bit_signed_unscaled_p (mode, offset)
5653                     || offset_12bit_unsigned_scaled_p (mode, offset));
5654         }
5655
5656       if (allow_reg_index_p)
5657         {
5658           /* Look for base + (scaled/extended) index register.  */
5659           if (aarch64_base_register_rtx_p (op0, strict_p)
5660               && aarch64_classify_index (info, op1, mode, strict_p))
5661             {
5662               info->base = op0;
5663               return true;
5664             }
5665           if (aarch64_base_register_rtx_p (op1, strict_p)
5666               && aarch64_classify_index (info, op0, mode, strict_p))
5667             {
5668               info->base = op1;
5669               return true;
5670             }
5671         }
5672
5673       return false;
5674
5675     case POST_INC:
5676     case POST_DEC:
5677     case PRE_INC:
5678     case PRE_DEC:
5679       info->type = ADDRESS_REG_WB;
5680       info->base = XEXP (x, 0);
5681       info->offset = NULL_RTX;
5682       return aarch64_base_register_rtx_p (info->base, strict_p);
5683
5684     case POST_MODIFY:
5685     case PRE_MODIFY:
5686       info->type = ADDRESS_REG_WB;
5687       info->base = XEXP (x, 0);
5688       if (GET_CODE (XEXP (x, 1)) == PLUS
5689           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
5690           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
5691           && aarch64_base_register_rtx_p (info->base, strict_p))
5692         {
5693           info->offset = XEXP (XEXP (x, 1), 1);
5694           info->const_offset = offset;
5695
5696           /* TImode and TFmode values are allowed in both pairs of X
5697              registers and individual Q registers.  The available
5698              address modes are:
5699              X,X: 7-bit signed scaled offset
5700              Q:   9-bit signed offset
5701              We conservatively require an offset representable in either mode.
5702            */
5703           if (mode == TImode || mode == TFmode)
5704             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
5705                     && offset_9bit_signed_unscaled_p (mode, offset));
5706
5707           if (load_store_pair_p)
5708             return ((known_eq (GET_MODE_SIZE (mode), 4)
5709                      || known_eq (GET_MODE_SIZE (mode), 8))
5710                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5711           else
5712             return offset_9bit_signed_unscaled_p (mode, offset);
5713         }
5714       return false;
5715
5716     case CONST:
5717     case SYMBOL_REF:
5718     case LABEL_REF:
5719       /* load literal: pc-relative constant pool entry.  Only supported
5720          for SI mode or larger.  */
5721       info->type = ADDRESS_SYMBOLIC;
5722
5723       if (!load_store_pair_p
5724           && GET_MODE_SIZE (mode).is_constant (&const_size)
5725           && const_size >= 4)
5726         {
5727           rtx sym, addend;
5728
5729           split_const (x, &sym, &addend);
5730           return ((GET_CODE (sym) == LABEL_REF
5731                    || (GET_CODE (sym) == SYMBOL_REF
5732                        && CONSTANT_POOL_ADDRESS_P (sym)
5733                        && aarch64_pcrelative_literal_loads)));
5734         }
5735       return false;
5736
5737     case LO_SUM:
5738       info->type = ADDRESS_LO_SUM;
5739       info->base = XEXP (x, 0);
5740       info->offset = XEXP (x, 1);
5741       if (allow_reg_index_p
5742           && aarch64_base_register_rtx_p (info->base, strict_p))
5743         {
5744           rtx sym, offs;
5745           split_const (info->offset, &sym, &offs);
5746           if (GET_CODE (sym) == SYMBOL_REF
5747               && (aarch64_classify_symbol (sym, INTVAL (offs))
5748                   == SYMBOL_SMALL_ABSOLUTE))
5749             {
5750               /* The symbol and offset must be aligned to the access size.  */
5751               unsigned int align;
5752
5753               if (CONSTANT_POOL_ADDRESS_P (sym))
5754                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
5755               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
5756                 {
5757                   tree exp = SYMBOL_REF_DECL (sym);
5758                   align = TYPE_ALIGN (TREE_TYPE (exp));
5759                   align = aarch64_constant_alignment (exp, align);
5760                 }
5761               else if (SYMBOL_REF_DECL (sym))
5762                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
5763               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
5764                        && SYMBOL_REF_BLOCK (sym) != NULL)
5765                 align = SYMBOL_REF_BLOCK (sym)->alignment;
5766               else
5767                 align = BITS_PER_UNIT;
5768
5769               poly_int64 ref_size = GET_MODE_SIZE (mode);
5770               if (known_eq (ref_size, 0))
5771                 ref_size = GET_MODE_SIZE (DImode);
5772
5773               return (multiple_p (INTVAL (offs), ref_size)
5774                       && multiple_p (align / BITS_PER_UNIT, ref_size));
5775             }
5776         }
5777       return false;
5778
5779     default:
5780       return false;
5781     }
5782 }
5783
5784 /* Return true if the address X is valid for a PRFM instruction.
5785    STRICT_P is true if we should do strict checking with
5786    aarch64_classify_address.  */
5787
5788 bool
5789 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
5790 {
5791   struct aarch64_address_info addr;
5792
5793   /* PRFM accepts the same addresses as DImode...  */
5794   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
5795   if (!res)
5796     return false;
5797
5798   /* ... except writeback forms.  */
5799   return addr.type != ADDRESS_REG_WB;
5800 }
5801
5802 bool
5803 aarch64_symbolic_address_p (rtx x)
5804 {
5805   rtx offset;
5806
5807   split_const (x, &x, &offset);
5808   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
5809 }
5810
5811 /* Classify the base of symbolic expression X.  */
5812
5813 enum aarch64_symbol_type
5814 aarch64_classify_symbolic_expression (rtx x)
5815 {
5816   rtx offset;
5817
5818   split_const (x, &x, &offset);
5819   return aarch64_classify_symbol (x, INTVAL (offset));
5820 }
5821
5822
5823 /* Return TRUE if X is a legitimate address for accessing memory in
5824    mode MODE.  */
5825 static bool
5826 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
5827 {
5828   struct aarch64_address_info addr;
5829
5830   return aarch64_classify_address (&addr, x, mode, strict_p);
5831 }
5832
5833 /* Return TRUE if X is a legitimate address of type TYPE for accessing
5834    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
5835 bool
5836 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
5837                               aarch64_addr_query_type type)
5838 {
5839   struct aarch64_address_info addr;
5840
5841   return aarch64_classify_address (&addr, x, mode, strict_p, type);
5842 }
5843
5844 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
5845
5846 static bool
5847 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
5848                                          poly_int64 orig_offset,
5849                                          machine_mode mode)
5850 {
5851   HOST_WIDE_INT size;
5852   if (GET_MODE_SIZE (mode).is_constant (&size))
5853     {
5854       HOST_WIDE_INT const_offset, second_offset;
5855
5856       /* A general SVE offset is A * VQ + B.  Remove the A component from
5857          coefficient 0 in order to get the constant B.  */
5858       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
5859
5860       /* Split an out-of-range address displacement into a base and
5861          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
5862          range otherwise to increase opportunities for sharing the base
5863          address of different sizes.  Unaligned accesses use the signed
5864          9-bit range, TImode/TFmode use the intersection of signed
5865          scaled 7-bit and signed 9-bit offset.  */
5866       if (mode == TImode || mode == TFmode)
5867         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
5868       else if ((const_offset & (size - 1)) != 0)
5869         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
5870       else
5871         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
5872
5873       if (second_offset == 0 || known_eq (orig_offset, second_offset))
5874         return false;
5875
5876       /* Split the offset into second_offset and the rest.  */
5877       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
5878       *offset2 = gen_int_mode (second_offset, Pmode);
5879       return true;
5880     }
5881   else
5882     {
5883       /* Get the mode we should use as the basis of the range.  For structure
5884          modes this is the mode of one vector.  */
5885       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5886       machine_mode step_mode
5887         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
5888
5889       /* Get the "mul vl" multiplier we'd like to use.  */
5890       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
5891       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
5892       if (vec_flags & VEC_SVE_DATA)
5893         /* LDR supports a 9-bit range, but the move patterns for
5894            structure modes require all vectors to be in range of the
5895            same base.  The simplest way of accomodating that while still
5896            promoting reuse of anchor points between different modes is
5897            to use an 8-bit range unconditionally.  */
5898         vnum = ((vnum + 128) & 255) - 128;
5899       else
5900         /* Predicates are only handled singly, so we might as well use
5901            the full range.  */
5902         vnum = ((vnum + 256) & 511) - 256;
5903       if (vnum == 0)
5904         return false;
5905
5906       /* Convert the "mul vl" multiplier into a byte offset.  */
5907       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
5908       if (known_eq (second_offset, orig_offset))
5909         return false;
5910
5911       /* Split the offset into second_offset and the rest.  */
5912       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
5913       *offset2 = gen_int_mode (second_offset, Pmode);
5914       return true;
5915     }
5916 }
5917
5918 /* Return the binary representation of floating point constant VALUE in INTVAL.
5919    If the value cannot be converted, return false without setting INTVAL.
5920    The conversion is done in the given MODE.  */
5921 bool
5922 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
5923 {
5924
5925   /* We make a general exception for 0.  */
5926   if (aarch64_float_const_zero_rtx_p (value))
5927     {
5928       *intval = 0;
5929       return true;
5930     }
5931
5932   scalar_float_mode mode;
5933   if (GET_CODE (value) != CONST_DOUBLE
5934       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
5935       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
5936       /* Only support up to DF mode.  */
5937       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
5938     return false;
5939
5940   unsigned HOST_WIDE_INT ival = 0;
5941
5942   long res[2];
5943   real_to_target (res,
5944                   CONST_DOUBLE_REAL_VALUE (value),
5945                   REAL_MODE_FORMAT (mode));
5946
5947   if (mode == DFmode)
5948     {
5949       int order = BYTES_BIG_ENDIAN ? 1 : 0;
5950       ival = zext_hwi (res[order], 32);
5951       ival |= (zext_hwi (res[1 - order], 32) << 32);
5952     }
5953   else
5954       ival = zext_hwi (res[0], 32);
5955
5956   *intval = ival;
5957   return true;
5958 }
5959
5960 /* Return TRUE if rtx X is an immediate constant that can be moved using a
5961    single MOV(+MOVK) followed by an FMOV.  */
5962 bool
5963 aarch64_float_const_rtx_p (rtx x)
5964 {
5965   machine_mode mode = GET_MODE (x);
5966   if (mode == VOIDmode)
5967     return false;
5968
5969   /* Determine whether it's cheaper to write float constants as
5970      mov/movk pairs over ldr/adrp pairs.  */
5971   unsigned HOST_WIDE_INT ival;
5972
5973   if (GET_CODE (x) == CONST_DOUBLE
5974       && SCALAR_FLOAT_MODE_P (mode)
5975       && aarch64_reinterpret_float_as_int (x, &ival))
5976     {
5977       scalar_int_mode imode = (mode == HFmode
5978                                ? SImode
5979                                : int_mode_for_mode (mode).require ());
5980       int num_instr = aarch64_internal_mov_immediate
5981                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
5982       return num_instr < 3;
5983     }
5984
5985   return false;
5986 }
5987
5988 /* Return TRUE if rtx X is immediate constant 0.0 */
5989 bool
5990 aarch64_float_const_zero_rtx_p (rtx x)
5991 {
5992   if (GET_MODE (x) == VOIDmode)
5993     return false;
5994
5995   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
5996     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
5997   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
5998 }
5999
6000 /* Return TRUE if rtx X is immediate constant that fits in a single
6001    MOVI immediate operation.  */
6002 bool
6003 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
6004 {
6005   if (!TARGET_SIMD)
6006      return false;
6007
6008   machine_mode vmode;
6009   scalar_int_mode imode;
6010   unsigned HOST_WIDE_INT ival;
6011
6012   if (GET_CODE (x) == CONST_DOUBLE
6013       && SCALAR_FLOAT_MODE_P (mode))
6014     {
6015       if (!aarch64_reinterpret_float_as_int (x, &ival))
6016         return false;
6017
6018       /* We make a general exception for 0.  */
6019       if (aarch64_float_const_zero_rtx_p (x))
6020         return true;
6021
6022       imode = int_mode_for_mode (mode).require ();
6023     }
6024   else if (GET_CODE (x) == CONST_INT
6025            && is_a <scalar_int_mode> (mode, &imode))
6026     ival = INTVAL (x);
6027   else
6028     return false;
6029
6030    /* use a 64 bit mode for everything except for DI/DF mode, where we use
6031      a 128 bit vector mode.  */
6032   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
6033
6034   vmode = aarch64_simd_container_mode (imode, width);
6035   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
6036
6037   return aarch64_simd_valid_immediate (v_op, NULL);
6038 }
6039
6040
6041 /* Return the fixed registers used for condition codes.  */
6042
6043 static bool
6044 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
6045 {
6046   *p1 = CC_REGNUM;
6047   *p2 = INVALID_REGNUM;
6048   return true;
6049 }
6050
6051 /* This function is used by the call expanders of the machine description.
6052    RESULT is the register in which the result is returned.  It's NULL for
6053    "call" and "sibcall".
6054    MEM is the location of the function call.
6055    SIBCALL indicates whether this function call is normal call or sibling call.
6056    It will generate different pattern accordingly.  */
6057
6058 void
6059 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
6060 {
6061   rtx call, callee, tmp;
6062   rtvec vec;
6063   machine_mode mode;
6064
6065   gcc_assert (MEM_P (mem));
6066   callee = XEXP (mem, 0);
6067   mode = GET_MODE (callee);
6068   gcc_assert (mode == Pmode);
6069
6070   /* Decide if we should generate indirect calls by loading the
6071      address of the callee into a register before performing
6072      the branch-and-link.  */
6073   if (SYMBOL_REF_P (callee)
6074       ? (aarch64_is_long_call_p (callee)
6075          || aarch64_is_noplt_call_p (callee))
6076       : !REG_P (callee))
6077     XEXP (mem, 0) = force_reg (mode, callee);
6078
6079   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
6080
6081   if (result != NULL_RTX)
6082     call = gen_rtx_SET (result, call);
6083
6084   if (sibcall)
6085     tmp = ret_rtx;
6086   else
6087     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
6088
6089   vec = gen_rtvec (2, call, tmp);
6090   call = gen_rtx_PARALLEL (VOIDmode, vec);
6091
6092   aarch64_emit_call_insn (call);
6093 }
6094
6095 /* Emit call insn with PAT and do aarch64-specific handling.  */
6096
6097 void
6098 aarch64_emit_call_insn (rtx pat)
6099 {
6100   rtx insn = emit_call_insn (pat);
6101
6102   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
6103   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
6104   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
6105 }
6106
6107 machine_mode
6108 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
6109 {
6110   /* All floating point compares return CCFP if it is an equality
6111      comparison, and CCFPE otherwise.  */
6112   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
6113     {
6114       switch (code)
6115         {
6116         case EQ:
6117         case NE:
6118         case UNORDERED:
6119         case ORDERED:
6120         case UNLT:
6121         case UNLE:
6122         case UNGT:
6123         case UNGE:
6124         case UNEQ:
6125           return CCFPmode;
6126
6127         case LT:
6128         case LE:
6129         case GT:
6130         case GE:
6131         case LTGT:
6132           return CCFPEmode;
6133
6134         default:
6135           gcc_unreachable ();
6136         }
6137     }
6138
6139   /* Equality comparisons of short modes against zero can be performed
6140      using the TST instruction with the appropriate bitmask.  */
6141   if (y == const0_rtx && REG_P (x)
6142       && (code == EQ || code == NE)
6143       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
6144     return CC_NZmode;
6145
6146   /* Similarly, comparisons of zero_extends from shorter modes can
6147      be performed using an ANDS with an immediate mask.  */
6148   if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
6149       && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6150       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
6151       && (code == EQ || code == NE))
6152     return CC_NZmode;
6153
6154   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6155       && y == const0_rtx
6156       && (code == EQ || code == NE || code == LT || code == GE)
6157       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
6158           || GET_CODE (x) == NEG
6159           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
6160               && CONST_INT_P (XEXP (x, 2)))))
6161     return CC_NZmode;
6162
6163   /* A compare with a shifted operand.  Because of canonicalization,
6164      the comparison will have to be swapped when we emit the assembly
6165      code.  */
6166   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6167       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
6168       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
6169           || GET_CODE (x) == LSHIFTRT
6170           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
6171     return CC_SWPmode;
6172
6173   /* Similarly for a negated operand, but we can only do this for
6174      equalities.  */
6175   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6176       && (REG_P (y) || GET_CODE (y) == SUBREG)
6177       && (code == EQ || code == NE)
6178       && GET_CODE (x) == NEG)
6179     return CC_Zmode;
6180
6181   /* A test for unsigned overflow.  */
6182   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
6183       && code == NE
6184       && GET_CODE (x) == PLUS
6185       && GET_CODE (y) == ZERO_EXTEND)
6186     return CC_Cmode;
6187
6188   /* For everything else, return CCmode.  */
6189   return CCmode;
6190 }
6191
6192 static int
6193 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
6194
6195 int
6196 aarch64_get_condition_code (rtx x)
6197 {
6198   machine_mode mode = GET_MODE (XEXP (x, 0));
6199   enum rtx_code comp_code = GET_CODE (x);
6200
6201   if (GET_MODE_CLASS (mode) != MODE_CC)
6202     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
6203   return aarch64_get_condition_code_1 (mode, comp_code);
6204 }
6205
6206 static int
6207 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
6208 {
6209   switch (mode)
6210     {
6211     case E_CCFPmode:
6212     case E_CCFPEmode:
6213       switch (comp_code)
6214         {
6215         case GE: return AARCH64_GE;
6216         case GT: return AARCH64_GT;
6217         case LE: return AARCH64_LS;
6218         case LT: return AARCH64_MI;
6219         case NE: return AARCH64_NE;
6220         case EQ: return AARCH64_EQ;
6221         case ORDERED: return AARCH64_VC;
6222         case UNORDERED: return AARCH64_VS;
6223         case UNLT: return AARCH64_LT;
6224         case UNLE: return AARCH64_LE;
6225         case UNGT: return AARCH64_HI;
6226         case UNGE: return AARCH64_PL;
6227         default: return -1;
6228         }
6229       break;
6230
6231     case E_CCmode:
6232       switch (comp_code)
6233         {
6234         case NE: return AARCH64_NE;
6235         case EQ: return AARCH64_EQ;
6236         case GE: return AARCH64_GE;
6237         case GT: return AARCH64_GT;
6238         case LE: return AARCH64_LE;
6239         case LT: return AARCH64_LT;
6240         case GEU: return AARCH64_CS;
6241         case GTU: return AARCH64_HI;
6242         case LEU: return AARCH64_LS;
6243         case LTU: return AARCH64_CC;
6244         default: return -1;
6245         }
6246       break;
6247
6248     case E_CC_SWPmode:
6249       switch (comp_code)
6250         {
6251         case NE: return AARCH64_NE;
6252         case EQ: return AARCH64_EQ;
6253         case GE: return AARCH64_LE;
6254         case GT: return AARCH64_LT;
6255         case LE: return AARCH64_GE;
6256         case LT: return AARCH64_GT;
6257         case GEU: return AARCH64_LS;
6258         case GTU: return AARCH64_CC;
6259         case LEU: return AARCH64_CS;
6260         case LTU: return AARCH64_HI;
6261         default: return -1;
6262         }
6263       break;
6264
6265     case E_CC_NZmode:
6266       switch (comp_code)
6267         {
6268         case NE: return AARCH64_NE;
6269         case EQ: return AARCH64_EQ;
6270         case GE: return AARCH64_PL;
6271         case LT: return AARCH64_MI;
6272         default: return -1;
6273         }
6274       break;
6275
6276     case E_CC_Zmode:
6277       switch (comp_code)
6278         {
6279         case NE: return AARCH64_NE;
6280         case EQ: return AARCH64_EQ;
6281         default: return -1;
6282         }
6283       break;
6284
6285     case E_CC_Cmode:
6286       switch (comp_code)
6287         {
6288         case NE: return AARCH64_CS;
6289         case EQ: return AARCH64_CC;
6290         default: return -1;
6291         }
6292       break;
6293
6294     default:
6295       return -1;
6296     }
6297
6298   return -1;
6299 }
6300
6301 bool
6302 aarch64_const_vec_all_same_in_range_p (rtx x,
6303                                        HOST_WIDE_INT minval,
6304                                        HOST_WIDE_INT maxval)
6305 {
6306   rtx elt;
6307   return (const_vec_duplicate_p (x, &elt)
6308           && CONST_INT_P (elt)
6309           && IN_RANGE (INTVAL (elt), minval, maxval));
6310 }
6311
6312 bool
6313 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
6314 {
6315   return aarch64_const_vec_all_same_in_range_p (x, val, val);
6316 }
6317
6318 /* Return true if VEC is a constant in which every element is in the range
6319    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
6320
6321 static bool
6322 aarch64_const_vec_all_in_range_p (rtx vec,
6323                                   HOST_WIDE_INT minval,
6324                                   HOST_WIDE_INT maxval)
6325 {
6326   if (GET_CODE (vec) != CONST_VECTOR
6327       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
6328     return false;
6329
6330   int nunits;
6331   if (!CONST_VECTOR_STEPPED_P (vec))
6332     nunits = const_vector_encoded_nelts (vec);
6333   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
6334     return false;
6335
6336   for (int i = 0; i < nunits; i++)
6337     {
6338       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
6339       if (!CONST_INT_P (vec_elem)
6340           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
6341         return false;
6342     }
6343   return true;
6344 }
6345
6346 /* N Z C V.  */
6347 #define AARCH64_CC_V 1
6348 #define AARCH64_CC_C (1 << 1)
6349 #define AARCH64_CC_Z (1 << 2)
6350 #define AARCH64_CC_N (1 << 3)
6351
6352 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
6353 static const int aarch64_nzcv_codes[] =
6354 {
6355   0,            /* EQ, Z == 1.  */
6356   AARCH64_CC_Z, /* NE, Z == 0.  */
6357   0,            /* CS, C == 1.  */
6358   AARCH64_CC_C, /* CC, C == 0.  */
6359   0,            /* MI, N == 1.  */
6360   AARCH64_CC_N, /* PL, N == 0.  */
6361   0,            /* VS, V == 1.  */
6362   AARCH64_CC_V, /* VC, V == 0.  */
6363   0,            /* HI, C ==1 && Z == 0.  */
6364   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
6365   AARCH64_CC_V, /* GE, N == V.  */
6366   0,            /* LT, N != V.  */
6367   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
6368   0,            /* LE, !(Z == 0 && N == V).  */
6369   0,            /* AL, Any.  */
6370   0             /* NV, Any.  */
6371 };
6372
6373 /* Print floating-point vector immediate operand X to F, negating it
6374    first if NEGATE is true.  Return true on success, false if it isn't
6375    a constant we can handle.  */
6376
6377 static bool
6378 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
6379 {
6380   rtx elt;
6381
6382   if (!const_vec_duplicate_p (x, &elt))
6383     return false;
6384
6385   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
6386   if (negate)
6387     r = real_value_negate (&r);
6388
6389   /* We only handle the SVE single-bit immediates here.  */
6390   if (real_equal (&r, &dconst0))
6391     asm_fprintf (f, "0.0");
6392   else if (real_equal (&r, &dconst1))
6393     asm_fprintf (f, "1.0");
6394   else if (real_equal (&r, &dconsthalf))
6395     asm_fprintf (f, "0.5");
6396   else
6397     return false;
6398
6399   return true;
6400 }
6401
6402 /* Return the equivalent letter for size.  */
6403 static char
6404 sizetochar (int size)
6405 {
6406   switch (size)
6407     {
6408     case 64: return 'd';
6409     case 32: return 's';
6410     case 16: return 'h';
6411     case 8 : return 'b';
6412     default: gcc_unreachable ();
6413     }
6414 }
6415
6416 /* Print operand X to file F in a target specific manner according to CODE.
6417    The acceptable formatting commands given by CODE are:
6418      'c':               An integer or symbol address without a preceding #
6419                         sign.
6420      'C':               Take the duplicated element in a vector constant
6421                         and print it in hex.
6422      'D':               Take the duplicated element in a vector constant
6423                         and print it as an unsigned integer, in decimal.
6424      'e':               Print the sign/zero-extend size as a character 8->b,
6425                         16->h, 32->w.
6426      'p':               Prints N such that 2^N == X (X must be power of 2 and
6427                         const int).
6428      'P':               Print the number of non-zero bits in X (a const_int).
6429      'H':               Print the higher numbered register of a pair (TImode)
6430                         of regs.
6431      'm':               Print a condition (eq, ne, etc).
6432      'M':               Same as 'm', but invert condition.
6433      'N':               Take the duplicated element in a vector constant
6434                         and print the negative of it in decimal.
6435      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
6436      'S/T/U/V':         Print a FP/SIMD register name for a register list.
6437                         The register printed is the FP/SIMD register name
6438                         of X + 0/1/2/3 for S/T/U/V.
6439      'R':               Print a scalar FP/SIMD register name + 1.
6440      'X':               Print bottom 16 bits of integer constant in hex.
6441      'w/x':             Print a general register name or the zero register
6442                         (32-bit or 64-bit).
6443      '0':               Print a normal operand, if it's a general register,
6444                         then we assume DImode.
6445      'k':               Print NZCV for conditional compare instructions.
6446      'A':               Output address constant representing the first
6447                         argument of X, specifying a relocation offset
6448                         if appropriate.
6449      'L':               Output constant address specified by X
6450                         with a relocation offset if appropriate.
6451      'G':               Prints address of X, specifying a PC relative
6452                         relocation mode if appropriate.
6453      'y':               Output address of LDP or STP - this is used for
6454                         some LDP/STPs which don't use a PARALLEL in their
6455                         pattern (so the mode needs to be adjusted).
6456      'z':               Output address of a typical LDP or STP.  */
6457
6458 static void
6459 aarch64_print_operand (FILE *f, rtx x, int code)
6460 {
6461   rtx elt;
6462   switch (code)
6463     {
6464     case 'c':
6465       switch (GET_CODE (x))
6466         {
6467         case CONST_INT:
6468           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
6469           break;
6470
6471         case SYMBOL_REF:
6472           output_addr_const (f, x);
6473           break;
6474
6475         case CONST:
6476           if (GET_CODE (XEXP (x, 0)) == PLUS
6477               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
6478             {
6479               output_addr_const (f, x);
6480               break;
6481             }
6482           /* Fall through.  */
6483
6484         default:
6485           output_operand_lossage ("unsupported operand for code '%c'", code);
6486         }
6487       break;
6488
6489     case 'e':
6490       {
6491         int n;
6492
6493         if (!CONST_INT_P (x)
6494             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
6495           {
6496             output_operand_lossage ("invalid operand for '%%%c'", code);
6497             return;
6498           }
6499
6500         switch (n)
6501           {
6502           case 3:
6503             fputc ('b', f);
6504             break;
6505           case 4:
6506             fputc ('h', f);
6507             break;
6508           case 5:
6509             fputc ('w', f);
6510             break;
6511           default:
6512             output_operand_lossage ("invalid operand for '%%%c'", code);
6513             return;
6514           }
6515       }
6516       break;
6517
6518     case 'p':
6519       {
6520         int n;
6521
6522         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
6523           {
6524             output_operand_lossage ("invalid operand for '%%%c'", code);
6525             return;
6526           }
6527
6528         asm_fprintf (f, "%d", n);
6529       }
6530       break;
6531
6532     case 'P':
6533       if (!CONST_INT_P (x))
6534         {
6535           output_operand_lossage ("invalid operand for '%%%c'", code);
6536           return;
6537         }
6538
6539       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
6540       break;
6541
6542     case 'H':
6543       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
6544         {
6545           output_operand_lossage ("invalid operand for '%%%c'", code);
6546           return;
6547         }
6548
6549       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
6550       break;
6551
6552     case 'M':
6553     case 'm':
6554       {
6555         int cond_code;
6556         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
6557         if (x == const_true_rtx)
6558           {
6559             if (code == 'M')
6560               fputs ("nv", f);
6561             return;
6562           }
6563
6564         if (!COMPARISON_P (x))
6565           {
6566             output_operand_lossage ("invalid operand for '%%%c'", code);
6567             return;
6568           }
6569
6570         cond_code = aarch64_get_condition_code (x);
6571         gcc_assert (cond_code >= 0);
6572         if (code == 'M')
6573           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
6574         fputs (aarch64_condition_codes[cond_code], f);
6575       }
6576       break;
6577
6578     case 'N':
6579       if (!const_vec_duplicate_p (x, &elt))
6580         {
6581           output_operand_lossage ("invalid vector constant");
6582           return;
6583         }
6584
6585       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6586         asm_fprintf (f, "%wd", -INTVAL (elt));
6587       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6588                && aarch64_print_vector_float_operand (f, x, true))
6589         ;
6590       else
6591         {
6592           output_operand_lossage ("invalid vector constant");
6593           return;
6594         }
6595       break;
6596
6597     case 'b':
6598     case 'h':
6599     case 's':
6600     case 'd':
6601     case 'q':
6602       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6603         {
6604           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6605           return;
6606         }
6607       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
6608       break;
6609
6610     case 'S':
6611     case 'T':
6612     case 'U':
6613     case 'V':
6614       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6615         {
6616           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6617           return;
6618         }
6619       asm_fprintf (f, "%c%d",
6620                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
6621                    REGNO (x) - V0_REGNUM + (code - 'S'));
6622       break;
6623
6624     case 'R':
6625       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6626         {
6627           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6628           return;
6629         }
6630       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
6631       break;
6632
6633     case 'X':
6634       if (!CONST_INT_P (x))
6635         {
6636           output_operand_lossage ("invalid operand for '%%%c'", code);
6637           return;
6638         }
6639       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
6640       break;
6641
6642     case 'C':
6643       {
6644         /* Print a replicated constant in hex.  */
6645         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6646           {
6647             output_operand_lossage ("invalid operand for '%%%c'", code);
6648             return;
6649           }
6650         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6651         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6652       }
6653       break;
6654
6655     case 'D':
6656       {
6657         /* Print a replicated constant in decimal, treating it as
6658            unsigned.  */
6659         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6660           {
6661             output_operand_lossage ("invalid operand for '%%%c'", code);
6662             return;
6663           }
6664         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6665         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6666       }
6667       break;
6668
6669     case 'w':
6670     case 'x':
6671       if (x == const0_rtx
6672           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
6673         {
6674           asm_fprintf (f, "%czr", code);
6675           break;
6676         }
6677
6678       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
6679         {
6680           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
6681           break;
6682         }
6683
6684       if (REG_P (x) && REGNO (x) == SP_REGNUM)
6685         {
6686           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
6687           break;
6688         }
6689
6690       /* Fall through */
6691
6692     case 0:
6693       if (x == NULL)
6694         {
6695           output_operand_lossage ("missing operand");
6696           return;
6697         }
6698
6699       switch (GET_CODE (x))
6700         {
6701         case REG:
6702           if (aarch64_sve_data_mode_p (GET_MODE (x)))
6703             {
6704               if (REG_NREGS (x) == 1)
6705                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
6706               else
6707                 {
6708                   char suffix
6709                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
6710                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
6711                                REGNO (x) - V0_REGNUM, suffix,
6712                                END_REGNO (x) - V0_REGNUM - 1, suffix);
6713                 }
6714             }
6715           else
6716             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
6717           break;
6718
6719         case MEM:
6720           output_address (GET_MODE (x), XEXP (x, 0));
6721           break;
6722
6723         case LABEL_REF:
6724         case SYMBOL_REF:
6725           output_addr_const (asm_out_file, x);
6726           break;
6727
6728         case CONST_INT:
6729           asm_fprintf (f, "%wd", INTVAL (x));
6730           break;
6731
6732         case CONST:
6733           if (!VECTOR_MODE_P (GET_MODE (x)))
6734             {
6735               output_addr_const (asm_out_file, x);
6736               break;
6737             }
6738           /* fall through */
6739
6740         case CONST_VECTOR:
6741           if (!const_vec_duplicate_p (x, &elt))
6742             {
6743               output_operand_lossage ("invalid vector constant");
6744               return;
6745             }
6746
6747           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6748             asm_fprintf (f, "%wd", INTVAL (elt));
6749           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6750                    && aarch64_print_vector_float_operand (f, x, false))
6751             ;
6752           else
6753             {
6754               output_operand_lossage ("invalid vector constant");
6755               return;
6756             }
6757           break;
6758
6759         case CONST_DOUBLE:
6760           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
6761              be getting CONST_DOUBLEs holding integers.  */
6762           gcc_assert (GET_MODE (x) != VOIDmode);
6763           if (aarch64_float_const_zero_rtx_p (x))
6764             {
6765               fputc ('0', f);
6766               break;
6767             }
6768           else if (aarch64_float_const_representable_p (x))
6769             {
6770 #define buf_size 20
6771               char float_buf[buf_size] = {'\0'};
6772               real_to_decimal_for_mode (float_buf,
6773                                         CONST_DOUBLE_REAL_VALUE (x),
6774                                         buf_size, buf_size,
6775                                         1, GET_MODE (x));
6776               asm_fprintf (asm_out_file, "%s", float_buf);
6777               break;
6778 #undef buf_size
6779             }
6780           output_operand_lossage ("invalid constant");
6781           return;
6782         default:
6783           output_operand_lossage ("invalid operand");
6784           return;
6785         }
6786       break;
6787
6788     case 'A':
6789       if (GET_CODE (x) == HIGH)
6790         x = XEXP (x, 0);
6791
6792       switch (aarch64_classify_symbolic_expression (x))
6793         {
6794         case SYMBOL_SMALL_GOT_4G:
6795           asm_fprintf (asm_out_file, ":got:");
6796           break;
6797
6798         case SYMBOL_SMALL_TLSGD:
6799           asm_fprintf (asm_out_file, ":tlsgd:");
6800           break;
6801
6802         case SYMBOL_SMALL_TLSDESC:
6803           asm_fprintf (asm_out_file, ":tlsdesc:");
6804           break;
6805
6806         case SYMBOL_SMALL_TLSIE:
6807           asm_fprintf (asm_out_file, ":gottprel:");
6808           break;
6809
6810         case SYMBOL_TLSLE24:
6811           asm_fprintf (asm_out_file, ":tprel:");
6812           break;
6813
6814         case SYMBOL_TINY_GOT:
6815           gcc_unreachable ();
6816           break;
6817
6818         default:
6819           break;
6820         }
6821       output_addr_const (asm_out_file, x);
6822       break;
6823
6824     case 'L':
6825       switch (aarch64_classify_symbolic_expression (x))
6826         {
6827         case SYMBOL_SMALL_GOT_4G:
6828           asm_fprintf (asm_out_file, ":lo12:");
6829           break;
6830
6831         case SYMBOL_SMALL_TLSGD:
6832           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
6833           break;
6834
6835         case SYMBOL_SMALL_TLSDESC:
6836           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
6837           break;
6838
6839         case SYMBOL_SMALL_TLSIE:
6840           asm_fprintf (asm_out_file, ":gottprel_lo12:");
6841           break;
6842
6843         case SYMBOL_TLSLE12:
6844           asm_fprintf (asm_out_file, ":tprel_lo12:");
6845           break;
6846
6847         case SYMBOL_TLSLE24:
6848           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
6849           break;
6850
6851         case SYMBOL_TINY_GOT:
6852           asm_fprintf (asm_out_file, ":got:");
6853           break;
6854
6855         case SYMBOL_TINY_TLSIE:
6856           asm_fprintf (asm_out_file, ":gottprel:");
6857           break;
6858
6859         default:
6860           break;
6861         }
6862       output_addr_const (asm_out_file, x);
6863       break;
6864
6865     case 'G':
6866       switch (aarch64_classify_symbolic_expression (x))
6867         {
6868         case SYMBOL_TLSLE24:
6869           asm_fprintf (asm_out_file, ":tprel_hi12:");
6870           break;
6871         default:
6872           break;
6873         }
6874       output_addr_const (asm_out_file, x);
6875       break;
6876
6877     case 'k':
6878       {
6879         HOST_WIDE_INT cond_code;
6880
6881         if (!CONST_INT_P (x))
6882           {
6883             output_operand_lossage ("invalid operand for '%%%c'", code);
6884             return;
6885           }
6886
6887         cond_code = INTVAL (x);
6888         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
6889         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
6890       }
6891       break;
6892
6893     case 'y':
6894     case 'z':
6895       {
6896         machine_mode mode = GET_MODE (x);
6897
6898         if (GET_CODE (x) != MEM
6899             || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
6900           {
6901             output_operand_lossage ("invalid operand for '%%%c'", code);
6902             return;
6903           }
6904
6905         if (code == 'y')
6906           /* LDP/STP which uses a single double-width memory operand.
6907              Adjust the mode to appear like a typical LDP/STP.
6908              Currently this is supported for 16-byte accesses only.  */
6909           mode = DFmode;
6910
6911         if (!aarch64_print_ldpstp_address (f, mode, XEXP (x, 0)))
6912           output_operand_lossage ("invalid operand prefix '%%%c'", code);
6913       }
6914       break;
6915
6916     default:
6917       output_operand_lossage ("invalid operand prefix '%%%c'", code);
6918       return;
6919     }
6920 }
6921
6922 /* Print address 'x' of a memory access with mode 'mode'.
6923    'op' is the context required by aarch64_classify_address.  It can either be
6924    MEM for a normal memory access or PARALLEL for LDP/STP.  */
6925 static bool
6926 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
6927                                 aarch64_addr_query_type type)
6928 {
6929   struct aarch64_address_info addr;
6930   unsigned int size;
6931
6932   /* Check all addresses are Pmode - including ILP32.  */
6933   gcc_assert (GET_MODE (x) == Pmode);
6934
6935   if (aarch64_classify_address (&addr, x, mode, true, type))
6936     switch (addr.type)
6937       {
6938       case ADDRESS_REG_IMM:
6939         if (known_eq (addr.const_offset, 0))
6940           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
6941         else if (aarch64_sve_data_mode_p (mode))
6942           {
6943             HOST_WIDE_INT vnum
6944               = exact_div (addr.const_offset,
6945                            BYTES_PER_SVE_VECTOR).to_constant ();
6946             asm_fprintf (f, "[%s, #%wd, mul vl]",
6947                          reg_names[REGNO (addr.base)], vnum);
6948           }
6949         else if (aarch64_sve_pred_mode_p (mode))
6950           {
6951             HOST_WIDE_INT vnum
6952               = exact_div (addr.const_offset,
6953                            BYTES_PER_SVE_PRED).to_constant ();
6954             asm_fprintf (f, "[%s, #%wd, mul vl]",
6955                          reg_names[REGNO (addr.base)], vnum);
6956           }
6957         else
6958           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
6959                        INTVAL (addr.offset));
6960         return true;
6961
6962       case ADDRESS_REG_REG:
6963         if (addr.shift == 0)
6964           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
6965                        reg_names [REGNO (addr.offset)]);
6966         else
6967           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
6968                        reg_names [REGNO (addr.offset)], addr.shift);
6969         return true;
6970
6971       case ADDRESS_REG_UXTW:
6972         if (addr.shift == 0)
6973           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
6974                        REGNO (addr.offset) - R0_REGNUM);
6975         else
6976           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
6977                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
6978         return true;
6979
6980       case ADDRESS_REG_SXTW:
6981         if (addr.shift == 0)
6982           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
6983                        REGNO (addr.offset) - R0_REGNUM);
6984         else
6985           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
6986                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
6987         return true;
6988
6989       case ADDRESS_REG_WB:
6990         /* Writeback is only supported for fixed-width modes.  */
6991         size = GET_MODE_SIZE (mode).to_constant ();
6992         switch (GET_CODE (x))
6993           {
6994           case PRE_INC:
6995             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
6996             return true;
6997           case POST_INC:
6998             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
6999             return true;
7000           case PRE_DEC:
7001             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
7002             return true;
7003           case POST_DEC:
7004             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
7005             return true;
7006           case PRE_MODIFY:
7007             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
7008                          INTVAL (addr.offset));
7009             return true;
7010           case POST_MODIFY:
7011             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
7012                          INTVAL (addr.offset));
7013             return true;
7014           default:
7015             break;
7016           }
7017         break;
7018
7019       case ADDRESS_LO_SUM:
7020         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
7021         output_addr_const (f, addr.offset);
7022         asm_fprintf (f, "]");
7023         return true;
7024
7025       case ADDRESS_SYMBOLIC:
7026         output_addr_const (f, x);
7027         return true;
7028       }
7029
7030   return false;
7031 }
7032
7033 /* Print address 'x' of a LDP/STP with mode 'mode'.  */
7034 static bool
7035 aarch64_print_ldpstp_address (FILE *f, machine_mode mode, rtx x)
7036 {
7037   return aarch64_print_address_internal (f, mode, x, ADDR_QUERY_LDP_STP);
7038 }
7039
7040 /* Print address 'x' of a memory access with mode 'mode'.  */
7041 static void
7042 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
7043 {
7044   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
7045     output_addr_const (f, x);
7046 }
7047
7048 bool
7049 aarch64_label_mentioned_p (rtx x)
7050 {
7051   const char *fmt;
7052   int i;
7053
7054   if (GET_CODE (x) == LABEL_REF)
7055     return true;
7056
7057   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
7058      referencing instruction, but they are constant offsets, not
7059      symbols.  */
7060   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
7061     return false;
7062
7063   fmt = GET_RTX_FORMAT (GET_CODE (x));
7064   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
7065     {
7066       if (fmt[i] == 'E')
7067         {
7068           int j;
7069
7070           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
7071             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
7072               return 1;
7073         }
7074       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
7075         return 1;
7076     }
7077
7078   return 0;
7079 }
7080
7081 /* Implement REGNO_REG_CLASS.  */
7082
7083 enum reg_class
7084 aarch64_regno_regclass (unsigned regno)
7085 {
7086   if (GP_REGNUM_P (regno))
7087     return GENERAL_REGS;
7088
7089   if (regno == SP_REGNUM)
7090     return STACK_REG;
7091
7092   if (regno == FRAME_POINTER_REGNUM
7093       || regno == ARG_POINTER_REGNUM)
7094     return POINTER_REGS;
7095
7096   if (FP_REGNUM_P (regno))
7097     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
7098
7099   if (PR_REGNUM_P (regno))
7100     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
7101
7102   return NO_REGS;
7103 }
7104
7105 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
7106    If OFFSET is out of range, return an offset of an anchor point
7107    that is in range.  Return 0 otherwise.  */
7108
7109 static HOST_WIDE_INT
7110 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
7111                        machine_mode mode)
7112 {
7113   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
7114   if (size > 16)
7115     return (offset + 0x400) & ~0x7f0;
7116
7117   /* For offsets that aren't a multiple of the access size, the limit is
7118      -256...255.  */
7119   if (offset & (size - 1))
7120     {
7121       /* BLKmode typically uses LDP of X-registers.  */
7122       if (mode == BLKmode)
7123         return (offset + 512) & ~0x3ff;
7124       return (offset + 0x100) & ~0x1ff;
7125     }
7126
7127   /* Small negative offsets are supported.  */
7128   if (IN_RANGE (offset, -256, 0))
7129     return 0;
7130
7131   if (mode == TImode || mode == TFmode)
7132     return (offset + 0x100) & ~0x1ff;
7133
7134   /* Use 12-bit offset by access size.  */
7135   return offset & (~0xfff * size);
7136 }
7137
7138 static rtx
7139 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
7140 {
7141   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
7142      where mask is selected by alignment and size of the offset.
7143      We try to pick as large a range for the offset as possible to
7144      maximize the chance of a CSE.  However, for aligned addresses
7145      we limit the range to 4k so that structures with different sized
7146      elements are likely to use the same base.  We need to be careful
7147      not to split a CONST for some forms of address expression, otherwise
7148      it will generate sub-optimal code.  */
7149
7150   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
7151     {
7152       rtx base = XEXP (x, 0);
7153       rtx offset_rtx = XEXP (x, 1);
7154       HOST_WIDE_INT offset = INTVAL (offset_rtx);
7155
7156       if (GET_CODE (base) == PLUS)
7157         {
7158           rtx op0 = XEXP (base, 0);
7159           rtx op1 = XEXP (base, 1);
7160
7161           /* Force any scaling into a temp for CSE.  */
7162           op0 = force_reg (Pmode, op0);
7163           op1 = force_reg (Pmode, op1);
7164
7165           /* Let the pointer register be in op0.  */
7166           if (REG_POINTER (op1))
7167             std::swap (op0, op1);
7168
7169           /* If the pointer is virtual or frame related, then we know that
7170              virtual register instantiation or register elimination is going
7171              to apply a second constant.  We want the two constants folded
7172              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
7173           if (virt_or_elim_regno_p (REGNO (op0)))
7174             {
7175               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
7176                                    NULL_RTX, true, OPTAB_DIRECT);
7177               return gen_rtx_PLUS (Pmode, base, op1);
7178             }
7179
7180           /* Otherwise, in order to encourage CSE (and thence loop strength
7181              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
7182           base = expand_binop (Pmode, add_optab, op0, op1,
7183                                NULL_RTX, true, OPTAB_DIRECT);
7184           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
7185         }
7186
7187       HOST_WIDE_INT size;
7188       if (GET_MODE_SIZE (mode).is_constant (&size))
7189         {
7190           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
7191                                                              mode);
7192           if (base_offset != 0)
7193             {
7194               base = plus_constant (Pmode, base, base_offset);
7195               base = force_operand (base, NULL_RTX);
7196               return plus_constant (Pmode, base, offset - base_offset);
7197             }
7198         }
7199     }
7200
7201   return x;
7202 }
7203
7204 /* Return the reload icode required for a constant pool in mode.  */
7205 static enum insn_code
7206 aarch64_constant_pool_reload_icode (machine_mode mode)
7207 {
7208   switch (mode)
7209     {
7210     case E_SFmode:
7211       return CODE_FOR_aarch64_reload_movcpsfdi;
7212
7213     case E_DFmode:
7214       return CODE_FOR_aarch64_reload_movcpdfdi;
7215
7216     case E_TFmode:
7217       return CODE_FOR_aarch64_reload_movcptfdi;
7218
7219     case E_V8QImode:
7220       return CODE_FOR_aarch64_reload_movcpv8qidi;
7221
7222     case E_V16QImode:
7223       return CODE_FOR_aarch64_reload_movcpv16qidi;
7224
7225     case E_V4HImode:
7226       return CODE_FOR_aarch64_reload_movcpv4hidi;
7227
7228     case E_V8HImode:
7229       return CODE_FOR_aarch64_reload_movcpv8hidi;
7230
7231     case E_V2SImode:
7232       return CODE_FOR_aarch64_reload_movcpv2sidi;
7233
7234     case E_V4SImode:
7235       return CODE_FOR_aarch64_reload_movcpv4sidi;
7236
7237     case E_V2DImode:
7238       return CODE_FOR_aarch64_reload_movcpv2didi;
7239
7240     case E_V2DFmode:
7241       return CODE_FOR_aarch64_reload_movcpv2dfdi;
7242
7243     default:
7244       gcc_unreachable ();
7245     }
7246
7247   gcc_unreachable ();
7248 }
7249 static reg_class_t
7250 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
7251                           reg_class_t rclass,
7252                           machine_mode mode,
7253                           secondary_reload_info *sri)
7254 {
7255   /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
7256      directly by the *aarch64_sve_mov<mode>_be move pattern.  See the
7257      comment at the head of aarch64-sve.md for more details about the
7258      big-endian handling.  */
7259   if (BYTES_BIG_ENDIAN
7260       && reg_class_subset_p (rclass, FP_REGS)
7261       && !((REG_P (x) && HARD_REGISTER_P (x))
7262            || aarch64_simd_valid_immediate (x, NULL))
7263       && aarch64_sve_data_mode_p (mode))
7264     {
7265       sri->icode = CODE_FOR_aarch64_sve_reload_be;
7266       return NO_REGS;
7267     }
7268
7269   /* If we have to disable direct literal pool loads and stores because the
7270      function is too big, then we need a scratch register.  */
7271   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
7272       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
7273           || targetm.vector_mode_supported_p (GET_MODE (x)))
7274       && !aarch64_pcrelative_literal_loads)
7275     {
7276       sri->icode = aarch64_constant_pool_reload_icode (mode);
7277       return NO_REGS;
7278     }
7279
7280   /* Without the TARGET_SIMD instructions we cannot move a Q register
7281      to a Q register directly.  We need a scratch.  */
7282   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
7283       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
7284       && reg_class_subset_p (rclass, FP_REGS))
7285     {
7286       if (mode == TFmode)
7287         sri->icode = CODE_FOR_aarch64_reload_movtf;
7288       else if (mode == TImode)
7289         sri->icode = CODE_FOR_aarch64_reload_movti;
7290       return NO_REGS;
7291     }
7292
7293   /* A TFmode or TImode memory access should be handled via an FP_REGS
7294      because AArch64 has richer addressing modes for LDR/STR instructions
7295      than LDP/STP instructions.  */
7296   if (TARGET_FLOAT && rclass == GENERAL_REGS
7297       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
7298     return FP_REGS;
7299
7300   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
7301       return GENERAL_REGS;
7302
7303   return NO_REGS;
7304 }
7305
7306 static bool
7307 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
7308 {
7309   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
7310
7311   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
7312      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
7313   if (frame_pointer_needed)
7314     return to == HARD_FRAME_POINTER_REGNUM;
7315   return true;
7316 }
7317
7318 poly_int64
7319 aarch64_initial_elimination_offset (unsigned from, unsigned to)
7320 {
7321   aarch64_layout_frame ();
7322
7323   if (to == HARD_FRAME_POINTER_REGNUM)
7324     {
7325       if (from == ARG_POINTER_REGNUM)
7326         return cfun->machine->frame.hard_fp_offset;
7327
7328       if (from == FRAME_POINTER_REGNUM)
7329         return cfun->machine->frame.hard_fp_offset
7330                - cfun->machine->frame.locals_offset;
7331     }
7332
7333   if (to == STACK_POINTER_REGNUM)
7334     {
7335       if (from == FRAME_POINTER_REGNUM)
7336           return cfun->machine->frame.frame_size
7337                  - cfun->machine->frame.locals_offset;
7338     }
7339
7340   return cfun->machine->frame.frame_size;
7341 }
7342
7343 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
7344    previous frame.  */
7345
7346 rtx
7347 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
7348 {
7349   if (count != 0)
7350     return const0_rtx;
7351   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
7352 }
7353
7354
7355 static void
7356 aarch64_asm_trampoline_template (FILE *f)
7357 {
7358   if (TARGET_ILP32)
7359     {
7360       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
7361       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
7362     }
7363   else
7364     {
7365       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
7366       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
7367     }
7368   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
7369   assemble_aligned_integer (4, const0_rtx);
7370   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7371   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7372 }
7373
7374 static void
7375 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
7376 {
7377   rtx fnaddr, mem, a_tramp;
7378   const int tramp_code_sz = 16;
7379
7380   /* Don't need to copy the trailing D-words, we fill those in below.  */
7381   emit_block_move (m_tramp, assemble_trampoline_template (),
7382                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
7383   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
7384   fnaddr = XEXP (DECL_RTL (fndecl), 0);
7385   if (GET_MODE (fnaddr) != ptr_mode)
7386     fnaddr = convert_memory_address (ptr_mode, fnaddr);
7387   emit_move_insn (mem, fnaddr);
7388
7389   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
7390   emit_move_insn (mem, chain_value);
7391
7392   /* XXX We should really define a "clear_cache" pattern and use
7393      gen_clear_cache().  */
7394   a_tramp = XEXP (m_tramp, 0);
7395   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
7396                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
7397                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
7398                      ptr_mode);
7399 }
7400
7401 static unsigned char
7402 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
7403 {
7404   /* ??? Logically we should only need to provide a value when
7405      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
7406      can hold MODE, but at the moment we need to handle all modes.
7407      Just ignore any runtime parts for registers that can't store them.  */
7408   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
7409   unsigned int nregs;
7410   switch (regclass)
7411     {
7412     case CALLER_SAVE_REGS:
7413     case POINTER_REGS:
7414     case GENERAL_REGS:
7415     case ALL_REGS:
7416     case POINTER_AND_FP_REGS:
7417     case FP_REGS:
7418     case FP_LO_REGS:
7419       if (aarch64_sve_data_mode_p (mode)
7420           && constant_multiple_p (GET_MODE_SIZE (mode),
7421                                   BYTES_PER_SVE_VECTOR, &nregs))
7422         return nregs;
7423       return (aarch64_vector_data_mode_p (mode)
7424               ? CEIL (lowest_size, UNITS_PER_VREG)
7425               : CEIL (lowest_size, UNITS_PER_WORD));
7426     case STACK_REG:
7427     case PR_REGS:
7428     case PR_LO_REGS:
7429     case PR_HI_REGS:
7430       return 1;
7431
7432     case NO_REGS:
7433       return 0;
7434
7435     default:
7436       break;
7437     }
7438   gcc_unreachable ();
7439 }
7440
7441 static reg_class_t
7442 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
7443 {
7444   if (regclass == POINTER_REGS)
7445     return GENERAL_REGS;
7446
7447   if (regclass == STACK_REG)
7448     {
7449       if (REG_P(x)
7450           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
7451           return regclass;
7452
7453       return NO_REGS;
7454     }
7455
7456   /* Register eliminiation can result in a request for
7457      SP+constant->FP_REGS.  We cannot support such operations which
7458      use SP as source and an FP_REG as destination, so reject out
7459      right now.  */
7460   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
7461     {
7462       rtx lhs = XEXP (x, 0);
7463
7464       /* Look through a possible SUBREG introduced by ILP32.  */
7465       if (GET_CODE (lhs) == SUBREG)
7466         lhs = SUBREG_REG (lhs);
7467
7468       gcc_assert (REG_P (lhs));
7469       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
7470                                       POINTER_REGS));
7471       return NO_REGS;
7472     }
7473
7474   return regclass;
7475 }
7476
7477 void
7478 aarch64_asm_output_labelref (FILE* f, const char *name)
7479 {
7480   asm_fprintf (f, "%U%s", name);
7481 }
7482
7483 static void
7484 aarch64_elf_asm_constructor (rtx symbol, int priority)
7485 {
7486   if (priority == DEFAULT_INIT_PRIORITY)
7487     default_ctor_section_asm_out_constructor (symbol, priority);
7488   else
7489     {
7490       section *s;
7491       /* While priority is known to be in range [0, 65535], so 18 bytes
7492          would be enough, the compiler might not know that.  To avoid
7493          -Wformat-truncation false positive, use a larger size.  */
7494       char buf[23];
7495       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
7496       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7497       switch_to_section (s);
7498       assemble_align (POINTER_SIZE);
7499       assemble_aligned_integer (POINTER_BYTES, symbol);
7500     }
7501 }
7502
7503 static void
7504 aarch64_elf_asm_destructor (rtx symbol, int priority)
7505 {
7506   if (priority == DEFAULT_INIT_PRIORITY)
7507     default_dtor_section_asm_out_destructor (symbol, priority);
7508   else
7509     {
7510       section *s;
7511       /* While priority is known to be in range [0, 65535], so 18 bytes
7512          would be enough, the compiler might not know that.  To avoid
7513          -Wformat-truncation false positive, use a larger size.  */
7514       char buf[23];
7515       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
7516       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7517       switch_to_section (s);
7518       assemble_align (POINTER_SIZE);
7519       assemble_aligned_integer (POINTER_BYTES, symbol);
7520     }
7521 }
7522
7523 const char*
7524 aarch64_output_casesi (rtx *operands)
7525 {
7526   char buf[100];
7527   char label[100];
7528   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
7529   int index;
7530   static const char *const patterns[4][2] =
7531   {
7532     {
7533       "ldrb\t%w3, [%0,%w1,uxtw]",
7534       "add\t%3, %4, %w3, sxtb #2"
7535     },
7536     {
7537       "ldrh\t%w3, [%0,%w1,uxtw #1]",
7538       "add\t%3, %4, %w3, sxth #2"
7539     },
7540     {
7541       "ldr\t%w3, [%0,%w1,uxtw #2]",
7542       "add\t%3, %4, %w3, sxtw #2"
7543     },
7544     /* We assume that DImode is only generated when not optimizing and
7545        that we don't really need 64-bit address offsets.  That would
7546        imply an object file with 8GB of code in a single function!  */
7547     {
7548       "ldr\t%w3, [%0,%w1,uxtw #2]",
7549       "add\t%3, %4, %w3, sxtw #2"
7550     }
7551   };
7552
7553   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
7554
7555   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
7556   index = exact_log2 (GET_MODE_SIZE (mode));
7557
7558   gcc_assert (index >= 0 && index <= 3);
7559
7560   /* Need to implement table size reduction, by chaning the code below.  */
7561   output_asm_insn (patterns[index][0], operands);
7562   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
7563   snprintf (buf, sizeof (buf),
7564             "adr\t%%4, %s", targetm.strip_name_encoding (label));
7565   output_asm_insn (buf, operands);
7566   output_asm_insn (patterns[index][1], operands);
7567   output_asm_insn ("br\t%3", operands);
7568   assemble_label (asm_out_file, label);
7569   return "";
7570 }
7571
7572
7573 /* Return size in bits of an arithmetic operand which is shifted/scaled and
7574    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
7575    operator.  */
7576
7577 int
7578 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
7579 {
7580   if (shift >= 0 && shift <= 3)
7581     {
7582       int size;
7583       for (size = 8; size <= 32; size *= 2)
7584         {
7585           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
7586           if (mask == bits << shift)
7587             return size;
7588         }
7589     }
7590   return 0;
7591 }
7592
7593 /* Constant pools are per function only when PC relative
7594    literal loads are true or we are in the large memory
7595    model.  */
7596
7597 static inline bool
7598 aarch64_can_use_per_function_literal_pools_p (void)
7599 {
7600   return (aarch64_pcrelative_literal_loads
7601           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
7602 }
7603
7604 static bool
7605 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
7606 {
7607   /* Fixme:: In an ideal world this would work similar
7608      to the logic in aarch64_select_rtx_section but this
7609      breaks bootstrap in gcc go.  For now we workaround
7610      this by returning false here.  */
7611   return false;
7612 }
7613
7614 /* Select appropriate section for constants depending
7615    on where we place literal pools.  */
7616
7617 static section *
7618 aarch64_select_rtx_section (machine_mode mode,
7619                             rtx x,
7620                             unsigned HOST_WIDE_INT align)
7621 {
7622   if (aarch64_can_use_per_function_literal_pools_p ())
7623     return function_section (current_function_decl);
7624
7625   return default_elf_select_rtx_section (mode, x, align);
7626 }
7627
7628 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
7629 void
7630 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
7631                                   HOST_WIDE_INT offset)
7632 {
7633   /* When using per-function literal pools, we must ensure that any code
7634      section is aligned to the minimal instruction length, lest we get
7635      errors from the assembler re "unaligned instructions".  */
7636   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
7637     ASM_OUTPUT_ALIGN (f, 2);
7638 }
7639
7640 /* Costs.  */
7641
7642 /* Helper function for rtx cost calculation.  Strip a shift expression
7643    from X.  Returns the inner operand if successful, or the original
7644    expression on failure.  */
7645 static rtx
7646 aarch64_strip_shift (rtx x)
7647 {
7648   rtx op = x;
7649
7650   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
7651      we can convert both to ROR during final output.  */
7652   if ((GET_CODE (op) == ASHIFT
7653        || GET_CODE (op) == ASHIFTRT
7654        || GET_CODE (op) == LSHIFTRT
7655        || GET_CODE (op) == ROTATERT
7656        || GET_CODE (op) == ROTATE)
7657       && CONST_INT_P (XEXP (op, 1)))
7658     return XEXP (op, 0);
7659
7660   if (GET_CODE (op) == MULT
7661       && CONST_INT_P (XEXP (op, 1))
7662       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
7663     return XEXP (op, 0);
7664
7665   return x;
7666 }
7667
7668 /* Helper function for rtx cost calculation.  Strip an extend
7669    expression from X.  Returns the inner operand if successful, or the
7670    original expression on failure.  We deal with a number of possible
7671    canonicalization variations here. If STRIP_SHIFT is true, then
7672    we can strip off a shift also.  */
7673 static rtx
7674 aarch64_strip_extend (rtx x, bool strip_shift)
7675 {
7676   scalar_int_mode mode;
7677   rtx op = x;
7678
7679   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
7680     return op;
7681
7682   /* Zero and sign extraction of a widened value.  */
7683   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
7684       && XEXP (op, 2) == const0_rtx
7685       && GET_CODE (XEXP (op, 0)) == MULT
7686       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
7687                                          XEXP (op, 1)))
7688     return XEXP (XEXP (op, 0), 0);
7689
7690   /* It can also be represented (for zero-extend) as an AND with an
7691      immediate.  */
7692   if (GET_CODE (op) == AND
7693       && GET_CODE (XEXP (op, 0)) == MULT
7694       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
7695       && CONST_INT_P (XEXP (op, 1))
7696       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
7697                            INTVAL (XEXP (op, 1))) != 0)
7698     return XEXP (XEXP (op, 0), 0);
7699
7700   /* Now handle extended register, as this may also have an optional
7701      left shift by 1..4.  */
7702   if (strip_shift
7703       && GET_CODE (op) == ASHIFT
7704       && CONST_INT_P (XEXP (op, 1))
7705       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
7706     op = XEXP (op, 0);
7707
7708   if (GET_CODE (op) == ZERO_EXTEND
7709       || GET_CODE (op) == SIGN_EXTEND)
7710     op = XEXP (op, 0);
7711
7712   if (op != x)
7713     return op;
7714
7715   return x;
7716 }
7717
7718 /* Return true iff CODE is a shift supported in combination
7719    with arithmetic instructions.  */
7720
7721 static bool
7722 aarch64_shift_p (enum rtx_code code)
7723 {
7724   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
7725 }
7726
7727
7728 /* Return true iff X is a cheap shift without a sign extend. */
7729
7730 static bool
7731 aarch64_cheap_mult_shift_p (rtx x)
7732 {
7733   rtx op0, op1;
7734
7735   op0 = XEXP (x, 0);
7736   op1 = XEXP (x, 1);
7737
7738   if (!(aarch64_tune_params.extra_tuning_flags
7739                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
7740     return false;
7741
7742   if (GET_CODE (op0) == SIGN_EXTEND)
7743     return false;
7744
7745   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
7746       && UINTVAL (op1) <= 4)
7747     return true;
7748
7749   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
7750     return false;
7751
7752   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
7753
7754   if (l2 > 0 && l2 <= 4)
7755     return true;
7756
7757   return false;
7758 }
7759
7760 /* Helper function for rtx cost calculation.  Calculate the cost of
7761    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
7762    Return the calculated cost of the expression, recursing manually in to
7763    operands where needed.  */
7764
7765 static int
7766 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
7767 {
7768   rtx op0, op1;
7769   const struct cpu_cost_table *extra_cost
7770     = aarch64_tune_params.insn_extra_cost;
7771   int cost = 0;
7772   bool compound_p = (outer == PLUS || outer == MINUS);
7773   machine_mode mode = GET_MODE (x);
7774
7775   gcc_checking_assert (code == MULT);
7776
7777   op0 = XEXP (x, 0);
7778   op1 = XEXP (x, 1);
7779
7780   if (VECTOR_MODE_P (mode))
7781     mode = GET_MODE_INNER (mode);
7782
7783   /* Integer multiply/fma.  */
7784   if (GET_MODE_CLASS (mode) == MODE_INT)
7785     {
7786       /* The multiply will be canonicalized as a shift, cost it as such.  */
7787       if (aarch64_shift_p (GET_CODE (x))
7788           || (CONST_INT_P (op1)
7789               && exact_log2 (INTVAL (op1)) > 0))
7790         {
7791           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
7792                            || GET_CODE (op0) == SIGN_EXTEND;
7793           if (speed)
7794             {
7795               if (compound_p)
7796                 {
7797                   /* If the shift is considered cheap,
7798                      then don't add any cost. */
7799                   if (aarch64_cheap_mult_shift_p (x))
7800                     ;
7801                   else if (REG_P (op1))
7802                     /* ARITH + shift-by-register.  */
7803                     cost += extra_cost->alu.arith_shift_reg;
7804                   else if (is_extend)
7805                     /* ARITH + extended register.  We don't have a cost field
7806                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
7807                     cost += extra_cost->alu.extend_arith;
7808                   else
7809                     /* ARITH + shift-by-immediate.  */
7810                     cost += extra_cost->alu.arith_shift;
7811                 }
7812               else
7813                 /* LSL (immediate).  */
7814                 cost += extra_cost->alu.shift;
7815
7816             }
7817           /* Strip extends as we will have costed them in the case above.  */
7818           if (is_extend)
7819             op0 = aarch64_strip_extend (op0, true);
7820
7821           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
7822
7823           return cost;
7824         }
7825
7826       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
7827          compound and let the below cases handle it.  After all, MNEG is a
7828          special-case alias of MSUB.  */
7829       if (GET_CODE (op0) == NEG)
7830         {
7831           op0 = XEXP (op0, 0);
7832           compound_p = true;
7833         }
7834
7835       /* Integer multiplies or FMAs have zero/sign extending variants.  */
7836       if ((GET_CODE (op0) == ZERO_EXTEND
7837            && GET_CODE (op1) == ZERO_EXTEND)
7838           || (GET_CODE (op0) == SIGN_EXTEND
7839               && GET_CODE (op1) == SIGN_EXTEND))
7840         {
7841           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
7842           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
7843
7844           if (speed)
7845             {
7846               if (compound_p)
7847                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
7848                 cost += extra_cost->mult[0].extend_add;
7849               else
7850                 /* MUL/SMULL/UMULL.  */
7851                 cost += extra_cost->mult[0].extend;
7852             }
7853
7854           return cost;
7855         }
7856
7857       /* This is either an integer multiply or a MADD.  In both cases
7858          we want to recurse and cost the operands.  */
7859       cost += rtx_cost (op0, mode, MULT, 0, speed);
7860       cost += rtx_cost (op1, mode, MULT, 1, speed);
7861
7862       if (speed)
7863         {
7864           if (compound_p)
7865             /* MADD/MSUB.  */
7866             cost += extra_cost->mult[mode == DImode].add;
7867           else
7868             /* MUL.  */
7869             cost += extra_cost->mult[mode == DImode].simple;
7870         }
7871
7872       return cost;
7873     }
7874   else
7875     {
7876       if (speed)
7877         {
7878           /* Floating-point FMA/FMUL can also support negations of the
7879              operands, unless the rounding mode is upward or downward in
7880              which case FNMUL is different than FMUL with operand negation.  */
7881           bool neg0 = GET_CODE (op0) == NEG;
7882           bool neg1 = GET_CODE (op1) == NEG;
7883           if (compound_p || !flag_rounding_math || (neg0 && neg1))
7884             {
7885               if (neg0)
7886                 op0 = XEXP (op0, 0);
7887               if (neg1)
7888                 op1 = XEXP (op1, 0);
7889             }
7890
7891           if (compound_p)
7892             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
7893             cost += extra_cost->fp[mode == DFmode].fma;
7894           else
7895             /* FMUL/FNMUL.  */
7896             cost += extra_cost->fp[mode == DFmode].mult;
7897         }
7898
7899       cost += rtx_cost (op0, mode, MULT, 0, speed);
7900       cost += rtx_cost (op1, mode, MULT, 1, speed);
7901       return cost;
7902     }
7903 }
7904
7905 static int
7906 aarch64_address_cost (rtx x,
7907                       machine_mode mode,
7908                       addr_space_t as ATTRIBUTE_UNUSED,
7909                       bool speed)
7910 {
7911   enum rtx_code c = GET_CODE (x);
7912   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
7913   struct aarch64_address_info info;
7914   int cost = 0;
7915   info.shift = 0;
7916
7917   if (!aarch64_classify_address (&info, x, mode, false))
7918     {
7919       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
7920         {
7921           /* This is a CONST or SYMBOL ref which will be split
7922              in a different way depending on the code model in use.
7923              Cost it through the generic infrastructure.  */
7924           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
7925           /* Divide through by the cost of one instruction to
7926              bring it to the same units as the address costs.  */
7927           cost_symbol_ref /= COSTS_N_INSNS (1);
7928           /* The cost is then the cost of preparing the address,
7929              followed by an immediate (possibly 0) offset.  */
7930           return cost_symbol_ref + addr_cost->imm_offset;
7931         }
7932       else
7933         {
7934           /* This is most likely a jump table from a case
7935              statement.  */
7936           return addr_cost->register_offset;
7937         }
7938     }
7939
7940   switch (info.type)
7941     {
7942       case ADDRESS_LO_SUM:
7943       case ADDRESS_SYMBOLIC:
7944       case ADDRESS_REG_IMM:
7945         cost += addr_cost->imm_offset;
7946         break;
7947
7948       case ADDRESS_REG_WB:
7949         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
7950           cost += addr_cost->pre_modify;
7951         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
7952           cost += addr_cost->post_modify;
7953         else
7954           gcc_unreachable ();
7955
7956         break;
7957
7958       case ADDRESS_REG_REG:
7959         cost += addr_cost->register_offset;
7960         break;
7961
7962       case ADDRESS_REG_SXTW:
7963         cost += addr_cost->register_sextend;
7964         break;
7965
7966       case ADDRESS_REG_UXTW:
7967         cost += addr_cost->register_zextend;
7968         break;
7969
7970       default:
7971         gcc_unreachable ();
7972     }
7973
7974
7975   if (info.shift > 0)
7976     {
7977       /* For the sake of calculating the cost of the shifted register
7978          component, we can treat same sized modes in the same way.  */
7979       if (known_eq (GET_MODE_BITSIZE (mode), 16))
7980         cost += addr_cost->addr_scale_costs.hi;
7981       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
7982         cost += addr_cost->addr_scale_costs.si;
7983       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
7984         cost += addr_cost->addr_scale_costs.di;
7985       else
7986         /* We can't tell, or this is a 128-bit vector.  */
7987         cost += addr_cost->addr_scale_costs.ti;
7988     }
7989
7990   return cost;
7991 }
7992
7993 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
7994    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
7995    to be taken.  */
7996
7997 int
7998 aarch64_branch_cost (bool speed_p, bool predictable_p)
7999 {
8000   /* When optimizing for speed, use the cost of unpredictable branches.  */
8001   const struct cpu_branch_cost *branch_costs =
8002     aarch64_tune_params.branch_costs;
8003
8004   if (!speed_p || predictable_p)
8005     return branch_costs->predictable;
8006   else
8007     return branch_costs->unpredictable;
8008 }
8009
8010 /* Return true if the RTX X in mode MODE is a zero or sign extract
8011    usable in an ADD or SUB (extended register) instruction.  */
8012 static bool
8013 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
8014 {
8015   /* Catch add with a sign extract.
8016      This is add_<optab><mode>_multp2.  */
8017   if (GET_CODE (x) == SIGN_EXTRACT
8018       || GET_CODE (x) == ZERO_EXTRACT)
8019     {
8020       rtx op0 = XEXP (x, 0);
8021       rtx op1 = XEXP (x, 1);
8022       rtx op2 = XEXP (x, 2);
8023
8024       if (GET_CODE (op0) == MULT
8025           && CONST_INT_P (op1)
8026           && op2 == const0_rtx
8027           && CONST_INT_P (XEXP (op0, 1))
8028           && aarch64_is_extend_from_extract (mode,
8029                                              XEXP (op0, 1),
8030                                              op1))
8031         {
8032           return true;
8033         }
8034     }
8035   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
8036      No shift.  */
8037   else if (GET_CODE (x) == SIGN_EXTEND
8038            || GET_CODE (x) == ZERO_EXTEND)
8039     return REG_P (XEXP (x, 0));
8040
8041   return false;
8042 }
8043
8044 static bool
8045 aarch64_frint_unspec_p (unsigned int u)
8046 {
8047   switch (u)
8048     {
8049       case UNSPEC_FRINTZ:
8050       case UNSPEC_FRINTP:
8051       case UNSPEC_FRINTM:
8052       case UNSPEC_FRINTA:
8053       case UNSPEC_FRINTN:
8054       case UNSPEC_FRINTX:
8055       case UNSPEC_FRINTI:
8056         return true;
8057
8058       default:
8059         return false;
8060     }
8061 }
8062
8063 /* Return true iff X is an rtx that will match an extr instruction
8064    i.e. as described in the *extr<mode>5_insn family of patterns.
8065    OP0 and OP1 will be set to the operands of the shifts involved
8066    on success and will be NULL_RTX otherwise.  */
8067
8068 static bool
8069 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
8070 {
8071   rtx op0, op1;
8072   scalar_int_mode mode;
8073   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
8074     return false;
8075
8076   *res_op0 = NULL_RTX;
8077   *res_op1 = NULL_RTX;
8078
8079   if (GET_CODE (x) != IOR)
8080     return false;
8081
8082   op0 = XEXP (x, 0);
8083   op1 = XEXP (x, 1);
8084
8085   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
8086       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
8087     {
8088      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
8089       if (GET_CODE (op1) == ASHIFT)
8090         std::swap (op0, op1);
8091
8092       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
8093         return false;
8094
8095       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
8096       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
8097
8098       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
8099           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
8100         {
8101           *res_op0 = XEXP (op0, 0);
8102           *res_op1 = XEXP (op1, 0);
8103           return true;
8104         }
8105     }
8106
8107   return false;
8108 }
8109
8110 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
8111    storing it in *COST.  Result is true if the total cost of the operation
8112    has now been calculated.  */
8113 static bool
8114 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
8115 {
8116   rtx inner;
8117   rtx comparator;
8118   enum rtx_code cmpcode;
8119
8120   if (COMPARISON_P (op0))
8121     {
8122       inner = XEXP (op0, 0);
8123       comparator = XEXP (op0, 1);
8124       cmpcode = GET_CODE (op0);
8125     }
8126   else
8127     {
8128       inner = op0;
8129       comparator = const0_rtx;
8130       cmpcode = NE;
8131     }
8132
8133   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
8134     {
8135       /* Conditional branch.  */
8136       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8137         return true;
8138       else
8139         {
8140           if (cmpcode == NE || cmpcode == EQ)
8141             {
8142               if (comparator == const0_rtx)
8143                 {
8144                   /* TBZ/TBNZ/CBZ/CBNZ.  */
8145                   if (GET_CODE (inner) == ZERO_EXTRACT)
8146                     /* TBZ/TBNZ.  */
8147                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
8148                                        ZERO_EXTRACT, 0, speed);
8149                   else
8150                     /* CBZ/CBNZ.  */
8151                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
8152
8153                 return true;
8154               }
8155             }
8156           else if (cmpcode == LT || cmpcode == GE)
8157             {
8158               /* TBZ/TBNZ.  */
8159               if (comparator == const0_rtx)
8160                 return true;
8161             }
8162         }
8163     }
8164   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8165     {
8166       /* CCMP.  */
8167       if (GET_CODE (op1) == COMPARE)
8168         {
8169           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
8170           if (XEXP (op1, 1) == const0_rtx)
8171             *cost += 1;
8172           if (speed)
8173             {
8174               machine_mode mode = GET_MODE (XEXP (op1, 0));
8175               const struct cpu_cost_table *extra_cost
8176                 = aarch64_tune_params.insn_extra_cost;
8177
8178               if (GET_MODE_CLASS (mode) == MODE_INT)
8179                 *cost += extra_cost->alu.arith;
8180               else
8181                 *cost += extra_cost->fp[mode == DFmode].compare;
8182             }
8183           return true;
8184         }
8185
8186       /* It's a conditional operation based on the status flags,
8187          so it must be some flavor of CSEL.  */
8188
8189       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
8190       if (GET_CODE (op1) == NEG
8191           || GET_CODE (op1) == NOT
8192           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
8193         op1 = XEXP (op1, 0);
8194       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
8195         {
8196           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
8197           op1 = XEXP (op1, 0);
8198           op2 = XEXP (op2, 0);
8199         }
8200
8201       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
8202       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
8203       return true;
8204     }
8205
8206   /* We don't know what this is, cost all operands.  */
8207   return false;
8208 }
8209
8210 /* Check whether X is a bitfield operation of the form shift + extend that
8211    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
8212    operand to which the bitfield operation is applied.  Otherwise return
8213    NULL_RTX.  */
8214
8215 static rtx
8216 aarch64_extend_bitfield_pattern_p (rtx x)
8217 {
8218   rtx_code outer_code = GET_CODE (x);
8219   machine_mode outer_mode = GET_MODE (x);
8220
8221   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
8222       && outer_mode != SImode && outer_mode != DImode)
8223     return NULL_RTX;
8224
8225   rtx inner = XEXP (x, 0);
8226   rtx_code inner_code = GET_CODE (inner);
8227   machine_mode inner_mode = GET_MODE (inner);
8228   rtx op = NULL_RTX;
8229
8230   switch (inner_code)
8231     {
8232       case ASHIFT:
8233         if (CONST_INT_P (XEXP (inner, 1))
8234             && (inner_mode == QImode || inner_mode == HImode))
8235           op = XEXP (inner, 0);
8236         break;
8237       case LSHIFTRT:
8238         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
8239             && (inner_mode == QImode || inner_mode == HImode))
8240           op = XEXP (inner, 0);
8241         break;
8242       case ASHIFTRT:
8243         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
8244             && (inner_mode == QImode || inner_mode == HImode))
8245           op = XEXP (inner, 0);
8246         break;
8247       default:
8248         break;
8249     }
8250
8251   return op;
8252 }
8253
8254 /* Return true if the mask and a shift amount from an RTX of the form
8255    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
8256    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
8257
8258 bool
8259 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
8260                                     rtx shft_amnt)
8261 {
8262   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
8263          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
8264          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
8265          && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
8266 }
8267
8268 /* Calculate the cost of calculating X, storing it in *COST.  Result
8269    is true if the total cost of the operation has now been calculated.  */
8270 static bool
8271 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
8272                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
8273 {
8274   rtx op0, op1, op2;
8275   const struct cpu_cost_table *extra_cost
8276     = aarch64_tune_params.insn_extra_cost;
8277   int code = GET_CODE (x);
8278   scalar_int_mode int_mode;
8279
8280   /* By default, assume that everything has equivalent cost to the
8281      cheapest instruction.  Any additional costs are applied as a delta
8282      above this default.  */
8283   *cost = COSTS_N_INSNS (1);
8284
8285   switch (code)
8286     {
8287     case SET:
8288       /* The cost depends entirely on the operands to SET.  */
8289       *cost = 0;
8290       op0 = SET_DEST (x);
8291       op1 = SET_SRC (x);
8292
8293       switch (GET_CODE (op0))
8294         {
8295         case MEM:
8296           if (speed)
8297             {
8298               rtx address = XEXP (op0, 0);
8299               if (VECTOR_MODE_P (mode))
8300                 *cost += extra_cost->ldst.storev;
8301               else if (GET_MODE_CLASS (mode) == MODE_INT)
8302                 *cost += extra_cost->ldst.store;
8303               else if (mode == SFmode)
8304                 *cost += extra_cost->ldst.storef;
8305               else if (mode == DFmode)
8306                 *cost += extra_cost->ldst.stored;
8307
8308               *cost +=
8309                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8310                                                      0, speed));
8311             }
8312
8313           *cost += rtx_cost (op1, mode, SET, 1, speed);
8314           return true;
8315
8316         case SUBREG:
8317           if (! REG_P (SUBREG_REG (op0)))
8318             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
8319
8320           /* Fall through.  */
8321         case REG:
8322           /* The cost is one per vector-register copied.  */
8323           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
8324             {
8325               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
8326               *cost = COSTS_N_INSNS (nregs);
8327             }
8328           /* const0_rtx is in general free, but we will use an
8329              instruction to set a register to 0.  */
8330           else if (REG_P (op1) || op1 == const0_rtx)
8331             {
8332               /* The cost is 1 per register copied.  */
8333               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
8334               *cost = COSTS_N_INSNS (nregs);
8335             }
8336           else
8337             /* Cost is just the cost of the RHS of the set.  */
8338             *cost += rtx_cost (op1, mode, SET, 1, speed);
8339           return true;
8340
8341         case ZERO_EXTRACT:
8342         case SIGN_EXTRACT:
8343           /* Bit-field insertion.  Strip any redundant widening of
8344              the RHS to meet the width of the target.  */
8345           if (GET_CODE (op1) == SUBREG)
8346             op1 = SUBREG_REG (op1);
8347           if ((GET_CODE (op1) == ZERO_EXTEND
8348                || GET_CODE (op1) == SIGN_EXTEND)
8349               && CONST_INT_P (XEXP (op0, 1))
8350               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
8351               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
8352             op1 = XEXP (op1, 0);
8353
8354           if (CONST_INT_P (op1))
8355             {
8356               /* MOV immediate is assumed to always be cheap.  */
8357               *cost = COSTS_N_INSNS (1);
8358             }
8359           else
8360             {
8361               /* BFM.  */
8362               if (speed)
8363                 *cost += extra_cost->alu.bfi;
8364               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
8365             }
8366
8367           return true;
8368
8369         default:
8370           /* We can't make sense of this, assume default cost.  */
8371           *cost = COSTS_N_INSNS (1);
8372           return false;
8373         }
8374       return false;
8375
8376     case CONST_INT:
8377       /* If an instruction can incorporate a constant within the
8378          instruction, the instruction's expression avoids calling
8379          rtx_cost() on the constant.  If rtx_cost() is called on a
8380          constant, then it is usually because the constant must be
8381          moved into a register by one or more instructions.
8382
8383          The exception is constant 0, which can be expressed
8384          as XZR/WZR and is therefore free.  The exception to this is
8385          if we have (set (reg) (const0_rtx)) in which case we must cost
8386          the move.  However, we can catch that when we cost the SET, so
8387          we don't need to consider that here.  */
8388       if (x == const0_rtx)
8389         *cost = 0;
8390       else
8391         {
8392           /* To an approximation, building any other constant is
8393              proportionally expensive to the number of instructions
8394              required to build that constant.  This is true whether we
8395              are compiling for SPEED or otherwise.  */
8396           if (!is_a <scalar_int_mode> (mode, &int_mode))
8397             int_mode = word_mode;
8398           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
8399                                  (NULL_RTX, x, false, int_mode));
8400         }
8401       return true;
8402
8403     case CONST_DOUBLE:
8404
8405       /* First determine number of instructions to do the move
8406           as an integer constant.  */
8407       if (!aarch64_float_const_representable_p (x)
8408            && !aarch64_can_const_movi_rtx_p (x, mode)
8409            && aarch64_float_const_rtx_p (x))
8410         {
8411           unsigned HOST_WIDE_INT ival;
8412           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
8413           gcc_assert (succeed);
8414
8415           scalar_int_mode imode = (mode == HFmode
8416                                    ? SImode
8417                                    : int_mode_for_mode (mode).require ());
8418           int ncost = aarch64_internal_mov_immediate
8419                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
8420           *cost += COSTS_N_INSNS (ncost);
8421           return true;
8422         }
8423
8424       if (speed)
8425         {
8426           /* mov[df,sf]_aarch64.  */
8427           if (aarch64_float_const_representable_p (x))
8428             /* FMOV (scalar immediate).  */
8429             *cost += extra_cost->fp[mode == DFmode].fpconst;
8430           else if (!aarch64_float_const_zero_rtx_p (x))
8431             {
8432               /* This will be a load from memory.  */
8433               if (mode == DFmode)
8434                 *cost += extra_cost->ldst.loadd;
8435               else
8436                 *cost += extra_cost->ldst.loadf;
8437             }
8438           else
8439             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
8440                or MOV v0.s[0], wzr - neither of which are modeled by the
8441                cost tables.  Just use the default cost.  */
8442             {
8443             }
8444         }
8445
8446       return true;
8447
8448     case MEM:
8449       if (speed)
8450         {
8451           /* For loads we want the base cost of a load, plus an
8452              approximation for the additional cost of the addressing
8453              mode.  */
8454           rtx address = XEXP (x, 0);
8455           if (VECTOR_MODE_P (mode))
8456             *cost += extra_cost->ldst.loadv;
8457           else if (GET_MODE_CLASS (mode) == MODE_INT)
8458             *cost += extra_cost->ldst.load;
8459           else if (mode == SFmode)
8460             *cost += extra_cost->ldst.loadf;
8461           else if (mode == DFmode)
8462             *cost += extra_cost->ldst.loadd;
8463
8464           *cost +=
8465                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8466                                                      0, speed));
8467         }
8468
8469       return true;
8470
8471     case NEG:
8472       op0 = XEXP (x, 0);
8473
8474       if (VECTOR_MODE_P (mode))
8475         {
8476           if (speed)
8477             {
8478               /* FNEG.  */
8479               *cost += extra_cost->vect.alu;
8480             }
8481           return false;
8482         }
8483
8484       if (GET_MODE_CLASS (mode) == MODE_INT)
8485         {
8486           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8487               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8488             {
8489               /* CSETM.  */
8490               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
8491               return true;
8492             }
8493
8494           /* Cost this as SUB wzr, X.  */
8495           op0 = CONST0_RTX (mode);
8496           op1 = XEXP (x, 0);
8497           goto cost_minus;
8498         }
8499
8500       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8501         {
8502           /* Support (neg(fma...)) as a single instruction only if
8503              sign of zeros is unimportant.  This matches the decision
8504              making in aarch64.md.  */
8505           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
8506             {
8507               /* FNMADD.  */
8508               *cost = rtx_cost (op0, mode, NEG, 0, speed);
8509               return true;
8510             }
8511           if (GET_CODE (op0) == MULT)
8512             {
8513               /* FNMUL.  */
8514               *cost = rtx_cost (op0, mode, NEG, 0, speed);
8515               return true;
8516             }
8517           if (speed)
8518             /* FNEG.  */
8519             *cost += extra_cost->fp[mode == DFmode].neg;
8520           return false;
8521         }
8522
8523       return false;
8524
8525     case CLRSB:
8526     case CLZ:
8527       if (speed)
8528         {
8529           if (VECTOR_MODE_P (mode))
8530             *cost += extra_cost->vect.alu;
8531           else
8532             *cost += extra_cost->alu.clz;
8533         }
8534
8535       return false;
8536
8537     case COMPARE:
8538       op0 = XEXP (x, 0);
8539       op1 = XEXP (x, 1);
8540
8541       if (op1 == const0_rtx
8542           && GET_CODE (op0) == AND)
8543         {
8544           x = op0;
8545           mode = GET_MODE (op0);
8546           goto cost_logic;
8547         }
8548
8549       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
8550         {
8551           /* TODO: A write to the CC flags possibly costs extra, this
8552              needs encoding in the cost tables.  */
8553
8554           mode = GET_MODE (op0);
8555           /* ANDS.  */
8556           if (GET_CODE (op0) == AND)
8557             {
8558               x = op0;
8559               goto cost_logic;
8560             }
8561
8562           if (GET_CODE (op0) == PLUS)
8563             {
8564               /* ADDS (and CMN alias).  */
8565               x = op0;
8566               goto cost_plus;
8567             }
8568
8569           if (GET_CODE (op0) == MINUS)
8570             {
8571               /* SUBS.  */
8572               x = op0;
8573               goto cost_minus;
8574             }
8575
8576           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
8577               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
8578               && CONST_INT_P (XEXP (op0, 2)))
8579             {
8580               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
8581                  Handle it here directly rather than going to cost_logic
8582                  since we know the immediate generated for the TST is valid
8583                  so we can avoid creating an intermediate rtx for it only
8584                  for costing purposes.  */
8585               if (speed)
8586                 *cost += extra_cost->alu.logical;
8587
8588               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
8589                                  ZERO_EXTRACT, 0, speed);
8590               return true;
8591             }
8592
8593           if (GET_CODE (op1) == NEG)
8594             {
8595               /* CMN.  */
8596               if (speed)
8597                 *cost += extra_cost->alu.arith;
8598
8599               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
8600               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
8601               return true;
8602             }
8603
8604           /* CMP.
8605
8606              Compare can freely swap the order of operands, and
8607              canonicalization puts the more complex operation first.
8608              But the integer MINUS logic expects the shift/extend
8609              operation in op1.  */
8610           if (! (REG_P (op0)
8611                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
8612           {
8613             op0 = XEXP (x, 1);
8614             op1 = XEXP (x, 0);
8615           }
8616           goto cost_minus;
8617         }
8618
8619       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
8620         {
8621           /* FCMP.  */
8622           if (speed)
8623             *cost += extra_cost->fp[mode == DFmode].compare;
8624
8625           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
8626             {
8627               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
8628               /* FCMP supports constant 0.0 for no extra cost. */
8629               return true;
8630             }
8631           return false;
8632         }
8633
8634       if (VECTOR_MODE_P (mode))
8635         {
8636           /* Vector compare.  */
8637           if (speed)
8638             *cost += extra_cost->vect.alu;
8639
8640           if (aarch64_float_const_zero_rtx_p (op1))
8641             {
8642               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
8643                  cost.  */
8644               return true;
8645             }
8646           return false;
8647         }
8648       return false;
8649
8650     case MINUS:
8651       {
8652         op0 = XEXP (x, 0);
8653         op1 = XEXP (x, 1);
8654
8655 cost_minus:
8656         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
8657
8658         /* Detect valid immediates.  */
8659         if ((GET_MODE_CLASS (mode) == MODE_INT
8660              || (GET_MODE_CLASS (mode) == MODE_CC
8661                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
8662             && CONST_INT_P (op1)
8663             && aarch64_uimm12_shift (INTVAL (op1)))
8664           {
8665             if (speed)
8666               /* SUB(S) (immediate).  */
8667               *cost += extra_cost->alu.arith;
8668             return true;
8669           }
8670
8671         /* Look for SUB (extended register).  */
8672         if (is_a <scalar_int_mode> (mode, &int_mode)
8673             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
8674           {
8675             if (speed)
8676               *cost += extra_cost->alu.extend_arith;
8677
8678             op1 = aarch64_strip_extend (op1, true);
8679             *cost += rtx_cost (op1, VOIDmode,
8680                                (enum rtx_code) GET_CODE (op1), 0, speed);
8681             return true;
8682           }
8683
8684         rtx new_op1 = aarch64_strip_extend (op1, false);
8685
8686         /* Cost this as an FMA-alike operation.  */
8687         if ((GET_CODE (new_op1) == MULT
8688              || aarch64_shift_p (GET_CODE (new_op1)))
8689             && code != COMPARE)
8690           {
8691             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
8692                                             (enum rtx_code) code,
8693                                             speed);
8694             return true;
8695           }
8696
8697         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
8698
8699         if (speed)
8700           {
8701             if (VECTOR_MODE_P (mode))
8702               {
8703                 /* Vector SUB.  */
8704                 *cost += extra_cost->vect.alu;
8705               }
8706             else if (GET_MODE_CLASS (mode) == MODE_INT)
8707               {
8708                 /* SUB(S).  */
8709                 *cost += extra_cost->alu.arith;
8710               }
8711             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8712               {
8713                 /* FSUB.  */
8714                 *cost += extra_cost->fp[mode == DFmode].addsub;
8715               }
8716           }
8717         return true;
8718       }
8719
8720     case PLUS:
8721       {
8722         rtx new_op0;
8723
8724         op0 = XEXP (x, 0);
8725         op1 = XEXP (x, 1);
8726
8727 cost_plus:
8728         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8729             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8730           {
8731             /* CSINC.  */
8732             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
8733             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8734             return true;
8735           }
8736
8737         if (GET_MODE_CLASS (mode) == MODE_INT
8738             && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
8739                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
8740           {
8741             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
8742
8743             if (speed)
8744               /* ADD (immediate).  */
8745               *cost += extra_cost->alu.arith;
8746             return true;
8747           }
8748
8749         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8750
8751         /* Look for ADD (extended register).  */
8752         if (is_a <scalar_int_mode> (mode, &int_mode)
8753             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
8754           {
8755             if (speed)
8756               *cost += extra_cost->alu.extend_arith;
8757
8758             op0 = aarch64_strip_extend (op0, true);
8759             *cost += rtx_cost (op0, VOIDmode,
8760                                (enum rtx_code) GET_CODE (op0), 0, speed);
8761             return true;
8762           }
8763
8764         /* Strip any extend, leave shifts behind as we will
8765            cost them through mult_cost.  */
8766         new_op0 = aarch64_strip_extend (op0, false);
8767
8768         if (GET_CODE (new_op0) == MULT
8769             || aarch64_shift_p (GET_CODE (new_op0)))
8770           {
8771             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
8772                                             speed);
8773             return true;
8774           }
8775
8776         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
8777
8778         if (speed)
8779           {
8780             if (VECTOR_MODE_P (mode))
8781               {
8782                 /* Vector ADD.  */
8783                 *cost += extra_cost->vect.alu;
8784               }
8785             else if (GET_MODE_CLASS (mode) == MODE_INT)
8786               {
8787                 /* ADD.  */
8788                 *cost += extra_cost->alu.arith;
8789               }
8790             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8791               {
8792                 /* FADD.  */
8793                 *cost += extra_cost->fp[mode == DFmode].addsub;
8794               }
8795           }
8796         return true;
8797       }
8798
8799     case BSWAP:
8800       *cost = COSTS_N_INSNS (1);
8801
8802       if (speed)
8803         {
8804           if (VECTOR_MODE_P (mode))
8805             *cost += extra_cost->vect.alu;
8806           else
8807             *cost += extra_cost->alu.rev;
8808         }
8809       return false;
8810
8811     case IOR:
8812       if (aarch_rev16_p (x))
8813         {
8814           *cost = COSTS_N_INSNS (1);
8815
8816           if (speed)
8817             {
8818               if (VECTOR_MODE_P (mode))
8819                 *cost += extra_cost->vect.alu;
8820               else
8821                 *cost += extra_cost->alu.rev;
8822             }
8823           return true;
8824         }
8825
8826       if (aarch64_extr_rtx_p (x, &op0, &op1))
8827         {
8828           *cost += rtx_cost (op0, mode, IOR, 0, speed);
8829           *cost += rtx_cost (op1, mode, IOR, 1, speed);
8830           if (speed)
8831             *cost += extra_cost->alu.shift;
8832
8833           return true;
8834         }
8835     /* Fall through.  */
8836     case XOR:
8837     case AND:
8838     cost_logic:
8839       op0 = XEXP (x, 0);
8840       op1 = XEXP (x, 1);
8841
8842       if (VECTOR_MODE_P (mode))
8843         {
8844           if (speed)
8845             *cost += extra_cost->vect.alu;
8846           return true;
8847         }
8848
8849       if (code == AND
8850           && GET_CODE (op0) == MULT
8851           && CONST_INT_P (XEXP (op0, 1))
8852           && CONST_INT_P (op1)
8853           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
8854                                INTVAL (op1)) != 0)
8855         {
8856           /* This is a UBFM/SBFM.  */
8857           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
8858           if (speed)
8859             *cost += extra_cost->alu.bfx;
8860           return true;
8861         }
8862
8863       if (is_int_mode (mode, &int_mode))
8864         {
8865           if (CONST_INT_P (op1))
8866             {
8867               /* We have a mask + shift version of a UBFIZ
8868                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
8869               if (GET_CODE (op0) == ASHIFT
8870                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
8871                                                          XEXP (op0, 1)))
8872                 {
8873                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
8874                                      (enum rtx_code) code, 0, speed);
8875                   if (speed)
8876                     *cost += extra_cost->alu.bfx;
8877
8878                   return true;
8879                 }
8880               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
8881                 {
8882                 /* We possibly get the immediate for free, this is not
8883                    modelled.  */
8884                   *cost += rtx_cost (op0, int_mode,
8885                                      (enum rtx_code) code, 0, speed);
8886                   if (speed)
8887                     *cost += extra_cost->alu.logical;
8888
8889                   return true;
8890                 }
8891             }
8892           else
8893             {
8894               rtx new_op0 = op0;
8895
8896               /* Handle ORN, EON, or BIC.  */
8897               if (GET_CODE (op0) == NOT)
8898                 op0 = XEXP (op0, 0);
8899
8900               new_op0 = aarch64_strip_shift (op0);
8901
8902               /* If we had a shift on op0 then this is a logical-shift-
8903                  by-register/immediate operation.  Otherwise, this is just
8904                  a logical operation.  */
8905               if (speed)
8906                 {
8907                   if (new_op0 != op0)
8908                     {
8909                       /* Shift by immediate.  */
8910                       if (CONST_INT_P (XEXP (op0, 1)))
8911                         *cost += extra_cost->alu.log_shift;
8912                       else
8913                         *cost += extra_cost->alu.log_shift_reg;
8914                     }
8915                   else
8916                     *cost += extra_cost->alu.logical;
8917                 }
8918
8919               /* In both cases we want to cost both operands.  */
8920               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
8921                                  0, speed);
8922               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
8923                                  1, speed);
8924
8925               return true;
8926             }
8927         }
8928       return false;
8929
8930     case NOT:
8931       x = XEXP (x, 0);
8932       op0 = aarch64_strip_shift (x);
8933
8934       if (VECTOR_MODE_P (mode))
8935         {
8936           /* Vector NOT.  */
8937           *cost += extra_cost->vect.alu;
8938           return false;
8939         }
8940
8941       /* MVN-shifted-reg.  */
8942       if (op0 != x)
8943         {
8944           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
8945
8946           if (speed)
8947             *cost += extra_cost->alu.log_shift;
8948
8949           return true;
8950         }
8951       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
8952          Handle the second form here taking care that 'a' in the above can
8953          be a shift.  */
8954       else if (GET_CODE (op0) == XOR)
8955         {
8956           rtx newop0 = XEXP (op0, 0);
8957           rtx newop1 = XEXP (op0, 1);
8958           rtx op0_stripped = aarch64_strip_shift (newop0);
8959
8960           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
8961           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
8962
8963           if (speed)
8964             {
8965               if (op0_stripped != newop0)
8966                 *cost += extra_cost->alu.log_shift;
8967               else
8968                 *cost += extra_cost->alu.logical;
8969             }
8970
8971           return true;
8972         }
8973       /* MVN.  */
8974       if (speed)
8975         *cost += extra_cost->alu.logical;
8976
8977       return false;
8978
8979     case ZERO_EXTEND:
8980
8981       op0 = XEXP (x, 0);
8982       /* If a value is written in SI mode, then zero extended to DI
8983          mode, the operation will in general be free as a write to
8984          a 'w' register implicitly zeroes the upper bits of an 'x'
8985          register.  However, if this is
8986
8987            (set (reg) (zero_extend (reg)))
8988
8989          we must cost the explicit register move.  */
8990       if (mode == DImode
8991           && GET_MODE (op0) == SImode
8992           && outer == SET)
8993         {
8994           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
8995
8996         /* If OP_COST is non-zero, then the cost of the zero extend
8997            is effectively the cost of the inner operation.  Otherwise
8998            we have a MOV instruction and we take the cost from the MOV
8999            itself.  This is true independently of whether we are
9000            optimizing for space or time.  */
9001           if (op_cost)
9002             *cost = op_cost;
9003
9004           return true;
9005         }
9006       else if (MEM_P (op0))
9007         {
9008           /* All loads can zero extend to any size for free.  */
9009           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
9010           return true;
9011         }
9012
9013       op0 = aarch64_extend_bitfield_pattern_p (x);
9014       if (op0)
9015         {
9016           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
9017           if (speed)
9018             *cost += extra_cost->alu.bfx;
9019           return true;
9020         }
9021
9022       if (speed)
9023         {
9024           if (VECTOR_MODE_P (mode))
9025             {
9026               /* UMOV.  */
9027               *cost += extra_cost->vect.alu;
9028             }
9029           else
9030             {
9031               /* We generate an AND instead of UXTB/UXTH.  */
9032               *cost += extra_cost->alu.logical;
9033             }
9034         }
9035       return false;
9036
9037     case SIGN_EXTEND:
9038       if (MEM_P (XEXP (x, 0)))
9039         {
9040           /* LDRSH.  */
9041           if (speed)
9042             {
9043               rtx address = XEXP (XEXP (x, 0), 0);
9044               *cost += extra_cost->ldst.load_sign_extend;
9045
9046               *cost +=
9047                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9048                                                      0, speed));
9049             }
9050           return true;
9051         }
9052
9053       op0 = aarch64_extend_bitfield_pattern_p (x);
9054       if (op0)
9055         {
9056           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
9057           if (speed)
9058             *cost += extra_cost->alu.bfx;
9059           return true;
9060         }
9061
9062       if (speed)
9063         {
9064           if (VECTOR_MODE_P (mode))
9065             *cost += extra_cost->vect.alu;
9066           else
9067             *cost += extra_cost->alu.extend;
9068         }
9069       return false;
9070
9071     case ASHIFT:
9072       op0 = XEXP (x, 0);
9073       op1 = XEXP (x, 1);
9074
9075       if (CONST_INT_P (op1))
9076         {
9077           if (speed)
9078             {
9079               if (VECTOR_MODE_P (mode))
9080                 {
9081                   /* Vector shift (immediate).  */
9082                   *cost += extra_cost->vect.alu;
9083                 }
9084               else
9085                 {
9086                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
9087                      aliases.  */
9088                   *cost += extra_cost->alu.shift;
9089                 }
9090             }
9091
9092           /* We can incorporate zero/sign extend for free.  */
9093           if (GET_CODE (op0) == ZERO_EXTEND
9094               || GET_CODE (op0) == SIGN_EXTEND)
9095             op0 = XEXP (op0, 0);
9096
9097           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
9098           return true;
9099         }
9100       else
9101         {
9102           if (VECTOR_MODE_P (mode))
9103             {
9104               if (speed)
9105                 /* Vector shift (register).  */
9106                 *cost += extra_cost->vect.alu;
9107             }
9108           else
9109             {
9110               if (speed)
9111                 /* LSLV.  */
9112                 *cost += extra_cost->alu.shift_reg;
9113
9114               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9115                   && CONST_INT_P (XEXP (op1, 1))
9116                   && known_eq (INTVAL (XEXP (op1, 1)),
9117                                GET_MODE_BITSIZE (mode) - 1))
9118                 {
9119                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9120                   /* We already demanded XEXP (op1, 0) to be REG_P, so
9121                      don't recurse into it.  */
9122                   return true;
9123                 }
9124             }
9125           return false;  /* All arguments need to be in registers.  */
9126         }
9127
9128     case ROTATE:
9129     case ROTATERT:
9130     case LSHIFTRT:
9131     case ASHIFTRT:
9132       op0 = XEXP (x, 0);
9133       op1 = XEXP (x, 1);
9134
9135       if (CONST_INT_P (op1))
9136         {
9137           /* ASR (immediate) and friends.  */
9138           if (speed)
9139             {
9140               if (VECTOR_MODE_P (mode))
9141                 *cost += extra_cost->vect.alu;
9142               else
9143                 *cost += extra_cost->alu.shift;
9144             }
9145
9146           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9147           return true;
9148         }
9149       else
9150         {
9151           if (VECTOR_MODE_P (mode))
9152             {
9153               if (speed)
9154                 /* Vector shift (register).  */
9155                 *cost += extra_cost->vect.alu;
9156             }
9157           else
9158             {
9159               if (speed)
9160                 /* ASR (register) and friends.  */
9161                 *cost += extra_cost->alu.shift_reg;
9162
9163               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9164                   && CONST_INT_P (XEXP (op1, 1))
9165                   && known_eq (INTVAL (XEXP (op1, 1)),
9166                                GET_MODE_BITSIZE (mode) - 1))
9167                 {
9168                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9169                   /* We already demanded XEXP (op1, 0) to be REG_P, so
9170                      don't recurse into it.  */
9171                   return true;
9172                 }
9173             }
9174           return false;  /* All arguments need to be in registers.  */
9175         }
9176
9177     case SYMBOL_REF:
9178
9179       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
9180           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
9181         {
9182           /* LDR.  */
9183           if (speed)
9184             *cost += extra_cost->ldst.load;
9185         }
9186       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
9187                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
9188         {
9189           /* ADRP, followed by ADD.  */
9190           *cost += COSTS_N_INSNS (1);
9191           if (speed)
9192             *cost += 2 * extra_cost->alu.arith;
9193         }
9194       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
9195                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
9196         {
9197           /* ADR.  */
9198           if (speed)
9199             *cost += extra_cost->alu.arith;
9200         }
9201
9202       if (flag_pic)
9203         {
9204           /* One extra load instruction, after accessing the GOT.  */
9205           *cost += COSTS_N_INSNS (1);
9206           if (speed)
9207             *cost += extra_cost->ldst.load;
9208         }
9209       return true;
9210
9211     case HIGH:
9212     case LO_SUM:
9213       /* ADRP/ADD (immediate).  */
9214       if (speed)
9215         *cost += extra_cost->alu.arith;
9216       return true;
9217
9218     case ZERO_EXTRACT:
9219     case SIGN_EXTRACT:
9220       /* UBFX/SBFX.  */
9221       if (speed)
9222         {
9223           if (VECTOR_MODE_P (mode))
9224             *cost += extra_cost->vect.alu;
9225           else
9226             *cost += extra_cost->alu.bfx;
9227         }
9228
9229       /* We can trust that the immediates used will be correct (there
9230          are no by-register forms), so we need only cost op0.  */
9231       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
9232       return true;
9233
9234     case MULT:
9235       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
9236       /* aarch64_rtx_mult_cost always handles recursion to its
9237          operands.  */
9238       return true;
9239
9240     case MOD:
9241     /* We can expand signed mod by power of 2 using a NEGS, two parallel
9242        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
9243        an unconditional negate.  This case should only ever be reached through
9244        the set_smod_pow2_cheap check in expmed.c.  */
9245       if (CONST_INT_P (XEXP (x, 1))
9246           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
9247           && (mode == SImode || mode == DImode))
9248         {
9249           /* We expand to 4 instructions.  Reset the baseline.  */
9250           *cost = COSTS_N_INSNS (4);
9251
9252           if (speed)
9253             *cost += 2 * extra_cost->alu.logical
9254                      + 2 * extra_cost->alu.arith;
9255
9256           return true;
9257         }
9258
9259     /* Fall-through.  */
9260     case UMOD:
9261       if (speed)
9262         {
9263           /* Slighly prefer UMOD over SMOD.  */
9264           if (VECTOR_MODE_P (mode))
9265             *cost += extra_cost->vect.alu;
9266           else if (GET_MODE_CLASS (mode) == MODE_INT)
9267             *cost += (extra_cost->mult[mode == DImode].add
9268                       + extra_cost->mult[mode == DImode].idiv
9269                       + (code == MOD ? 1 : 0));
9270         }
9271       return false;  /* All arguments need to be in registers.  */
9272
9273     case DIV:
9274     case UDIV:
9275     case SQRT:
9276       if (speed)
9277         {
9278           if (VECTOR_MODE_P (mode))
9279             *cost += extra_cost->vect.alu;
9280           else if (GET_MODE_CLASS (mode) == MODE_INT)
9281             /* There is no integer SQRT, so only DIV and UDIV can get
9282                here.  */
9283             *cost += (extra_cost->mult[mode == DImode].idiv
9284                      /* Slighly prefer UDIV over SDIV.  */
9285                      + (code == DIV ? 1 : 0));
9286           else
9287             *cost += extra_cost->fp[mode == DFmode].div;
9288         }
9289       return false;  /* All arguments need to be in registers.  */
9290
9291     case IF_THEN_ELSE:
9292       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
9293                                          XEXP (x, 2), cost, speed);
9294
9295     case EQ:
9296     case NE:
9297     case GT:
9298     case GTU:
9299     case LT:
9300     case LTU:
9301     case GE:
9302     case GEU:
9303     case LE:
9304     case LEU:
9305
9306       return false; /* All arguments must be in registers.  */
9307
9308     case FMA:
9309       op0 = XEXP (x, 0);
9310       op1 = XEXP (x, 1);
9311       op2 = XEXP (x, 2);
9312
9313       if (speed)
9314         {
9315           if (VECTOR_MODE_P (mode))
9316             *cost += extra_cost->vect.alu;
9317           else
9318             *cost += extra_cost->fp[mode == DFmode].fma;
9319         }
9320
9321       /* FMSUB, FNMADD, and FNMSUB are free.  */
9322       if (GET_CODE (op0) == NEG)
9323         op0 = XEXP (op0, 0);
9324
9325       if (GET_CODE (op2) == NEG)
9326         op2 = XEXP (op2, 0);
9327
9328       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
9329          and the by-element operand as operand 0.  */
9330       if (GET_CODE (op1) == NEG)
9331         op1 = XEXP (op1, 0);
9332
9333       /* Catch vector-by-element operations.  The by-element operand can
9334          either be (vec_duplicate (vec_select (x))) or just
9335          (vec_select (x)), depending on whether we are multiplying by
9336          a vector or a scalar.
9337
9338          Canonicalization is not very good in these cases, FMA4 will put the
9339          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
9340       if (GET_CODE (op0) == VEC_DUPLICATE)
9341         op0 = XEXP (op0, 0);
9342       else if (GET_CODE (op1) == VEC_DUPLICATE)
9343         op1 = XEXP (op1, 0);
9344
9345       if (GET_CODE (op0) == VEC_SELECT)
9346         op0 = XEXP (op0, 0);
9347       else if (GET_CODE (op1) == VEC_SELECT)
9348         op1 = XEXP (op1, 0);
9349
9350       /* If the remaining parameters are not registers,
9351          get the cost to put them into registers.  */
9352       *cost += rtx_cost (op0, mode, FMA, 0, speed);
9353       *cost += rtx_cost (op1, mode, FMA, 1, speed);
9354       *cost += rtx_cost (op2, mode, FMA, 2, speed);
9355       return true;
9356
9357     case FLOAT:
9358     case UNSIGNED_FLOAT:
9359       if (speed)
9360         *cost += extra_cost->fp[mode == DFmode].fromint;
9361       return false;
9362
9363     case FLOAT_EXTEND:
9364       if (speed)
9365         {
9366           if (VECTOR_MODE_P (mode))
9367             {
9368               /*Vector truncate.  */
9369               *cost += extra_cost->vect.alu;
9370             }
9371           else
9372             *cost += extra_cost->fp[mode == DFmode].widen;
9373         }
9374       return false;
9375
9376     case FLOAT_TRUNCATE:
9377       if (speed)
9378         {
9379           if (VECTOR_MODE_P (mode))
9380             {
9381               /*Vector conversion.  */
9382               *cost += extra_cost->vect.alu;
9383             }
9384           else
9385             *cost += extra_cost->fp[mode == DFmode].narrow;
9386         }
9387       return false;
9388
9389     case FIX:
9390     case UNSIGNED_FIX:
9391       x = XEXP (x, 0);
9392       /* Strip the rounding part.  They will all be implemented
9393          by the fcvt* family of instructions anyway.  */
9394       if (GET_CODE (x) == UNSPEC)
9395         {
9396           unsigned int uns_code = XINT (x, 1);
9397
9398           if (uns_code == UNSPEC_FRINTA
9399               || uns_code == UNSPEC_FRINTM
9400               || uns_code == UNSPEC_FRINTN
9401               || uns_code == UNSPEC_FRINTP
9402               || uns_code == UNSPEC_FRINTZ)
9403             x = XVECEXP (x, 0, 0);
9404         }
9405
9406       if (speed)
9407         {
9408           if (VECTOR_MODE_P (mode))
9409             *cost += extra_cost->vect.alu;
9410           else
9411             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
9412         }
9413
9414       /* We can combine fmul by a power of 2 followed by a fcvt into a single
9415          fixed-point fcvt.  */
9416       if (GET_CODE (x) == MULT
9417           && ((VECTOR_MODE_P (mode)
9418                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
9419               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
9420         {
9421           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
9422                              0, speed);
9423           return true;
9424         }
9425
9426       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
9427       return true;
9428
9429     case ABS:
9430       if (VECTOR_MODE_P (mode))
9431         {
9432           /* ABS (vector).  */
9433           if (speed)
9434             *cost += extra_cost->vect.alu;
9435         }
9436       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9437         {
9438           op0 = XEXP (x, 0);
9439
9440           /* FABD, which is analogous to FADD.  */
9441           if (GET_CODE (op0) == MINUS)
9442             {
9443               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
9444               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
9445               if (speed)
9446                 *cost += extra_cost->fp[mode == DFmode].addsub;
9447
9448               return true;
9449             }
9450           /* Simple FABS is analogous to FNEG.  */
9451           if (speed)
9452             *cost += extra_cost->fp[mode == DFmode].neg;
9453         }
9454       else
9455         {
9456           /* Integer ABS will either be split to
9457              two arithmetic instructions, or will be an ABS
9458              (scalar), which we don't model.  */
9459           *cost = COSTS_N_INSNS (2);
9460           if (speed)
9461             *cost += 2 * extra_cost->alu.arith;
9462         }
9463       return false;
9464
9465     case SMAX:
9466     case SMIN:
9467       if (speed)
9468         {
9469           if (VECTOR_MODE_P (mode))
9470             *cost += extra_cost->vect.alu;
9471           else
9472             {
9473               /* FMAXNM/FMINNM/FMAX/FMIN.
9474                  TODO: This may not be accurate for all implementations, but
9475                  we do not model this in the cost tables.  */
9476               *cost += extra_cost->fp[mode == DFmode].addsub;
9477             }
9478         }
9479       return false;
9480
9481     case UNSPEC:
9482       /* The floating point round to integer frint* instructions.  */
9483       if (aarch64_frint_unspec_p (XINT (x, 1)))
9484         {
9485           if (speed)
9486             *cost += extra_cost->fp[mode == DFmode].roundint;
9487
9488           return false;
9489         }
9490
9491       if (XINT (x, 1) == UNSPEC_RBIT)
9492         {
9493           if (speed)
9494             *cost += extra_cost->alu.rev;
9495
9496           return false;
9497         }
9498       break;
9499
9500     case TRUNCATE:
9501
9502       /* Decompose <su>muldi3_highpart.  */
9503       if (/* (truncate:DI  */
9504           mode == DImode
9505           /*   (lshiftrt:TI  */
9506           && GET_MODE (XEXP (x, 0)) == TImode
9507           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
9508           /*      (mult:TI  */
9509           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
9510           /*        (ANY_EXTEND:TI (reg:DI))
9511                     (ANY_EXTEND:TI (reg:DI)))  */
9512           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
9513                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
9514               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
9515                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
9516           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
9517           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
9518           /*     (const_int 64)  */
9519           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9520           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
9521         {
9522           /* UMULH/SMULH.  */
9523           if (speed)
9524             *cost += extra_cost->mult[mode == DImode].extend;
9525           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
9526                              mode, MULT, 0, speed);
9527           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
9528                              mode, MULT, 1, speed);
9529           return true;
9530         }
9531
9532       /* Fall through.  */
9533     default:
9534       break;
9535     }
9536
9537   if (dump_file
9538       && flag_aarch64_verbose_cost)
9539     fprintf (dump_file,
9540       "\nFailed to cost RTX.  Assuming default cost.\n");
9541
9542   return true;
9543 }
9544
9545 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
9546    calculated for X.  This cost is stored in *COST.  Returns true
9547    if the total cost of X was calculated.  */
9548 static bool
9549 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
9550                    int param, int *cost, bool speed)
9551 {
9552   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
9553
9554   if (dump_file
9555       && flag_aarch64_verbose_cost)
9556     {
9557       print_rtl_single (dump_file, x);
9558       fprintf (dump_file, "\n%s cost: %d (%s)\n",
9559                speed ? "Hot" : "Cold",
9560                *cost, result ? "final" : "partial");
9561     }
9562
9563   return result;
9564 }
9565
9566 static int
9567 aarch64_register_move_cost (machine_mode mode,
9568                             reg_class_t from_i, reg_class_t to_i)
9569 {
9570   enum reg_class from = (enum reg_class) from_i;
9571   enum reg_class to = (enum reg_class) to_i;
9572   const struct cpu_regmove_cost *regmove_cost
9573     = aarch64_tune_params.regmove_cost;
9574
9575   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
9576   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
9577     to = GENERAL_REGS;
9578
9579   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
9580     from = GENERAL_REGS;
9581
9582   /* Moving between GPR and stack cost is the same as GP2GP.  */
9583   if ((from == GENERAL_REGS && to == STACK_REG)
9584       || (to == GENERAL_REGS && from == STACK_REG))
9585     return regmove_cost->GP2GP;
9586
9587   /* To/From the stack register, we move via the gprs.  */
9588   if (to == STACK_REG || from == STACK_REG)
9589     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
9590             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
9591
9592   if (known_eq (GET_MODE_SIZE (mode), 16))
9593     {
9594       /* 128-bit operations on general registers require 2 instructions.  */
9595       if (from == GENERAL_REGS && to == GENERAL_REGS)
9596         return regmove_cost->GP2GP * 2;
9597       else if (from == GENERAL_REGS)
9598         return regmove_cost->GP2FP * 2;
9599       else if (to == GENERAL_REGS)
9600         return regmove_cost->FP2GP * 2;
9601
9602       /* When AdvSIMD instructions are disabled it is not possible to move
9603          a 128-bit value directly between Q registers.  This is handled in
9604          secondary reload.  A general register is used as a scratch to move
9605          the upper DI value and the lower DI value is moved directly,
9606          hence the cost is the sum of three moves. */
9607       if (! TARGET_SIMD)
9608         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
9609
9610       return regmove_cost->FP2FP;
9611     }
9612
9613   if (from == GENERAL_REGS && to == GENERAL_REGS)
9614     return regmove_cost->GP2GP;
9615   else if (from == GENERAL_REGS)
9616     return regmove_cost->GP2FP;
9617   else if (to == GENERAL_REGS)
9618     return regmove_cost->FP2GP;
9619
9620   return regmove_cost->FP2FP;
9621 }
9622
9623 static int
9624 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
9625                           reg_class_t rclass ATTRIBUTE_UNUSED,
9626                           bool in ATTRIBUTE_UNUSED)
9627 {
9628   return aarch64_tune_params.memmov_cost;
9629 }
9630
9631 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
9632    to optimize 1.0/sqrt.  */
9633
9634 static bool
9635 use_rsqrt_p (machine_mode mode)
9636 {
9637   return (!flag_trapping_math
9638           && flag_unsafe_math_optimizations
9639           && ((aarch64_tune_params.approx_modes->recip_sqrt
9640                & AARCH64_APPROX_MODE (mode))
9641               || flag_mrecip_low_precision_sqrt));
9642 }
9643
9644 /* Function to decide when to use the approximate reciprocal square root
9645    builtin.  */
9646
9647 static tree
9648 aarch64_builtin_reciprocal (tree fndecl)
9649 {
9650   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
9651
9652   if (!use_rsqrt_p (mode))
9653     return NULL_TREE;
9654   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
9655 }
9656
9657 typedef rtx (*rsqrte_type) (rtx, rtx);
9658
9659 /* Select reciprocal square root initial estimate insn depending on machine
9660    mode.  */
9661
9662 static rsqrte_type
9663 get_rsqrte_type (machine_mode mode)
9664 {
9665   switch (mode)
9666   {
9667     case E_DFmode:   return gen_aarch64_rsqrtedf;
9668     case E_SFmode:   return gen_aarch64_rsqrtesf;
9669     case E_V2DFmode: return gen_aarch64_rsqrtev2df;
9670     case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
9671     case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
9672     default: gcc_unreachable ();
9673   }
9674 }
9675
9676 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
9677
9678 /* Select reciprocal square root series step insn depending on machine mode.  */
9679
9680 static rsqrts_type
9681 get_rsqrts_type (machine_mode mode)
9682 {
9683   switch (mode)
9684   {
9685     case E_DFmode:   return gen_aarch64_rsqrtsdf;
9686     case E_SFmode:   return gen_aarch64_rsqrtssf;
9687     case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
9688     case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
9689     case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
9690     default: gcc_unreachable ();
9691   }
9692 }
9693
9694 /* Emit instruction sequence to compute either the approximate square root
9695    or its approximate reciprocal, depending on the flag RECP, and return
9696    whether the sequence was emitted or not.  */
9697
9698 bool
9699 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
9700 {
9701   machine_mode mode = GET_MODE (dst);
9702
9703   if (GET_MODE_INNER (mode) == HFmode)
9704     {
9705       gcc_assert (!recp);
9706       return false;
9707     }
9708
9709   if (!recp)
9710     {
9711       if (!(flag_mlow_precision_sqrt
9712             || (aarch64_tune_params.approx_modes->sqrt
9713                 & AARCH64_APPROX_MODE (mode))))
9714         return false;
9715
9716       if (flag_finite_math_only
9717           || flag_trapping_math
9718           || !flag_unsafe_math_optimizations
9719           || optimize_function_for_size_p (cfun))
9720         return false;
9721     }
9722   else
9723     /* Caller assumes we cannot fail.  */
9724     gcc_assert (use_rsqrt_p (mode));
9725
9726   machine_mode mmsk = mode_for_int_vector (mode).require ();
9727   rtx xmsk = gen_reg_rtx (mmsk);
9728   if (!recp)
9729     /* When calculating the approximate square root, compare the
9730        argument with 0.0 and create a mask.  */
9731     emit_insn (gen_rtx_SET (xmsk,
9732                             gen_rtx_NEG (mmsk,
9733                                          gen_rtx_EQ (mmsk, src,
9734                                                      CONST0_RTX (mode)))));
9735
9736   /* Estimate the approximate reciprocal square root.  */
9737   rtx xdst = gen_reg_rtx (mode);
9738   emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
9739
9740   /* Iterate over the series twice for SF and thrice for DF.  */
9741   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9742
9743   /* Optionally iterate over the series once less for faster performance
9744      while sacrificing the accuracy.  */
9745   if ((recp && flag_mrecip_low_precision_sqrt)
9746       || (!recp && flag_mlow_precision_sqrt))
9747     iterations--;
9748
9749   /* Iterate over the series to calculate the approximate reciprocal square
9750      root.  */
9751   rtx x1 = gen_reg_rtx (mode);
9752   while (iterations--)
9753     {
9754       rtx x2 = gen_reg_rtx (mode);
9755       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
9756
9757       emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
9758
9759       if (iterations > 0)
9760         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
9761     }
9762
9763   if (!recp)
9764     {
9765       /* Qualify the approximate reciprocal square root when the argument is
9766          0.0 by squashing the intermediary result to 0.0.  */
9767       rtx xtmp = gen_reg_rtx (mmsk);
9768       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
9769                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
9770       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
9771
9772       /* Calculate the approximate square root.  */
9773       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
9774     }
9775
9776   /* Finalize the approximation.  */
9777   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
9778
9779   return true;
9780 }
9781
9782 typedef rtx (*recpe_type) (rtx, rtx);
9783
9784 /* Select reciprocal initial estimate insn depending on machine mode.  */
9785
9786 static recpe_type
9787 get_recpe_type (machine_mode mode)
9788 {
9789   switch (mode)
9790   {
9791     case E_SFmode:   return (gen_aarch64_frecpesf);
9792     case E_V2SFmode: return (gen_aarch64_frecpev2sf);
9793     case E_V4SFmode: return (gen_aarch64_frecpev4sf);
9794     case E_DFmode:   return (gen_aarch64_frecpedf);
9795     case E_V2DFmode: return (gen_aarch64_frecpev2df);
9796     default:         gcc_unreachable ();
9797   }
9798 }
9799
9800 typedef rtx (*recps_type) (rtx, rtx, rtx);
9801
9802 /* Select reciprocal series step insn depending on machine mode.  */
9803
9804 static recps_type
9805 get_recps_type (machine_mode mode)
9806 {
9807   switch (mode)
9808   {
9809     case E_SFmode:   return (gen_aarch64_frecpssf);
9810     case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
9811     case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
9812     case E_DFmode:   return (gen_aarch64_frecpsdf);
9813     case E_V2DFmode: return (gen_aarch64_frecpsv2df);
9814     default:         gcc_unreachable ();
9815   }
9816 }
9817
9818 /* Emit the instruction sequence to compute the approximation for the division
9819    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
9820
9821 bool
9822 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
9823 {
9824   machine_mode mode = GET_MODE (quo);
9825
9826   if (GET_MODE_INNER (mode) == HFmode)
9827     return false;
9828
9829   bool use_approx_division_p = (flag_mlow_precision_div
9830                                 || (aarch64_tune_params.approx_modes->division
9831                                     & AARCH64_APPROX_MODE (mode)));
9832
9833   if (!flag_finite_math_only
9834       || flag_trapping_math
9835       || !flag_unsafe_math_optimizations
9836       || optimize_function_for_size_p (cfun)
9837       || !use_approx_division_p)
9838     return false;
9839
9840   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
9841     return false;
9842
9843   /* Estimate the approximate reciprocal.  */
9844   rtx xrcp = gen_reg_rtx (mode);
9845   emit_insn ((*get_recpe_type (mode)) (xrcp, den));
9846
9847   /* Iterate over the series twice for SF and thrice for DF.  */
9848   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9849
9850   /* Optionally iterate over the series once less for faster performance,
9851      while sacrificing the accuracy.  */
9852   if (flag_mlow_precision_div)
9853     iterations--;
9854
9855   /* Iterate over the series to calculate the approximate reciprocal.  */
9856   rtx xtmp = gen_reg_rtx (mode);
9857   while (iterations--)
9858     {
9859       emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
9860
9861       if (iterations > 0)
9862         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
9863     }
9864
9865   if (num != CONST1_RTX (mode))
9866     {
9867       /* As the approximate reciprocal of DEN is already calculated, only
9868          calculate the approximate division when NUM is not 1.0.  */
9869       rtx xnum = force_reg (mode, num);
9870       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
9871     }
9872
9873   /* Finalize the approximation.  */
9874   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
9875   return true;
9876 }
9877
9878 /* Return the number of instructions that can be issued per cycle.  */
9879 static int
9880 aarch64_sched_issue_rate (void)
9881 {
9882   return aarch64_tune_params.issue_rate;
9883 }
9884
9885 static int
9886 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
9887 {
9888   int issue_rate = aarch64_sched_issue_rate ();
9889
9890   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
9891 }
9892
9893
9894 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
9895    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
9896    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
9897
9898 static int
9899 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
9900                                                     int ready_index)
9901 {
9902   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
9903 }
9904
9905
9906 /* Vectorizer cost model target hooks.  */
9907
9908 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
9909 static int
9910 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
9911                                     tree vectype,
9912                                     int misalign ATTRIBUTE_UNUSED)
9913 {
9914   unsigned elements;
9915   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
9916   bool fp = false;
9917
9918   if (vectype != NULL)
9919     fp = FLOAT_TYPE_P (vectype);
9920
9921   switch (type_of_cost)
9922     {
9923       case scalar_stmt:
9924         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
9925
9926       case scalar_load:
9927         return costs->scalar_load_cost;
9928
9929       case scalar_store:
9930         return costs->scalar_store_cost;
9931
9932       case vector_stmt:
9933         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
9934
9935       case vector_load:
9936         return costs->vec_align_load_cost;
9937
9938       case vector_store:
9939         return costs->vec_store_cost;
9940
9941       case vec_to_scalar:
9942         return costs->vec_to_scalar_cost;
9943
9944       case scalar_to_vec:
9945         return costs->scalar_to_vec_cost;
9946
9947       case unaligned_load:
9948       case vector_gather_load:
9949         return costs->vec_unalign_load_cost;
9950
9951       case unaligned_store:
9952       case vector_scatter_store:
9953         return costs->vec_unalign_store_cost;
9954
9955       case cond_branch_taken:
9956         return costs->cond_taken_branch_cost;
9957
9958       case cond_branch_not_taken:
9959         return costs->cond_not_taken_branch_cost;
9960
9961       case vec_perm:
9962         return costs->vec_permute_cost;
9963
9964       case vec_promote_demote:
9965         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
9966
9967       case vec_construct:
9968         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
9969         return elements / 2 + 1;
9970
9971       default:
9972         gcc_unreachable ();
9973     }
9974 }
9975
9976 /* Implement targetm.vectorize.add_stmt_cost.  */
9977 static unsigned
9978 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
9979                        struct _stmt_vec_info *stmt_info, int misalign,
9980                        enum vect_cost_model_location where)
9981 {
9982   unsigned *cost = (unsigned *) data;
9983   unsigned retval = 0;
9984
9985   if (flag_vect_cost_model)
9986     {
9987       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
9988       int stmt_cost =
9989             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
9990
9991       /* Statements in an inner loop relative to the loop being
9992          vectorized are weighted more heavily.  The value here is
9993          arbitrary and could potentially be improved with analysis.  */
9994       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
9995         count *= 50; /*  FIXME  */
9996
9997       retval = (unsigned) (count * stmt_cost);
9998       cost[where] += retval;
9999     }
10000
10001   return retval;
10002 }
10003
10004 static void initialize_aarch64_code_model (struct gcc_options *);
10005
10006 /* Parse the TO_PARSE string and put the architecture struct that it
10007    selects into RES and the architectural features into ISA_FLAGS.
10008    Return an aarch64_parse_opt_result describing the parse result.
10009    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
10010
10011 static enum aarch64_parse_opt_result
10012 aarch64_parse_arch (const char *to_parse, const struct processor **res,
10013                     unsigned long *isa_flags)
10014 {
10015   char *ext;
10016   const struct processor *arch;
10017   char *str = (char *) alloca (strlen (to_parse) + 1);
10018   size_t len;
10019
10020   strcpy (str, to_parse);
10021
10022   ext = strchr (str, '+');
10023
10024   if (ext != NULL)
10025     len = ext - str;
10026   else
10027     len = strlen (str);
10028
10029   if (len == 0)
10030     return AARCH64_PARSE_MISSING_ARG;
10031
10032
10033   /* Loop through the list of supported ARCHes to find a match.  */
10034   for (arch = all_architectures; arch->name != NULL; arch++)
10035     {
10036       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
10037         {
10038           unsigned long isa_temp = arch->flags;
10039
10040           if (ext != NULL)
10041             {
10042               /* TO_PARSE string contains at least one extension.  */
10043               enum aarch64_parse_opt_result ext_res
10044                 = aarch64_parse_extension (ext, &isa_temp);
10045
10046               if (ext_res != AARCH64_PARSE_OK)
10047                 return ext_res;
10048             }
10049           /* Extension parsing was successful.  Confirm the result
10050              arch and ISA flags.  */
10051           *res = arch;
10052           *isa_flags = isa_temp;
10053           return AARCH64_PARSE_OK;
10054         }
10055     }
10056
10057   /* ARCH name not found in list.  */
10058   return AARCH64_PARSE_INVALID_ARG;
10059 }
10060
10061 /* Parse the TO_PARSE string and put the result tuning in RES and the
10062    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
10063    describing the parse result.  If there is an error parsing, RES and
10064    ISA_FLAGS are left unchanged.  */
10065
10066 static enum aarch64_parse_opt_result
10067 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
10068                    unsigned long *isa_flags)
10069 {
10070   char *ext;
10071   const struct processor *cpu;
10072   char *str = (char *) alloca (strlen (to_parse) + 1);
10073   size_t len;
10074
10075   strcpy (str, to_parse);
10076
10077   ext = strchr (str, '+');
10078
10079   if (ext != NULL)
10080     len = ext - str;
10081   else
10082     len = strlen (str);
10083
10084   if (len == 0)
10085     return AARCH64_PARSE_MISSING_ARG;
10086
10087
10088   /* Loop through the list of supported CPUs to find a match.  */
10089   for (cpu = all_cores; cpu->name != NULL; cpu++)
10090     {
10091       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
10092         {
10093           unsigned long isa_temp = cpu->flags;
10094
10095
10096           if (ext != NULL)
10097             {
10098               /* TO_PARSE string contains at least one extension.  */
10099               enum aarch64_parse_opt_result ext_res
10100                 = aarch64_parse_extension (ext, &isa_temp);
10101
10102               if (ext_res != AARCH64_PARSE_OK)
10103                 return ext_res;
10104             }
10105           /* Extension parsing was successfull.  Confirm the result
10106              cpu and ISA flags.  */
10107           *res = cpu;
10108           *isa_flags = isa_temp;
10109           return AARCH64_PARSE_OK;
10110         }
10111     }
10112
10113   /* CPU name not found in list.  */
10114   return AARCH64_PARSE_INVALID_ARG;
10115 }
10116
10117 /* Parse the TO_PARSE string and put the cpu it selects into RES.
10118    Return an aarch64_parse_opt_result describing the parse result.
10119    If the parsing fails the RES does not change.  */
10120
10121 static enum aarch64_parse_opt_result
10122 aarch64_parse_tune (const char *to_parse, const struct processor **res)
10123 {
10124   const struct processor *cpu;
10125   char *str = (char *) alloca (strlen (to_parse) + 1);
10126
10127   strcpy (str, to_parse);
10128
10129   /* Loop through the list of supported CPUs to find a match.  */
10130   for (cpu = all_cores; cpu->name != NULL; cpu++)
10131     {
10132       if (strcmp (cpu->name, str) == 0)
10133         {
10134           *res = cpu;
10135           return AARCH64_PARSE_OK;
10136         }
10137     }
10138
10139   /* CPU name not found in list.  */
10140   return AARCH64_PARSE_INVALID_ARG;
10141 }
10142
10143 /* Parse TOKEN, which has length LENGTH to see if it is an option
10144    described in FLAG.  If it is, return the index bit for that fusion type.
10145    If not, error (printing OPTION_NAME) and return zero.  */
10146
10147 static unsigned int
10148 aarch64_parse_one_option_token (const char *token,
10149                                 size_t length,
10150                                 const struct aarch64_flag_desc *flag,
10151                                 const char *option_name)
10152 {
10153   for (; flag->name != NULL; flag++)
10154     {
10155       if (length == strlen (flag->name)
10156           && !strncmp (flag->name, token, length))
10157         return flag->flag;
10158     }
10159
10160   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
10161   return 0;
10162 }
10163
10164 /* Parse OPTION which is a comma-separated list of flags to enable.
10165    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
10166    default state we inherit from the CPU tuning structures.  OPTION_NAME
10167    gives the top-level option we are parsing in the -moverride string,
10168    for use in error messages.  */
10169
10170 static unsigned int
10171 aarch64_parse_boolean_options (const char *option,
10172                                const struct aarch64_flag_desc *flags,
10173                                unsigned int initial_state,
10174                                const char *option_name)
10175 {
10176   const char separator = '.';
10177   const char* specs = option;
10178   const char* ntoken = option;
10179   unsigned int found_flags = initial_state;
10180
10181   while ((ntoken = strchr (specs, separator)))
10182     {
10183       size_t token_length = ntoken - specs;
10184       unsigned token_ops = aarch64_parse_one_option_token (specs,
10185                                                            token_length,
10186                                                            flags,
10187                                                            option_name);
10188       /* If we find "none" (or, for simplicity's sake, an error) anywhere
10189          in the token stream, reset the supported operations.  So:
10190
10191            adrp+add.cmp+branch.none.adrp+add
10192
10193            would have the result of turning on only adrp+add fusion.  */
10194       if (!token_ops)
10195         found_flags = 0;
10196
10197       found_flags |= token_ops;
10198       specs = ++ntoken;
10199     }
10200
10201   /* We ended with a comma, print something.  */
10202   if (!(*specs))
10203     {
10204       error ("%s string ill-formed\n", option_name);
10205       return 0;
10206     }
10207
10208   /* We still have one more token to parse.  */
10209   size_t token_length = strlen (specs);
10210   unsigned token_ops = aarch64_parse_one_option_token (specs,
10211                                                        token_length,
10212                                                        flags,
10213                                                        option_name);
10214    if (!token_ops)
10215      found_flags = 0;
10216
10217   found_flags |= token_ops;
10218   return found_flags;
10219 }
10220
10221 /* Support for overriding instruction fusion.  */
10222
10223 static void
10224 aarch64_parse_fuse_string (const char *fuse_string,
10225                             struct tune_params *tune)
10226 {
10227   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
10228                                                      aarch64_fusible_pairs,
10229                                                      tune->fusible_ops,
10230                                                      "fuse=");
10231 }
10232
10233 /* Support for overriding other tuning flags.  */
10234
10235 static void
10236 aarch64_parse_tune_string (const char *tune_string,
10237                             struct tune_params *tune)
10238 {
10239   tune->extra_tuning_flags
10240     = aarch64_parse_boolean_options (tune_string,
10241                                      aarch64_tuning_flags,
10242                                      tune->extra_tuning_flags,
10243                                      "tune=");
10244 }
10245
10246 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
10247    we understand.  If it is, extract the option string and handoff to
10248    the appropriate function.  */
10249
10250 void
10251 aarch64_parse_one_override_token (const char* token,
10252                                   size_t length,
10253                                   struct tune_params *tune)
10254 {
10255   const struct aarch64_tuning_override_function *fn
10256     = aarch64_tuning_override_functions;
10257
10258   const char *option_part = strchr (token, '=');
10259   if (!option_part)
10260     {
10261       error ("tuning string missing in option (%s)", token);
10262       return;
10263     }
10264
10265   /* Get the length of the option name.  */
10266   length = option_part - token;
10267   /* Skip the '=' to get to the option string.  */
10268   option_part++;
10269
10270   for (; fn->name != NULL; fn++)
10271     {
10272       if (!strncmp (fn->name, token, length))
10273         {
10274           fn->parse_override (option_part, tune);
10275           return;
10276         }
10277     }
10278
10279   error ("unknown tuning option (%s)",token);
10280   return;
10281 }
10282
10283 /* A checking mechanism for the implementation of the tls size.  */
10284
10285 static void
10286 initialize_aarch64_tls_size (struct gcc_options *opts)
10287 {
10288   if (aarch64_tls_size == 0)
10289     aarch64_tls_size = 24;
10290
10291   switch (opts->x_aarch64_cmodel_var)
10292     {
10293     case AARCH64_CMODEL_TINY:
10294       /* Both the default and maximum TLS size allowed under tiny is 1M which
10295          needs two instructions to address, so we clamp the size to 24.  */
10296       if (aarch64_tls_size > 24)
10297         aarch64_tls_size = 24;
10298       break;
10299     case AARCH64_CMODEL_SMALL:
10300       /* The maximum TLS size allowed under small is 4G.  */
10301       if (aarch64_tls_size > 32)
10302         aarch64_tls_size = 32;
10303       break;
10304     case AARCH64_CMODEL_LARGE:
10305       /* The maximum TLS size allowed under large is 16E.
10306          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
10307       if (aarch64_tls_size > 48)
10308         aarch64_tls_size = 48;
10309       break;
10310     default:
10311       gcc_unreachable ();
10312     }
10313
10314   return;
10315 }
10316
10317 /* Parse STRING looking for options in the format:
10318      string     :: option:string
10319      option     :: name=substring
10320      name       :: {a-z}
10321      substring  :: defined by option.  */
10322
10323 static void
10324 aarch64_parse_override_string (const char* input_string,
10325                                struct tune_params* tune)
10326 {
10327   const char separator = ':';
10328   size_t string_length = strlen (input_string) + 1;
10329   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
10330   char *string = string_root;
10331   strncpy (string, input_string, string_length);
10332   string[string_length - 1] = '\0';
10333
10334   char* ntoken = string;
10335
10336   while ((ntoken = strchr (string, separator)))
10337     {
10338       size_t token_length = ntoken - string;
10339       /* Make this substring look like a string.  */
10340       *ntoken = '\0';
10341       aarch64_parse_one_override_token (string, token_length, tune);
10342       string = ++ntoken;
10343     }
10344
10345   /* One last option to parse.  */
10346   aarch64_parse_one_override_token (string, strlen (string), tune);
10347   free (string_root);
10348 }
10349
10350
10351 static void
10352 aarch64_override_options_after_change_1 (struct gcc_options *opts)
10353 {
10354   /* PR 70044: We have to be careful about being called multiple times for the
10355      same function.  This means all changes should be repeatable.  */
10356
10357   /* If the frame pointer is enabled, set it to a special value that behaves
10358      similar to frame pointer omission.  If we don't do this all leaf functions
10359      will get a frame pointer even if flag_omit_leaf_frame_pointer is set.
10360      If flag_omit_frame_pointer has this special value, we must force the
10361      frame pointer if not in a leaf function.  We also need to force it in a
10362      leaf function if flag_omit_frame_pointer is not set or if LR is used.  */
10363   if (opts->x_flag_omit_frame_pointer == 0)
10364     opts->x_flag_omit_frame_pointer = 2;
10365
10366   /* If not optimizing for size, set the default
10367      alignment to what the target wants.  */
10368   if (!opts->x_optimize_size)
10369     {
10370       if (opts->x_align_loops <= 0)
10371         opts->x_align_loops = aarch64_tune_params.loop_align;
10372       if (opts->x_align_jumps <= 0)
10373         opts->x_align_jumps = aarch64_tune_params.jump_align;
10374       if (opts->x_align_functions <= 0)
10375         opts->x_align_functions = aarch64_tune_params.function_align;
10376     }
10377
10378   /* We default to no pc-relative literal loads.  */
10379
10380   aarch64_pcrelative_literal_loads = false;
10381
10382   /* If -mpc-relative-literal-loads is set on the command line, this
10383      implies that the user asked for PC relative literal loads.  */
10384   if (opts->x_pcrelative_literal_loads == 1)
10385     aarch64_pcrelative_literal_loads = true;
10386
10387   /* In the tiny memory model it makes no sense to disallow PC relative
10388      literal pool loads.  */
10389   if (aarch64_cmodel == AARCH64_CMODEL_TINY
10390       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10391     aarch64_pcrelative_literal_loads = true;
10392
10393   /* When enabling the lower precision Newton series for the square root, also
10394      enable it for the reciprocal square root, since the latter is an
10395      intermediary step for the former.  */
10396   if (flag_mlow_precision_sqrt)
10397     flag_mrecip_low_precision_sqrt = true;
10398 }
10399
10400 /* 'Unpack' up the internal tuning structs and update the options
10401     in OPTS.  The caller must have set up selected_tune and selected_arch
10402     as all the other target-specific codegen decisions are
10403     derived from them.  */
10404
10405 void
10406 aarch64_override_options_internal (struct gcc_options *opts)
10407 {
10408   aarch64_tune_flags = selected_tune->flags;
10409   aarch64_tune = selected_tune->sched_core;
10410   /* Make a copy of the tuning parameters attached to the core, which
10411      we may later overwrite.  */
10412   aarch64_tune_params = *(selected_tune->tune);
10413   aarch64_architecture_version = selected_arch->architecture_version;
10414
10415   if (opts->x_aarch64_override_tune_string)
10416     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
10417                                   &aarch64_tune_params);
10418
10419   /* This target defaults to strict volatile bitfields.  */
10420   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
10421     opts->x_flag_strict_volatile_bitfields = 1;
10422
10423   initialize_aarch64_code_model (opts);
10424   initialize_aarch64_tls_size (opts);
10425
10426   int queue_depth = 0;
10427   switch (aarch64_tune_params.autoprefetcher_model)
10428     {
10429       case tune_params::AUTOPREFETCHER_OFF:
10430         queue_depth = -1;
10431         break;
10432       case tune_params::AUTOPREFETCHER_WEAK:
10433         queue_depth = 0;
10434         break;
10435       case tune_params::AUTOPREFETCHER_STRONG:
10436         queue_depth = max_insn_queue_index + 1;
10437         break;
10438       default:
10439         gcc_unreachable ();
10440     }
10441
10442   /* We don't mind passing in global_options_set here as we don't use
10443      the *options_set structs anyway.  */
10444   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
10445                          queue_depth,
10446                          opts->x_param_values,
10447                          global_options_set.x_param_values);
10448
10449   /* Set up parameters to be used in prefetching algorithm.  Do not
10450      override the defaults unless we are tuning for a core we have
10451      researched values for.  */
10452   if (aarch64_tune_params.prefetch->num_slots > 0)
10453     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
10454                            aarch64_tune_params.prefetch->num_slots,
10455                            opts->x_param_values,
10456                            global_options_set.x_param_values);
10457   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
10458     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
10459                            aarch64_tune_params.prefetch->l1_cache_size,
10460                            opts->x_param_values,
10461                            global_options_set.x_param_values);
10462   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
10463     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
10464                            aarch64_tune_params.prefetch->l1_cache_line_size,
10465                            opts->x_param_values,
10466                            global_options_set.x_param_values);
10467   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
10468     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
10469                            aarch64_tune_params.prefetch->l2_cache_size,
10470                            opts->x_param_values,
10471                            global_options_set.x_param_values);
10472
10473   /* Use the alternative scheduling-pressure algorithm by default.  */
10474   maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
10475                          opts->x_param_values,
10476                          global_options_set.x_param_values);
10477
10478   /* Enable sw prefetching at specified optimization level for
10479      CPUS that have prefetch.  Lower optimization level threshold by 1
10480      when profiling is enabled.  */
10481   if (opts->x_flag_prefetch_loop_arrays < 0
10482       && !opts->x_optimize_size
10483       && aarch64_tune_params.prefetch->default_opt_level >= 0
10484       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
10485     opts->x_flag_prefetch_loop_arrays = 1;
10486
10487   aarch64_override_options_after_change_1 (opts);
10488 }
10489
10490 /* Print a hint with a suggestion for a core or architecture name that
10491    most closely resembles what the user passed in STR.  ARCH is true if
10492    the user is asking for an architecture name.  ARCH is false if the user
10493    is asking for a core name.  */
10494
10495 static void
10496 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
10497 {
10498   auto_vec<const char *> candidates;
10499   const struct processor *entry = arch ? all_architectures : all_cores;
10500   for (; entry->name != NULL; entry++)
10501     candidates.safe_push (entry->name);
10502   char *s;
10503   const char *hint = candidates_list_and_hint (str, s, candidates);
10504   if (hint)
10505     inform (input_location, "valid arguments are: %s;"
10506                              " did you mean %qs?", s, hint);
10507   XDELETEVEC (s);
10508 }
10509
10510 /* Print a hint with a suggestion for a core name that most closely resembles
10511    what the user passed in STR.  */
10512
10513 inline static void
10514 aarch64_print_hint_for_core (const char *str)
10515 {
10516   aarch64_print_hint_for_core_or_arch (str, false);
10517 }
10518
10519 /* Print a hint with a suggestion for an architecture name that most closely
10520    resembles what the user passed in STR.  */
10521
10522 inline static void
10523 aarch64_print_hint_for_arch (const char *str)
10524 {
10525   aarch64_print_hint_for_core_or_arch (str, true);
10526 }
10527
10528 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
10529    specified in STR and throw errors if appropriate.  Put the results if
10530    they are valid in RES and ISA_FLAGS.  Return whether the option is
10531    valid.  */
10532
10533 static bool
10534 aarch64_validate_mcpu (const char *str, const struct processor **res,
10535                        unsigned long *isa_flags)
10536 {
10537   enum aarch64_parse_opt_result parse_res
10538     = aarch64_parse_cpu (str, res, isa_flags);
10539
10540   if (parse_res == AARCH64_PARSE_OK)
10541     return true;
10542
10543   switch (parse_res)
10544     {
10545       case AARCH64_PARSE_MISSING_ARG:
10546         error ("missing cpu name in %<-mcpu=%s%>", str);
10547         break;
10548       case AARCH64_PARSE_INVALID_ARG:
10549         error ("unknown value %qs for -mcpu", str);
10550         aarch64_print_hint_for_core (str);
10551         break;
10552       case AARCH64_PARSE_INVALID_FEATURE:
10553         error ("invalid feature modifier in %<-mcpu=%s%>", str);
10554         break;
10555       default:
10556         gcc_unreachable ();
10557     }
10558
10559   return false;
10560 }
10561
10562 /* Validate a command-line -march option.  Parse the arch and extensions
10563    (if any) specified in STR and throw errors if appropriate.  Put the
10564    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
10565    option is valid.  */
10566
10567 static bool
10568 aarch64_validate_march (const char *str, const struct processor **res,
10569                          unsigned long *isa_flags)
10570 {
10571   enum aarch64_parse_opt_result parse_res
10572     = aarch64_parse_arch (str, res, isa_flags);
10573
10574   if (parse_res == AARCH64_PARSE_OK)
10575     return true;
10576
10577   switch (parse_res)
10578     {
10579       case AARCH64_PARSE_MISSING_ARG:
10580         error ("missing arch name in %<-march=%s%>", str);
10581         break;
10582       case AARCH64_PARSE_INVALID_ARG:
10583         error ("unknown value %qs for -march", str);
10584         aarch64_print_hint_for_arch (str);
10585         break;
10586       case AARCH64_PARSE_INVALID_FEATURE:
10587         error ("invalid feature modifier in %<-march=%s%>", str);
10588         break;
10589       default:
10590         gcc_unreachable ();
10591     }
10592
10593   return false;
10594 }
10595
10596 /* Validate a command-line -mtune option.  Parse the cpu
10597    specified in STR and throw errors if appropriate.  Put the
10598    result, if it is valid, in RES.  Return whether the option is
10599    valid.  */
10600
10601 static bool
10602 aarch64_validate_mtune (const char *str, const struct processor **res)
10603 {
10604   enum aarch64_parse_opt_result parse_res
10605     = aarch64_parse_tune (str, res);
10606
10607   if (parse_res == AARCH64_PARSE_OK)
10608     return true;
10609
10610   switch (parse_res)
10611     {
10612       case AARCH64_PARSE_MISSING_ARG:
10613         error ("missing cpu name in %<-mtune=%s%>", str);
10614         break;
10615       case AARCH64_PARSE_INVALID_ARG:
10616         error ("unknown value %qs for -mtune", str);
10617         aarch64_print_hint_for_core (str);
10618         break;
10619       default:
10620         gcc_unreachable ();
10621     }
10622   return false;
10623 }
10624
10625 /* Return the CPU corresponding to the enum CPU.
10626    If it doesn't specify a cpu, return the default.  */
10627
10628 static const struct processor *
10629 aarch64_get_tune_cpu (enum aarch64_processor cpu)
10630 {
10631   if (cpu != aarch64_none)
10632     return &all_cores[cpu];
10633
10634   /* The & 0x3f is to extract the bottom 6 bits that encode the
10635      default cpu as selected by the --with-cpu GCC configure option
10636      in config.gcc.
10637      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
10638      flags mechanism should be reworked to make it more sane.  */
10639   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10640 }
10641
10642 /* Return the architecture corresponding to the enum ARCH.
10643    If it doesn't specify a valid architecture, return the default.  */
10644
10645 static const struct processor *
10646 aarch64_get_arch (enum aarch64_arch arch)
10647 {
10648   if (arch != aarch64_no_arch)
10649     return &all_architectures[arch];
10650
10651   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10652
10653   return &all_architectures[cpu->arch];
10654 }
10655
10656 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
10657
10658 static poly_uint16
10659 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
10660 {
10661   /* For now generate vector-length agnostic code for -msve-vector-bits=128.
10662      This ensures we can clearly distinguish SVE and Advanced SIMD modes when
10663      deciding which .md file patterns to use and when deciding whether
10664      something is a legitimate address or constant.  */
10665   if (value == SVE_SCALABLE || value == SVE_128)
10666     return poly_uint16 (2, 2);
10667   else
10668     return (int) value / 64;
10669 }
10670
10671 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
10672    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
10673    tuning structs.  In particular it must set selected_tune and
10674    aarch64_isa_flags that define the available ISA features and tuning
10675    decisions.  It must also set selected_arch as this will be used to
10676    output the .arch asm tags for each function.  */
10677
10678 static void
10679 aarch64_override_options (void)
10680 {
10681   unsigned long cpu_isa = 0;
10682   unsigned long arch_isa = 0;
10683   aarch64_isa_flags = 0;
10684
10685   bool valid_cpu = true;
10686   bool valid_tune = true;
10687   bool valid_arch = true;
10688
10689   selected_cpu = NULL;
10690   selected_arch = NULL;
10691   selected_tune = NULL;
10692
10693   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
10694      If either of -march or -mtune is given, they override their
10695      respective component of -mcpu.  */
10696   if (aarch64_cpu_string)
10697     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
10698                                         &cpu_isa);
10699
10700   if (aarch64_arch_string)
10701     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
10702                                           &arch_isa);
10703
10704   if (aarch64_tune_string)
10705     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
10706
10707   /* If the user did not specify a processor, choose the default
10708      one for them.  This will be the CPU set during configuration using
10709      --with-cpu, otherwise it is "generic".  */
10710   if (!selected_cpu)
10711     {
10712       if (selected_arch)
10713         {
10714           selected_cpu = &all_cores[selected_arch->ident];
10715           aarch64_isa_flags = arch_isa;
10716           explicit_arch = selected_arch->arch;
10717         }
10718       else
10719         {
10720           /* Get default configure-time CPU.  */
10721           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
10722           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
10723         }
10724
10725       if (selected_tune)
10726         explicit_tune_core = selected_tune->ident;
10727     }
10728   /* If both -mcpu and -march are specified check that they are architecturally
10729      compatible, warn if they're not and prefer the -march ISA flags.  */
10730   else if (selected_arch)
10731     {
10732       if (selected_arch->arch != selected_cpu->arch)
10733         {
10734           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
10735                        all_architectures[selected_cpu->arch].name,
10736                        selected_arch->name);
10737         }
10738       aarch64_isa_flags = arch_isa;
10739       explicit_arch = selected_arch->arch;
10740       explicit_tune_core = selected_tune ? selected_tune->ident
10741                                           : selected_cpu->ident;
10742     }
10743   else
10744     {
10745       /* -mcpu but no -march.  */
10746       aarch64_isa_flags = cpu_isa;
10747       explicit_tune_core = selected_tune ? selected_tune->ident
10748                                           : selected_cpu->ident;
10749       gcc_assert (selected_cpu);
10750       selected_arch = &all_architectures[selected_cpu->arch];
10751       explicit_arch = selected_arch->arch;
10752     }
10753
10754   /* Set the arch as well as we will need it when outputing
10755      the .arch directive in assembly.  */
10756   if (!selected_arch)
10757     {
10758       gcc_assert (selected_cpu);
10759       selected_arch = &all_architectures[selected_cpu->arch];
10760     }
10761
10762   if (!selected_tune)
10763     selected_tune = selected_cpu;
10764
10765 #ifndef HAVE_AS_MABI_OPTION
10766   /* The compiler may have been configured with 2.23.* binutils, which does
10767      not have support for ILP32.  */
10768   if (TARGET_ILP32)
10769     error ("assembler does not support -mabi=ilp32");
10770 #endif
10771
10772   /* Convert -msve-vector-bits to a VG count.  */
10773   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
10774
10775   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
10776     sorry ("return address signing is only supported for -mabi=lp64");
10777
10778   /* Make sure we properly set up the explicit options.  */
10779   if ((aarch64_cpu_string && valid_cpu)
10780        || (aarch64_tune_string && valid_tune))
10781     gcc_assert (explicit_tune_core != aarch64_none);
10782
10783   if ((aarch64_cpu_string && valid_cpu)
10784        || (aarch64_arch_string && valid_arch))
10785     gcc_assert (explicit_arch != aarch64_no_arch);
10786
10787   aarch64_override_options_internal (&global_options);
10788
10789   /* Save these options as the default ones in case we push and pop them later
10790      while processing functions with potential target attributes.  */
10791   target_option_default_node = target_option_current_node
10792       = build_target_option_node (&global_options);
10793 }
10794
10795 /* Implement targetm.override_options_after_change.  */
10796
10797 static void
10798 aarch64_override_options_after_change (void)
10799 {
10800   aarch64_override_options_after_change_1 (&global_options);
10801 }
10802
10803 static struct machine_function *
10804 aarch64_init_machine_status (void)
10805 {
10806   struct machine_function *machine;
10807   machine = ggc_cleared_alloc<machine_function> ();
10808   return machine;
10809 }
10810
10811 void
10812 aarch64_init_expanders (void)
10813 {
10814   init_machine_status = aarch64_init_machine_status;
10815 }
10816
10817 /* A checking mechanism for the implementation of the various code models.  */
10818 static void
10819 initialize_aarch64_code_model (struct gcc_options *opts)
10820 {
10821    if (opts->x_flag_pic)
10822      {
10823        switch (opts->x_aarch64_cmodel_var)
10824          {
10825          case AARCH64_CMODEL_TINY:
10826            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
10827            break;
10828          case AARCH64_CMODEL_SMALL:
10829 #ifdef HAVE_AS_SMALL_PIC_RELOCS
10830            aarch64_cmodel = (flag_pic == 2
10831                              ? AARCH64_CMODEL_SMALL_PIC
10832                              : AARCH64_CMODEL_SMALL_SPIC);
10833 #else
10834            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
10835 #endif
10836            break;
10837          case AARCH64_CMODEL_LARGE:
10838            sorry ("code model %qs with -f%s", "large",
10839                   opts->x_flag_pic > 1 ? "PIC" : "pic");
10840            break;
10841          default:
10842            gcc_unreachable ();
10843          }
10844      }
10845    else
10846      aarch64_cmodel = opts->x_aarch64_cmodel_var;
10847 }
10848
10849 /* Implement TARGET_OPTION_SAVE.  */
10850
10851 static void
10852 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
10853 {
10854   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
10855 }
10856
10857 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
10858    using the information saved in PTR.  */
10859
10860 static void
10861 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
10862 {
10863   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
10864   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
10865   opts->x_explicit_arch = ptr->x_explicit_arch;
10866   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
10867   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
10868
10869   aarch64_override_options_internal (opts);
10870 }
10871
10872 /* Implement TARGET_OPTION_PRINT.  */
10873
10874 static void
10875 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
10876 {
10877   const struct processor *cpu
10878     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
10879   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
10880   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
10881   std::string extension
10882     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
10883
10884   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
10885   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
10886            arch->name, extension.c_str ());
10887 }
10888
10889 static GTY(()) tree aarch64_previous_fndecl;
10890
10891 void
10892 aarch64_reset_previous_fndecl (void)
10893 {
10894   aarch64_previous_fndecl = NULL;
10895 }
10896
10897 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
10898    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
10899    make sure optab availability predicates are recomputed when necessary.  */
10900
10901 void
10902 aarch64_save_restore_target_globals (tree new_tree)
10903 {
10904   if (TREE_TARGET_GLOBALS (new_tree))
10905     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
10906   else if (new_tree == target_option_default_node)
10907     restore_target_globals (&default_target_globals);
10908   else
10909     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
10910 }
10911
10912 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
10913    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
10914    of the function, if such exists.  This function may be called multiple
10915    times on a single function so use aarch64_previous_fndecl to avoid
10916    setting up identical state.  */
10917
10918 static void
10919 aarch64_set_current_function (tree fndecl)
10920 {
10921   if (!fndecl || fndecl == aarch64_previous_fndecl)
10922     return;
10923
10924   tree old_tree = (aarch64_previous_fndecl
10925                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
10926                    : NULL_TREE);
10927
10928   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
10929
10930   /* If current function has no attributes but the previous one did,
10931      use the default node.  */
10932   if (!new_tree && old_tree)
10933     new_tree = target_option_default_node;
10934
10935   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
10936      the default have been handled by aarch64_save_restore_target_globals from
10937      aarch64_pragma_target_parse.  */
10938   if (old_tree == new_tree)
10939     return;
10940
10941   aarch64_previous_fndecl = fndecl;
10942
10943   /* First set the target options.  */
10944   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
10945
10946   aarch64_save_restore_target_globals (new_tree);
10947 }
10948
10949 /* Enum describing the various ways we can handle attributes.
10950    In many cases we can reuse the generic option handling machinery.  */
10951
10952 enum aarch64_attr_opt_type
10953 {
10954   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
10955   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
10956   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
10957   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
10958 };
10959
10960 /* All the information needed to handle a target attribute.
10961    NAME is the name of the attribute.
10962    ATTR_TYPE specifies the type of behavior of the attribute as described
10963    in the definition of enum aarch64_attr_opt_type.
10964    ALLOW_NEG is true if the attribute supports a "no-" form.
10965    HANDLER is the function that takes the attribute string as an argument
10966    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
10967    OPT_NUM is the enum specifying the option that the attribute modifies.
10968    This is needed for attributes that mirror the behavior of a command-line
10969    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
10970    aarch64_attr_enum.  */
10971
10972 struct aarch64_attribute_info
10973 {
10974   const char *name;
10975   enum aarch64_attr_opt_type attr_type;
10976   bool allow_neg;
10977   bool (*handler) (const char *);
10978   enum opt_code opt_num;
10979 };
10980
10981 /* Handle the ARCH_STR argument to the arch= target attribute.  */
10982
10983 static bool
10984 aarch64_handle_attr_arch (const char *str)
10985 {
10986   const struct processor *tmp_arch = NULL;
10987   enum aarch64_parse_opt_result parse_res
10988     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
10989
10990   if (parse_res == AARCH64_PARSE_OK)
10991     {
10992       gcc_assert (tmp_arch);
10993       selected_arch = tmp_arch;
10994       explicit_arch = selected_arch->arch;
10995       return true;
10996     }
10997
10998   switch (parse_res)
10999     {
11000       case AARCH64_PARSE_MISSING_ARG:
11001         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
11002         break;
11003       case AARCH64_PARSE_INVALID_ARG:
11004         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
11005         aarch64_print_hint_for_arch (str);
11006         break;
11007       case AARCH64_PARSE_INVALID_FEATURE:
11008         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11009         break;
11010       default:
11011         gcc_unreachable ();
11012     }
11013
11014   return false;
11015 }
11016
11017 /* Handle the argument CPU_STR to the cpu= target attribute.  */
11018
11019 static bool
11020 aarch64_handle_attr_cpu (const char *str)
11021 {
11022   const struct processor *tmp_cpu = NULL;
11023   enum aarch64_parse_opt_result parse_res
11024     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
11025
11026   if (parse_res == AARCH64_PARSE_OK)
11027     {
11028       gcc_assert (tmp_cpu);
11029       selected_tune = tmp_cpu;
11030       explicit_tune_core = selected_tune->ident;
11031
11032       selected_arch = &all_architectures[tmp_cpu->arch];
11033       explicit_arch = selected_arch->arch;
11034       return true;
11035     }
11036
11037   switch (parse_res)
11038     {
11039       case AARCH64_PARSE_MISSING_ARG:
11040         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
11041         break;
11042       case AARCH64_PARSE_INVALID_ARG:
11043         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
11044         aarch64_print_hint_for_core (str);
11045         break;
11046       case AARCH64_PARSE_INVALID_FEATURE:
11047         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11048         break;
11049       default:
11050         gcc_unreachable ();
11051     }
11052
11053   return false;
11054 }
11055
11056 /* Handle the argument STR to the tune= target attribute.  */
11057
11058 static bool
11059 aarch64_handle_attr_tune (const char *str)
11060 {
11061   const struct processor *tmp_tune = NULL;
11062   enum aarch64_parse_opt_result parse_res
11063     = aarch64_parse_tune (str, &tmp_tune);
11064
11065   if (parse_res == AARCH64_PARSE_OK)
11066     {
11067       gcc_assert (tmp_tune);
11068       selected_tune = tmp_tune;
11069       explicit_tune_core = selected_tune->ident;
11070       return true;
11071     }
11072
11073   switch (parse_res)
11074     {
11075       case AARCH64_PARSE_INVALID_ARG:
11076         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
11077         aarch64_print_hint_for_core (str);
11078         break;
11079       default:
11080         gcc_unreachable ();
11081     }
11082
11083   return false;
11084 }
11085
11086 /* Parse an architecture extensions target attribute string specified in STR.
11087    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
11088    if successful.  Update aarch64_isa_flags to reflect the ISA features
11089    modified.  */
11090
11091 static bool
11092 aarch64_handle_attr_isa_flags (char *str)
11093 {
11094   enum aarch64_parse_opt_result parse_res;
11095   unsigned long isa_flags = aarch64_isa_flags;
11096
11097   /* We allow "+nothing" in the beginning to clear out all architectural
11098      features if the user wants to handpick specific features.  */
11099   if (strncmp ("+nothing", str, 8) == 0)
11100     {
11101       isa_flags = 0;
11102       str += 8;
11103     }
11104
11105   parse_res = aarch64_parse_extension (str, &isa_flags);
11106
11107   if (parse_res == AARCH64_PARSE_OK)
11108     {
11109       aarch64_isa_flags = isa_flags;
11110       return true;
11111     }
11112
11113   switch (parse_res)
11114     {
11115       case AARCH64_PARSE_MISSING_ARG:
11116         error ("missing value in %<target()%> pragma or attribute");
11117         break;
11118
11119       case AARCH64_PARSE_INVALID_FEATURE:
11120         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11121         break;
11122
11123       default:
11124         gcc_unreachable ();
11125     }
11126
11127  return false;
11128 }
11129
11130 /* The target attributes that we support.  On top of these we also support just
11131    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
11132    handled explicitly in aarch64_process_one_target_attr.  */
11133
11134 static const struct aarch64_attribute_info aarch64_attributes[] =
11135 {
11136   { "general-regs-only", aarch64_attr_mask, false, NULL,
11137      OPT_mgeneral_regs_only },
11138   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
11139      OPT_mfix_cortex_a53_835769 },
11140   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
11141      OPT_mfix_cortex_a53_843419 },
11142   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
11143   { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
11144   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
11145      OPT_momit_leaf_frame_pointer },
11146   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
11147   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
11148      OPT_march_ },
11149   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
11150   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
11151      OPT_mtune_ },
11152   { "sign-return-address", aarch64_attr_enum, false, NULL,
11153      OPT_msign_return_address_ },
11154   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
11155 };
11156
11157 /* Parse ARG_STR which contains the definition of one target attribute.
11158    Show appropriate errors if any or return true if the attribute is valid.  */
11159
11160 static bool
11161 aarch64_process_one_target_attr (char *arg_str)
11162 {
11163   bool invert = false;
11164
11165   size_t len = strlen (arg_str);
11166
11167   if (len == 0)
11168     {
11169       error ("malformed %<target()%> pragma or attribute");
11170       return false;
11171     }
11172
11173   char *str_to_check = (char *) alloca (len + 1);
11174   strcpy (str_to_check, arg_str);
11175
11176   /* Skip leading whitespace.  */
11177   while (*str_to_check == ' ' || *str_to_check == '\t')
11178     str_to_check++;
11179
11180   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
11181      It is easier to detect and handle it explicitly here rather than going
11182      through the machinery for the rest of the target attributes in this
11183      function.  */
11184   if (*str_to_check == '+')
11185     return aarch64_handle_attr_isa_flags (str_to_check);
11186
11187   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
11188     {
11189       invert = true;
11190       str_to_check += 3;
11191     }
11192   char *arg = strchr (str_to_check, '=');
11193
11194   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
11195      and point ARG to "foo".  */
11196   if (arg)
11197     {
11198       *arg = '\0';
11199       arg++;
11200     }
11201   const struct aarch64_attribute_info *p_attr;
11202   bool found = false;
11203   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
11204     {
11205       /* If the names don't match up, or the user has given an argument
11206          to an attribute that doesn't accept one, or didn't give an argument
11207          to an attribute that expects one, fail to match.  */
11208       if (strcmp (str_to_check, p_attr->name) != 0)
11209         continue;
11210
11211       found = true;
11212       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
11213                               || p_attr->attr_type == aarch64_attr_enum;
11214
11215       if (attr_need_arg_p ^ (arg != NULL))
11216         {
11217           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
11218           return false;
11219         }
11220
11221       /* If the name matches but the attribute does not allow "no-" versions
11222          then we can't match.  */
11223       if (invert && !p_attr->allow_neg)
11224         {
11225           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
11226           return false;
11227         }
11228
11229       switch (p_attr->attr_type)
11230         {
11231         /* Has a custom handler registered.
11232            For example, cpu=, arch=, tune=.  */
11233           case aarch64_attr_custom:
11234             gcc_assert (p_attr->handler);
11235             if (!p_attr->handler (arg))
11236               return false;
11237             break;
11238
11239           /* Either set or unset a boolean option.  */
11240           case aarch64_attr_bool:
11241             {
11242               struct cl_decoded_option decoded;
11243
11244               generate_option (p_attr->opt_num, NULL, !invert,
11245                                CL_TARGET, &decoded);
11246               aarch64_handle_option (&global_options, &global_options_set,
11247                                       &decoded, input_location);
11248               break;
11249             }
11250           /* Set or unset a bit in the target_flags.  aarch64_handle_option
11251              should know what mask to apply given the option number.  */
11252           case aarch64_attr_mask:
11253             {
11254               struct cl_decoded_option decoded;
11255               /* We only need to specify the option number.
11256                  aarch64_handle_option will know which mask to apply.  */
11257               decoded.opt_index = p_attr->opt_num;
11258               decoded.value = !invert;
11259               aarch64_handle_option (&global_options, &global_options_set,
11260                                       &decoded, input_location);
11261               break;
11262             }
11263           /* Use the option setting machinery to set an option to an enum.  */
11264           case aarch64_attr_enum:
11265             {
11266               gcc_assert (arg);
11267               bool valid;
11268               int value;
11269               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
11270                                               &value, CL_TARGET);
11271               if (valid)
11272                 {
11273                   set_option (&global_options, NULL, p_attr->opt_num, value,
11274                               NULL, DK_UNSPECIFIED, input_location,
11275                               global_dc);
11276                 }
11277               else
11278                 {
11279                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
11280                 }
11281               break;
11282             }
11283           default:
11284             gcc_unreachable ();
11285         }
11286     }
11287
11288   /* If we reached here we either have found an attribute and validated
11289      it or didn't match any.  If we matched an attribute but its arguments
11290      were malformed we will have returned false already.  */
11291   return found;
11292 }
11293
11294 /* Count how many times the character C appears in
11295    NULL-terminated string STR.  */
11296
11297 static unsigned int
11298 num_occurences_in_str (char c, char *str)
11299 {
11300   unsigned int res = 0;
11301   while (*str != '\0')
11302     {
11303       if (*str == c)
11304         res++;
11305
11306       str++;
11307     }
11308
11309   return res;
11310 }
11311
11312 /* Parse the tree in ARGS that contains the target attribute information
11313    and update the global target options space.  */
11314
11315 bool
11316 aarch64_process_target_attr (tree args)
11317 {
11318   if (TREE_CODE (args) == TREE_LIST)
11319     {
11320       do
11321         {
11322           tree head = TREE_VALUE (args);
11323           if (head)
11324             {
11325               if (!aarch64_process_target_attr (head))
11326                 return false;
11327             }
11328           args = TREE_CHAIN (args);
11329         } while (args);
11330
11331       return true;
11332     }
11333
11334   if (TREE_CODE (args) != STRING_CST)
11335     {
11336       error ("attribute %<target%> argument not a string");
11337       return false;
11338     }
11339
11340   size_t len = strlen (TREE_STRING_POINTER (args));
11341   char *str_to_check = (char *) alloca (len + 1);
11342   strcpy (str_to_check, TREE_STRING_POINTER (args));
11343
11344   if (len == 0)
11345     {
11346       error ("malformed %<target()%> pragma or attribute");
11347       return false;
11348     }
11349
11350   /* Used to catch empty spaces between commas i.e.
11351      attribute ((target ("attr1,,attr2"))).  */
11352   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
11353
11354   /* Handle multiple target attributes separated by ','.  */
11355   char *token = strtok (str_to_check, ",");
11356
11357   unsigned int num_attrs = 0;
11358   while (token)
11359     {
11360       num_attrs++;
11361       if (!aarch64_process_one_target_attr (token))
11362         {
11363           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
11364           return false;
11365         }
11366
11367       token = strtok (NULL, ",");
11368     }
11369
11370   if (num_attrs != num_commas + 1)
11371     {
11372       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
11373       return false;
11374     }
11375
11376   return true;
11377 }
11378
11379 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
11380    process attribute ((target ("..."))).  */
11381
11382 static bool
11383 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
11384 {
11385   struct cl_target_option cur_target;
11386   bool ret;
11387   tree old_optimize;
11388   tree new_target, new_optimize;
11389   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11390
11391   /* If what we're processing is the current pragma string then the
11392      target option node is already stored in target_option_current_node
11393      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
11394      having to re-parse the string.  This is especially useful to keep
11395      arm_neon.h compile times down since that header contains a lot
11396      of intrinsics enclosed in pragmas.  */
11397   if (!existing_target && args == current_target_pragma)
11398     {
11399       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
11400       return true;
11401     }
11402   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11403
11404   old_optimize = build_optimization_node (&global_options);
11405   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11406
11407   /* If the function changed the optimization levels as well as setting
11408      target options, start with the optimizations specified.  */
11409   if (func_optimize && func_optimize != old_optimize)
11410     cl_optimization_restore (&global_options,
11411                              TREE_OPTIMIZATION (func_optimize));
11412
11413   /* Save the current target options to restore at the end.  */
11414   cl_target_option_save (&cur_target, &global_options);
11415
11416   /* If fndecl already has some target attributes applied to it, unpack
11417      them so that we add this attribute on top of them, rather than
11418      overwriting them.  */
11419   if (existing_target)
11420     {
11421       struct cl_target_option *existing_options
11422         = TREE_TARGET_OPTION (existing_target);
11423
11424       if (existing_options)
11425         cl_target_option_restore (&global_options, existing_options);
11426     }
11427   else
11428     cl_target_option_restore (&global_options,
11429                         TREE_TARGET_OPTION (target_option_current_node));
11430
11431   ret = aarch64_process_target_attr (args);
11432
11433   /* Set up any additional state.  */
11434   if (ret)
11435     {
11436       aarch64_override_options_internal (&global_options);
11437       /* Initialize SIMD builtins if we haven't already.
11438          Set current_target_pragma to NULL for the duration so that
11439          the builtin initialization code doesn't try to tag the functions
11440          being built with the attributes specified by any current pragma, thus
11441          going into an infinite recursion.  */
11442       if (TARGET_SIMD)
11443         {
11444           tree saved_current_target_pragma = current_target_pragma;
11445           current_target_pragma = NULL;
11446           aarch64_init_simd_builtins ();
11447           current_target_pragma = saved_current_target_pragma;
11448         }
11449       new_target = build_target_option_node (&global_options);
11450     }
11451   else
11452     new_target = NULL;
11453
11454   new_optimize = build_optimization_node (&global_options);
11455
11456   if (fndecl && ret)
11457     {
11458       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
11459
11460       if (old_optimize != new_optimize)
11461         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
11462     }
11463
11464   cl_target_option_restore (&global_options, &cur_target);
11465
11466   if (old_optimize != new_optimize)
11467     cl_optimization_restore (&global_options,
11468                              TREE_OPTIMIZATION (old_optimize));
11469   return ret;
11470 }
11471
11472 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
11473    tri-bool options (yes, no, don't care) and the default value is
11474    DEF, determine whether to reject inlining.  */
11475
11476 static bool
11477 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
11478                                      int dont_care, int def)
11479 {
11480   /* If the callee doesn't care, always allow inlining.  */
11481   if (callee == dont_care)
11482     return true;
11483
11484   /* If the caller doesn't care, always allow inlining.  */
11485   if (caller == dont_care)
11486     return true;
11487
11488   /* Otherwise, allow inlining if either the callee and caller values
11489      agree, or if the callee is using the default value.  */
11490   return (callee == caller || callee == def);
11491 }
11492
11493 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
11494    to inline CALLEE into CALLER based on target-specific info.
11495    Make sure that the caller and callee have compatible architectural
11496    features.  Then go through the other possible target attributes
11497    and see if they can block inlining.  Try not to reject always_inline
11498    callees unless they are incompatible architecturally.  */
11499
11500 static bool
11501 aarch64_can_inline_p (tree caller, tree callee)
11502 {
11503   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
11504   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
11505
11506   /* If callee has no option attributes, then it is ok to inline.  */
11507   if (!callee_tree)
11508     return true;
11509
11510   struct cl_target_option *caller_opts
11511         = TREE_TARGET_OPTION (caller_tree ? caller_tree
11512                                            : target_option_default_node);
11513
11514   struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
11515
11516
11517   /* Callee's ISA flags should be a subset of the caller's.  */
11518   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
11519        != callee_opts->x_aarch64_isa_flags)
11520     return false;
11521
11522   /* Allow non-strict aligned functions inlining into strict
11523      aligned ones.  */
11524   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
11525        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
11526       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
11527            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
11528     return false;
11529
11530   bool always_inline = lookup_attribute ("always_inline",
11531                                           DECL_ATTRIBUTES (callee));
11532
11533   /* If the architectural features match up and the callee is always_inline
11534      then the other attributes don't matter.  */
11535   if (always_inline)
11536     return true;
11537
11538   if (caller_opts->x_aarch64_cmodel_var
11539       != callee_opts->x_aarch64_cmodel_var)
11540     return false;
11541
11542   if (caller_opts->x_aarch64_tls_dialect
11543       != callee_opts->x_aarch64_tls_dialect)
11544     return false;
11545
11546   /* Honour explicit requests to workaround errata.  */
11547   if (!aarch64_tribools_ok_for_inlining_p (
11548           caller_opts->x_aarch64_fix_a53_err835769,
11549           callee_opts->x_aarch64_fix_a53_err835769,
11550           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
11551     return false;
11552
11553   if (!aarch64_tribools_ok_for_inlining_p (
11554           caller_opts->x_aarch64_fix_a53_err843419,
11555           callee_opts->x_aarch64_fix_a53_err843419,
11556           2, TARGET_FIX_ERR_A53_843419))
11557     return false;
11558
11559   /* If the user explicitly specified -momit-leaf-frame-pointer for the
11560      caller and calle and they don't match up, reject inlining.  */
11561   if (!aarch64_tribools_ok_for_inlining_p (
11562           caller_opts->x_flag_omit_leaf_frame_pointer,
11563           callee_opts->x_flag_omit_leaf_frame_pointer,
11564           2, 1))
11565     return false;
11566
11567   /* If the callee has specific tuning overrides, respect them.  */
11568   if (callee_opts->x_aarch64_override_tune_string != NULL
11569       && caller_opts->x_aarch64_override_tune_string == NULL)
11570     return false;
11571
11572   /* If the user specified tuning override strings for the
11573      caller and callee and they don't match up, reject inlining.
11574      We just do a string compare here, we don't analyze the meaning
11575      of the string, as it would be too costly for little gain.  */
11576   if (callee_opts->x_aarch64_override_tune_string
11577       && caller_opts->x_aarch64_override_tune_string
11578       && (strcmp (callee_opts->x_aarch64_override_tune_string,
11579                   caller_opts->x_aarch64_override_tune_string) != 0))
11580     return false;
11581
11582   return true;
11583 }
11584
11585 /* Return true if SYMBOL_REF X binds locally.  */
11586
11587 static bool
11588 aarch64_symbol_binds_local_p (const_rtx x)
11589 {
11590   return (SYMBOL_REF_DECL (x)
11591           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
11592           : SYMBOL_REF_LOCAL_P (x));
11593 }
11594
11595 /* Return true if SYMBOL_REF X is thread local */
11596 static bool
11597 aarch64_tls_symbol_p (rtx x)
11598 {
11599   if (! TARGET_HAVE_TLS)
11600     return false;
11601
11602   if (GET_CODE (x) != SYMBOL_REF)
11603     return false;
11604
11605   return SYMBOL_REF_TLS_MODEL (x) != 0;
11606 }
11607
11608 /* Classify a TLS symbol into one of the TLS kinds.  */
11609 enum aarch64_symbol_type
11610 aarch64_classify_tls_symbol (rtx x)
11611 {
11612   enum tls_model tls_kind = tls_symbolic_operand_type (x);
11613
11614   switch (tls_kind)
11615     {
11616     case TLS_MODEL_GLOBAL_DYNAMIC:
11617     case TLS_MODEL_LOCAL_DYNAMIC:
11618       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
11619
11620     case TLS_MODEL_INITIAL_EXEC:
11621       switch (aarch64_cmodel)
11622         {
11623         case AARCH64_CMODEL_TINY:
11624         case AARCH64_CMODEL_TINY_PIC:
11625           return SYMBOL_TINY_TLSIE;
11626         default:
11627           return SYMBOL_SMALL_TLSIE;
11628         }
11629
11630     case TLS_MODEL_LOCAL_EXEC:
11631       if (aarch64_tls_size == 12)
11632         return SYMBOL_TLSLE12;
11633       else if (aarch64_tls_size == 24)
11634         return SYMBOL_TLSLE24;
11635       else if (aarch64_tls_size == 32)
11636         return SYMBOL_TLSLE32;
11637       else if (aarch64_tls_size == 48)
11638         return SYMBOL_TLSLE48;
11639       else
11640         gcc_unreachable ();
11641
11642     case TLS_MODEL_EMULATED:
11643     case TLS_MODEL_NONE:
11644       return SYMBOL_FORCE_TO_MEM;
11645
11646     default:
11647       gcc_unreachable ();
11648     }
11649 }
11650
11651 /* Return the correct method for accessing X + OFFSET, where X is either
11652    a SYMBOL_REF or LABEL_REF.  */
11653
11654 enum aarch64_symbol_type
11655 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
11656 {
11657   if (GET_CODE (x) == LABEL_REF)
11658     {
11659       switch (aarch64_cmodel)
11660         {
11661         case AARCH64_CMODEL_LARGE:
11662           return SYMBOL_FORCE_TO_MEM;
11663
11664         case AARCH64_CMODEL_TINY_PIC:
11665         case AARCH64_CMODEL_TINY:
11666           return SYMBOL_TINY_ABSOLUTE;
11667
11668         case AARCH64_CMODEL_SMALL_SPIC:
11669         case AARCH64_CMODEL_SMALL_PIC:
11670         case AARCH64_CMODEL_SMALL:
11671           return SYMBOL_SMALL_ABSOLUTE;
11672
11673         default:
11674           gcc_unreachable ();
11675         }
11676     }
11677
11678   if (GET_CODE (x) == SYMBOL_REF)
11679     {
11680       if (aarch64_tls_symbol_p (x))
11681         return aarch64_classify_tls_symbol (x);
11682
11683       switch (aarch64_cmodel)
11684         {
11685         case AARCH64_CMODEL_TINY:
11686           /* When we retrieve symbol + offset address, we have to make sure
11687              the offset does not cause overflow of the final address.  But
11688              we have no way of knowing the address of symbol at compile time
11689              so we can't accurately say if the distance between the PC and
11690              symbol + offset is outside the addressible range of +/-1M in the
11691              TINY code model.  So we rely on images not being greater than
11692              1M and cap the offset at 1M and anything beyond 1M will have to
11693              be loaded using an alternative mechanism.  Furthermore if the
11694              symbol is a weak reference to something that isn't known to
11695              resolve to a symbol in this module, then force to memory.  */
11696           if ((SYMBOL_REF_WEAK (x)
11697                && !aarch64_symbol_binds_local_p (x))
11698               || !IN_RANGE (offset, -1048575, 1048575))
11699             return SYMBOL_FORCE_TO_MEM;
11700           return SYMBOL_TINY_ABSOLUTE;
11701
11702         case AARCH64_CMODEL_SMALL:
11703           /* Same reasoning as the tiny code model, but the offset cap here is
11704              4G.  */
11705           if ((SYMBOL_REF_WEAK (x)
11706                && !aarch64_symbol_binds_local_p (x))
11707               || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
11708                             HOST_WIDE_INT_C (4294967264)))
11709             return SYMBOL_FORCE_TO_MEM;
11710           return SYMBOL_SMALL_ABSOLUTE;
11711
11712         case AARCH64_CMODEL_TINY_PIC:
11713           if (!aarch64_symbol_binds_local_p (x))
11714             return SYMBOL_TINY_GOT;
11715           return SYMBOL_TINY_ABSOLUTE;
11716
11717         case AARCH64_CMODEL_SMALL_SPIC:
11718         case AARCH64_CMODEL_SMALL_PIC:
11719           if (!aarch64_symbol_binds_local_p (x))
11720             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
11721                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
11722           return SYMBOL_SMALL_ABSOLUTE;
11723
11724         case AARCH64_CMODEL_LARGE:
11725           /* This is alright even in PIC code as the constant
11726              pool reference is always PC relative and within
11727              the same translation unit.  */
11728           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
11729             return SYMBOL_SMALL_ABSOLUTE;
11730           else
11731             return SYMBOL_FORCE_TO_MEM;
11732
11733         default:
11734           gcc_unreachable ();
11735         }
11736     }
11737
11738   /* By default push everything into the constant pool.  */
11739   return SYMBOL_FORCE_TO_MEM;
11740 }
11741
11742 bool
11743 aarch64_constant_address_p (rtx x)
11744 {
11745   return (CONSTANT_P (x) && memory_address_p (DImode, x));
11746 }
11747
11748 bool
11749 aarch64_legitimate_pic_operand_p (rtx x)
11750 {
11751   if (GET_CODE (x) == SYMBOL_REF
11752       || (GET_CODE (x) == CONST
11753           && GET_CODE (XEXP (x, 0)) == PLUS
11754           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
11755      return false;
11756
11757   return true;
11758 }
11759
11760 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
11761    that should be rematerialized rather than spilled.  */
11762
11763 static bool
11764 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
11765 {
11766   /* Support CSE and rematerialization of common constants.  */
11767   if (CONST_INT_P (x)
11768       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
11769       || GET_CODE (x) == CONST_VECTOR)
11770     return true;
11771
11772   /* Do not allow vector struct mode constants for Advanced SIMD.
11773      We could support 0 and -1 easily, but they need support in
11774      aarch64-simd.md.  */
11775   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11776   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
11777     return false;
11778
11779   /* Only accept variable-length vector constants if they can be
11780      handled directly.
11781
11782      ??? It would be possible to handle rematerialization of other
11783      constants via secondary reloads.  */
11784   if (vec_flags & VEC_ANY_SVE)
11785     return aarch64_simd_valid_immediate (x, NULL);
11786
11787   if (GET_CODE (x) == HIGH)
11788     x = XEXP (x, 0);
11789
11790   /* Accept polynomial constants that can be calculated by using the
11791      destination of a move as the sole temporary.  Constants that
11792      require a second temporary cannot be rematerialized (they can't be
11793      forced to memory and also aren't legitimate constants).  */
11794   poly_int64 offset;
11795   if (poly_int_rtx_p (x, &offset))
11796     return aarch64_offset_temporaries (false, offset) <= 1;
11797
11798   /* If an offset is being added to something else, we need to allow the
11799      base to be moved into the destination register, meaning that there
11800      are no free temporaries for the offset.  */
11801   x = strip_offset (x, &offset);
11802   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
11803     return false;
11804
11805   /* Do not allow const (plus (anchor_symbol, const_int)).  */
11806   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
11807     return false;
11808
11809   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
11810      so spilling them is better than rematerialization.  */
11811   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
11812     return true;
11813
11814   /* Label references are always constant.  */
11815   if (GET_CODE (x) == LABEL_REF)
11816     return true;
11817
11818   return false;
11819 }
11820
11821 rtx
11822 aarch64_load_tp (rtx target)
11823 {
11824   if (!target
11825       || GET_MODE (target) != Pmode
11826       || !register_operand (target, Pmode))
11827     target = gen_reg_rtx (Pmode);
11828
11829   /* Can return in any reg.  */
11830   emit_insn (gen_aarch64_load_tp_hard (target));
11831   return target;
11832 }
11833
11834 /* On AAPCS systems, this is the "struct __va_list".  */
11835 static GTY(()) tree va_list_type;
11836
11837 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
11838    Return the type to use as __builtin_va_list.
11839
11840    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
11841
11842    struct __va_list
11843    {
11844      void *__stack;
11845      void *__gr_top;
11846      void *__vr_top;
11847      int   __gr_offs;
11848      int   __vr_offs;
11849    };  */
11850
11851 static tree
11852 aarch64_build_builtin_va_list (void)
11853 {
11854   tree va_list_name;
11855   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
11856
11857   /* Create the type.  */
11858   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
11859   /* Give it the required name.  */
11860   va_list_name = build_decl (BUILTINS_LOCATION,
11861                              TYPE_DECL,
11862                              get_identifier ("__va_list"),
11863                              va_list_type);
11864   DECL_ARTIFICIAL (va_list_name) = 1;
11865   TYPE_NAME (va_list_type) = va_list_name;
11866   TYPE_STUB_DECL (va_list_type) = va_list_name;
11867
11868   /* Create the fields.  */
11869   f_stack = build_decl (BUILTINS_LOCATION,
11870                         FIELD_DECL, get_identifier ("__stack"),
11871                         ptr_type_node);
11872   f_grtop = build_decl (BUILTINS_LOCATION,
11873                         FIELD_DECL, get_identifier ("__gr_top"),
11874                         ptr_type_node);
11875   f_vrtop = build_decl (BUILTINS_LOCATION,
11876                         FIELD_DECL, get_identifier ("__vr_top"),
11877                         ptr_type_node);
11878   f_groff = build_decl (BUILTINS_LOCATION,
11879                         FIELD_DECL, get_identifier ("__gr_offs"),
11880                         integer_type_node);
11881   f_vroff = build_decl (BUILTINS_LOCATION,
11882                         FIELD_DECL, get_identifier ("__vr_offs"),
11883                         integer_type_node);
11884
11885   /* Tell tree-stdarg pass about our internal offset fields.
11886      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
11887      purpose to identify whether the code is updating va_list internal
11888      offset fields through irregular way.  */
11889   va_list_gpr_counter_field = f_groff;
11890   va_list_fpr_counter_field = f_vroff;
11891
11892   DECL_ARTIFICIAL (f_stack) = 1;
11893   DECL_ARTIFICIAL (f_grtop) = 1;
11894   DECL_ARTIFICIAL (f_vrtop) = 1;
11895   DECL_ARTIFICIAL (f_groff) = 1;
11896   DECL_ARTIFICIAL (f_vroff) = 1;
11897
11898   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
11899   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
11900   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
11901   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
11902   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
11903
11904   TYPE_FIELDS (va_list_type) = f_stack;
11905   DECL_CHAIN (f_stack) = f_grtop;
11906   DECL_CHAIN (f_grtop) = f_vrtop;
11907   DECL_CHAIN (f_vrtop) = f_groff;
11908   DECL_CHAIN (f_groff) = f_vroff;
11909
11910   /* Compute its layout.  */
11911   layout_type (va_list_type);
11912
11913   return va_list_type;
11914 }
11915
11916 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
11917 static void
11918 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
11919 {
11920   const CUMULATIVE_ARGS *cum;
11921   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
11922   tree stack, grtop, vrtop, groff, vroff;
11923   tree t;
11924   int gr_save_area_size = cfun->va_list_gpr_size;
11925   int vr_save_area_size = cfun->va_list_fpr_size;
11926   int vr_offset;
11927
11928   cum = &crtl->args.info;
11929   if (cfun->va_list_gpr_size)
11930     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
11931                              cfun->va_list_gpr_size);
11932   if (cfun->va_list_fpr_size)
11933     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
11934                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
11935
11936   if (!TARGET_FLOAT)
11937     {
11938       gcc_assert (cum->aapcs_nvrn == 0);
11939       vr_save_area_size = 0;
11940     }
11941
11942   f_stack = TYPE_FIELDS (va_list_type_node);
11943   f_grtop = DECL_CHAIN (f_stack);
11944   f_vrtop = DECL_CHAIN (f_grtop);
11945   f_groff = DECL_CHAIN (f_vrtop);
11946   f_vroff = DECL_CHAIN (f_groff);
11947
11948   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
11949                   NULL_TREE);
11950   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
11951                   NULL_TREE);
11952   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
11953                   NULL_TREE);
11954   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
11955                   NULL_TREE);
11956   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
11957                   NULL_TREE);
11958
11959   /* Emit code to initialize STACK, which points to the next varargs stack
11960      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
11961      by named arguments.  STACK is 8-byte aligned.  */
11962   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
11963   if (cum->aapcs_stack_size > 0)
11964     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
11965   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
11966   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11967
11968   /* Emit code to initialize GRTOP, the top of the GR save area.
11969      virtual_incoming_args_rtx should have been 16 byte aligned.  */
11970   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
11971   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
11972   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11973
11974   /* Emit code to initialize VRTOP, the top of the VR save area.
11975      This address is gr_save_area_bytes below GRTOP, rounded
11976      down to the next 16-byte boundary.  */
11977   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
11978   vr_offset = ROUND_UP (gr_save_area_size,
11979                         STACK_BOUNDARY / BITS_PER_UNIT);
11980
11981   if (vr_offset)
11982     t = fold_build_pointer_plus_hwi (t, -vr_offset);
11983   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
11984   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11985
11986   /* Emit code to initialize GROFF, the offset from GRTOP of the
11987      next GPR argument.  */
11988   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
11989               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
11990   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11991
11992   /* Likewise emit code to initialize VROFF, the offset from FTOP
11993      of the next VR argument.  */
11994   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
11995               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
11996   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11997 }
11998
11999 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
12000
12001 static tree
12002 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
12003                               gimple_seq *post_p ATTRIBUTE_UNUSED)
12004 {
12005   tree addr;
12006   bool indirect_p;
12007   bool is_ha;           /* is HFA or HVA.  */
12008   bool dw_align;        /* double-word align.  */
12009   machine_mode ag_mode = VOIDmode;
12010   int nregs;
12011   machine_mode mode;
12012
12013   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12014   tree stack, f_top, f_off, off, arg, roundup, on_stack;
12015   HOST_WIDE_INT size, rsize, adjust, align;
12016   tree t, u, cond1, cond2;
12017
12018   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
12019   if (indirect_p)
12020     type = build_pointer_type (type);
12021
12022   mode = TYPE_MODE (type);
12023
12024   f_stack = TYPE_FIELDS (va_list_type_node);
12025   f_grtop = DECL_CHAIN (f_stack);
12026   f_vrtop = DECL_CHAIN (f_grtop);
12027   f_groff = DECL_CHAIN (f_vrtop);
12028   f_vroff = DECL_CHAIN (f_groff);
12029
12030   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
12031                   f_stack, NULL_TREE);
12032   size = int_size_in_bytes (type);
12033   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
12034
12035   dw_align = false;
12036   adjust = 0;
12037   if (aarch64_vfp_is_call_or_return_candidate (mode,
12038                                                type,
12039                                                &ag_mode,
12040                                                &nregs,
12041                                                &is_ha))
12042     {
12043       /* No frontends can create types with variable-sized modes, so we
12044          shouldn't be asked to pass or return them.  */
12045       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
12046
12047       /* TYPE passed in fp/simd registers.  */
12048       if (!TARGET_FLOAT)
12049         aarch64_err_no_fpadvsimd (mode, "varargs");
12050
12051       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
12052                       unshare_expr (valist), f_vrtop, NULL_TREE);
12053       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
12054                       unshare_expr (valist), f_vroff, NULL_TREE);
12055
12056       rsize = nregs * UNITS_PER_VREG;
12057
12058       if (is_ha)
12059         {
12060           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
12061             adjust = UNITS_PER_VREG - ag_size;
12062         }
12063       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12064                && size < UNITS_PER_VREG)
12065         {
12066           adjust = UNITS_PER_VREG - size;
12067         }
12068     }
12069   else
12070     {
12071       /* TYPE passed in general registers.  */
12072       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
12073                       unshare_expr (valist), f_grtop, NULL_TREE);
12074       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
12075                       unshare_expr (valist), f_groff, NULL_TREE);
12076       rsize = ROUND_UP (size, UNITS_PER_WORD);
12077       nregs = rsize / UNITS_PER_WORD;
12078
12079       if (align > 8)
12080         dw_align = true;
12081
12082       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12083           && size < UNITS_PER_WORD)
12084         {
12085           adjust = UNITS_PER_WORD  - size;
12086         }
12087     }
12088
12089   /* Get a local temporary for the field value.  */
12090   off = get_initialized_tmp_var (f_off, pre_p, NULL);
12091
12092   /* Emit code to branch if off >= 0.  */
12093   t = build2 (GE_EXPR, boolean_type_node, off,
12094               build_int_cst (TREE_TYPE (off), 0));
12095   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
12096
12097   if (dw_align)
12098     {
12099       /* Emit: offs = (offs + 15) & -16.  */
12100       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12101                   build_int_cst (TREE_TYPE (off), 15));
12102       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
12103                   build_int_cst (TREE_TYPE (off), -16));
12104       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
12105     }
12106   else
12107     roundup = NULL;
12108
12109   /* Update ap.__[g|v]r_offs  */
12110   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12111               build_int_cst (TREE_TYPE (off), rsize));
12112   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
12113
12114   /* String up.  */
12115   if (roundup)
12116     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12117
12118   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
12119   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
12120               build_int_cst (TREE_TYPE (f_off), 0));
12121   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
12122
12123   /* String up: make sure the assignment happens before the use.  */
12124   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
12125   COND_EXPR_ELSE (cond1) = t;
12126
12127   /* Prepare the trees handling the argument that is passed on the stack;
12128      the top level node will store in ON_STACK.  */
12129   arg = get_initialized_tmp_var (stack, pre_p, NULL);
12130   if (align > 8)
12131     {
12132       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
12133       t = fold_convert (intDI_type_node, arg);
12134       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
12135                   build_int_cst (TREE_TYPE (t), 15));
12136       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12137                   build_int_cst (TREE_TYPE (t), -16));
12138       t = fold_convert (TREE_TYPE (arg), t);
12139       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
12140     }
12141   else
12142     roundup = NULL;
12143   /* Advance ap.__stack  */
12144   t = fold_convert (intDI_type_node, arg);
12145   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
12146               build_int_cst (TREE_TYPE (t), size + 7));
12147   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12148               build_int_cst (TREE_TYPE (t), -8));
12149   t = fold_convert (TREE_TYPE (arg), t);
12150   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
12151   /* String up roundup and advance.  */
12152   if (roundup)
12153     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12154   /* String up with arg */
12155   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
12156   /* Big-endianness related address adjustment.  */
12157   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12158       && size < UNITS_PER_WORD)
12159   {
12160     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
12161                 size_int (UNITS_PER_WORD - size));
12162     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
12163   }
12164
12165   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
12166   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
12167
12168   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
12169   t = off;
12170   if (adjust)
12171     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
12172                 build_int_cst (TREE_TYPE (off), adjust));
12173
12174   t = fold_convert (sizetype, t);
12175   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
12176
12177   if (is_ha)
12178     {
12179       /* type ha; // treat as "struct {ftype field[n];}"
12180          ... [computing offs]
12181          for (i = 0; i <nregs; ++i, offs += 16)
12182            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
12183          return ha;  */
12184       int i;
12185       tree tmp_ha, field_t, field_ptr_t;
12186
12187       /* Declare a local variable.  */
12188       tmp_ha = create_tmp_var_raw (type, "ha");
12189       gimple_add_tmp_var (tmp_ha);
12190
12191       /* Establish the base type.  */
12192       switch (ag_mode)
12193         {
12194         case E_SFmode:
12195           field_t = float_type_node;
12196           field_ptr_t = float_ptr_type_node;
12197           break;
12198         case E_DFmode:
12199           field_t = double_type_node;
12200           field_ptr_t = double_ptr_type_node;
12201           break;
12202         case E_TFmode:
12203           field_t = long_double_type_node;
12204           field_ptr_t = long_double_ptr_type_node;
12205           break;
12206         case E_HFmode:
12207           field_t = aarch64_fp16_type_node;
12208           field_ptr_t = aarch64_fp16_ptr_type_node;
12209           break;
12210         case E_V2SImode:
12211         case E_V4SImode:
12212             {
12213               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
12214               field_t = build_vector_type_for_mode (innertype, ag_mode);
12215               field_ptr_t = build_pointer_type (field_t);
12216             }
12217           break;
12218         default:
12219           gcc_assert (0);
12220         }
12221
12222       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
12223       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
12224       addr = t;
12225       t = fold_convert (field_ptr_t, addr);
12226       t = build2 (MODIFY_EXPR, field_t,
12227                   build1 (INDIRECT_REF, field_t, tmp_ha),
12228                   build1 (INDIRECT_REF, field_t, t));
12229
12230       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
12231       for (i = 1; i < nregs; ++i)
12232         {
12233           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
12234           u = fold_convert (field_ptr_t, addr);
12235           u = build2 (MODIFY_EXPR, field_t,
12236                       build2 (MEM_REF, field_t, tmp_ha,
12237                               build_int_cst (field_ptr_t,
12238                                              (i *
12239                                               int_size_in_bytes (field_t)))),
12240                       build1 (INDIRECT_REF, field_t, u));
12241           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
12242         }
12243
12244       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
12245       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
12246     }
12247
12248   COND_EXPR_ELSE (cond2) = t;
12249   addr = fold_convert (build_pointer_type (type), cond1);
12250   addr = build_va_arg_indirect_ref (addr);
12251
12252   if (indirect_p)
12253     addr = build_va_arg_indirect_ref (addr);
12254
12255   return addr;
12256 }
12257
12258 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
12259
12260 static void
12261 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
12262                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
12263                                 int no_rtl)
12264 {
12265   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
12266   CUMULATIVE_ARGS local_cum;
12267   int gr_saved = cfun->va_list_gpr_size;
12268   int vr_saved = cfun->va_list_fpr_size;
12269
12270   /* The caller has advanced CUM up to, but not beyond, the last named
12271      argument.  Advance a local copy of CUM past the last "real" named
12272      argument, to find out how many registers are left over.  */
12273   local_cum = *cum;
12274   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
12275
12276   /* Found out how many registers we need to save.
12277      Honor tree-stdvar analysis results.  */
12278   if (cfun->va_list_gpr_size)
12279     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
12280                     cfun->va_list_gpr_size / UNITS_PER_WORD);
12281   if (cfun->va_list_fpr_size)
12282     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
12283                     cfun->va_list_fpr_size / UNITS_PER_VREG);
12284
12285   if (!TARGET_FLOAT)
12286     {
12287       gcc_assert (local_cum.aapcs_nvrn == 0);
12288       vr_saved = 0;
12289     }
12290
12291   if (!no_rtl)
12292     {
12293       if (gr_saved > 0)
12294         {
12295           rtx ptr, mem;
12296
12297           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
12298           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
12299                                - gr_saved * UNITS_PER_WORD);
12300           mem = gen_frame_mem (BLKmode, ptr);
12301           set_mem_alias_set (mem, get_varargs_alias_set ());
12302
12303           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
12304                                mem, gr_saved);
12305         }
12306       if (vr_saved > 0)
12307         {
12308           /* We can't use move_block_from_reg, because it will use
12309              the wrong mode, storing D regs only.  */
12310           machine_mode mode = TImode;
12311           int off, i, vr_start;
12312
12313           /* Set OFF to the offset from virtual_incoming_args_rtx of
12314              the first vector register.  The VR save area lies below
12315              the GR one, and is aligned to 16 bytes.  */
12316           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
12317                            STACK_BOUNDARY / BITS_PER_UNIT);
12318           off -= vr_saved * UNITS_PER_VREG;
12319
12320           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
12321           for (i = 0; i < vr_saved; ++i)
12322             {
12323               rtx ptr, mem;
12324
12325               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
12326               mem = gen_frame_mem (mode, ptr);
12327               set_mem_alias_set (mem, get_varargs_alias_set ());
12328               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
12329               off += UNITS_PER_VREG;
12330             }
12331         }
12332     }
12333
12334   /* We don't save the size into *PRETEND_SIZE because we want to avoid
12335      any complication of having crtl->args.pretend_args_size changed.  */
12336   cfun->machine->frame.saved_varargs_size
12337     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
12338                  STACK_BOUNDARY / BITS_PER_UNIT)
12339        + vr_saved * UNITS_PER_VREG);
12340 }
12341
12342 static void
12343 aarch64_conditional_register_usage (void)
12344 {
12345   int i;
12346   if (!TARGET_FLOAT)
12347     {
12348       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
12349         {
12350           fixed_regs[i] = 1;
12351           call_used_regs[i] = 1;
12352         }
12353     }
12354   if (!TARGET_SVE)
12355     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
12356       {
12357         fixed_regs[i] = 1;
12358         call_used_regs[i] = 1;
12359       }
12360 }
12361
12362 /* Walk down the type tree of TYPE counting consecutive base elements.
12363    If *MODEP is VOIDmode, then set it to the first valid floating point
12364    type.  If a non-floating point type is found, or if a floating point
12365    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
12366    otherwise return the count in the sub-tree.  */
12367 static int
12368 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
12369 {
12370   machine_mode mode;
12371   HOST_WIDE_INT size;
12372
12373   switch (TREE_CODE (type))
12374     {
12375     case REAL_TYPE:
12376       mode = TYPE_MODE (type);
12377       if (mode != DFmode && mode != SFmode
12378           && mode != TFmode && mode != HFmode)
12379         return -1;
12380
12381       if (*modep == VOIDmode)
12382         *modep = mode;
12383
12384       if (*modep == mode)
12385         return 1;
12386
12387       break;
12388
12389     case COMPLEX_TYPE:
12390       mode = TYPE_MODE (TREE_TYPE (type));
12391       if (mode != DFmode && mode != SFmode
12392           && mode != TFmode && mode != HFmode)
12393         return -1;
12394
12395       if (*modep == VOIDmode)
12396         *modep = mode;
12397
12398       if (*modep == mode)
12399         return 2;
12400
12401       break;
12402
12403     case VECTOR_TYPE:
12404       /* Use V2SImode and V4SImode as representatives of all 64-bit
12405          and 128-bit vector types.  */
12406       size = int_size_in_bytes (type);
12407       switch (size)
12408         {
12409         case 8:
12410           mode = V2SImode;
12411           break;
12412         case 16:
12413           mode = V4SImode;
12414           break;
12415         default:
12416           return -1;
12417         }
12418
12419       if (*modep == VOIDmode)
12420         *modep = mode;
12421
12422       /* Vector modes are considered to be opaque: two vectors are
12423          equivalent for the purposes of being homogeneous aggregates
12424          if they are the same size.  */
12425       if (*modep == mode)
12426         return 1;
12427
12428       break;
12429
12430     case ARRAY_TYPE:
12431       {
12432         int count;
12433         tree index = TYPE_DOMAIN (type);
12434
12435         /* Can't handle incomplete types nor sizes that are not
12436            fixed.  */
12437         if (!COMPLETE_TYPE_P (type)
12438             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12439           return -1;
12440
12441         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
12442         if (count == -1
12443             || !index
12444             || !TYPE_MAX_VALUE (index)
12445             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
12446             || !TYPE_MIN_VALUE (index)
12447             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
12448             || count < 0)
12449           return -1;
12450
12451         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
12452                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
12453
12454         /* There must be no padding.  */
12455         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12456                       count * GET_MODE_BITSIZE (*modep)))
12457           return -1;
12458
12459         return count;
12460       }
12461
12462     case RECORD_TYPE:
12463       {
12464         int count = 0;
12465         int sub_count;
12466         tree field;
12467
12468         /* Can't handle incomplete types nor sizes that are not
12469            fixed.  */
12470         if (!COMPLETE_TYPE_P (type)
12471             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12472           return -1;
12473
12474         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12475           {
12476             if (TREE_CODE (field) != FIELD_DECL)
12477               continue;
12478
12479             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12480             if (sub_count < 0)
12481               return -1;
12482             count += sub_count;
12483           }
12484
12485         /* There must be no padding.  */
12486         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12487                       count * GET_MODE_BITSIZE (*modep)))
12488           return -1;
12489
12490         return count;
12491       }
12492
12493     case UNION_TYPE:
12494     case QUAL_UNION_TYPE:
12495       {
12496         /* These aren't very interesting except in a degenerate case.  */
12497         int count = 0;
12498         int sub_count;
12499         tree field;
12500
12501         /* Can't handle incomplete types nor sizes that are not
12502            fixed.  */
12503         if (!COMPLETE_TYPE_P (type)
12504             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12505           return -1;
12506
12507         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12508           {
12509             if (TREE_CODE (field) != FIELD_DECL)
12510               continue;
12511
12512             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12513             if (sub_count < 0)
12514               return -1;
12515             count = count > sub_count ? count : sub_count;
12516           }
12517
12518         /* There must be no padding.  */
12519         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12520                       count * GET_MODE_BITSIZE (*modep)))
12521           return -1;
12522
12523         return count;
12524       }
12525
12526     default:
12527       break;
12528     }
12529
12530   return -1;
12531 }
12532
12533 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
12534    type as described in AAPCS64 \S 4.1.2.
12535
12536    See the comment above aarch64_composite_type_p for the notes on MODE.  */
12537
12538 static bool
12539 aarch64_short_vector_p (const_tree type,
12540                         machine_mode mode)
12541 {
12542   poly_int64 size = -1;
12543
12544   if (type && TREE_CODE (type) == VECTOR_TYPE)
12545     size = int_size_in_bytes (type);
12546   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
12547             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
12548     size = GET_MODE_SIZE (mode);
12549
12550   return known_eq (size, 8) || known_eq (size, 16);
12551 }
12552
12553 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
12554    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
12555    array types.  The C99 floating-point complex types are also considered
12556    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
12557    types, which are GCC extensions and out of the scope of AAPCS64, are
12558    treated as composite types here as well.
12559
12560    Note that MODE itself is not sufficient in determining whether a type
12561    is such a composite type or not.  This is because
12562    stor-layout.c:compute_record_mode may have already changed the MODE
12563    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
12564    structure with only one field may have its MODE set to the mode of the
12565    field.  Also an integer mode whose size matches the size of the
12566    RECORD_TYPE type may be used to substitute the original mode
12567    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
12568    solely relied on.  */
12569
12570 static bool
12571 aarch64_composite_type_p (const_tree type,
12572                           machine_mode mode)
12573 {
12574   if (aarch64_short_vector_p (type, mode))
12575     return false;
12576
12577   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
12578     return true;
12579
12580   if (mode == BLKmode
12581       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
12582       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
12583     return true;
12584
12585   return false;
12586 }
12587
12588 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
12589    shall be passed or returned in simd/fp register(s) (providing these
12590    parameter passing registers are available).
12591
12592    Upon successful return, *COUNT returns the number of needed registers,
12593    *BASE_MODE returns the mode of the individual register and when IS_HAF
12594    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
12595    floating-point aggregate or a homogeneous short-vector aggregate.  */
12596
12597 static bool
12598 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
12599                                          const_tree type,
12600                                          machine_mode *base_mode,
12601                                          int *count,
12602                                          bool *is_ha)
12603 {
12604   machine_mode new_mode = VOIDmode;
12605   bool composite_p = aarch64_composite_type_p (type, mode);
12606
12607   if (is_ha != NULL) *is_ha = false;
12608
12609   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
12610       || aarch64_short_vector_p (type, mode))
12611     {
12612       *count = 1;
12613       new_mode = mode;
12614     }
12615   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
12616     {
12617       if (is_ha != NULL) *is_ha = true;
12618       *count = 2;
12619       new_mode = GET_MODE_INNER (mode);
12620     }
12621   else if (type && composite_p)
12622     {
12623       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
12624
12625       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
12626         {
12627           if (is_ha != NULL) *is_ha = true;
12628           *count = ag_count;
12629         }
12630       else
12631         return false;
12632     }
12633   else
12634     return false;
12635
12636   *base_mode = new_mode;
12637   return true;
12638 }
12639
12640 /* Implement TARGET_STRUCT_VALUE_RTX.  */
12641
12642 static rtx
12643 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
12644                           int incoming ATTRIBUTE_UNUSED)
12645 {
12646   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
12647 }
12648
12649 /* Implements target hook vector_mode_supported_p.  */
12650 static bool
12651 aarch64_vector_mode_supported_p (machine_mode mode)
12652 {
12653   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12654   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
12655 }
12656
12657 /* Return appropriate SIMD container
12658    for MODE within a vector of WIDTH bits.  */
12659 static machine_mode
12660 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
12661 {
12662   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
12663     switch (mode)
12664       {
12665       case E_DFmode:
12666         return VNx2DFmode;
12667       case E_SFmode:
12668         return VNx4SFmode;
12669       case E_HFmode:
12670         return VNx8HFmode;
12671       case E_DImode:
12672         return VNx2DImode;
12673       case E_SImode:
12674         return VNx4SImode;
12675       case E_HImode:
12676         return VNx8HImode;
12677       case E_QImode:
12678         return VNx16QImode;
12679       default:
12680         return word_mode;
12681       }
12682
12683   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
12684   if (TARGET_SIMD)
12685     {
12686       if (known_eq (width, 128))
12687         switch (mode)
12688           {
12689           case E_DFmode:
12690             return V2DFmode;
12691           case E_SFmode:
12692             return V4SFmode;
12693           case E_HFmode:
12694             return V8HFmode;
12695           case E_SImode:
12696             return V4SImode;
12697           case E_HImode:
12698             return V8HImode;
12699           case E_QImode:
12700             return V16QImode;
12701           case E_DImode:
12702             return V2DImode;
12703           default:
12704             break;
12705           }
12706       else
12707         switch (mode)
12708           {
12709           case E_SFmode:
12710             return V2SFmode;
12711           case E_HFmode:
12712             return V4HFmode;
12713           case E_SImode:
12714             return V2SImode;
12715           case E_HImode:
12716             return V4HImode;
12717           case E_QImode:
12718             return V8QImode;
12719           default:
12720             break;
12721           }
12722     }
12723   return word_mode;
12724 }
12725
12726 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
12727 static machine_mode
12728 aarch64_preferred_simd_mode (scalar_mode mode)
12729 {
12730   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
12731   return aarch64_simd_container_mode (mode, bits);
12732 }
12733
12734 /* Return a list of possible vector sizes for the vectorizer
12735    to iterate over.  */
12736 static void
12737 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
12738 {
12739   if (TARGET_SVE)
12740     sizes->safe_push (BYTES_PER_SVE_VECTOR);
12741   sizes->safe_push (16);
12742   sizes->safe_push (8);
12743 }
12744
12745 /* Implement TARGET_MANGLE_TYPE.  */
12746
12747 static const char *
12748 aarch64_mangle_type (const_tree type)
12749 {
12750   /* The AArch64 ABI documents say that "__va_list" has to be
12751      managled as if it is in the "std" namespace.  */
12752   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
12753     return "St9__va_list";
12754
12755   /* Half-precision float.  */
12756   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
12757     return "Dh";
12758
12759   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
12760      builtin types.  */
12761   if (TYPE_NAME (type) != NULL)
12762     return aarch64_mangle_builtin_type (type);
12763
12764   /* Use the default mangling.  */
12765   return NULL;
12766 }
12767
12768 /* Find the first rtx_insn before insn that will generate an assembly
12769    instruction.  */
12770
12771 static rtx_insn *
12772 aarch64_prev_real_insn (rtx_insn *insn)
12773 {
12774   if (!insn)
12775     return NULL;
12776
12777   do
12778     {
12779       insn = prev_real_insn (insn);
12780     }
12781   while (insn && recog_memoized (insn) < 0);
12782
12783   return insn;
12784 }
12785
12786 static bool
12787 is_madd_op (enum attr_type t1)
12788 {
12789   unsigned int i;
12790   /* A number of these may be AArch32 only.  */
12791   enum attr_type mlatypes[] = {
12792     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
12793     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
12794     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
12795   };
12796
12797   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
12798     {
12799       if (t1 == mlatypes[i])
12800         return true;
12801     }
12802
12803   return false;
12804 }
12805
12806 /* Check if there is a register dependency between a load and the insn
12807    for which we hold recog_data.  */
12808
12809 static bool
12810 dep_between_memop_and_curr (rtx memop)
12811 {
12812   rtx load_reg;
12813   int opno;
12814
12815   gcc_assert (GET_CODE (memop) == SET);
12816
12817   if (!REG_P (SET_DEST (memop)))
12818     return false;
12819
12820   load_reg = SET_DEST (memop);
12821   for (opno = 1; opno < recog_data.n_operands; opno++)
12822     {
12823       rtx operand = recog_data.operand[opno];
12824       if (REG_P (operand)
12825           && reg_overlap_mentioned_p (load_reg, operand))
12826         return true;
12827
12828     }
12829   return false;
12830 }
12831
12832
12833 /* When working around the Cortex-A53 erratum 835769,
12834    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
12835    instruction and has a preceding memory instruction such that a NOP
12836    should be inserted between them.  */
12837
12838 bool
12839 aarch64_madd_needs_nop (rtx_insn* insn)
12840 {
12841   enum attr_type attr_type;
12842   rtx_insn *prev;
12843   rtx body;
12844
12845   if (!TARGET_FIX_ERR_A53_835769)
12846     return false;
12847
12848   if (!INSN_P (insn) || recog_memoized (insn) < 0)
12849     return false;
12850
12851   attr_type = get_attr_type (insn);
12852   if (!is_madd_op (attr_type))
12853     return false;
12854
12855   prev = aarch64_prev_real_insn (insn);
12856   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
12857      Restore recog state to INSN to avoid state corruption.  */
12858   extract_constrain_insn_cached (insn);
12859
12860   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
12861     return false;
12862
12863   body = single_set (prev);
12864
12865   /* If the previous insn is a memory op and there is no dependency between
12866      it and the DImode madd, emit a NOP between them.  If body is NULL then we
12867      have a complex memory operation, probably a load/store pair.
12868      Be conservative for now and emit a NOP.  */
12869   if (GET_MODE (recog_data.operand[0]) == DImode
12870       && (!body || !dep_between_memop_and_curr (body)))
12871     return true;
12872
12873   return false;
12874
12875 }
12876
12877
12878 /* Implement FINAL_PRESCAN_INSN.  */
12879
12880 void
12881 aarch64_final_prescan_insn (rtx_insn *insn)
12882 {
12883   if (aarch64_madd_needs_nop (insn))
12884     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
12885 }
12886
12887
12888 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
12889    instruction.  */
12890
12891 bool
12892 aarch64_sve_index_immediate_p (rtx base_or_step)
12893 {
12894   return (CONST_INT_P (base_or_step)
12895           && IN_RANGE (INTVAL (base_or_step), -16, 15));
12896 }
12897
12898 /* Return true if X is a valid immediate for the SVE ADD and SUB
12899    instructions.  Negate X first if NEGATE_P is true.  */
12900
12901 bool
12902 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
12903 {
12904   rtx elt;
12905
12906   if (!const_vec_duplicate_p (x, &elt)
12907       || !CONST_INT_P (elt))
12908     return false;
12909
12910   HOST_WIDE_INT val = INTVAL (elt);
12911   if (negate_p)
12912     val = -val;
12913   val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
12914
12915   if (val & 0xff)
12916     return IN_RANGE (val, 0, 0xff);
12917   return IN_RANGE (val, 0, 0xff00);
12918 }
12919
12920 /* Return true if X is a valid immediate operand for an SVE logical
12921    instruction such as AND.  */
12922
12923 bool
12924 aarch64_sve_bitmask_immediate_p (rtx x)
12925 {
12926   rtx elt;
12927
12928   return (const_vec_duplicate_p (x, &elt)
12929           && CONST_INT_P (elt)
12930           && aarch64_bitmask_imm (INTVAL (elt),
12931                                   GET_MODE_INNER (GET_MODE (x))));
12932 }
12933
12934 /* Return true if X is a valid immediate for the SVE DUP and CPY
12935    instructions.  */
12936
12937 bool
12938 aarch64_sve_dup_immediate_p (rtx x)
12939 {
12940   rtx elt;
12941
12942   if (!const_vec_duplicate_p (x, &elt)
12943       || !CONST_INT_P (elt))
12944     return false;
12945
12946   HOST_WIDE_INT val = INTVAL (elt);
12947   if (val & 0xff)
12948     return IN_RANGE (val, -0x80, 0x7f);
12949   return IN_RANGE (val, -0x8000, 0x7f00);
12950 }
12951
12952 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
12953    SIGNED_P says whether the operand is signed rather than unsigned.  */
12954
12955 bool
12956 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
12957 {
12958   rtx elt;
12959
12960   return (const_vec_duplicate_p (x, &elt)
12961           && CONST_INT_P (elt)
12962           && (signed_p
12963               ? IN_RANGE (INTVAL (elt), -16, 15)
12964               : IN_RANGE (INTVAL (elt), 0, 127)));
12965 }
12966
12967 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
12968    instruction.  Negate X first if NEGATE_P is true.  */
12969
12970 bool
12971 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
12972 {
12973   rtx elt;
12974   REAL_VALUE_TYPE r;
12975
12976   if (!const_vec_duplicate_p (x, &elt)
12977       || GET_CODE (elt) != CONST_DOUBLE)
12978     return false;
12979
12980   r = *CONST_DOUBLE_REAL_VALUE (elt);
12981
12982   if (negate_p)
12983     r = real_value_negate (&r);
12984
12985   if (real_equal (&r, &dconst1))
12986     return true;
12987   if (real_equal (&r, &dconsthalf))
12988     return true;
12989   return false;
12990 }
12991
12992 /* Return true if X is a valid immediate operand for an SVE FMUL
12993    instruction.  */
12994
12995 bool
12996 aarch64_sve_float_mul_immediate_p (rtx x)
12997 {
12998   rtx elt;
12999
13000   /* GCC will never generate a multiply with an immediate of 2, so there is no
13001      point testing for it (even though it is a valid constant).  */
13002   return (const_vec_duplicate_p (x, &elt)
13003           && GET_CODE (elt) == CONST_DOUBLE
13004           && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
13005 }
13006
13007 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
13008    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
13009    is nonnull, use it to describe valid immediates.  */
13010 static bool
13011 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
13012                                     simd_immediate_info *info,
13013                                     enum simd_immediate_check which,
13014                                     simd_immediate_info::insn_type insn)
13015 {
13016   /* Try a 4-byte immediate with LSL.  */
13017   for (unsigned int shift = 0; shift < 32; shift += 8)
13018     if ((val32 & (0xff << shift)) == val32)
13019       {
13020         if (info)
13021           *info = simd_immediate_info (SImode, val32 >> shift, insn,
13022                                        simd_immediate_info::LSL, shift);
13023         return true;
13024       }
13025
13026   /* Try a 2-byte immediate with LSL.  */
13027   unsigned int imm16 = val32 & 0xffff;
13028   if (imm16 == (val32 >> 16))
13029     for (unsigned int shift = 0; shift < 16; shift += 8)
13030       if ((imm16 & (0xff << shift)) == imm16)
13031         {
13032           if (info)
13033             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
13034                                          simd_immediate_info::LSL, shift);
13035           return true;
13036         }
13037
13038   /* Try a 4-byte immediate with MSL, except for cases that MVN
13039      can handle.  */
13040   if (which == AARCH64_CHECK_MOV)
13041     for (unsigned int shift = 8; shift < 24; shift += 8)
13042       {
13043         unsigned int low = (1 << shift) - 1;
13044         if (((val32 & (0xff << shift)) | low) == val32)
13045           {
13046             if (info)
13047               *info = simd_immediate_info (SImode, val32 >> shift, insn,
13048                                            simd_immediate_info::MSL, shift);
13049             return true;
13050           }
13051       }
13052
13053   return false;
13054 }
13055
13056 /* Return true if replicating VAL64 is a valid immediate for the
13057    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
13058    use it to describe valid immediates.  */
13059 static bool
13060 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
13061                                  simd_immediate_info *info,
13062                                  enum simd_immediate_check which)
13063 {
13064   unsigned int val32 = val64 & 0xffffffff;
13065   unsigned int val16 = val64 & 0xffff;
13066   unsigned int val8 = val64 & 0xff;
13067
13068   if (val32 == (val64 >> 32))
13069     {
13070       if ((which & AARCH64_CHECK_ORR) != 0
13071           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
13072                                                  simd_immediate_info::MOV))
13073         return true;
13074
13075       if ((which & AARCH64_CHECK_BIC) != 0
13076           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
13077                                                  simd_immediate_info::MVN))
13078         return true;
13079
13080       /* Try using a replicated byte.  */
13081       if (which == AARCH64_CHECK_MOV
13082           && val16 == (val32 >> 16)
13083           && val8 == (val16 >> 8))
13084         {
13085           if (info)
13086             *info = simd_immediate_info (QImode, val8);
13087           return true;
13088         }
13089     }
13090
13091   /* Try using a bit-to-bytemask.  */
13092   if (which == AARCH64_CHECK_MOV)
13093     {
13094       unsigned int i;
13095       for (i = 0; i < 64; i += 8)
13096         {
13097           unsigned char byte = (val64 >> i) & 0xff;
13098           if (byte != 0 && byte != 0xff)
13099             break;
13100         }
13101       if (i == 64)
13102         {
13103           if (info)
13104             *info = simd_immediate_info (DImode, val64);
13105           return true;
13106         }
13107     }
13108   return false;
13109 }
13110
13111 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
13112    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
13113
13114 static bool
13115 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
13116                              simd_immediate_info *info)
13117 {
13118   scalar_int_mode mode = DImode;
13119   unsigned int val32 = val64 & 0xffffffff;
13120   if (val32 == (val64 >> 32))
13121     {
13122       mode = SImode;
13123       unsigned int val16 = val32 & 0xffff;
13124       if (val16 == (val32 >> 16))
13125         {
13126           mode = HImode;
13127           unsigned int val8 = val16 & 0xff;
13128           if (val8 == (val16 >> 8))
13129             mode = QImode;
13130         }
13131     }
13132   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
13133   if (IN_RANGE (val, -0x80, 0x7f))
13134     {
13135       /* DUP with no shift.  */
13136       if (info)
13137         *info = simd_immediate_info (mode, val);
13138       return true;
13139     }
13140   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
13141     {
13142       /* DUP with LSL #8.  */
13143       if (info)
13144         *info = simd_immediate_info (mode, val);
13145       return true;
13146     }
13147   if (aarch64_bitmask_imm (val64, mode))
13148     {
13149       /* DUPM.  */
13150       if (info)
13151         *info = simd_immediate_info (mode, val);
13152       return true;
13153     }
13154   return false;
13155 }
13156
13157 /* Return true if OP is a valid SIMD immediate for the operation
13158    described by WHICH.  If INFO is nonnull, use it to describe valid
13159    immediates.  */
13160 bool
13161 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
13162                               enum simd_immediate_check which)
13163 {
13164   machine_mode mode = GET_MODE (op);
13165   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13166   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13167     return false;
13168
13169   scalar_mode elt_mode = GET_MODE_INNER (mode);
13170   rtx base, step;
13171   unsigned int n_elts;
13172   if (GET_CODE (op) == CONST_VECTOR
13173       && CONST_VECTOR_DUPLICATE_P (op))
13174     n_elts = CONST_VECTOR_NPATTERNS (op);
13175   else if ((vec_flags & VEC_SVE_DATA)
13176            && const_vec_series_p (op, &base, &step))
13177     {
13178       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
13179       if (!aarch64_sve_index_immediate_p (base)
13180           || !aarch64_sve_index_immediate_p (step))
13181         return false;
13182
13183       if (info)
13184         *info = simd_immediate_info (elt_mode, base, step);
13185       return true;
13186     }
13187   else if (GET_CODE (op) == CONST_VECTOR
13188            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
13189     /* N_ELTS set above.  */;
13190   else
13191     return false;
13192
13193   /* Handle PFALSE and PTRUE.  */
13194   if (vec_flags & VEC_SVE_PRED)
13195     return (op == CONST0_RTX (mode)
13196             || op == CONSTM1_RTX (mode));
13197
13198   scalar_float_mode elt_float_mode;
13199   if (n_elts == 1
13200       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
13201     {
13202       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
13203       if (aarch64_float_const_zero_rtx_p (elt)
13204           || aarch64_float_const_representable_p (elt))
13205         {
13206           if (info)
13207             *info = simd_immediate_info (elt_float_mode, elt);
13208           return true;
13209         }
13210     }
13211
13212   unsigned int elt_size = GET_MODE_SIZE (elt_mode);
13213   if (elt_size > 8)
13214     return false;
13215
13216   scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
13217
13218   /* Expand the vector constant out into a byte vector, with the least
13219      significant byte of the register first.  */
13220   auto_vec<unsigned char, 16> bytes;
13221   bytes.reserve (n_elts * elt_size);
13222   for (unsigned int i = 0; i < n_elts; i++)
13223     {
13224       /* The vector is provided in gcc endian-neutral fashion.
13225          For aarch64_be Advanced SIMD, it must be laid out in the vector
13226          register in reverse order.  */
13227       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
13228       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
13229
13230       if (elt_mode != elt_int_mode)
13231         elt = gen_lowpart (elt_int_mode, elt);
13232
13233       if (!CONST_INT_P (elt))
13234         return false;
13235
13236       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
13237       for (unsigned int byte = 0; byte < elt_size; byte++)
13238         {
13239           bytes.quick_push (elt_val & 0xff);
13240           elt_val >>= BITS_PER_UNIT;
13241         }
13242     }
13243
13244   /* The immediate must repeat every eight bytes.  */
13245   unsigned int nbytes = bytes.length ();
13246   for (unsigned i = 8; i < nbytes; ++i)
13247     if (bytes[i] != bytes[i - 8])
13248       return false;
13249
13250   /* Get the repeating 8-byte value as an integer.  No endian correction
13251      is needed here because bytes is already in lsb-first order.  */
13252   unsigned HOST_WIDE_INT val64 = 0;
13253   for (unsigned int i = 0; i < 8; i++)
13254     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
13255               << (i * BITS_PER_UNIT));
13256
13257   if (vec_flags & VEC_SVE_DATA)
13258     return aarch64_sve_valid_immediate (val64, info);
13259   else
13260     return aarch64_advsimd_valid_immediate (val64, info, which);
13261 }
13262
13263 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
13264    has a step in the range of INDEX.  Return the index expression if so,
13265    otherwise return null.  */
13266 rtx
13267 aarch64_check_zero_based_sve_index_immediate (rtx x)
13268 {
13269   rtx base, step;
13270   if (const_vec_series_p (x, &base, &step)
13271       && base == const0_rtx
13272       && aarch64_sve_index_immediate_p (step))
13273     return step;
13274   return NULL_RTX;
13275 }
13276
13277 /* Check of immediate shift constants are within range.  */
13278 bool
13279 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
13280 {
13281   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
13282   if (left)
13283     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
13284   else
13285     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
13286 }
13287
13288 /* Return the bitmask CONST_INT to select the bits required by a zero extract
13289    operation of width WIDTH at bit position POS.  */
13290
13291 rtx
13292 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
13293 {
13294   gcc_assert (CONST_INT_P (width));
13295   gcc_assert (CONST_INT_P (pos));
13296
13297   unsigned HOST_WIDE_INT mask
13298     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
13299   return GEN_INT (mask << UINTVAL (pos));
13300 }
13301
13302 bool
13303 aarch64_mov_operand_p (rtx x, machine_mode mode)
13304 {
13305   if (GET_CODE (x) == HIGH
13306       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
13307     return true;
13308
13309   if (CONST_INT_P (x))
13310     return true;
13311
13312   if (VECTOR_MODE_P (GET_MODE (x)))
13313     return aarch64_simd_valid_immediate (x, NULL);
13314
13315   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
13316     return true;
13317
13318   if (aarch64_sve_cnt_immediate_p (x))
13319     return true;
13320
13321   return aarch64_classify_symbolic_expression (x)
13322     == SYMBOL_TINY_ABSOLUTE;
13323 }
13324
13325 /* Return a const_int vector of VAL.  */
13326 rtx
13327 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
13328 {
13329   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
13330   return gen_const_vec_duplicate (mode, c);
13331 }
13332
13333 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
13334
13335 bool
13336 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
13337 {
13338   machine_mode vmode;
13339
13340   vmode = aarch64_simd_container_mode (mode, 64);
13341   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
13342   return aarch64_simd_valid_immediate (op_v, NULL);
13343 }
13344
13345 /* Construct and return a PARALLEL RTX vector with elements numbering the
13346    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
13347    the vector - from the perspective of the architecture.  This does not
13348    line up with GCC's perspective on lane numbers, so we end up with
13349    different masks depending on our target endian-ness.  The diagram
13350    below may help.  We must draw the distinction when building masks
13351    which select one half of the vector.  An instruction selecting
13352    architectural low-lanes for a big-endian target, must be described using
13353    a mask selecting GCC high-lanes.
13354
13355                  Big-Endian             Little-Endian
13356
13357 GCC             0   1   2   3           3   2   1   0
13358               | x | x | x | x |       | x | x | x | x |
13359 Architecture    3   2   1   0           3   2   1   0
13360
13361 Low Mask:         { 2, 3 }                { 0, 1 }
13362 High Mask:        { 0, 1 }                { 2, 3 }
13363
13364    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
13365
13366 rtx
13367 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
13368 {
13369   rtvec v = rtvec_alloc (nunits / 2);
13370   int high_base = nunits / 2;
13371   int low_base = 0;
13372   int base;
13373   rtx t1;
13374   int i;
13375
13376   if (BYTES_BIG_ENDIAN)
13377     base = high ? low_base : high_base;
13378   else
13379     base = high ? high_base : low_base;
13380
13381   for (i = 0; i < nunits / 2; i++)
13382     RTVEC_ELT (v, i) = GEN_INT (base + i);
13383
13384   t1 = gen_rtx_PARALLEL (mode, v);
13385   return t1;
13386 }
13387
13388 /* Check OP for validity as a PARALLEL RTX vector with elements
13389    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
13390    from the perspective of the architecture.  See the diagram above
13391    aarch64_simd_vect_par_cnst_half for more details.  */
13392
13393 bool
13394 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
13395                                        bool high)
13396 {
13397   int nelts;
13398   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
13399     return false;
13400
13401   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
13402   HOST_WIDE_INT count_op = XVECLEN (op, 0);
13403   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
13404   int i = 0;
13405
13406   if (count_op != count_ideal)
13407     return false;
13408
13409   for (i = 0; i < count_ideal; i++)
13410     {
13411       rtx elt_op = XVECEXP (op, 0, i);
13412       rtx elt_ideal = XVECEXP (ideal, 0, i);
13413
13414       if (!CONST_INT_P (elt_op)
13415           || INTVAL (elt_ideal) != INTVAL (elt_op))
13416         return false;
13417     }
13418   return true;
13419 }
13420
13421 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
13422    HIGH (exclusive).  */
13423 void
13424 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
13425                           const_tree exp)
13426 {
13427   HOST_WIDE_INT lane;
13428   gcc_assert (CONST_INT_P (operand));
13429   lane = INTVAL (operand);
13430
13431   if (lane < low || lane >= high)
13432   {
13433     if (exp)
13434       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
13435     else
13436       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
13437   }
13438 }
13439
13440 /* Peform endian correction on lane number N, which indexes a vector
13441    of mode MODE, and return the result as an SImode rtx.  */
13442
13443 rtx
13444 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
13445 {
13446   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
13447 }
13448
13449 /* Return TRUE if OP is a valid vector addressing mode.  */
13450
13451 bool
13452 aarch64_simd_mem_operand_p (rtx op)
13453 {
13454   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
13455                         || REG_P (XEXP (op, 0)));
13456 }
13457
13458 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
13459
13460 bool
13461 aarch64_sve_ld1r_operand_p (rtx op)
13462 {
13463   struct aarch64_address_info addr;
13464   scalar_mode mode;
13465
13466   return (MEM_P (op)
13467           && is_a <scalar_mode> (GET_MODE (op), &mode)
13468           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
13469           && addr.type == ADDRESS_REG_IMM
13470           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
13471 }
13472
13473 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
13474    The conditions for STR are the same.  */
13475 bool
13476 aarch64_sve_ldr_operand_p (rtx op)
13477 {
13478   struct aarch64_address_info addr;
13479
13480   return (MEM_P (op)
13481           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
13482                                        false, ADDR_QUERY_ANY)
13483           && addr.type == ADDRESS_REG_IMM);
13484 }
13485
13486 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
13487    We need to be able to access the individual pieces, so the range
13488    is different from LD[234] and ST[234].  */
13489 bool
13490 aarch64_sve_struct_memory_operand_p (rtx op)
13491 {
13492   if (!MEM_P (op))
13493     return false;
13494
13495   machine_mode mode = GET_MODE (op);
13496   struct aarch64_address_info addr;
13497   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
13498                                  ADDR_QUERY_ANY)
13499       || addr.type != ADDRESS_REG_IMM)
13500     return false;
13501
13502   poly_int64 first = addr.const_offset;
13503   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
13504   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
13505           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
13506 }
13507
13508 /* Emit a register copy from operand to operand, taking care not to
13509    early-clobber source registers in the process.
13510
13511    COUNT is the number of components into which the copy needs to be
13512    decomposed.  */
13513 void
13514 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
13515                                 unsigned int count)
13516 {
13517   unsigned int i;
13518   int rdest = REGNO (operands[0]);
13519   int rsrc = REGNO (operands[1]);
13520
13521   if (!reg_overlap_mentioned_p (operands[0], operands[1])
13522       || rdest < rsrc)
13523     for (i = 0; i < count; i++)
13524       emit_move_insn (gen_rtx_REG (mode, rdest + i),
13525                       gen_rtx_REG (mode, rsrc + i));
13526   else
13527     for (i = 0; i < count; i++)
13528       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
13529                       gen_rtx_REG (mode, rsrc + count - i - 1));
13530 }
13531
13532 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
13533    one of VSTRUCT modes: OI, CI, or XI.  */
13534 int
13535 aarch64_simd_attr_length_rglist (machine_mode mode)
13536 {
13537   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
13538   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
13539 }
13540
13541 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
13542    alignment of a vector to 128 bits.  SVE predicates have an alignment of
13543    16 bits.  */
13544 static HOST_WIDE_INT
13545 aarch64_simd_vector_alignment (const_tree type)
13546 {
13547   if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13548     /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
13549        be set for non-predicate vectors of booleans.  Modes are the most
13550        direct way we have of identifying real SVE predicate types.  */
13551     return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
13552   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
13553   return MIN (align, 128);
13554 }
13555
13556 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
13557 static HOST_WIDE_INT
13558 aarch64_vectorize_preferred_vector_alignment (const_tree type)
13559 {
13560   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
13561     {
13562       /* If the length of the vector is fixed, try to align to that length,
13563          otherwise don't try to align at all.  */
13564       HOST_WIDE_INT result;
13565       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
13566         result = TYPE_ALIGN (TREE_TYPE (type));
13567       return result;
13568     }
13569   return TYPE_ALIGN (type);
13570 }
13571
13572 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
13573 static bool
13574 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
13575 {
13576   if (is_packed)
13577     return false;
13578
13579   /* For fixed-length vectors, check that the vectorizer will aim for
13580      full-vector alignment.  This isn't true for generic GCC vectors
13581      that are wider than the ABI maximum of 128 bits.  */
13582   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
13583       && (wi::to_widest (TYPE_SIZE (type))
13584           != aarch64_vectorize_preferred_vector_alignment (type)))
13585     return false;
13586
13587   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
13588   return true;
13589 }
13590
13591 /* Return true if the vector misalignment factor is supported by the
13592    target.  */
13593 static bool
13594 aarch64_builtin_support_vector_misalignment (machine_mode mode,
13595                                              const_tree type, int misalignment,
13596                                              bool is_packed)
13597 {
13598   if (TARGET_SIMD && STRICT_ALIGNMENT)
13599     {
13600       /* Return if movmisalign pattern is not supported for this mode.  */
13601       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
13602         return false;
13603
13604       /* Misalignment factor is unknown at compile time.  */
13605       if (misalignment == -1)
13606         return false;
13607     }
13608   return default_builtin_support_vector_misalignment (mode, type, misalignment,
13609                                                       is_packed);
13610 }
13611
13612 /* If VALS is a vector constant that can be loaded into a register
13613    using DUP, generate instructions to do so and return an RTX to
13614    assign to the register.  Otherwise return NULL_RTX.  */
13615 static rtx
13616 aarch64_simd_dup_constant (rtx vals)
13617 {
13618   machine_mode mode = GET_MODE (vals);
13619   machine_mode inner_mode = GET_MODE_INNER (mode);
13620   rtx x;
13621
13622   if (!const_vec_duplicate_p (vals, &x))
13623     return NULL_RTX;
13624
13625   /* We can load this constant by using DUP and a constant in a
13626      single ARM register.  This will be cheaper than a vector
13627      load.  */
13628   x = copy_to_mode_reg (inner_mode, x);
13629   return gen_vec_duplicate (mode, x);
13630 }
13631
13632
13633 /* Generate code to load VALS, which is a PARALLEL containing only
13634    constants (for vec_init) or CONST_VECTOR, efficiently into a
13635    register.  Returns an RTX to copy into the register, or NULL_RTX
13636    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
13637 static rtx
13638 aarch64_simd_make_constant (rtx vals)
13639 {
13640   machine_mode mode = GET_MODE (vals);
13641   rtx const_dup;
13642   rtx const_vec = NULL_RTX;
13643   int n_const = 0;
13644   int i;
13645
13646   if (GET_CODE (vals) == CONST_VECTOR)
13647     const_vec = vals;
13648   else if (GET_CODE (vals) == PARALLEL)
13649     {
13650       /* A CONST_VECTOR must contain only CONST_INTs and
13651          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
13652          Only store valid constants in a CONST_VECTOR.  */
13653       int n_elts = XVECLEN (vals, 0);
13654       for (i = 0; i < n_elts; ++i)
13655         {
13656           rtx x = XVECEXP (vals, 0, i);
13657           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13658             n_const++;
13659         }
13660       if (n_const == n_elts)
13661         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
13662     }
13663   else
13664     gcc_unreachable ();
13665
13666   if (const_vec != NULL_RTX
13667       && aarch64_simd_valid_immediate (const_vec, NULL))
13668     /* Load using MOVI/MVNI.  */
13669     return const_vec;
13670   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
13671     /* Loaded using DUP.  */
13672     return const_dup;
13673   else if (const_vec != NULL_RTX)
13674     /* Load from constant pool. We can not take advantage of single-cycle
13675        LD1 because we need a PC-relative addressing mode.  */
13676     return const_vec;
13677   else
13678     /* A PARALLEL containing something not valid inside CONST_VECTOR.
13679        We can not construct an initializer.  */
13680     return NULL_RTX;
13681 }
13682
13683 /* Expand a vector initialisation sequence, such that TARGET is
13684    initialised to contain VALS.  */
13685
13686 void
13687 aarch64_expand_vector_init (rtx target, rtx vals)
13688 {
13689   machine_mode mode = GET_MODE (target);
13690   scalar_mode inner_mode = GET_MODE_INNER (mode);
13691   /* The number of vector elements.  */
13692   int n_elts = XVECLEN (vals, 0);
13693   /* The number of vector elements which are not constant.  */
13694   int n_var = 0;
13695   rtx any_const = NULL_RTX;
13696   /* The first element of vals.  */
13697   rtx v0 = XVECEXP (vals, 0, 0);
13698   bool all_same = true;
13699
13700   /* Count the number of variable elements to initialise.  */
13701   for (int i = 0; i < n_elts; ++i)
13702     {
13703       rtx x = XVECEXP (vals, 0, i);
13704       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
13705         ++n_var;
13706       else
13707         any_const = x;
13708
13709       all_same &= rtx_equal_p (x, v0);
13710     }
13711
13712   /* No variable elements, hand off to aarch64_simd_make_constant which knows
13713      how best to handle this.  */
13714   if (n_var == 0)
13715     {
13716       rtx constant = aarch64_simd_make_constant (vals);
13717       if (constant != NULL_RTX)
13718         {
13719           emit_move_insn (target, constant);
13720           return;
13721         }
13722     }
13723
13724   /* Splat a single non-constant element if we can.  */
13725   if (all_same)
13726     {
13727       rtx x = copy_to_mode_reg (inner_mode, v0);
13728       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13729       return;
13730     }
13731
13732   enum insn_code icode = optab_handler (vec_set_optab, mode);
13733   gcc_assert (icode != CODE_FOR_nothing);
13734
13735   /* If there are only variable elements, try to optimize
13736      the insertion using dup for the most common element
13737      followed by insertions.  */
13738
13739   /* The algorithm will fill matches[*][0] with the earliest matching element,
13740      and matches[X][1] with the count of duplicate elements (if X is the
13741      earliest element which has duplicates).  */
13742
13743   if (n_var == n_elts && n_elts <= 16)
13744     {
13745       int matches[16][2] = {0};
13746       for (int i = 0; i < n_elts; i++)
13747         {
13748           for (int j = 0; j <= i; j++)
13749             {
13750               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
13751                 {
13752                   matches[i][0] = j;
13753                   matches[j][1]++;
13754                   break;
13755                 }
13756             }
13757         }
13758       int maxelement = 0;
13759       int maxv = 0;
13760       for (int i = 0; i < n_elts; i++)
13761         if (matches[i][1] > maxv)
13762           {
13763             maxelement = i;
13764             maxv = matches[i][1];
13765           }
13766
13767       /* Create a duplicate of the most common element.  */
13768       rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
13769       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13770
13771       /* Insert the rest.  */
13772       for (int i = 0; i < n_elts; i++)
13773         {
13774           rtx x = XVECEXP (vals, 0, i);
13775           if (matches[i][0] == maxelement)
13776             continue;
13777           x = copy_to_mode_reg (inner_mode, x);
13778           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
13779         }
13780       return;
13781     }
13782
13783   /* Initialise a vector which is part-variable.  We want to first try
13784      to build those lanes which are constant in the most efficient way we
13785      can.  */
13786   if (n_var != n_elts)
13787     {
13788       rtx copy = copy_rtx (vals);
13789
13790       /* Load constant part of vector.  We really don't care what goes into the
13791          parts we will overwrite, but we're more likely to be able to load the
13792          constant efficiently if it has fewer, larger, repeating parts
13793          (see aarch64_simd_valid_immediate).  */
13794       for (int i = 0; i < n_elts; i++)
13795         {
13796           rtx x = XVECEXP (vals, 0, i);
13797           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13798             continue;
13799           rtx subst = any_const;
13800           for (int bit = n_elts / 2; bit > 0; bit /= 2)
13801             {
13802               /* Look in the copied vector, as more elements are const.  */
13803               rtx test = XVECEXP (copy, 0, i ^ bit);
13804               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
13805                 {
13806                   subst = test;
13807                   break;
13808                 }
13809             }
13810           XVECEXP (copy, 0, i) = subst;
13811         }
13812       aarch64_expand_vector_init (target, copy);
13813     }
13814
13815   /* Insert the variable lanes directly.  */
13816   for (int i = 0; i < n_elts; i++)
13817     {
13818       rtx x = XVECEXP (vals, 0, i);
13819       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13820         continue;
13821       x = copy_to_mode_reg (inner_mode, x);
13822       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
13823     }
13824 }
13825
13826 static unsigned HOST_WIDE_INT
13827 aarch64_shift_truncation_mask (machine_mode mode)
13828 {
13829   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
13830     return 0;
13831   return GET_MODE_UNIT_BITSIZE (mode) - 1;
13832 }
13833
13834 /* Select a format to encode pointers in exception handling data.  */
13835 int
13836 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
13837 {
13838    int type;
13839    switch (aarch64_cmodel)
13840      {
13841      case AARCH64_CMODEL_TINY:
13842      case AARCH64_CMODEL_TINY_PIC:
13843      case AARCH64_CMODEL_SMALL:
13844      case AARCH64_CMODEL_SMALL_PIC:
13845      case AARCH64_CMODEL_SMALL_SPIC:
13846        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
13847           for everything.  */
13848        type = DW_EH_PE_sdata4;
13849        break;
13850      default:
13851        /* No assumptions here.  8-byte relocs required.  */
13852        type = DW_EH_PE_sdata8;
13853        break;
13854      }
13855    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
13856 }
13857
13858 /* The last .arch and .tune assembly strings that we printed.  */
13859 static std::string aarch64_last_printed_arch_string;
13860 static std::string aarch64_last_printed_tune_string;
13861
13862 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
13863    by the function fndecl.  */
13864
13865 void
13866 aarch64_declare_function_name (FILE *stream, const char* name,
13867                                 tree fndecl)
13868 {
13869   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13870
13871   struct cl_target_option *targ_options;
13872   if (target_parts)
13873     targ_options = TREE_TARGET_OPTION (target_parts);
13874   else
13875     targ_options = TREE_TARGET_OPTION (target_option_current_node);
13876   gcc_assert (targ_options);
13877
13878   const struct processor *this_arch
13879     = aarch64_get_arch (targ_options->x_explicit_arch);
13880
13881   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
13882   std::string extension
13883     = aarch64_get_extension_string_for_isa_flags (isa_flags,
13884                                                   this_arch->flags);
13885   /* Only update the assembler .arch string if it is distinct from the last
13886      such string we printed.  */
13887   std::string to_print = this_arch->name + extension;
13888   if (to_print != aarch64_last_printed_arch_string)
13889     {
13890       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
13891       aarch64_last_printed_arch_string = to_print;
13892     }
13893
13894   /* Print the cpu name we're tuning for in the comments, might be
13895      useful to readers of the generated asm.  Do it only when it changes
13896      from function to function and verbose assembly is requested.  */
13897   const struct processor *this_tune
13898     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
13899
13900   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
13901     {
13902       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
13903                    this_tune->name);
13904       aarch64_last_printed_tune_string = this_tune->name;
13905     }
13906
13907   /* Don't forget the type directive for ELF.  */
13908   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
13909   ASM_OUTPUT_LABEL (stream, name);
13910 }
13911
13912 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
13913
13914 static void
13915 aarch64_start_file (void)
13916 {
13917   struct cl_target_option *default_options
13918     = TREE_TARGET_OPTION (target_option_default_node);
13919
13920   const struct processor *default_arch
13921     = aarch64_get_arch (default_options->x_explicit_arch);
13922   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
13923   std::string extension
13924     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
13925                                                   default_arch->flags);
13926
13927    aarch64_last_printed_arch_string = default_arch->name + extension;
13928    aarch64_last_printed_tune_string = "";
13929    asm_fprintf (asm_out_file, "\t.arch %s\n",
13930                 aarch64_last_printed_arch_string.c_str ());
13931
13932    default_file_start ();
13933 }
13934
13935 /* Emit load exclusive.  */
13936
13937 static void
13938 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
13939                              rtx mem, rtx model_rtx)
13940 {
13941   rtx (*gen) (rtx, rtx, rtx);
13942
13943   switch (mode)
13944     {
13945     case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
13946     case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
13947     case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
13948     case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
13949     default:
13950       gcc_unreachable ();
13951     }
13952
13953   emit_insn (gen (rval, mem, model_rtx));
13954 }
13955
13956 /* Emit store exclusive.  */
13957
13958 static void
13959 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
13960                               rtx rval, rtx mem, rtx model_rtx)
13961 {
13962   rtx (*gen) (rtx, rtx, rtx, rtx);
13963
13964   switch (mode)
13965     {
13966     case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
13967     case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
13968     case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
13969     case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
13970     default:
13971       gcc_unreachable ();
13972     }
13973
13974   emit_insn (gen (bval, rval, mem, model_rtx));
13975 }
13976
13977 /* Mark the previous jump instruction as unlikely.  */
13978
13979 static void
13980 aarch64_emit_unlikely_jump (rtx insn)
13981 {
13982   rtx_insn *jump = emit_jump_insn (insn);
13983   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
13984 }
13985
13986 /* Expand a compare and swap pattern.  */
13987
13988 void
13989 aarch64_expand_compare_and_swap (rtx operands[])
13990 {
13991   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
13992   machine_mode mode, cmp_mode;
13993   typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
13994   int idx;
13995   gen_cas_fn gen;
13996   const gen_cas_fn split_cas[] =
13997   {
13998     gen_aarch64_compare_and_swapqi,
13999     gen_aarch64_compare_and_swaphi,
14000     gen_aarch64_compare_and_swapsi,
14001     gen_aarch64_compare_and_swapdi
14002   };
14003   const gen_cas_fn atomic_cas[] =
14004   {
14005     gen_aarch64_compare_and_swapqi_lse,
14006     gen_aarch64_compare_and_swaphi_lse,
14007     gen_aarch64_compare_and_swapsi_lse,
14008     gen_aarch64_compare_and_swapdi_lse
14009   };
14010
14011   bval = operands[0];
14012   rval = operands[1];
14013   mem = operands[2];
14014   oldval = operands[3];
14015   newval = operands[4];
14016   is_weak = operands[5];
14017   mod_s = operands[6];
14018   mod_f = operands[7];
14019   mode = GET_MODE (mem);
14020   cmp_mode = mode;
14021
14022   /* Normally the succ memory model must be stronger than fail, but in the
14023      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
14024      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
14025
14026   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
14027       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
14028     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
14029
14030   switch (mode)
14031     {
14032     case E_QImode:
14033     case E_HImode:
14034       /* For short modes, we're going to perform the comparison in SImode,
14035          so do the zero-extension now.  */
14036       cmp_mode = SImode;
14037       rval = gen_reg_rtx (SImode);
14038       oldval = convert_modes (SImode, mode, oldval, true);
14039       /* Fall through.  */
14040
14041     case E_SImode:
14042     case E_DImode:
14043       /* Force the value into a register if needed.  */
14044       if (!aarch64_plus_operand (oldval, mode))
14045         oldval = force_reg (cmp_mode, oldval);
14046       break;
14047
14048     default:
14049       gcc_unreachable ();
14050     }
14051
14052   switch (mode)
14053     {
14054     case E_QImode: idx = 0; break;
14055     case E_HImode: idx = 1; break;
14056     case E_SImode: idx = 2; break;
14057     case E_DImode: idx = 3; break;
14058     default:
14059       gcc_unreachable ();
14060     }
14061   if (TARGET_LSE)
14062     gen = atomic_cas[idx];
14063   else
14064     gen = split_cas[idx];
14065
14066   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
14067
14068   if (mode == QImode || mode == HImode)
14069     emit_move_insn (operands[1], gen_lowpart (mode, rval));
14070
14071   x = gen_rtx_REG (CCmode, CC_REGNUM);
14072   x = gen_rtx_EQ (SImode, x, const0_rtx);
14073   emit_insn (gen_rtx_SET (bval, x));
14074 }
14075
14076 /* Test whether the target supports using a atomic load-operate instruction.
14077    CODE is the operation and AFTER is TRUE if the data in memory after the
14078    operation should be returned and FALSE if the data before the operation
14079    should be returned.  Returns FALSE if the operation isn't supported by the
14080    architecture.  */
14081
14082 bool
14083 aarch64_atomic_ldop_supported_p (enum rtx_code code)
14084 {
14085   if (!TARGET_LSE)
14086     return false;
14087
14088   switch (code)
14089     {
14090     case SET:
14091     case AND:
14092     case IOR:
14093     case XOR:
14094     case MINUS:
14095     case PLUS:
14096       return true;
14097     default:
14098       return false;
14099     }
14100 }
14101
14102 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
14103    sequence implementing an atomic operation.  */
14104
14105 static void
14106 aarch64_emit_post_barrier (enum memmodel model)
14107 {
14108   const enum memmodel base_model = memmodel_base (model);
14109
14110   if (is_mm_sync (model)
14111       && (base_model == MEMMODEL_ACQUIRE
14112           || base_model == MEMMODEL_ACQ_REL
14113           || base_model == MEMMODEL_SEQ_CST))
14114     {
14115       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
14116     }
14117 }
14118
14119 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
14120    for the data in memory.  EXPECTED is the value expected to be in memory.
14121    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
14122    is the memory ordering to use.  */
14123
14124 void
14125 aarch64_gen_atomic_cas (rtx rval, rtx mem,
14126                         rtx expected, rtx desired,
14127                         rtx model)
14128 {
14129   rtx (*gen) (rtx, rtx, rtx, rtx);
14130   machine_mode mode;
14131
14132   mode = GET_MODE (mem);
14133
14134   switch (mode)
14135     {
14136     case E_QImode: gen = gen_aarch64_atomic_casqi; break;
14137     case E_HImode: gen = gen_aarch64_atomic_cashi; break;
14138     case E_SImode: gen = gen_aarch64_atomic_cassi; break;
14139     case E_DImode: gen = gen_aarch64_atomic_casdi; break;
14140     default:
14141       gcc_unreachable ();
14142     }
14143
14144   /* Move the expected value into the CAS destination register.  */
14145   emit_insn (gen_rtx_SET (rval, expected));
14146
14147   /* Emit the CAS.  */
14148   emit_insn (gen (rval, mem, desired, model));
14149
14150   /* Compare the expected value with the value loaded by the CAS, to establish
14151      whether the swap was made.  */
14152   aarch64_gen_compare_reg (EQ, rval, expected);
14153 }
14154
14155 /* Split a compare and swap pattern.  */
14156
14157 void
14158 aarch64_split_compare_and_swap (rtx operands[])
14159 {
14160   rtx rval, mem, oldval, newval, scratch;
14161   machine_mode mode;
14162   bool is_weak;
14163   rtx_code_label *label1, *label2;
14164   rtx x, cond;
14165   enum memmodel model;
14166   rtx model_rtx;
14167
14168   rval = operands[0];
14169   mem = operands[1];
14170   oldval = operands[2];
14171   newval = operands[3];
14172   is_weak = (operands[4] != const0_rtx);
14173   model_rtx = operands[5];
14174   scratch = operands[7];
14175   mode = GET_MODE (mem);
14176   model = memmodel_from_int (INTVAL (model_rtx));
14177
14178   /* When OLDVAL is zero and we want the strong version we can emit a tighter
14179     loop:
14180     .label1:
14181         LD[A]XR rval, [mem]
14182         CBNZ    rval, .label2
14183         ST[L]XR scratch, newval, [mem]
14184         CBNZ    scratch, .label1
14185     .label2:
14186         CMP     rval, 0.  */
14187   bool strong_zero_p = !is_weak && oldval == const0_rtx;
14188
14189   label1 = NULL;
14190   if (!is_weak)
14191     {
14192       label1 = gen_label_rtx ();
14193       emit_label (label1);
14194     }
14195   label2 = gen_label_rtx ();
14196
14197   /* The initial load can be relaxed for a __sync operation since a final
14198      barrier will be emitted to stop code hoisting.  */
14199   if (is_mm_sync (model))
14200     aarch64_emit_load_exclusive (mode, rval, mem,
14201                                  GEN_INT (MEMMODEL_RELAXED));
14202   else
14203     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
14204
14205   if (strong_zero_p)
14206     {
14207       x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
14208       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14209                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14210       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14211     }
14212   else
14213     {
14214       cond = aarch64_gen_compare_reg (NE, rval, oldval);
14215       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14216       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14217                                  gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14218       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14219     }
14220
14221   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
14222
14223   if (!is_weak)
14224     {
14225       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
14226       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14227                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
14228       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14229     }
14230   else
14231     {
14232       cond = gen_rtx_REG (CCmode, CC_REGNUM);
14233       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
14234       emit_insn (gen_rtx_SET (cond, x));
14235     }
14236
14237   emit_label (label2);
14238   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
14239      to set the condition flags.  If this is not used it will be removed by
14240      later passes.  */
14241   if (strong_zero_p)
14242     {
14243       cond = gen_rtx_REG (CCmode, CC_REGNUM);
14244       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
14245       emit_insn (gen_rtx_SET (cond, x));
14246     }
14247   /* Emit any final barrier needed for a __sync operation.  */
14248   if (is_mm_sync (model))
14249     aarch64_emit_post_barrier (model);
14250 }
14251
14252 /* Emit a BIC instruction.  */
14253
14254 static void
14255 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
14256 {
14257   rtx shift_rtx = GEN_INT (shift);
14258   rtx (*gen) (rtx, rtx, rtx, rtx);
14259
14260   switch (mode)
14261     {
14262     case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
14263     case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
14264     default:
14265       gcc_unreachable ();
14266     }
14267
14268   emit_insn (gen (dst, s2, shift_rtx, s1));
14269 }
14270
14271 /* Emit an atomic swap.  */
14272
14273 static void
14274 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
14275                           rtx mem, rtx model)
14276 {
14277   rtx (*gen) (rtx, rtx, rtx, rtx);
14278
14279   switch (mode)
14280     {
14281     case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
14282     case E_HImode: gen = gen_aarch64_atomic_swphi; break;
14283     case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
14284     case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
14285     default:
14286       gcc_unreachable ();
14287     }
14288
14289   emit_insn (gen (dst, mem, value, model));
14290 }
14291
14292 /* Operations supported by aarch64_emit_atomic_load_op.  */
14293
14294 enum aarch64_atomic_load_op_code
14295 {
14296   AARCH64_LDOP_PLUS,    /* A + B  */
14297   AARCH64_LDOP_XOR,     /* A ^ B  */
14298   AARCH64_LDOP_OR,      /* A | B  */
14299   AARCH64_LDOP_BIC      /* A & ~B  */
14300 };
14301
14302 /* Emit an atomic load-operate.  */
14303
14304 static void
14305 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
14306                              machine_mode mode, rtx dst, rtx src,
14307                              rtx mem, rtx model)
14308 {
14309   typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
14310   const aarch64_atomic_load_op_fn plus[] =
14311   {
14312     gen_aarch64_atomic_loadaddqi,
14313     gen_aarch64_atomic_loadaddhi,
14314     gen_aarch64_atomic_loadaddsi,
14315     gen_aarch64_atomic_loadadddi
14316   };
14317   const aarch64_atomic_load_op_fn eor[] =
14318   {
14319     gen_aarch64_atomic_loadeorqi,
14320     gen_aarch64_atomic_loadeorhi,
14321     gen_aarch64_atomic_loadeorsi,
14322     gen_aarch64_atomic_loadeordi
14323   };
14324   const aarch64_atomic_load_op_fn ior[] =
14325   {
14326     gen_aarch64_atomic_loadsetqi,
14327     gen_aarch64_atomic_loadsethi,
14328     gen_aarch64_atomic_loadsetsi,
14329     gen_aarch64_atomic_loadsetdi
14330   };
14331   const aarch64_atomic_load_op_fn bic[] =
14332   {
14333     gen_aarch64_atomic_loadclrqi,
14334     gen_aarch64_atomic_loadclrhi,
14335     gen_aarch64_atomic_loadclrsi,
14336     gen_aarch64_atomic_loadclrdi
14337   };
14338   aarch64_atomic_load_op_fn gen;
14339   int idx = 0;
14340
14341   switch (mode)
14342     {
14343     case E_QImode: idx = 0; break;
14344     case E_HImode: idx = 1; break;
14345     case E_SImode: idx = 2; break;
14346     case E_DImode: idx = 3; break;
14347     default:
14348       gcc_unreachable ();
14349     }
14350
14351   switch (code)
14352     {
14353     case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
14354     case AARCH64_LDOP_XOR: gen = eor[idx]; break;
14355     case AARCH64_LDOP_OR: gen = ior[idx]; break;
14356     case AARCH64_LDOP_BIC: gen = bic[idx]; break;
14357     default:
14358       gcc_unreachable ();
14359     }
14360
14361   emit_insn (gen (dst, mem, src, model));
14362 }
14363
14364 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
14365    location to store the data read from memory.  OUT_RESULT is the location to
14366    store the result of the operation.  MEM is the memory location to read and
14367    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
14368    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
14369    be NULL.  */
14370
14371 void
14372 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
14373                          rtx mem, rtx value, rtx model_rtx)
14374 {
14375   machine_mode mode = GET_MODE (mem);
14376   machine_mode wmode = (mode == DImode ? DImode : SImode);
14377   const bool short_mode = (mode < SImode);
14378   aarch64_atomic_load_op_code ldop_code;
14379   rtx src;
14380   rtx x;
14381
14382   if (out_data)
14383     out_data = gen_lowpart (mode, out_data);
14384
14385   if (out_result)
14386     out_result = gen_lowpart (mode, out_result);
14387
14388   /* Make sure the value is in a register, putting it into a destination
14389      register if it needs to be manipulated.  */
14390   if (!register_operand (value, mode)
14391       || code == AND || code == MINUS)
14392     {
14393       src = out_result ? out_result : out_data;
14394       emit_move_insn (src, gen_lowpart (mode, value));
14395     }
14396   else
14397     src = value;
14398   gcc_assert (register_operand (src, mode));
14399
14400   /* Preprocess the data for the operation as necessary.  If the operation is
14401      a SET then emit a swap instruction and finish.  */
14402   switch (code)
14403     {
14404     case SET:
14405       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
14406       return;
14407
14408     case MINUS:
14409       /* Negate the value and treat it as a PLUS.  */
14410       {
14411         rtx neg_src;
14412
14413         /* Resize the value if necessary.  */
14414         if (short_mode)
14415           src = gen_lowpart (wmode, src);
14416
14417         neg_src = gen_rtx_NEG (wmode, src);
14418         emit_insn (gen_rtx_SET (src, neg_src));
14419
14420         if (short_mode)
14421           src = gen_lowpart (mode, src);
14422       }
14423       /* Fall-through.  */
14424     case PLUS:
14425       ldop_code = AARCH64_LDOP_PLUS;
14426       break;
14427
14428     case IOR:
14429       ldop_code = AARCH64_LDOP_OR;
14430       break;
14431
14432     case XOR:
14433       ldop_code = AARCH64_LDOP_XOR;
14434       break;
14435
14436     case AND:
14437       {
14438         rtx not_src;
14439
14440         /* Resize the value if necessary.  */
14441         if (short_mode)
14442           src = gen_lowpart (wmode, src);
14443
14444         not_src = gen_rtx_NOT (wmode, src);
14445         emit_insn (gen_rtx_SET (src, not_src));
14446
14447         if (short_mode)
14448           src = gen_lowpart (mode, src);
14449       }
14450       ldop_code = AARCH64_LDOP_BIC;
14451       break;
14452
14453     default:
14454       /* The operation can't be done with atomic instructions.  */
14455       gcc_unreachable ();
14456     }
14457
14458   aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
14459
14460   /* If necessary, calculate the data in memory after the update by redoing the
14461      operation from values in registers.  */
14462   if (!out_result)
14463     return;
14464
14465   if (short_mode)
14466     {
14467       src = gen_lowpart (wmode, src);
14468       out_data = gen_lowpart (wmode, out_data);
14469       out_result = gen_lowpart (wmode, out_result);
14470     }
14471
14472   x = NULL_RTX;
14473
14474   switch (code)
14475     {
14476     case MINUS:
14477     case PLUS:
14478       x = gen_rtx_PLUS (wmode, out_data, src);
14479       break;
14480     case IOR:
14481       x = gen_rtx_IOR (wmode, out_data, src);
14482       break;
14483     case XOR:
14484       x = gen_rtx_XOR (wmode, out_data, src);
14485       break;
14486     case AND:
14487       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
14488       return;
14489     default:
14490       gcc_unreachable ();
14491     }
14492
14493   emit_set_insn (out_result, x);
14494
14495   return;
14496 }
14497
14498 /* Split an atomic operation.  */
14499
14500 void
14501 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
14502                          rtx value, rtx model_rtx, rtx cond)
14503 {
14504   machine_mode mode = GET_MODE (mem);
14505   machine_mode wmode = (mode == DImode ? DImode : SImode);
14506   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
14507   const bool is_sync = is_mm_sync (model);
14508   rtx_code_label *label;
14509   rtx x;
14510
14511   /* Split the atomic operation into a sequence.  */
14512   label = gen_label_rtx ();
14513   emit_label (label);
14514
14515   if (new_out)
14516     new_out = gen_lowpart (wmode, new_out);
14517   if (old_out)
14518     old_out = gen_lowpart (wmode, old_out);
14519   else
14520     old_out = new_out;
14521   value = simplify_gen_subreg (wmode, value, mode, 0);
14522
14523   /* The initial load can be relaxed for a __sync operation since a final
14524      barrier will be emitted to stop code hoisting.  */
14525  if (is_sync)
14526     aarch64_emit_load_exclusive (mode, old_out, mem,
14527                                  GEN_INT (MEMMODEL_RELAXED));
14528   else
14529     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
14530
14531   switch (code)
14532     {
14533     case SET:
14534       new_out = value;
14535       break;
14536
14537     case NOT:
14538       x = gen_rtx_AND (wmode, old_out, value);
14539       emit_insn (gen_rtx_SET (new_out, x));
14540       x = gen_rtx_NOT (wmode, new_out);
14541       emit_insn (gen_rtx_SET (new_out, x));
14542       break;
14543
14544     case MINUS:
14545       if (CONST_INT_P (value))
14546         {
14547           value = GEN_INT (-INTVAL (value));
14548           code = PLUS;
14549         }
14550       /* Fall through.  */
14551
14552     default:
14553       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
14554       emit_insn (gen_rtx_SET (new_out, x));
14555       break;
14556     }
14557
14558   aarch64_emit_store_exclusive (mode, cond, mem,
14559                                 gen_lowpart (mode, new_out), model_rtx);
14560
14561   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14562   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14563                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
14564   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14565
14566   /* Emit any final barrier needed for a __sync operation.  */
14567   if (is_sync)
14568     aarch64_emit_post_barrier (model);
14569 }
14570
14571 static void
14572 aarch64_init_libfuncs (void)
14573 {
14574    /* Half-precision float operations.  The compiler handles all operations
14575      with NULL libfuncs by converting to SFmode.  */
14576
14577   /* Conversions.  */
14578   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
14579   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
14580
14581   /* Arithmetic.  */
14582   set_optab_libfunc (add_optab, HFmode, NULL);
14583   set_optab_libfunc (sdiv_optab, HFmode, NULL);
14584   set_optab_libfunc (smul_optab, HFmode, NULL);
14585   set_optab_libfunc (neg_optab, HFmode, NULL);
14586   set_optab_libfunc (sub_optab, HFmode, NULL);
14587
14588   /* Comparisons.  */
14589   set_optab_libfunc (eq_optab, HFmode, NULL);
14590   set_optab_libfunc (ne_optab, HFmode, NULL);
14591   set_optab_libfunc (lt_optab, HFmode, NULL);
14592   set_optab_libfunc (le_optab, HFmode, NULL);
14593   set_optab_libfunc (ge_optab, HFmode, NULL);
14594   set_optab_libfunc (gt_optab, HFmode, NULL);
14595   set_optab_libfunc (unord_optab, HFmode, NULL);
14596 }
14597
14598 /* Target hook for c_mode_for_suffix.  */
14599 static machine_mode
14600 aarch64_c_mode_for_suffix (char suffix)
14601 {
14602   if (suffix == 'q')
14603     return TFmode;
14604
14605   return VOIDmode;
14606 }
14607
14608 /* We can only represent floating point constants which will fit in
14609    "quarter-precision" values.  These values are characterised by
14610    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
14611    by:
14612
14613    (-1)^s * (n/16) * 2^r
14614
14615    Where:
14616      's' is the sign bit.
14617      'n' is an integer in the range 16 <= n <= 31.
14618      'r' is an integer in the range -3 <= r <= 4.  */
14619
14620 /* Return true iff X can be represented by a quarter-precision
14621    floating point immediate operand X.  Note, we cannot represent 0.0.  */
14622 bool
14623 aarch64_float_const_representable_p (rtx x)
14624 {
14625   /* This represents our current view of how many bits
14626      make up the mantissa.  */
14627   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
14628   int exponent;
14629   unsigned HOST_WIDE_INT mantissa, mask;
14630   REAL_VALUE_TYPE r, m;
14631   bool fail;
14632
14633   if (!CONST_DOUBLE_P (x))
14634     return false;
14635
14636   /* We don't support HFmode constants yet.  */
14637   if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
14638     return false;
14639
14640   r = *CONST_DOUBLE_REAL_VALUE (x);
14641
14642   /* We cannot represent infinities, NaNs or +/-zero.  We won't
14643      know if we have +zero until we analyse the mantissa, but we
14644      can reject the other invalid values.  */
14645   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
14646       || REAL_VALUE_MINUS_ZERO (r))
14647     return false;
14648
14649   /* Extract exponent.  */
14650   r = real_value_abs (&r);
14651   exponent = REAL_EXP (&r);
14652
14653   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
14654      highest (sign) bit, with a fixed binary point at bit point_pos.
14655      m1 holds the low part of the mantissa, m2 the high part.
14656      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
14657      bits for the mantissa, this can fail (low bits will be lost).  */
14658   real_ldexp (&m, &r, point_pos - exponent);
14659   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
14660
14661   /* If the low part of the mantissa has bits set we cannot represent
14662      the value.  */
14663   if (w.ulow () != 0)
14664     return false;
14665   /* We have rejected the lower HOST_WIDE_INT, so update our
14666      understanding of how many bits lie in the mantissa and
14667      look only at the high HOST_WIDE_INT.  */
14668   mantissa = w.elt (1);
14669   point_pos -= HOST_BITS_PER_WIDE_INT;
14670
14671   /* We can only represent values with a mantissa of the form 1.xxxx.  */
14672   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
14673   if ((mantissa & mask) != 0)
14674     return false;
14675
14676   /* Having filtered unrepresentable values, we may now remove all
14677      but the highest 5 bits.  */
14678   mantissa >>= point_pos - 5;
14679
14680   /* We cannot represent the value 0.0, so reject it.  This is handled
14681      elsewhere.  */
14682   if (mantissa == 0)
14683     return false;
14684
14685   /* Then, as bit 4 is always set, we can mask it off, leaving
14686      the mantissa in the range [0, 15].  */
14687   mantissa &= ~(1 << 4);
14688   gcc_assert (mantissa <= 15);
14689
14690   /* GCC internally does not use IEEE754-like encoding (where normalized
14691      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
14692      Our mantissa values are shifted 4 places to the left relative to
14693      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
14694      by 5 places to correct for GCC's representation.  */
14695   exponent = 5 - exponent;
14696
14697   return (exponent >= 0 && exponent <= 7);
14698 }
14699
14700 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
14701    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
14702    output MOVI/MVNI, ORR or BIC immediate.  */
14703 char*
14704 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
14705                                    enum simd_immediate_check which)
14706 {
14707   bool is_valid;
14708   static char templ[40];
14709   const char *mnemonic;
14710   const char *shift_op;
14711   unsigned int lane_count = 0;
14712   char element_char;
14713
14714   struct simd_immediate_info info;
14715
14716   /* This will return true to show const_vector is legal for use as either
14717      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
14718      It will also update INFO to show how the immediate should be generated.
14719      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
14720   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
14721   gcc_assert (is_valid);
14722
14723   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14724   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
14725
14726   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
14727     {
14728       gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
14729       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
14730          move immediate path.  */
14731       if (aarch64_float_const_zero_rtx_p (info.value))
14732         info.value = GEN_INT (0);
14733       else
14734         {
14735           const unsigned int buf_size = 20;
14736           char float_buf[buf_size] = {'\0'};
14737           real_to_decimal_for_mode (float_buf,
14738                                     CONST_DOUBLE_REAL_VALUE (info.value),
14739                                     buf_size, buf_size, 1, info.elt_mode);
14740
14741           if (lane_count == 1)
14742             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
14743           else
14744             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
14745                       lane_count, element_char, float_buf);
14746           return templ;
14747         }
14748     }
14749
14750   gcc_assert (CONST_INT_P (info.value));
14751
14752   if (which == AARCH64_CHECK_MOV)
14753     {
14754       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
14755       shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
14756       if (lane_count == 1)
14757         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
14758                   mnemonic, UINTVAL (info.value));
14759       else if (info.shift)
14760         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
14761                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
14762                   element_char, UINTVAL (info.value), shift_op, info.shift);
14763       else
14764         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
14765                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
14766                   element_char, UINTVAL (info.value));
14767     }
14768   else
14769     {
14770       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
14771       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
14772       if (info.shift)
14773         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
14774                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
14775                   element_char, UINTVAL (info.value), "lsl", info.shift);
14776       else
14777         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
14778                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
14779                   element_char, UINTVAL (info.value));
14780     }
14781   return templ;
14782 }
14783
14784 char*
14785 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
14786 {
14787
14788   /* If a floating point number was passed and we desire to use it in an
14789      integer mode do the conversion to integer.  */
14790   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
14791     {
14792       unsigned HOST_WIDE_INT ival;
14793       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
14794           gcc_unreachable ();
14795       immediate = gen_int_mode (ival, mode);
14796     }
14797
14798   machine_mode vmode;
14799   /* use a 64 bit mode for everything except for DI/DF mode, where we use
14800      a 128 bit vector mode.  */
14801   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
14802
14803   vmode = aarch64_simd_container_mode (mode, width);
14804   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
14805   return aarch64_output_simd_mov_immediate (v_op, width);
14806 }
14807
14808 /* Return the output string to use for moving immediate CONST_VECTOR
14809    into an SVE register.  */
14810
14811 char *
14812 aarch64_output_sve_mov_immediate (rtx const_vector)
14813 {
14814   static char templ[40];
14815   struct simd_immediate_info info;
14816   char element_char;
14817
14818   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
14819   gcc_assert (is_valid);
14820
14821   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14822
14823   if (info.step)
14824     {
14825       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
14826                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
14827                 element_char, INTVAL (info.value), INTVAL (info.step));
14828       return templ;
14829     }
14830
14831   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
14832     {
14833       if (aarch64_float_const_zero_rtx_p (info.value))
14834         info.value = GEN_INT (0);
14835       else
14836         {
14837           const int buf_size = 20;
14838           char float_buf[buf_size] = {};
14839           real_to_decimal_for_mode (float_buf,
14840                                     CONST_DOUBLE_REAL_VALUE (info.value),
14841                                     buf_size, buf_size, 1, info.elt_mode);
14842
14843           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
14844                     element_char, float_buf);
14845           return templ;
14846         }
14847     }
14848
14849   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
14850             element_char, INTVAL (info.value));
14851   return templ;
14852 }
14853
14854 /* Return the asm format for a PTRUE instruction whose destination has
14855    mode MODE.  SUFFIX is the element size suffix.  */
14856
14857 char *
14858 aarch64_output_ptrue (machine_mode mode, char suffix)
14859 {
14860   unsigned int nunits;
14861   static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
14862   if (GET_MODE_NUNITS (mode).is_constant (&nunits))
14863     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
14864   else
14865     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
14866   return buf;
14867 }
14868
14869 /* Split operands into moves from op[1] + op[2] into op[0].  */
14870
14871 void
14872 aarch64_split_combinev16qi (rtx operands[3])
14873 {
14874   unsigned int dest = REGNO (operands[0]);
14875   unsigned int src1 = REGNO (operands[1]);
14876   unsigned int src2 = REGNO (operands[2]);
14877   machine_mode halfmode = GET_MODE (operands[1]);
14878   unsigned int halfregs = REG_NREGS (operands[1]);
14879   rtx destlo, desthi;
14880
14881   gcc_assert (halfmode == V16QImode);
14882
14883   if (src1 == dest && src2 == dest + halfregs)
14884     {
14885       /* No-op move.  Can't split to nothing; emit something.  */
14886       emit_note (NOTE_INSN_DELETED);
14887       return;
14888     }
14889
14890   /* Preserve register attributes for variable tracking.  */
14891   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
14892   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
14893                                GET_MODE_SIZE (halfmode));
14894
14895   /* Special case of reversed high/low parts.  */
14896   if (reg_overlap_mentioned_p (operands[2], destlo)
14897       && reg_overlap_mentioned_p (operands[1], desthi))
14898     {
14899       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
14900       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
14901       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
14902     }
14903   else if (!reg_overlap_mentioned_p (operands[2], destlo))
14904     {
14905       /* Try to avoid unnecessary moves if part of the result
14906          is in the right place already.  */
14907       if (src1 != dest)
14908         emit_move_insn (destlo, operands[1]);
14909       if (src2 != dest + halfregs)
14910         emit_move_insn (desthi, operands[2]);
14911     }
14912   else
14913     {
14914       if (src2 != dest + halfregs)
14915         emit_move_insn (desthi, operands[2]);
14916       if (src1 != dest)
14917         emit_move_insn (destlo, operands[1]);
14918     }
14919 }
14920
14921 /* vec_perm support.  */
14922
14923 struct expand_vec_perm_d
14924 {
14925   rtx target, op0, op1;
14926   vec_perm_indices perm;
14927   machine_mode vmode;
14928   unsigned int vec_flags;
14929   bool one_vector_p;
14930   bool testing_p;
14931 };
14932
14933 /* Generate a variable permutation.  */
14934
14935 static void
14936 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
14937 {
14938   machine_mode vmode = GET_MODE (target);
14939   bool one_vector_p = rtx_equal_p (op0, op1);
14940
14941   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
14942   gcc_checking_assert (GET_MODE (op0) == vmode);
14943   gcc_checking_assert (GET_MODE (op1) == vmode);
14944   gcc_checking_assert (GET_MODE (sel) == vmode);
14945   gcc_checking_assert (TARGET_SIMD);
14946
14947   if (one_vector_p)
14948     {
14949       if (vmode == V8QImode)
14950         {
14951           /* Expand the argument to a V16QI mode by duplicating it.  */
14952           rtx pair = gen_reg_rtx (V16QImode);
14953           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
14954           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
14955         }
14956       else
14957         {
14958           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
14959         }
14960     }
14961   else
14962     {
14963       rtx pair;
14964
14965       if (vmode == V8QImode)
14966         {
14967           pair = gen_reg_rtx (V16QImode);
14968           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
14969           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
14970         }
14971       else
14972         {
14973           pair = gen_reg_rtx (OImode);
14974           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
14975           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
14976         }
14977     }
14978 }
14979
14980 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
14981    NELT is the number of elements in the vector.  */
14982
14983 void
14984 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
14985                          unsigned int nelt)
14986 {
14987   machine_mode vmode = GET_MODE (target);
14988   bool one_vector_p = rtx_equal_p (op0, op1);
14989   rtx mask;
14990
14991   /* The TBL instruction does not use a modulo index, so we must take care
14992      of that ourselves.  */
14993   mask = aarch64_simd_gen_const_vector_dup (vmode,
14994       one_vector_p ? nelt - 1 : 2 * nelt - 1);
14995   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
14996
14997   /* For big-endian, we also need to reverse the index within the vector
14998      (but not which vector).  */
14999   if (BYTES_BIG_ENDIAN)
15000     {
15001       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
15002       if (!one_vector_p)
15003         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
15004       sel = expand_simple_binop (vmode, XOR, sel, mask,
15005                                  NULL, 0, OPTAB_LIB_WIDEN);
15006     }
15007   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
15008 }
15009
15010 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
15011
15012 static void
15013 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
15014 {
15015   emit_insn (gen_rtx_SET (target,
15016                           gen_rtx_UNSPEC (GET_MODE (target),
15017                                           gen_rtvec (2, op0, op1), code)));
15018 }
15019
15020 /* Expand an SVE vec_perm with the given operands.  */
15021
15022 void
15023 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
15024 {
15025   machine_mode data_mode = GET_MODE (target);
15026   machine_mode sel_mode = GET_MODE (sel);
15027   /* Enforced by the pattern condition.  */
15028   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
15029
15030   /* Note: vec_perm indices are supposed to wrap when they go beyond the
15031      size of the two value vectors, i.e. the upper bits of the indices
15032      are effectively ignored.  SVE TBL instead produces 0 for any
15033      out-of-range indices, so we need to modulo all the vec_perm indices
15034      to ensure they are all in range.  */
15035   rtx sel_reg = force_reg (sel_mode, sel);
15036
15037   /* Check if the sel only references the first values vector.  */
15038   if (GET_CODE (sel) == CONST_VECTOR
15039       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
15040     {
15041       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
15042       return;
15043     }
15044
15045   /* Check if the two values vectors are the same.  */
15046   if (rtx_equal_p (op0, op1))
15047     {
15048       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
15049       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15050                                          NULL, 0, OPTAB_DIRECT);
15051       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
15052       return;
15053     }
15054
15055   /* Run TBL on for each value vector and combine the results.  */
15056
15057   rtx res0 = gen_reg_rtx (data_mode);
15058   rtx res1 = gen_reg_rtx (data_mode);
15059   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
15060   if (GET_CODE (sel) != CONST_VECTOR
15061       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
15062     {
15063       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
15064                                                        2 * nunits - 1);
15065       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15066                                      NULL, 0, OPTAB_DIRECT);
15067     }
15068   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
15069   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
15070                                      NULL, 0, OPTAB_DIRECT);
15071   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
15072   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
15073     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
15074   else
15075     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
15076 }
15077
15078 /* Recognize patterns suitable for the TRN instructions.  */
15079 static bool
15080 aarch64_evpc_trn (struct expand_vec_perm_d *d)
15081 {
15082   HOST_WIDE_INT odd;
15083   poly_uint64 nelt = d->perm.length ();
15084   rtx out, in0, in1, x;
15085   machine_mode vmode = d->vmode;
15086
15087   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15088     return false;
15089
15090   /* Note that these are little-endian tests.
15091      We correct for big-endian later.  */
15092   if (!d->perm[0].is_constant (&odd)
15093       || (odd != 0 && odd != 1)
15094       || !d->perm.series_p (0, 2, odd, 2)
15095       || !d->perm.series_p (1, 2, nelt + odd, 2))
15096     return false;
15097
15098   /* Success!  */
15099   if (d->testing_p)
15100     return true;
15101
15102   in0 = d->op0;
15103   in1 = d->op1;
15104   /* We don't need a big-endian lane correction for SVE; see the comment
15105      at the head of aarch64-sve.md for details.  */
15106   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15107     {
15108       x = in0, in0 = in1, in1 = x;
15109       odd = !odd;
15110     }
15111   out = d->target;
15112
15113   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15114                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
15115   return true;
15116 }
15117
15118 /* Recognize patterns suitable for the UZP instructions.  */
15119 static bool
15120 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
15121 {
15122   HOST_WIDE_INT odd;
15123   rtx out, in0, in1, x;
15124   machine_mode vmode = d->vmode;
15125
15126   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15127     return false;
15128
15129   /* Note that these are little-endian tests.
15130      We correct for big-endian later.  */
15131   if (!d->perm[0].is_constant (&odd)
15132       || (odd != 0 && odd != 1)
15133       || !d->perm.series_p (0, 1, odd, 2))
15134     return false;
15135
15136   /* Success!  */
15137   if (d->testing_p)
15138     return true;
15139
15140   in0 = d->op0;
15141   in1 = d->op1;
15142   /* We don't need a big-endian lane correction for SVE; see the comment
15143      at the head of aarch64-sve.md for details.  */
15144   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15145     {
15146       x = in0, in0 = in1, in1 = x;
15147       odd = !odd;
15148     }
15149   out = d->target;
15150
15151   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15152                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
15153   return true;
15154 }
15155
15156 /* Recognize patterns suitable for the ZIP instructions.  */
15157 static bool
15158 aarch64_evpc_zip (struct expand_vec_perm_d *d)
15159 {
15160   unsigned int high;
15161   poly_uint64 nelt = d->perm.length ();
15162   rtx out, in0, in1, x;
15163   machine_mode vmode = d->vmode;
15164
15165   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15166     return false;
15167
15168   /* Note that these are little-endian tests.
15169      We correct for big-endian later.  */
15170   poly_uint64 first = d->perm[0];
15171   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
15172       || !d->perm.series_p (0, 2, first, 1)
15173       || !d->perm.series_p (1, 2, first + nelt, 1))
15174     return false;
15175   high = maybe_ne (first, 0U);
15176
15177   /* Success!  */
15178   if (d->testing_p)
15179     return true;
15180
15181   in0 = d->op0;
15182   in1 = d->op1;
15183   /* We don't need a big-endian lane correction for SVE; see the comment
15184      at the head of aarch64-sve.md for details.  */
15185   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15186     {
15187       x = in0, in0 = in1, in1 = x;
15188       high = !high;
15189     }
15190   out = d->target;
15191
15192   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15193                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
15194   return true;
15195 }
15196
15197 /* Recognize patterns for the EXT insn.  */
15198
15199 static bool
15200 aarch64_evpc_ext (struct expand_vec_perm_d *d)
15201 {
15202   HOST_WIDE_INT location;
15203   rtx offset;
15204
15205   /* The first element always refers to the first vector.
15206      Check if the extracted indices are increasing by one.  */
15207   if (d->vec_flags == VEC_SVE_PRED
15208       || !d->perm[0].is_constant (&location)
15209       || !d->perm.series_p (0, 1, location, 1))
15210     return false;
15211
15212   /* Success! */
15213   if (d->testing_p)
15214     return true;
15215
15216   /* The case where (location == 0) is a no-op for both big- and little-endian,
15217      and is removed by the mid-end at optimization levels -O1 and higher.
15218
15219      We don't need a big-endian lane correction for SVE; see the comment
15220      at the head of aarch64-sve.md for details.  */
15221   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
15222     {
15223       /* After setup, we want the high elements of the first vector (stored
15224          at the LSB end of the register), and the low elements of the second
15225          vector (stored at the MSB end of the register). So swap.  */
15226       std::swap (d->op0, d->op1);
15227       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
15228          to_constant () is safe since this is restricted to Advanced SIMD
15229          vectors.  */
15230       location = d->perm.length ().to_constant () - location;
15231     }
15232
15233   offset = GEN_INT (location);
15234   emit_set_insn (d->target,
15235                  gen_rtx_UNSPEC (d->vmode,
15236                                  gen_rtvec (3, d->op0, d->op1, offset),
15237                                  UNSPEC_EXT));
15238   return true;
15239 }
15240
15241 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
15242    within each 64-bit, 32-bit or 16-bit granule.  */
15243
15244 static bool
15245 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
15246 {
15247   HOST_WIDE_INT diff;
15248   unsigned int i, size, unspec;
15249   machine_mode pred_mode;
15250
15251   if (d->vec_flags == VEC_SVE_PRED
15252       || !d->one_vector_p
15253       || !d->perm[0].is_constant (&diff))
15254     return false;
15255
15256   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
15257   if (size == 8)
15258     {
15259       unspec = UNSPEC_REV64;
15260       pred_mode = VNx2BImode;
15261     }
15262   else if (size == 4)
15263     {
15264       unspec = UNSPEC_REV32;
15265       pred_mode = VNx4BImode;
15266     }
15267   else if (size == 2)
15268     {
15269       unspec = UNSPEC_REV16;
15270       pred_mode = VNx8BImode;
15271     }
15272   else
15273     return false;
15274
15275   unsigned int step = diff + 1;
15276   for (i = 0; i < step; ++i)
15277     if (!d->perm.series_p (i, step, diff - i, step))
15278       return false;
15279
15280   /* Success! */
15281   if (d->testing_p)
15282     return true;
15283
15284   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
15285   if (d->vec_flags == VEC_SVE_DATA)
15286     {
15287       rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15288       src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
15289                             UNSPEC_MERGE_PTRUE);
15290     }
15291   emit_set_insn (d->target, src);
15292   return true;
15293 }
15294
15295 /* Recognize patterns for the REV insn, which reverses elements within
15296    a full vector.  */
15297
15298 static bool
15299 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
15300 {
15301   poly_uint64 nelt = d->perm.length ();
15302
15303   if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
15304     return false;
15305
15306   if (!d->perm.series_p (0, 1, nelt - 1, -1))
15307     return false;
15308
15309   /* Success! */
15310   if (d->testing_p)
15311     return true;
15312
15313   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
15314   emit_set_insn (d->target, src);
15315   return true;
15316 }
15317
15318 static bool
15319 aarch64_evpc_dup (struct expand_vec_perm_d *d)
15320 {
15321   rtx out = d->target;
15322   rtx in0;
15323   HOST_WIDE_INT elt;
15324   machine_mode vmode = d->vmode;
15325   rtx lane;
15326
15327   if (d->vec_flags == VEC_SVE_PRED
15328       || d->perm.encoding ().encoded_nelts () != 1
15329       || !d->perm[0].is_constant (&elt))
15330     return false;
15331
15332   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
15333     return false;
15334
15335   /* Success! */
15336   if (d->testing_p)
15337     return true;
15338
15339   /* The generic preparation in aarch64_expand_vec_perm_const_1
15340      swaps the operand order and the permute indices if it finds
15341      d->perm[0] to be in the second operand.  Thus, we can always
15342      use d->op0 and need not do any extra arithmetic to get the
15343      correct lane number.  */
15344   in0 = d->op0;
15345   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
15346
15347   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
15348   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
15349   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
15350   return true;
15351 }
15352
15353 static bool
15354 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
15355 {
15356   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
15357   machine_mode vmode = d->vmode;
15358
15359   /* Make sure that the indices are constant.  */
15360   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
15361   for (unsigned int i = 0; i < encoded_nelts; ++i)
15362     if (!d->perm[i].is_constant ())
15363       return false;
15364
15365   if (d->testing_p)
15366     return true;
15367
15368   /* Generic code will try constant permutation twice.  Once with the
15369      original mode and again with the elements lowered to QImode.
15370      So wait and don't do the selector expansion ourselves.  */
15371   if (vmode != V8QImode && vmode != V16QImode)
15372     return false;
15373
15374   /* to_constant is safe since this routine is specific to Advanced SIMD
15375      vectors.  */
15376   unsigned int nelt = d->perm.length ().to_constant ();
15377   for (unsigned int i = 0; i < nelt; ++i)
15378     /* If big-endian and two vectors we end up with a weird mixed-endian
15379        mode on NEON.  Reverse the index within each word but not the word
15380        itself.  to_constant is safe because we checked is_constant above.  */
15381     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
15382                         ? d->perm[i].to_constant () ^ (nelt - 1)
15383                         : d->perm[i].to_constant ());
15384
15385   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
15386   sel = force_reg (vmode, sel);
15387
15388   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
15389   return true;
15390 }
15391
15392 /* Try to implement D using an SVE TBL instruction.  */
15393
15394 static bool
15395 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
15396 {
15397   unsigned HOST_WIDE_INT nelt;
15398
15399   /* Permuting two variable-length vectors could overflow the
15400      index range.  */
15401   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
15402     return false;
15403
15404   if (d->testing_p)
15405     return true;
15406
15407   machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
15408   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
15409   aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
15410   return true;
15411 }
15412
15413 static bool
15414 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
15415 {
15416   /* The pattern matching functions above are written to look for a small
15417      number to begin the sequence (0, 1, N/2).  If we begin with an index
15418      from the second operand, we can swap the operands.  */
15419   poly_int64 nelt = d->perm.length ();
15420   if (known_ge (d->perm[0], nelt))
15421     {
15422       d->perm.rotate_inputs (1);
15423       std::swap (d->op0, d->op1);
15424     }
15425
15426   if ((d->vec_flags == VEC_ADVSIMD
15427        || d->vec_flags == VEC_SVE_DATA
15428        || d->vec_flags == VEC_SVE_PRED)
15429       && known_gt (nelt, 1))
15430     {
15431       if (aarch64_evpc_rev_local (d))
15432         return true;
15433       else if (aarch64_evpc_rev_global (d))
15434         return true;
15435       else if (aarch64_evpc_ext (d))
15436         return true;
15437       else if (aarch64_evpc_dup (d))
15438         return true;
15439       else if (aarch64_evpc_zip (d))
15440         return true;
15441       else if (aarch64_evpc_uzp (d))
15442         return true;
15443       else if (aarch64_evpc_trn (d))
15444         return true;
15445       if (d->vec_flags == VEC_SVE_DATA)
15446         return aarch64_evpc_sve_tbl (d);
15447       else if (d->vec_flags == VEC_SVE_DATA)
15448         return aarch64_evpc_tbl (d);
15449     }
15450   return false;
15451 }
15452
15453 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
15454
15455 static bool
15456 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
15457                                   rtx op1, const vec_perm_indices &sel)
15458 {
15459   struct expand_vec_perm_d d;
15460
15461   /* Check whether the mask can be applied to a single vector.  */
15462   if (op0 && rtx_equal_p (op0, op1))
15463     d.one_vector_p = true;
15464   else if (sel.all_from_input_p (0))
15465     {
15466       d.one_vector_p = true;
15467       op1 = op0;
15468     }
15469   else if (sel.all_from_input_p (1))
15470     {
15471       d.one_vector_p = true;
15472       op0 = op1;
15473     }
15474   else
15475     d.one_vector_p = false;
15476
15477   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
15478                      sel.nelts_per_input ());
15479   d.vmode = vmode;
15480   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
15481   d.target = target;
15482   d.op0 = op0;
15483   d.op1 = op1;
15484   d.testing_p = !target;
15485
15486   if (!d.testing_p)
15487     return aarch64_expand_vec_perm_const_1 (&d);
15488
15489   rtx_insn *last = get_last_insn ();
15490   bool ret = aarch64_expand_vec_perm_const_1 (&d);
15491   gcc_assert (last == get_last_insn ());
15492
15493   return ret;
15494 }
15495
15496 /* Generate a byte permute mask for a register of mode MODE,
15497    which has NUNITS units.  */
15498
15499 rtx
15500 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
15501 {
15502   /* We have to reverse each vector because we dont have
15503      a permuted load that can reverse-load according to ABI rules.  */
15504   rtx mask;
15505   rtvec v = rtvec_alloc (16);
15506   unsigned int i, j;
15507   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
15508
15509   gcc_assert (BYTES_BIG_ENDIAN);
15510   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
15511
15512   for (i = 0; i < nunits; i++)
15513     for (j = 0; j < usize; j++)
15514       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
15515   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
15516   return force_reg (V16QImode, mask);
15517 }
15518
15519 /* Return true if X is a valid second operand for the SVE instruction
15520    that implements integer comparison OP_CODE.  */
15521
15522 static bool
15523 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
15524 {
15525   if (register_operand (x, VOIDmode))
15526     return true;
15527
15528   switch (op_code)
15529     {
15530     case LTU:
15531     case LEU:
15532     case GEU:
15533     case GTU:
15534       return aarch64_sve_cmp_immediate_p (x, false);
15535     case LT:
15536     case LE:
15537     case GE:
15538     case GT:
15539     case NE:
15540     case EQ:
15541       return aarch64_sve_cmp_immediate_p (x, true);
15542     default:
15543       gcc_unreachable ();
15544     }
15545 }
15546
15547 /* Return the UNSPEC_COND_* code for comparison CODE.  */
15548
15549 static unsigned int
15550 aarch64_unspec_cond_code (rtx_code code)
15551 {
15552   switch (code)
15553     {
15554     case NE:
15555       return UNSPEC_COND_NE;
15556     case EQ:
15557       return UNSPEC_COND_EQ;
15558     case LT:
15559       return UNSPEC_COND_LT;
15560     case GT:
15561       return UNSPEC_COND_GT;
15562     case LE:
15563       return UNSPEC_COND_LE;
15564     case GE:
15565       return UNSPEC_COND_GE;
15566     case LTU:
15567       return UNSPEC_COND_LO;
15568     case GTU:
15569       return UNSPEC_COND_HI;
15570     case LEU:
15571       return UNSPEC_COND_LS;
15572     case GEU:
15573       return UNSPEC_COND_HS;
15574     case UNORDERED:
15575       return UNSPEC_COND_UO;
15576     default:
15577       gcc_unreachable ();
15578     }
15579 }
15580
15581 /* Return an (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>) expression,
15582    where <X> is the operation associated with comparison CODE.  */
15583
15584 static rtx
15585 aarch64_gen_unspec_cond (rtx_code code, machine_mode pred_mode,
15586                          rtx pred, rtx op0, rtx op1)
15587 {
15588   rtvec vec = gen_rtvec (3, pred, op0, op1);
15589   return gen_rtx_UNSPEC (pred_mode, vec, aarch64_unspec_cond_code (code));
15590 }
15591
15592 /* Expand an SVE integer comparison:
15593
15594      TARGET = CODE (OP0, OP1).  */
15595
15596 void
15597 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
15598 {
15599   machine_mode pred_mode = GET_MODE (target);
15600   machine_mode data_mode = GET_MODE (op0);
15601
15602   if (!aarch64_sve_cmp_operand_p (code, op1))
15603     op1 = force_reg (data_mode, op1);
15604
15605   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15606   rtx unspec = aarch64_gen_unspec_cond (code, pred_mode, ptrue, op0, op1);
15607   emit_insn (gen_set_clobber_cc (target, unspec));
15608 }
15609
15610 /* Emit an instruction:
15611
15612       (set TARGET (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>))
15613
15614    where <X> is the operation associated with comparison CODE.  */
15615
15616 static void
15617 aarch64_emit_unspec_cond (rtx target, rtx_code code, machine_mode pred_mode,
15618                           rtx pred, rtx op0, rtx op1)
15619 {
15620   rtx unspec = aarch64_gen_unspec_cond (code, pred_mode, pred, op0, op1);
15621   emit_set_insn (target, unspec);
15622 }
15623
15624 /* Emit:
15625
15626       (set TMP1 (unspec:PRED_MODE [PTRUE OP0 OP1] UNSPEC_COND_<X1>))
15627       (set TMP2 (unspec:PRED_MODE [PTRUE OP0 OP1] UNSPEC_COND_<X2>))
15628       (set TARGET (and:PRED_MODE (ior:PRED_MODE TMP1 TMP2) PTRUE))
15629
15630    where <Xi> is the operation associated with comparison CODEi.  */
15631
15632 static void
15633 aarch64_emit_unspec_cond_or (rtx target, rtx_code code1, rtx_code code2,
15634                              machine_mode pred_mode, rtx ptrue,
15635                              rtx op0, rtx op1)
15636 {
15637   rtx tmp1 = gen_reg_rtx (pred_mode);
15638   aarch64_emit_unspec_cond (tmp1, code1, pred_mode, ptrue, op0, op1);
15639   rtx tmp2 = gen_reg_rtx (pred_mode);
15640   aarch64_emit_unspec_cond (tmp2, code2, pred_mode, ptrue, op0, op1);
15641   emit_set_insn (target, gen_rtx_AND (pred_mode,
15642                                       gen_rtx_IOR (pred_mode, tmp1, tmp2),
15643                                       ptrue));
15644 }
15645
15646 /* If CAN_INVERT_P, emit an instruction:
15647
15648       (set TARGET (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>))
15649
15650    where <X> is the operation associated with comparison CODE.  Otherwise
15651    emit:
15652
15653       (set TMP (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>))
15654       (set TARGET (and:PRED_MODE (not:PRED_MODE TMP) PTRUE))
15655
15656    where the second instructions sets TARGET to the inverse of TMP.  */
15657
15658 static void
15659 aarch64_emit_inverted_unspec_cond (rtx target, rtx_code code,
15660                                    machine_mode pred_mode, rtx ptrue, rtx pred,
15661                                    rtx op0, rtx op1, bool can_invert_p)
15662 {
15663   if (can_invert_p)
15664     aarch64_emit_unspec_cond (target, code, pred_mode, pred, op0, op1);
15665   else
15666     {
15667       rtx tmp = gen_reg_rtx (pred_mode);
15668       aarch64_emit_unspec_cond (tmp, code, pred_mode, pred, op0, op1);
15669       emit_set_insn (target, gen_rtx_AND (pred_mode,
15670                                           gen_rtx_NOT (pred_mode, tmp),
15671                                           ptrue));
15672     }
15673 }
15674
15675 /* Expand an SVE floating-point comparison:
15676
15677      TARGET = CODE (OP0, OP1)
15678
15679    If CAN_INVERT_P is true, the caller can also handle inverted results;
15680    return true if the result is in fact inverted.  */
15681
15682 bool
15683 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
15684                                   rtx op0, rtx op1, bool can_invert_p)
15685 {
15686   machine_mode pred_mode = GET_MODE (target);
15687   machine_mode data_mode = GET_MODE (op0);
15688
15689   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15690   switch (code)
15691     {
15692     case UNORDERED:
15693       /* UNORDERED has no immediate form.  */
15694       op1 = force_reg (data_mode, op1);
15695       aarch64_emit_unspec_cond (target, code, pred_mode, ptrue, op0, op1);
15696       return false;
15697
15698     case LT:
15699     case LE:
15700     case GT:
15701     case GE:
15702     case EQ:
15703     case NE:
15704       /* There is native support for the comparison.  */
15705       aarch64_emit_unspec_cond (target, code, pred_mode, ptrue, op0, op1);
15706       return false;
15707
15708     case ORDERED:
15709       /* There is native support for the inverse comparison.  */
15710       op1 = force_reg (data_mode, op1);
15711       aarch64_emit_inverted_unspec_cond (target, UNORDERED,
15712                                          pred_mode, ptrue, ptrue, op0, op1,
15713                                          can_invert_p);
15714       return can_invert_p;
15715
15716     case LTGT:
15717       /* This is a trapping operation (LT or GT).  */
15718       aarch64_emit_unspec_cond_or (target, LT, GT, pred_mode, ptrue, op0, op1);
15719       return false;
15720
15721     case UNEQ:
15722       if (!flag_trapping_math)
15723         {
15724           /* This would trap for signaling NaNs.  */
15725           op1 = force_reg (data_mode, op1);
15726           aarch64_emit_unspec_cond_or (target, UNORDERED, EQ,
15727                                        pred_mode, ptrue, op0, op1);
15728           return false;
15729         }
15730       /* fall through */
15731
15732     case UNLT:
15733     case UNLE:
15734     case UNGT:
15735     case UNGE:
15736       {
15737         rtx ordered = ptrue;
15738         if (flag_trapping_math)
15739           {
15740             /* Only compare the elements that are known to be ordered.  */
15741             ordered = gen_reg_rtx (pred_mode);
15742             op1 = force_reg (data_mode, op1);
15743             aarch64_emit_inverted_unspec_cond (ordered, UNORDERED, pred_mode,
15744                                                ptrue, ptrue, op0, op1, false);
15745           }
15746         if (code == UNEQ)
15747           code = NE;
15748         else
15749           code = reverse_condition_maybe_unordered (code);
15750         aarch64_emit_inverted_unspec_cond (target, code, pred_mode, ptrue,
15751                                            ordered, op0, op1, can_invert_p);
15752         return can_invert_p;
15753       }
15754
15755     default:
15756       gcc_unreachable ();
15757     }
15758 }
15759
15760 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
15761    of the data being selected and CMP_MODE is the mode of the values being
15762    compared.  */
15763
15764 void
15765 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
15766                           rtx *ops)
15767 {
15768   machine_mode pred_mode
15769     = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
15770                              GET_MODE_SIZE (cmp_mode)).require ();
15771   rtx pred = gen_reg_rtx (pred_mode);
15772   if (FLOAT_MODE_P (cmp_mode))
15773     {
15774       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
15775                                             ops[4], ops[5], true))
15776         std::swap (ops[1], ops[2]);
15777     }
15778   else
15779     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
15780
15781   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
15782   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
15783 }
15784
15785 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
15786    true.  However due to issues with register allocation it is preferable
15787    to avoid tieing integer scalar and FP scalar modes.  Executing integer
15788    operations in general registers is better than treating them as scalar
15789    vector operations.  This reduces latency and avoids redundant int<->FP
15790    moves.  So tie modes if they are either the same class, or vector modes
15791    with other vector modes, vector structs or any scalar mode.  */
15792
15793 static bool
15794 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
15795 {
15796   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
15797     return true;
15798
15799   /* We specifically want to allow elements of "structure" modes to
15800      be tieable to the structure.  This more general condition allows
15801      other rarer situations too.  The reason we don't extend this to
15802      predicate modes is that there are no predicate structure modes
15803      nor any specific instructions for extracting part of a predicate
15804      register.  */
15805   if (aarch64_vector_data_mode_p (mode1)
15806       && aarch64_vector_data_mode_p (mode2))
15807     return true;
15808
15809   /* Also allow any scalar modes with vectors.  */
15810   if (aarch64_vector_mode_supported_p (mode1)
15811       || aarch64_vector_mode_supported_p (mode2))
15812     return true;
15813
15814   return false;
15815 }
15816
15817 /* Return a new RTX holding the result of moving POINTER forward by
15818    AMOUNT bytes.  */
15819
15820 static rtx
15821 aarch64_move_pointer (rtx pointer, poly_int64 amount)
15822 {
15823   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
15824
15825   return adjust_automodify_address (pointer, GET_MODE (pointer),
15826                                     next, amount);
15827 }
15828
15829 /* Return a new RTX holding the result of moving POINTER forward by the
15830    size of the mode it points to.  */
15831
15832 static rtx
15833 aarch64_progress_pointer (rtx pointer)
15834 {
15835   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
15836 }
15837
15838 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
15839    MODE bytes.  */
15840
15841 static void
15842 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
15843                                               machine_mode mode)
15844 {
15845   rtx reg = gen_reg_rtx (mode);
15846
15847   /* "Cast" the pointers to the correct mode.  */
15848   *src = adjust_address (*src, mode, 0);
15849   *dst = adjust_address (*dst, mode, 0);
15850   /* Emit the memcpy.  */
15851   emit_move_insn (reg, *src);
15852   emit_move_insn (*dst, reg);
15853   /* Move the pointers forward.  */
15854   *src = aarch64_progress_pointer (*src);
15855   *dst = aarch64_progress_pointer (*dst);
15856 }
15857
15858 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
15859    we succeed, otherwise return false.  */
15860
15861 bool
15862 aarch64_expand_movmem (rtx *operands)
15863 {
15864   unsigned int n;
15865   rtx dst = operands[0];
15866   rtx src = operands[1];
15867   rtx base;
15868   bool speed_p = !optimize_function_for_size_p (cfun);
15869
15870   /* When optimizing for size, give a better estimate of the length of a
15871      memcpy call, but use the default otherwise.  */
15872   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
15873
15874   /* We can't do anything smart if the amount to copy is not constant.  */
15875   if (!CONST_INT_P (operands[2]))
15876     return false;
15877
15878   n = UINTVAL (operands[2]);
15879
15880   /* Try to keep the number of instructions low.  For cases below 16 bytes we
15881      need to make at most two moves.  For cases above 16 bytes it will be one
15882      move for each 16 byte chunk, then at most two additional moves.  */
15883   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
15884     return false;
15885
15886   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
15887   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
15888
15889   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
15890   src = adjust_automodify_address (src, VOIDmode, base, 0);
15891
15892   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
15893      1-byte chunk.  */
15894   if (n < 4)
15895     {
15896       if (n >= 2)
15897         {
15898           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
15899           n -= 2;
15900         }
15901
15902       if (n == 1)
15903         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
15904
15905       return true;
15906     }
15907
15908   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
15909      4-byte chunk, partially overlapping with the previously copied chunk.  */
15910   if (n < 8)
15911     {
15912       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
15913       n -= 4;
15914       if (n > 0)
15915         {
15916           int move = n - 4;
15917
15918           src = aarch64_move_pointer (src, move);
15919           dst = aarch64_move_pointer (dst, move);
15920           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
15921         }
15922       return true;
15923     }
15924
15925   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
15926      them, then (if applicable) an 8-byte chunk.  */
15927   while (n >= 8)
15928     {
15929       if (n / 16)
15930         {
15931           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
15932           n -= 16;
15933         }
15934       else
15935         {
15936           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
15937           n -= 8;
15938         }
15939     }
15940
15941   /* Finish the final bytes of the copy.  We can always do this in one
15942      instruction.  We either copy the exact amount we need, or partially
15943      overlap with the previous chunk we copied and copy 8-bytes.  */
15944   if (n == 0)
15945     return true;
15946   else if (n == 1)
15947     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
15948   else if (n == 2)
15949     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
15950   else if (n == 4)
15951     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
15952   else
15953     {
15954       if (n == 3)
15955         {
15956           src = aarch64_move_pointer (src, -1);
15957           dst = aarch64_move_pointer (dst, -1);
15958           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
15959         }
15960       else
15961         {
15962           int move = n - 8;
15963
15964           src = aarch64_move_pointer (src, move);
15965           dst = aarch64_move_pointer (dst, move);
15966           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
15967         }
15968     }
15969
15970   return true;
15971 }
15972
15973 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
15974    SImode stores.  Handle the case when the constant has identical
15975    bottom and top halves.  This is beneficial when the two stores can be
15976    merged into an STP and we avoid synthesising potentially expensive
15977    immediates twice.  Return true if such a split is possible.  */
15978
15979 bool
15980 aarch64_split_dimode_const_store (rtx dst, rtx src)
15981 {
15982   rtx lo = gen_lowpart (SImode, src);
15983   rtx hi = gen_highpart_mode (SImode, DImode, src);
15984
15985   bool size_p = optimize_function_for_size_p (cfun);
15986
15987   if (!rtx_equal_p (lo, hi))
15988     return false;
15989
15990   unsigned int orig_cost
15991     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
15992   unsigned int lo_cost
15993     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
15994
15995   /* We want to transform:
15996      MOV        x1, 49370
15997      MOVK       x1, 0x140, lsl 16
15998      MOVK       x1, 0xc0da, lsl 32
15999      MOVK       x1, 0x140, lsl 48
16000      STR        x1, [x0]
16001    into:
16002      MOV        w1, 49370
16003      MOVK       w1, 0x140, lsl 16
16004      STP        w1, w1, [x0]
16005    So we want to perform this only when we save two instructions
16006    or more.  When optimizing for size, however, accept any code size
16007    savings we can.  */
16008   if (size_p && orig_cost <= lo_cost)
16009     return false;
16010
16011   if (!size_p
16012       && (orig_cost <= lo_cost + 1))
16013     return false;
16014
16015   rtx mem_lo = adjust_address (dst, SImode, 0);
16016   if (!aarch64_mem_pair_operand (mem_lo, SImode))
16017     return false;
16018
16019   rtx tmp_reg = gen_reg_rtx (SImode);
16020   aarch64_expand_mov_immediate (tmp_reg, lo);
16021   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
16022   /* Don't emit an explicit store pair as this may not be always profitable.
16023      Let the sched-fusion logic decide whether to merge them.  */
16024   emit_move_insn (mem_lo, tmp_reg);
16025   emit_move_insn (mem_hi, tmp_reg);
16026
16027   return true;
16028 }
16029
16030 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
16031
16032 static unsigned HOST_WIDE_INT
16033 aarch64_asan_shadow_offset (void)
16034 {
16035   return (HOST_WIDE_INT_1 << 36);
16036 }
16037
16038 static rtx
16039 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
16040                         int code, tree treeop0, tree treeop1)
16041 {
16042   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16043   rtx op0, op1;
16044   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16045   insn_code icode;
16046   struct expand_operand ops[4];
16047
16048   start_sequence ();
16049   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16050
16051   op_mode = GET_MODE (op0);
16052   if (op_mode == VOIDmode)
16053     op_mode = GET_MODE (op1);
16054
16055   switch (op_mode)
16056     {
16057     case E_QImode:
16058     case E_HImode:
16059     case E_SImode:
16060       cmp_mode = SImode;
16061       icode = CODE_FOR_cmpsi;
16062       break;
16063
16064     case E_DImode:
16065       cmp_mode = DImode;
16066       icode = CODE_FOR_cmpdi;
16067       break;
16068
16069     case E_SFmode:
16070       cmp_mode = SFmode;
16071       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16072       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
16073       break;
16074
16075     case E_DFmode:
16076       cmp_mode = DFmode;
16077       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16078       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
16079       break;
16080
16081     default:
16082       end_sequence ();
16083       return NULL_RTX;
16084     }
16085
16086   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
16087   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
16088   if (!op0 || !op1)
16089     {
16090       end_sequence ();
16091       return NULL_RTX;
16092     }
16093   *prep_seq = get_insns ();
16094   end_sequence ();
16095
16096   create_fixed_operand (&ops[0], op0);
16097   create_fixed_operand (&ops[1], op1);
16098
16099   start_sequence ();
16100   if (!maybe_expand_insn (icode, 2, ops))
16101     {
16102       end_sequence ();
16103       return NULL_RTX;
16104     }
16105   *gen_seq = get_insns ();
16106   end_sequence ();
16107
16108   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
16109                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
16110 }
16111
16112 static rtx
16113 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
16114                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
16115 {
16116   rtx op0, op1, target;
16117   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16118   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16119   insn_code icode;
16120   struct expand_operand ops[6];
16121   int aarch64_cond;
16122
16123   push_to_sequence (*prep_seq);
16124   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16125
16126   op_mode = GET_MODE (op0);
16127   if (op_mode == VOIDmode)
16128     op_mode = GET_MODE (op1);
16129
16130   switch (op_mode)
16131     {
16132     case E_QImode:
16133     case E_HImode:
16134     case E_SImode:
16135       cmp_mode = SImode;
16136       icode = CODE_FOR_ccmpsi;
16137       break;
16138
16139     case E_DImode:
16140       cmp_mode = DImode;
16141       icode = CODE_FOR_ccmpdi;
16142       break;
16143
16144     case E_SFmode:
16145       cmp_mode = SFmode;
16146       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16147       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
16148       break;
16149
16150     case E_DFmode:
16151       cmp_mode = DFmode;
16152       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16153       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
16154       break;
16155
16156     default:
16157       end_sequence ();
16158       return NULL_RTX;
16159     }
16160
16161   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
16162   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
16163   if (!op0 || !op1)
16164     {
16165       end_sequence ();
16166       return NULL_RTX;
16167     }
16168   *prep_seq = get_insns ();
16169   end_sequence ();
16170
16171   target = gen_rtx_REG (cc_mode, CC_REGNUM);
16172   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
16173
16174   if (bit_code != AND)
16175     {
16176       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
16177                                                 GET_MODE (XEXP (prev, 0))),
16178                              VOIDmode, XEXP (prev, 0), const0_rtx);
16179       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
16180     }
16181
16182   create_fixed_operand (&ops[0], XEXP (prev, 0));
16183   create_fixed_operand (&ops[1], target);
16184   create_fixed_operand (&ops[2], op0);
16185   create_fixed_operand (&ops[3], op1);
16186   create_fixed_operand (&ops[4], prev);
16187   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
16188
16189   push_to_sequence (*gen_seq);
16190   if (!maybe_expand_insn (icode, 6, ops))
16191     {
16192       end_sequence ();
16193       return NULL_RTX;
16194     }
16195
16196   *gen_seq = get_insns ();
16197   end_sequence ();
16198
16199   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
16200 }
16201
16202 #undef TARGET_GEN_CCMP_FIRST
16203 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
16204
16205 #undef TARGET_GEN_CCMP_NEXT
16206 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
16207
16208 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
16209    instruction fusion of some sort.  */
16210
16211 static bool
16212 aarch64_macro_fusion_p (void)
16213 {
16214   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
16215 }
16216
16217
16218 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
16219    should be kept together during scheduling.  */
16220
16221 static bool
16222 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
16223 {
16224   rtx set_dest;
16225   rtx prev_set = single_set (prev);
16226   rtx curr_set = single_set (curr);
16227   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
16228   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
16229
16230   if (!aarch64_macro_fusion_p ())
16231     return false;
16232
16233   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
16234     {
16235       /* We are trying to match:
16236          prev (mov)  == (set (reg r0) (const_int imm16))
16237          curr (movk) == (set (zero_extract (reg r0)
16238                                            (const_int 16)
16239                                            (const_int 16))
16240                              (const_int imm16_1))  */
16241
16242       set_dest = SET_DEST (curr_set);
16243
16244       if (GET_CODE (set_dest) == ZERO_EXTRACT
16245           && CONST_INT_P (SET_SRC (curr_set))
16246           && CONST_INT_P (SET_SRC (prev_set))
16247           && CONST_INT_P (XEXP (set_dest, 2))
16248           && INTVAL (XEXP (set_dest, 2)) == 16
16249           && REG_P (XEXP (set_dest, 0))
16250           && REG_P (SET_DEST (prev_set))
16251           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
16252         {
16253           return true;
16254         }
16255     }
16256
16257   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
16258     {
16259
16260       /*  We're trying to match:
16261           prev (adrp) == (set (reg r1)
16262                               (high (symbol_ref ("SYM"))))
16263           curr (add) == (set (reg r0)
16264                              (lo_sum (reg r1)
16265                                      (symbol_ref ("SYM"))))
16266           Note that r0 need not necessarily be the same as r1, especially
16267           during pre-regalloc scheduling.  */
16268
16269       if (satisfies_constraint_Ush (SET_SRC (prev_set))
16270           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16271         {
16272           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
16273               && REG_P (XEXP (SET_SRC (curr_set), 0))
16274               && REGNO (XEXP (SET_SRC (curr_set), 0))
16275                  == REGNO (SET_DEST (prev_set))
16276               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
16277                               XEXP (SET_SRC (curr_set), 1)))
16278             return true;
16279         }
16280     }
16281
16282   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
16283     {
16284
16285       /* We're trying to match:
16286          prev (movk) == (set (zero_extract (reg r0)
16287                                            (const_int 16)
16288                                            (const_int 32))
16289                              (const_int imm16_1))
16290          curr (movk) == (set (zero_extract (reg r0)
16291                                            (const_int 16)
16292                                            (const_int 48))
16293                              (const_int imm16_2))  */
16294
16295       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
16296           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
16297           && REG_P (XEXP (SET_DEST (prev_set), 0))
16298           && REG_P (XEXP (SET_DEST (curr_set), 0))
16299           && REGNO (XEXP (SET_DEST (prev_set), 0))
16300              == REGNO (XEXP (SET_DEST (curr_set), 0))
16301           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
16302           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
16303           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
16304           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
16305           && CONST_INT_P (SET_SRC (prev_set))
16306           && CONST_INT_P (SET_SRC (curr_set)))
16307         return true;
16308
16309     }
16310   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
16311     {
16312       /* We're trying to match:
16313           prev (adrp) == (set (reg r0)
16314                               (high (symbol_ref ("SYM"))))
16315           curr (ldr) == (set (reg r1)
16316                              (mem (lo_sum (reg r0)
16317                                              (symbol_ref ("SYM")))))
16318                  or
16319           curr (ldr) == (set (reg r1)
16320                              (zero_extend (mem
16321                                            (lo_sum (reg r0)
16322                                                    (symbol_ref ("SYM"))))))  */
16323       if (satisfies_constraint_Ush (SET_SRC (prev_set))
16324           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16325         {
16326           rtx curr_src = SET_SRC (curr_set);
16327
16328           if (GET_CODE (curr_src) == ZERO_EXTEND)
16329             curr_src = XEXP (curr_src, 0);
16330
16331           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
16332               && REG_P (XEXP (XEXP (curr_src, 0), 0))
16333               && REGNO (XEXP (XEXP (curr_src, 0), 0))
16334                  == REGNO (SET_DEST (prev_set))
16335               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
16336                               XEXP (SET_SRC (prev_set), 0)))
16337               return true;
16338         }
16339     }
16340
16341   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
16342        && aarch_crypto_can_dual_issue (prev, curr))
16343     return true;
16344
16345   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
16346       && any_condjump_p (curr))
16347     {
16348       enum attr_type prev_type = get_attr_type (prev);
16349
16350       unsigned int condreg1, condreg2;
16351       rtx cc_reg_1;
16352       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
16353       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
16354
16355       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
16356           && prev
16357           && modified_in_p (cc_reg_1, prev))
16358         {
16359           /* FIXME: this misses some which is considered simple arthematic
16360              instructions for ThunderX.  Simple shifts are missed here.  */
16361           if (prev_type == TYPE_ALUS_SREG
16362               || prev_type == TYPE_ALUS_IMM
16363               || prev_type == TYPE_LOGICS_REG
16364               || prev_type == TYPE_LOGICS_IMM)
16365             return true;
16366         }
16367     }
16368
16369   if (prev_set
16370       && curr_set
16371       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
16372       && any_condjump_p (curr))
16373     {
16374       /* We're trying to match:
16375           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
16376           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
16377                                                          (const_int 0))
16378                                                  (label_ref ("SYM"))
16379                                                  (pc))  */
16380       if (SET_DEST (curr_set) == (pc_rtx)
16381           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
16382           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
16383           && REG_P (SET_DEST (prev_set))
16384           && REGNO (SET_DEST (prev_set))
16385              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
16386         {
16387           /* Fuse ALU operations followed by conditional branch instruction.  */
16388           switch (get_attr_type (prev))
16389             {
16390             case TYPE_ALU_IMM:
16391             case TYPE_ALU_SREG:
16392             case TYPE_ADC_REG:
16393             case TYPE_ADC_IMM:
16394             case TYPE_ADCS_REG:
16395             case TYPE_ADCS_IMM:
16396             case TYPE_LOGIC_REG:
16397             case TYPE_LOGIC_IMM:
16398             case TYPE_CSEL:
16399             case TYPE_ADR:
16400             case TYPE_MOV_IMM:
16401             case TYPE_SHIFT_REG:
16402             case TYPE_SHIFT_IMM:
16403             case TYPE_BFM:
16404             case TYPE_RBIT:
16405             case TYPE_REV:
16406             case TYPE_EXTEND:
16407               return true;
16408
16409             default:;
16410             }
16411         }
16412     }
16413
16414   return false;
16415 }
16416
16417 /* Return true iff the instruction fusion described by OP is enabled.  */
16418
16419 bool
16420 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
16421 {
16422   return (aarch64_tune_params.fusible_ops & op) != 0;
16423 }
16424
16425 /* If MEM is in the form of [base+offset], extract the two parts
16426    of address and set to BASE and OFFSET, otherwise return false
16427    after clearing BASE and OFFSET.  */
16428
16429 bool
16430 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
16431 {
16432   rtx addr;
16433
16434   gcc_assert (MEM_P (mem));
16435
16436   addr = XEXP (mem, 0);
16437
16438   if (REG_P (addr))
16439     {
16440       *base = addr;
16441       *offset = const0_rtx;
16442       return true;
16443     }
16444
16445   if (GET_CODE (addr) == PLUS
16446       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
16447     {
16448       *base = XEXP (addr, 0);
16449       *offset = XEXP (addr, 1);
16450       return true;
16451     }
16452
16453   *base = NULL_RTX;
16454   *offset = NULL_RTX;
16455
16456   return false;
16457 }
16458
16459 /* Types for scheduling fusion.  */
16460 enum sched_fusion_type
16461 {
16462   SCHED_FUSION_NONE = 0,
16463   SCHED_FUSION_LD_SIGN_EXTEND,
16464   SCHED_FUSION_LD_ZERO_EXTEND,
16465   SCHED_FUSION_LD,
16466   SCHED_FUSION_ST,
16467   SCHED_FUSION_NUM
16468 };
16469
16470 /* If INSN is a load or store of address in the form of [base+offset],
16471    extract the two parts and set to BASE and OFFSET.  Return scheduling
16472    fusion type this INSN is.  */
16473
16474 static enum sched_fusion_type
16475 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
16476 {
16477   rtx x, dest, src;
16478   enum sched_fusion_type fusion = SCHED_FUSION_LD;
16479
16480   gcc_assert (INSN_P (insn));
16481   x = PATTERN (insn);
16482   if (GET_CODE (x) != SET)
16483     return SCHED_FUSION_NONE;
16484
16485   src = SET_SRC (x);
16486   dest = SET_DEST (x);
16487
16488   machine_mode dest_mode = GET_MODE (dest);
16489
16490   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
16491     return SCHED_FUSION_NONE;
16492
16493   if (GET_CODE (src) == SIGN_EXTEND)
16494     {
16495       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
16496       src = XEXP (src, 0);
16497       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16498         return SCHED_FUSION_NONE;
16499     }
16500   else if (GET_CODE (src) == ZERO_EXTEND)
16501     {
16502       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
16503       src = XEXP (src, 0);
16504       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16505         return SCHED_FUSION_NONE;
16506     }
16507
16508   if (GET_CODE (src) == MEM && REG_P (dest))
16509     extract_base_offset_in_addr (src, base, offset);
16510   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
16511     {
16512       fusion = SCHED_FUSION_ST;
16513       extract_base_offset_in_addr (dest, base, offset);
16514     }
16515   else
16516     return SCHED_FUSION_NONE;
16517
16518   if (*base == NULL_RTX || *offset == NULL_RTX)
16519     fusion = SCHED_FUSION_NONE;
16520
16521   return fusion;
16522 }
16523
16524 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
16525
16526    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
16527    and PRI are only calculated for these instructions.  For other instruction,
16528    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
16529    type instruction fusion can be added by returning different priorities.
16530
16531    It's important that irrelevant instructions get the largest FUSION_PRI.  */
16532
16533 static void
16534 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
16535                                int *fusion_pri, int *pri)
16536 {
16537   int tmp, off_val;
16538   rtx base, offset;
16539   enum sched_fusion_type fusion;
16540
16541   gcc_assert (INSN_P (insn));
16542
16543   tmp = max_pri - 1;
16544   fusion = fusion_load_store (insn, &base, &offset);
16545   if (fusion == SCHED_FUSION_NONE)
16546     {
16547       *pri = tmp;
16548       *fusion_pri = tmp;
16549       return;
16550     }
16551
16552   /* Set FUSION_PRI according to fusion type and base register.  */
16553   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
16554
16555   /* Calculate PRI.  */
16556   tmp /= 2;
16557
16558   /* INSN with smaller offset goes first.  */
16559   off_val = (int)(INTVAL (offset));
16560   if (off_val >= 0)
16561     tmp -= (off_val & 0xfffff);
16562   else
16563     tmp += ((- off_val) & 0xfffff);
16564
16565   *pri = tmp;
16566   return;
16567 }
16568
16569 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
16570    Adjust priority of sha1h instructions so they are scheduled before
16571    other SHA1 instructions.  */
16572
16573 static int
16574 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
16575 {
16576   rtx x = PATTERN (insn);
16577
16578   if (GET_CODE (x) == SET)
16579     {
16580       x = SET_SRC (x);
16581
16582       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
16583         return priority + 10;
16584     }
16585
16586   return priority;
16587 }
16588
16589 /* Given OPERANDS of consecutive load/store, check if we can merge
16590    them into ldp/stp.  LOAD is true if they are load instructions.
16591    MODE is the mode of memory operands.  */
16592
16593 bool
16594 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
16595                                 machine_mode mode)
16596 {
16597   HOST_WIDE_INT offval_1, offval_2, msize;
16598   enum reg_class rclass_1, rclass_2;
16599   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
16600
16601   if (load)
16602     {
16603       mem_1 = operands[1];
16604       mem_2 = operands[3];
16605       reg_1 = operands[0];
16606       reg_2 = operands[2];
16607       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
16608       if (REGNO (reg_1) == REGNO (reg_2))
16609         return false;
16610     }
16611   else
16612     {
16613       mem_1 = operands[0];
16614       mem_2 = operands[2];
16615       reg_1 = operands[1];
16616       reg_2 = operands[3];
16617     }
16618
16619   /* The mems cannot be volatile.  */
16620   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
16621     return false;
16622
16623   /* If we have SImode and slow unaligned ldp,
16624      check the alignment to be at least 8 byte. */
16625   if (mode == SImode
16626       && (aarch64_tune_params.extra_tuning_flags
16627           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16628       && !optimize_size
16629       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
16630     return false;
16631
16632   /* Check if the addresses are in the form of [base+offset].  */
16633   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16634   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16635     return false;
16636   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16637   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16638     return false;
16639
16640   /* Check if the bases are same.  */
16641   if (!rtx_equal_p (base_1, base_2))
16642     return false;
16643
16644   offval_1 = INTVAL (offset_1);
16645   offval_2 = INTVAL (offset_2);
16646   /* We should only be trying this for fixed-sized modes.  There is no
16647      SVE LDP/STP instruction.  */
16648   msize = GET_MODE_SIZE (mode).to_constant ();
16649   /* Check if the offsets are consecutive.  */
16650   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
16651     return false;
16652
16653   /* Check if the addresses are clobbered by load.  */
16654   if (load)
16655     {
16656       if (reg_mentioned_p (reg_1, mem_1))
16657         return false;
16658
16659       /* In increasing order, the last load can clobber the address.  */
16660       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
16661       return false;
16662     }
16663
16664   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
16665     rclass_1 = FP_REGS;
16666   else
16667     rclass_1 = GENERAL_REGS;
16668
16669   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
16670     rclass_2 = FP_REGS;
16671   else
16672     rclass_2 = GENERAL_REGS;
16673
16674   /* Check if the registers are of same class.  */
16675   if (rclass_1 != rclass_2)
16676     return false;
16677
16678   return true;
16679 }
16680
16681 /* Given OPERANDS of consecutive load/store, check if we can merge
16682    them into ldp/stp by adjusting the offset.  LOAD is true if they
16683    are load instructions.  MODE is the mode of memory operands.
16684
16685    Given below consecutive stores:
16686
16687      str  w1, [xb, 0x100]
16688      str  w1, [xb, 0x104]
16689      str  w1, [xb, 0x108]
16690      str  w1, [xb, 0x10c]
16691
16692    Though the offsets are out of the range supported by stp, we can
16693    still pair them after adjusting the offset, like:
16694
16695      add  scratch, xb, 0x100
16696      stp  w1, w1, [scratch]
16697      stp  w1, w1, [scratch, 0x8]
16698
16699    The peephole patterns detecting this opportunity should guarantee
16700    the scratch register is avaliable.  */
16701
16702 bool
16703 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
16704                                        scalar_mode mode)
16705 {
16706   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
16707   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
16708   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
16709   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
16710
16711   if (load)
16712     {
16713       reg_1 = operands[0];
16714       mem_1 = operands[1];
16715       reg_2 = operands[2];
16716       mem_2 = operands[3];
16717       reg_3 = operands[4];
16718       mem_3 = operands[5];
16719       reg_4 = operands[6];
16720       mem_4 = operands[7];
16721       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
16722                   && REG_P (reg_3) && REG_P (reg_4));
16723       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
16724         return false;
16725     }
16726   else
16727     {
16728       mem_1 = operands[0];
16729       reg_1 = operands[1];
16730       mem_2 = operands[2];
16731       reg_2 = operands[3];
16732       mem_3 = operands[4];
16733       reg_3 = operands[5];
16734       mem_4 = operands[6];
16735       reg_4 = operands[7];
16736     }
16737   /* Skip if memory operand is by itslef valid for ldp/stp.  */
16738   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
16739     return false;
16740
16741   /* The mems cannot be volatile.  */
16742   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
16743       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
16744     return false;
16745
16746   /* Check if the addresses are in the form of [base+offset].  */
16747   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16748   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16749     return false;
16750   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16751   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16752     return false;
16753   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
16754   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
16755     return false;
16756   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
16757   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
16758     return false;
16759
16760   /* Check if the bases are same.  */
16761   if (!rtx_equal_p (base_1, base_2)
16762       || !rtx_equal_p (base_2, base_3)
16763       || !rtx_equal_p (base_3, base_4))
16764     return false;
16765
16766   offval_1 = INTVAL (offset_1);
16767   offval_2 = INTVAL (offset_2);
16768   offval_3 = INTVAL (offset_3);
16769   offval_4 = INTVAL (offset_4);
16770   msize = GET_MODE_SIZE (mode);
16771   /* Check if the offsets are consecutive.  */
16772   if ((offval_1 != (offval_2 + msize)
16773        || offval_1 != (offval_3 + msize * 2)
16774        || offval_1 != (offval_4 + msize * 3))
16775       && (offval_4 != (offval_3 + msize)
16776           || offval_4 != (offval_2 + msize * 2)
16777           || offval_4 != (offval_1 + msize * 3)))
16778     return false;
16779
16780   /* Check if the addresses are clobbered by load.  */
16781   if (load)
16782     {
16783       if (reg_mentioned_p (reg_1, mem_1)
16784           || reg_mentioned_p (reg_2, mem_2)
16785           || reg_mentioned_p (reg_3, mem_3))
16786         return false;
16787
16788       /* In increasing order, the last load can clobber the address.  */
16789       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
16790         return false;
16791     }
16792
16793   /* If we have SImode and slow unaligned ldp,
16794      check the alignment to be at least 8 byte. */
16795   if (mode == SImode
16796       && (aarch64_tune_params.extra_tuning_flags
16797           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16798       && !optimize_size
16799       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
16800     return false;
16801
16802   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
16803     rclass_1 = FP_REGS;
16804   else
16805     rclass_1 = GENERAL_REGS;
16806
16807   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
16808     rclass_2 = FP_REGS;
16809   else
16810     rclass_2 = GENERAL_REGS;
16811
16812   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
16813     rclass_3 = FP_REGS;
16814   else
16815     rclass_3 = GENERAL_REGS;
16816
16817   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
16818     rclass_4 = FP_REGS;
16819   else
16820     rclass_4 = GENERAL_REGS;
16821
16822   /* Check if the registers are of same class.  */
16823   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
16824     return false;
16825
16826   return true;
16827 }
16828
16829 /* Given OPERANDS of consecutive load/store, this function pairs them
16830    into ldp/stp after adjusting the offset.  It depends on the fact
16831    that addresses of load/store instructions are in increasing order.
16832    MODE is the mode of memory operands.  CODE is the rtl operator
16833    which should be applied to all memory operands, it's SIGN_EXTEND,
16834    ZERO_EXTEND or UNKNOWN.  */
16835
16836 bool
16837 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
16838                              scalar_mode mode, RTX_CODE code)
16839 {
16840   rtx base, offset, t1, t2;
16841   rtx mem_1, mem_2, mem_3, mem_4;
16842   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
16843
16844   if (load)
16845     {
16846       mem_1 = operands[1];
16847       mem_2 = operands[3];
16848       mem_3 = operands[5];
16849       mem_4 = operands[7];
16850     }
16851   else
16852     {
16853       mem_1 = operands[0];
16854       mem_2 = operands[2];
16855       mem_3 = operands[4];
16856       mem_4 = operands[6];
16857       gcc_assert (code == UNKNOWN);
16858     }
16859
16860   extract_base_offset_in_addr (mem_1, &base, &offset);
16861   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
16862
16863   /* Adjust offset thus it can fit in ldp/stp instruction.  */
16864   msize = GET_MODE_SIZE (mode);
16865   stp_off_limit = msize * 0x40;
16866   off_val = INTVAL (offset);
16867   abs_off = (off_val < 0) ? -off_val : off_val;
16868   new_off = abs_off % stp_off_limit;
16869   adj_off = abs_off - new_off;
16870
16871   /* Further adjust to make sure all offsets are OK.  */
16872   if ((new_off + msize * 2) >= stp_off_limit)
16873     {
16874       adj_off += stp_off_limit;
16875       new_off -= stp_off_limit;
16876     }
16877
16878   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
16879   if (adj_off >= 0x1000)
16880     return false;
16881
16882   if (off_val < 0)
16883     {
16884       adj_off = -adj_off;
16885       new_off = -new_off;
16886     }
16887
16888   /* Create new memory references.  */
16889   mem_1 = change_address (mem_1, VOIDmode,
16890                           plus_constant (DImode, operands[8], new_off));
16891
16892   /* Check if the adjusted address is OK for ldp/stp.  */
16893   if (!aarch64_mem_pair_operand (mem_1, mode))
16894     return false;
16895
16896   msize = GET_MODE_SIZE (mode);
16897   mem_2 = change_address (mem_2, VOIDmode,
16898                           plus_constant (DImode,
16899                                          operands[8],
16900                                          new_off + msize));
16901   mem_3 = change_address (mem_3, VOIDmode,
16902                           plus_constant (DImode,
16903                                          operands[8],
16904                                          new_off + msize * 2));
16905   mem_4 = change_address (mem_4, VOIDmode,
16906                           plus_constant (DImode,
16907                                          operands[8],
16908                                          new_off + msize * 3));
16909
16910   if (code == ZERO_EXTEND)
16911     {
16912       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
16913       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
16914       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
16915       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
16916     }
16917   else if (code == SIGN_EXTEND)
16918     {
16919       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
16920       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
16921       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
16922       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
16923     }
16924
16925   if (load)
16926     {
16927       operands[1] = mem_1;
16928       operands[3] = mem_2;
16929       operands[5] = mem_3;
16930       operands[7] = mem_4;
16931     }
16932   else
16933     {
16934       operands[0] = mem_1;
16935       operands[2] = mem_2;
16936       operands[4] = mem_3;
16937       operands[6] = mem_4;
16938     }
16939
16940   /* Emit adjusting instruction.  */
16941   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
16942   /* Emit ldp/stp instructions.  */
16943   t1 = gen_rtx_SET (operands[0], operands[1]);
16944   t2 = gen_rtx_SET (operands[2], operands[3]);
16945   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
16946   t1 = gen_rtx_SET (operands[4], operands[5]);
16947   t2 = gen_rtx_SET (operands[6], operands[7]);
16948   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
16949   return true;
16950 }
16951
16952 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
16953    it isn't worth branching around empty masked ops (including masked
16954    stores).  */
16955
16956 static bool
16957 aarch64_empty_mask_is_expensive (unsigned)
16958 {
16959   return false;
16960 }
16961
16962 /* Return 1 if pseudo register should be created and used to hold
16963    GOT address for PIC code.  */
16964
16965 bool
16966 aarch64_use_pseudo_pic_reg (void)
16967 {
16968   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
16969 }
16970
16971 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
16972
16973 static int
16974 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
16975 {
16976   switch (XINT (x, 1))
16977     {
16978     case UNSPEC_GOTSMALLPIC:
16979     case UNSPEC_GOTSMALLPIC28K:
16980     case UNSPEC_GOTTINYPIC:
16981       return 0;
16982     default:
16983       break;
16984     }
16985
16986   return default_unspec_may_trap_p (x, flags);
16987 }
16988
16989
16990 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
16991    return the log2 of that value.  Otherwise return -1.  */
16992
16993 int
16994 aarch64_fpconst_pow_of_2 (rtx x)
16995 {
16996   const REAL_VALUE_TYPE *r;
16997
16998   if (!CONST_DOUBLE_P (x))
16999     return -1;
17000
17001   r = CONST_DOUBLE_REAL_VALUE (x);
17002
17003   if (REAL_VALUE_NEGATIVE (*r)
17004       || REAL_VALUE_ISNAN (*r)
17005       || REAL_VALUE_ISINF (*r)
17006       || !real_isinteger (r, DFmode))
17007     return -1;
17008
17009   return exact_log2 (real_to_integer (r));
17010 }
17011
17012 /* If X is a vector of equal CONST_DOUBLE values and that value is
17013    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
17014
17015 int
17016 aarch64_vec_fpconst_pow_of_2 (rtx x)
17017 {
17018   int nelts;
17019   if (GET_CODE (x) != CONST_VECTOR
17020       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
17021     return -1;
17022
17023   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
17024     return -1;
17025
17026   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
17027   if (firstval <= 0)
17028     return -1;
17029
17030   for (int i = 1; i < nelts; i++)
17031     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
17032       return -1;
17033
17034   return firstval;
17035 }
17036
17037 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
17038    to float.
17039
17040    __fp16 always promotes through this hook.
17041    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
17042    through the generic excess precision logic rather than here.  */
17043
17044 static tree
17045 aarch64_promoted_type (const_tree t)
17046 {
17047   if (SCALAR_FLOAT_TYPE_P (t)
17048       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
17049     return float_type_node;
17050
17051   return NULL_TREE;
17052 }
17053
17054 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
17055
17056 static bool
17057 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
17058                            optimization_type opt_type)
17059 {
17060   switch (op)
17061     {
17062     case rsqrt_optab:
17063       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
17064
17065     default:
17066       return true;
17067     }
17068 }
17069
17070 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
17071
17072 static unsigned int
17073 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
17074                                         int *offset)
17075 {
17076   /* Polynomial invariant 1 == (VG / 2) - 1.  */
17077   gcc_assert (i == 1);
17078   *factor = 2;
17079   *offset = 1;
17080   return AARCH64_DWARF_VG;
17081 }
17082
17083 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
17084    if MODE is HFmode, and punt to the generic implementation otherwise.  */
17085
17086 static bool
17087 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
17088 {
17089   return (mode == HFmode
17090           ? true
17091           : default_libgcc_floating_mode_supported_p (mode));
17092 }
17093
17094 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
17095    if MODE is HFmode, and punt to the generic implementation otherwise.  */
17096
17097 static bool
17098 aarch64_scalar_mode_supported_p (scalar_mode mode)
17099 {
17100   return (mode == HFmode
17101           ? true
17102           : default_scalar_mode_supported_p (mode));
17103 }
17104
17105 /* Set the value of FLT_EVAL_METHOD.
17106    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
17107
17108     0: evaluate all operations and constants, whose semantic type has at
17109        most the range and precision of type float, to the range and
17110        precision of float; evaluate all other operations and constants to
17111        the range and precision of the semantic type;
17112
17113     N, where _FloatN is a supported interchange floating type
17114        evaluate all operations and constants, whose semantic type has at
17115        most the range and precision of _FloatN type, to the range and
17116        precision of the _FloatN type; evaluate all other operations and
17117        constants to the range and precision of the semantic type;
17118
17119    If we have the ARMv8.2-A extensions then we support _Float16 in native
17120    precision, so we should set this to 16.  Otherwise, we support the type,
17121    but want to evaluate expressions in float precision, so set this to
17122    0.  */
17123
17124 static enum flt_eval_method
17125 aarch64_excess_precision (enum excess_precision_type type)
17126 {
17127   switch (type)
17128     {
17129       case EXCESS_PRECISION_TYPE_FAST:
17130       case EXCESS_PRECISION_TYPE_STANDARD:
17131         /* We can calculate either in 16-bit range and precision or
17132            32-bit range and precision.  Make that decision based on whether
17133            we have native support for the ARMv8.2-A 16-bit floating-point
17134            instructions or not.  */
17135         return (TARGET_FP_F16INST
17136                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
17137                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
17138       case EXCESS_PRECISION_TYPE_IMPLICIT:
17139         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
17140       default:
17141         gcc_unreachable ();
17142     }
17143   return FLT_EVAL_METHOD_UNPREDICTABLE;
17144 }
17145
17146 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
17147    scheduled for speculative execution.  Reject the long-running division
17148    and square-root instructions.  */
17149
17150 static bool
17151 aarch64_sched_can_speculate_insn (rtx_insn *insn)
17152 {
17153   switch (get_attr_type (insn))
17154     {
17155       case TYPE_SDIV:
17156       case TYPE_UDIV:
17157       case TYPE_FDIVS:
17158       case TYPE_FDIVD:
17159       case TYPE_FSQRTS:
17160       case TYPE_FSQRTD:
17161       case TYPE_NEON_FP_SQRT_S:
17162       case TYPE_NEON_FP_SQRT_D:
17163       case TYPE_NEON_FP_SQRT_S_Q:
17164       case TYPE_NEON_FP_SQRT_D_Q:
17165       case TYPE_NEON_FP_DIV_S:
17166       case TYPE_NEON_FP_DIV_D:
17167       case TYPE_NEON_FP_DIV_S_Q:
17168       case TYPE_NEON_FP_DIV_D_Q:
17169         return false;
17170       default:
17171         return true;
17172     }
17173 }
17174
17175 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
17176
17177 static int
17178 aarch64_compute_pressure_classes (reg_class *classes)
17179 {
17180   int i = 0;
17181   classes[i++] = GENERAL_REGS;
17182   classes[i++] = FP_REGS;
17183   /* PR_REGS isn't a useful pressure class because many predicate pseudo
17184      registers need to go in PR_LO_REGS at some point during their
17185      lifetime.  Splitting it into two halves has the effect of making
17186      all predicates count against PR_LO_REGS, so that we try whenever
17187      possible to restrict the number of live predicates to 8.  This
17188      greatly reduces the amount of spilling in certain loops.  */
17189   classes[i++] = PR_LO_REGS;
17190   classes[i++] = PR_HI_REGS;
17191   return i;
17192 }
17193
17194 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
17195
17196 static bool
17197 aarch64_can_change_mode_class (machine_mode from,
17198                                machine_mode to, reg_class_t)
17199 {
17200   /* See the comment at the head of aarch64-sve.md for details.  */
17201   if (BYTES_BIG_ENDIAN
17202       && (aarch64_sve_data_mode_p (from) != aarch64_sve_data_mode_p (to)))
17203     return false;
17204   return true;
17205 }
17206
17207 /* Implement TARGET_EARLY_REMAT_MODES.  */
17208
17209 static void
17210 aarch64_select_early_remat_modes (sbitmap modes)
17211 {
17212   /* SVE values are not normally live across a call, so it should be
17213      worth doing early rematerialization even in VL-specific mode.  */
17214   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
17215     {
17216       machine_mode mode = (machine_mode) i;
17217       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
17218       if (vec_flags & VEC_ANY_SVE)
17219         bitmap_set_bit (modes, i);
17220     }
17221 }
17222
17223 /* Target-specific selftests.  */
17224
17225 #if CHECKING_P
17226
17227 namespace selftest {
17228
17229 /* Selftest for the RTL loader.
17230    Verify that the RTL loader copes with a dump from
17231    print_rtx_function.  This is essentially just a test that class
17232    function_reader can handle a real dump, but it also verifies
17233    that lookup_reg_by_dump_name correctly handles hard regs.
17234    The presence of hard reg names in the dump means that the test is
17235    target-specific, hence it is in this file.  */
17236
17237 static void
17238 aarch64_test_loading_full_dump ()
17239 {
17240   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
17241
17242   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
17243
17244   rtx_insn *insn_1 = get_insn_by_uid (1);
17245   ASSERT_EQ (NOTE, GET_CODE (insn_1));
17246
17247   rtx_insn *insn_15 = get_insn_by_uid (15);
17248   ASSERT_EQ (INSN, GET_CODE (insn_15));
17249   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
17250
17251   /* Verify crtl->return_rtx.  */
17252   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
17253   ASSERT_EQ (0, REGNO (crtl->return_rtx));
17254   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
17255 }
17256
17257 /* Run all target-specific selftests.  */
17258
17259 static void
17260 aarch64_run_selftests (void)
17261 {
17262   aarch64_test_loading_full_dump ();
17263 }
17264
17265 } // namespace selftest
17266
17267 #endif /* #if CHECKING_P */
17268
17269 #undef TARGET_ADDRESS_COST
17270 #define TARGET_ADDRESS_COST aarch64_address_cost
17271
17272 /* This hook will determines whether unnamed bitfields affect the alignment
17273    of the containing structure.  The hook returns true if the structure
17274    should inherit the alignment requirements of an unnamed bitfield's
17275    type.  */
17276 #undef TARGET_ALIGN_ANON_BITFIELD
17277 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
17278
17279 #undef TARGET_ASM_ALIGNED_DI_OP
17280 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
17281
17282 #undef TARGET_ASM_ALIGNED_HI_OP
17283 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
17284
17285 #undef TARGET_ASM_ALIGNED_SI_OP
17286 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
17287
17288 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
17289 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
17290   hook_bool_const_tree_hwi_hwi_const_tree_true
17291
17292 #undef TARGET_ASM_FILE_START
17293 #define TARGET_ASM_FILE_START aarch64_start_file
17294
17295 #undef TARGET_ASM_OUTPUT_MI_THUNK
17296 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
17297
17298 #undef TARGET_ASM_SELECT_RTX_SECTION
17299 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
17300
17301 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
17302 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
17303
17304 #undef TARGET_BUILD_BUILTIN_VA_LIST
17305 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
17306
17307 #undef TARGET_CALLEE_COPIES
17308 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
17309
17310 #undef TARGET_CAN_ELIMINATE
17311 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
17312
17313 #undef TARGET_CAN_INLINE_P
17314 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
17315
17316 #undef TARGET_CANNOT_FORCE_CONST_MEM
17317 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
17318
17319 #undef TARGET_CASE_VALUES_THRESHOLD
17320 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
17321
17322 #undef TARGET_CONDITIONAL_REGISTER_USAGE
17323 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
17324
17325 /* Only the least significant bit is used for initialization guard
17326    variables.  */
17327 #undef TARGET_CXX_GUARD_MASK_BIT
17328 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
17329
17330 #undef TARGET_C_MODE_FOR_SUFFIX
17331 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
17332
17333 #ifdef TARGET_BIG_ENDIAN_DEFAULT
17334 #undef  TARGET_DEFAULT_TARGET_FLAGS
17335 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
17336 #endif
17337
17338 #undef TARGET_CLASS_MAX_NREGS
17339 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
17340
17341 #undef TARGET_BUILTIN_DECL
17342 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
17343
17344 #undef TARGET_BUILTIN_RECIPROCAL
17345 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
17346
17347 #undef TARGET_C_EXCESS_PRECISION
17348 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
17349
17350 #undef  TARGET_EXPAND_BUILTIN
17351 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
17352
17353 #undef TARGET_EXPAND_BUILTIN_VA_START
17354 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
17355
17356 #undef TARGET_FOLD_BUILTIN
17357 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
17358
17359 #undef TARGET_FUNCTION_ARG
17360 #define TARGET_FUNCTION_ARG aarch64_function_arg
17361
17362 #undef TARGET_FUNCTION_ARG_ADVANCE
17363 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
17364
17365 #undef TARGET_FUNCTION_ARG_BOUNDARY
17366 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
17367
17368 #undef TARGET_FUNCTION_ARG_PADDING
17369 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
17370
17371 #undef TARGET_GET_RAW_RESULT_MODE
17372 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
17373 #undef TARGET_GET_RAW_ARG_MODE
17374 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
17375
17376 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
17377 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
17378
17379 #undef TARGET_FUNCTION_VALUE
17380 #define TARGET_FUNCTION_VALUE aarch64_function_value
17381
17382 #undef TARGET_FUNCTION_VALUE_REGNO_P
17383 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
17384
17385 #undef TARGET_GIMPLE_FOLD_BUILTIN
17386 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
17387
17388 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
17389 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
17390
17391 #undef  TARGET_INIT_BUILTINS
17392 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
17393
17394 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
17395 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
17396   aarch64_ira_change_pseudo_allocno_class
17397
17398 #undef TARGET_LEGITIMATE_ADDRESS_P
17399 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
17400
17401 #undef TARGET_LEGITIMATE_CONSTANT_P
17402 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
17403
17404 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
17405 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
17406   aarch64_legitimize_address_displacement
17407
17408 #undef TARGET_LIBGCC_CMP_RETURN_MODE
17409 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
17410
17411 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
17412 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
17413 aarch64_libgcc_floating_mode_supported_p
17414
17415 #undef TARGET_MANGLE_TYPE
17416 #define TARGET_MANGLE_TYPE aarch64_mangle_type
17417
17418 #undef TARGET_MEMORY_MOVE_COST
17419 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
17420
17421 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
17422 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
17423
17424 #undef TARGET_MUST_PASS_IN_STACK
17425 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
17426
17427 /* This target hook should return true if accesses to volatile bitfields
17428    should use the narrowest mode possible.  It should return false if these
17429    accesses should use the bitfield container type.  */
17430 #undef TARGET_NARROW_VOLATILE_BITFIELD
17431 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
17432
17433 #undef  TARGET_OPTION_OVERRIDE
17434 #define TARGET_OPTION_OVERRIDE aarch64_override_options
17435
17436 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
17437 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
17438   aarch64_override_options_after_change
17439
17440 #undef TARGET_OPTION_SAVE
17441 #define TARGET_OPTION_SAVE aarch64_option_save
17442
17443 #undef TARGET_OPTION_RESTORE
17444 #define TARGET_OPTION_RESTORE aarch64_option_restore
17445
17446 #undef TARGET_OPTION_PRINT
17447 #define TARGET_OPTION_PRINT aarch64_option_print
17448
17449 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
17450 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
17451
17452 #undef TARGET_SET_CURRENT_FUNCTION
17453 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
17454
17455 #undef TARGET_PASS_BY_REFERENCE
17456 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
17457
17458 #undef TARGET_PREFERRED_RELOAD_CLASS
17459 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
17460
17461 #undef TARGET_SCHED_REASSOCIATION_WIDTH
17462 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
17463
17464 #undef TARGET_PROMOTED_TYPE
17465 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
17466
17467 #undef TARGET_SECONDARY_RELOAD
17468 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
17469
17470 #undef TARGET_SHIFT_TRUNCATION_MASK
17471 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
17472
17473 #undef TARGET_SETUP_INCOMING_VARARGS
17474 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
17475
17476 #undef TARGET_STRUCT_VALUE_RTX
17477 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
17478
17479 #undef TARGET_REGISTER_MOVE_COST
17480 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
17481
17482 #undef TARGET_RETURN_IN_MEMORY
17483 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
17484
17485 #undef TARGET_RETURN_IN_MSB
17486 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
17487
17488 #undef TARGET_RTX_COSTS
17489 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
17490
17491 #undef TARGET_SCALAR_MODE_SUPPORTED_P
17492 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
17493
17494 #undef TARGET_SCHED_ISSUE_RATE
17495 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
17496
17497 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
17498 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
17499   aarch64_sched_first_cycle_multipass_dfa_lookahead
17500
17501 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
17502 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
17503   aarch64_first_cycle_multipass_dfa_lookahead_guard
17504
17505 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
17506 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
17507   aarch64_get_separate_components
17508
17509 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
17510 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
17511   aarch64_components_for_bb
17512
17513 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
17514 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
17515   aarch64_disqualify_components
17516
17517 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
17518 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
17519   aarch64_emit_prologue_components
17520
17521 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
17522 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
17523   aarch64_emit_epilogue_components
17524
17525 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
17526 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
17527   aarch64_set_handled_components
17528
17529 #undef TARGET_TRAMPOLINE_INIT
17530 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
17531
17532 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
17533 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
17534
17535 #undef TARGET_VECTOR_MODE_SUPPORTED_P
17536 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
17537
17538 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
17539 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
17540   aarch64_builtin_support_vector_misalignment
17541
17542 #undef TARGET_ARRAY_MODE
17543 #define TARGET_ARRAY_MODE aarch64_array_mode
17544
17545 #undef TARGET_ARRAY_MODE_SUPPORTED_P
17546 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
17547
17548 #undef TARGET_VECTORIZE_ADD_STMT_COST
17549 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
17550
17551 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
17552 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
17553   aarch64_builtin_vectorization_cost
17554
17555 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
17556 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
17557
17558 #undef TARGET_VECTORIZE_BUILTINS
17559 #define TARGET_VECTORIZE_BUILTINS
17560
17561 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
17562 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
17563   aarch64_builtin_vectorized_function
17564
17565 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
17566 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
17567   aarch64_autovectorize_vector_sizes
17568
17569 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
17570 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
17571   aarch64_atomic_assign_expand_fenv
17572
17573 /* Section anchor support.  */
17574
17575 #undef TARGET_MIN_ANCHOR_OFFSET
17576 #define TARGET_MIN_ANCHOR_OFFSET -256
17577
17578 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
17579    byte offset; we can do much more for larger data types, but have no way
17580    to determine the size of the access.  We assume accesses are aligned.  */
17581 #undef TARGET_MAX_ANCHOR_OFFSET
17582 #define TARGET_MAX_ANCHOR_OFFSET 4095
17583
17584 #undef TARGET_VECTOR_ALIGNMENT
17585 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
17586
17587 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
17588 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
17589   aarch64_vectorize_preferred_vector_alignment
17590 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
17591 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
17592   aarch64_simd_vector_alignment_reachable
17593
17594 /* vec_perm support.  */
17595
17596 #undef TARGET_VECTORIZE_VEC_PERM_CONST
17597 #define TARGET_VECTORIZE_VEC_PERM_CONST \
17598   aarch64_vectorize_vec_perm_const
17599
17600 #undef TARGET_VECTORIZE_GET_MASK_MODE
17601 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
17602 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
17603 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
17604   aarch64_empty_mask_is_expensive
17605
17606 #undef TARGET_INIT_LIBFUNCS
17607 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
17608
17609 #undef TARGET_FIXED_CONDITION_CODE_REGS
17610 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
17611
17612 #undef TARGET_FLAGS_REGNUM
17613 #define TARGET_FLAGS_REGNUM CC_REGNUM
17614
17615 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
17616 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
17617
17618 #undef TARGET_ASAN_SHADOW_OFFSET
17619 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
17620
17621 #undef TARGET_LEGITIMIZE_ADDRESS
17622 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
17623
17624 #undef TARGET_SCHED_CAN_SPECULATE_INSN
17625 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
17626
17627 #undef TARGET_CAN_USE_DOLOOP_P
17628 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
17629
17630 #undef TARGET_SCHED_ADJUST_PRIORITY
17631 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
17632
17633 #undef TARGET_SCHED_MACRO_FUSION_P
17634 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
17635
17636 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
17637 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
17638
17639 #undef TARGET_SCHED_FUSION_PRIORITY
17640 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
17641
17642 #undef TARGET_UNSPEC_MAY_TRAP_P
17643 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
17644
17645 #undef TARGET_USE_PSEUDO_PIC_REG
17646 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
17647
17648 #undef TARGET_PRINT_OPERAND
17649 #define TARGET_PRINT_OPERAND aarch64_print_operand
17650
17651 #undef TARGET_PRINT_OPERAND_ADDRESS
17652 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
17653
17654 #undef TARGET_OPTAB_SUPPORTED_P
17655 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
17656
17657 #undef TARGET_OMIT_STRUCT_RETURN_REG
17658 #define TARGET_OMIT_STRUCT_RETURN_REG true
17659
17660 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
17661 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
17662   aarch64_dwarf_poly_indeterminate_value
17663
17664 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
17665 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
17666 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
17667
17668 #undef TARGET_HARD_REGNO_NREGS
17669 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
17670 #undef TARGET_HARD_REGNO_MODE_OK
17671 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
17672
17673 #undef TARGET_MODES_TIEABLE_P
17674 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
17675
17676 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
17677 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
17678   aarch64_hard_regno_call_part_clobbered
17679
17680 #undef TARGET_CONSTANT_ALIGNMENT
17681 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
17682
17683 #undef TARGET_COMPUTE_PRESSURE_CLASSES
17684 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
17685
17686 #undef TARGET_CAN_CHANGE_MODE_CLASS
17687 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
17688
17689 #undef TARGET_SELECT_EARLY_REMAT_MODES
17690 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
17691
17692 #if CHECKING_P
17693 #undef TARGET_RUN_TARGET_SELFTESTS
17694 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
17695 #endif /* #if CHECKING_P */
17696
17697 struct gcc_target targetm = TARGET_INITIALIZER;
17698
17699 #include "gt-aarch64.h"