gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2018 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #include "config.h"
  24 #define INCLUDE_STRING
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "memmodel.h"
  32 #include "gimple.h"
  33 #include "cfghooks.h"
  34 #include "cfgloop.h"
  35 #include "df.h"
  36 #include "tm_p.h"
  37 #include "stringpool.h"
  38 #include "attribs.h"
  39 #include "optabs.h"
  40 #include "regs.h"
  41 #include "emit-rtl.h"
  42 #include "recog.h"
  43 #include "diagnostic.h"
  44 #include "insn-attr.h"
  45 #include "alias.h"
  46 #include "fold-const.h"
  47 #include "stor-layout.h"
  48 #include "calls.h"
  49 #include "varasm.h"
  50 #include "output.h"
  51 #include "flags.h"
  52 #include "explow.h"
  53 #include "expr.h"
  54 #include "reload.h"
  55 #include "langhooks.h"
  56 #include "opts.h"
  57 #include "params.h"
  58 #include "gimplify.h"
  59 #include "dwarf2.h"
  60 #include "gimple-iterator.h"
  61 #include "tree-vectorizer.h"
  62 #include "aarch64-cost-tables.h"
  63 #include "dumpfile.h"
  64 #include "builtins.h"
  65 #include "rtl-iter.h"
  66 #include "tm-constrs.h"
  67 #include "sched-int.h"
  68 #include "target-globals.h"
  69 #include "common/common-target.h"
  70 #include "cfgrtl.h"
  71 #include "selftest.h"
  72 #include "selftest-rtl.h"
  73 #include "rtx-vector-builder.h"
  74
  75 /* This file should be included last.  */
  76 #include "target-def.h"
  77
  78 /* Defined for convenience.  */
  79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  80
  81 /* Information about a legitimate vector immediate operand.  */
  82 struct simd_immediate_info
  83 {
  84   enum insn_type { MOV, MVN };
  85   enum modifier_type { LSL, MSL };
  86
  87   simd_immediate_info () {}
  88   simd_immediate_info (scalar_float_mode, rtx);
  89   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
  90                        insn_type = MOV, modifier_type = LSL,
  91                        unsigned int = 0);
  92   simd_immediate_info (scalar_mode, rtx, rtx);
  93
  94   /* The mode of the elements.  */
  95   scalar_mode elt_mode;
  96
  97   /* The value of each element if all elements are the same, or the
  98      first value if the constant is a series.  */
  99   rtx value;
 100
 101   /* The value of the step if the constant is a series, null otherwise.  */
 102   rtx step;
 103
 104   /* The instruction to use to move the immediate into a vector.  */
 105   insn_type insn;
 106
 107   /* The kind of shift modifier to use, and the number of bits to shift.
 108      This is (LSL, 0) if no shift is needed.  */
 109   modifier_type modifier;
 110   unsigned int shift;
 111 };
 112
 113 /* Construct a floating-point immediate in which each element has mode
 114    ELT_MODE_IN and value VALUE_IN.  */
 115 inline simd_immediate_info
 116 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 117   : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
 118     modifier (LSL), shift (0)
 119 {}
 120
 121 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 122    and value VALUE_IN.  The other parameters are as for the structure
 123    fields.  */
 124 inline simd_immediate_info
 125 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 126                        unsigned HOST_WIDE_INT value_in,
 127                        insn_type insn_in, modifier_type modifier_in,
 128                        unsigned int shift_in)
 129   : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
 130     step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
 131 {}
 132
 133 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 134    and where element I is equal to VALUE_IN + I * STEP_IN.  */
 135 inline simd_immediate_info
 136 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
 137   : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
 138     modifier (LSL), shift (0)
 139 {}
 140
 141 /* The current code model.  */
 142 enum aarch64_code_model aarch64_cmodel;
 143
 144 /* The number of 64-bit elements in an SVE vector.  */
 145 poly_uint16 aarch64_sve_vg;
 146
 147 #ifdef HAVE_AS_TLS
 148 #undef TARGET_HAVE_TLS
 149 #define TARGET_HAVE_TLS 1
 150 #endif
 151
 152 static bool aarch64_composite_type_p (const_tree, machine_mode);
 153 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 154                                                      const_tree,
 155                                                      machine_mode *, int *,
 156                                                      bool *);
 157 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 158 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 159 static void aarch64_override_options_after_change (void);
 160 static bool aarch64_vector_mode_supported_p (machine_mode);
 161 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 162 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 163                                                          const_tree type,
 164                                                          int misalignment,
 165                                                          bool is_packed);
 166 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 167 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 168                                             aarch64_addr_query_type);
 169
 170 /* Major revision number of the ARM Architecture implemented by the target.  */
 171 unsigned aarch64_architecture_version;
 172
 173 /* The processor for which instructions should be scheduled.  */
 174 enum aarch64_processor aarch64_tune = cortexa53;
 175
 176 /* Mask to specify which instruction scheduling options should be used.  */
 177 unsigned long aarch64_tune_flags = 0;
 178
 179 /* Global flag for PC relative loads.  */
 180 bool aarch64_pcrelative_literal_loads;
 181
 182 /* Global flag for whether frame pointer is enabled.  */
 183 bool aarch64_use_frame_pointer;
 184
 185 /* Support for command line parsing of boolean flags in the tuning
 186    structures.  */
 187 struct aarch64_flag_desc
 188 {
 189   const char* name;
 190   unsigned int flag;
 191 };
 192
 193 #define AARCH64_FUSION_PAIR(name, internal_name) \
 194   { name, AARCH64_FUSE_##internal_name },
 195 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 196 {
 197   { "none", AARCH64_FUSE_NOTHING },
 198 #include "aarch64-fusion-pairs.def"
 199   { "all", AARCH64_FUSE_ALL },
 200   { NULL, AARCH64_FUSE_NOTHING }
 201 };
 202
 203 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 204   { name, AARCH64_EXTRA_TUNE_##internal_name },
 205 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 206 {
 207   { "none", AARCH64_EXTRA_TUNE_NONE },
 208 #include "aarch64-tuning-flags.def"
 209   { "all", AARCH64_EXTRA_TUNE_ALL },
 210   { NULL, AARCH64_EXTRA_TUNE_NONE }
 211 };
 212
 213 /* Tuning parameters.  */
 214
 215 static const struct cpu_addrcost_table generic_addrcost_table =
 216 {
 217     {
 218       1, /* hi  */
 219       0, /* si  */
 220       0, /* di  */
 221       1, /* ti  */
 222     },
 223   0, /* pre_modify  */
 224   0, /* post_modify  */
 225   0, /* register_offset  */
 226   0, /* register_sextend  */
 227   0, /* register_zextend  */
 228   0 /* imm_offset  */
 229 };
 230
 231 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 232 {
 233     {
 234       0, /* hi  */
 235       0, /* si  */
 236       0, /* di  */
 237       2, /* ti  */
 238     },
 239   0, /* pre_modify  */
 240   0, /* post_modify  */
 241   1, /* register_offset  */
 242   1, /* register_sextend  */
 243   2, /* register_zextend  */
 244   0, /* imm_offset  */
 245 };
 246
 247 static const struct cpu_addrcost_table xgene1_addrcost_table =
 248 {
 249     {
 250       1, /* hi  */
 251       0, /* si  */
 252       0, /* di  */
 253       1, /* ti  */
 254     },
 255   1, /* pre_modify  */
 256   0, /* post_modify  */
 257   0, /* register_offset  */
 258   1, /* register_sextend  */
 259   1, /* register_zextend  */
 260   0, /* imm_offset  */
 261 };
 262
 263 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 264 {
 265     {
 266       1, /* hi  */
 267       1, /* si  */
 268       1, /* di  */
 269       2, /* ti  */
 270     },
 271   0, /* pre_modify  */
 272   0, /* post_modify  */
 273   2, /* register_offset  */
 274   3, /* register_sextend  */
 275   3, /* register_zextend  */
 276   0, /* imm_offset  */
 277 };
 278
 279 static const struct cpu_addrcost_table tsv110_addrcost_table =
 280 {
 281     {
 282       1, /* hi  */
 283       0, /* si  */
 284       0, /* di  */
 285       1, /* ti  */
 286     },
 287   0, /* pre_modify  */
 288   0, /* post_modify  */
 289   0, /* register_offset  */
 290   1, /* register_sextend  */
 291   1, /* register_zextend  */
 292   0, /* imm_offset  */
 293 };
 294
 295 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 296 {
 297     {
 298       1, /* hi  */
 299       1, /* si  */
 300       1, /* di  */
 301       2, /* ti  */
 302     },
 303   1, /* pre_modify  */
 304   1, /* post_modify  */
 305   3, /* register_offset  */
 306   3, /* register_sextend  */
 307   3, /* register_zextend  */
 308   2, /* imm_offset  */
 309 };
 310
 311 static const struct cpu_regmove_cost generic_regmove_cost =
 312 {
 313   1, /* GP2GP  */
 314   /* Avoid the use of slow int<->fp moves for spilling by setting
 315      their cost higher than memmov_cost.  */
 316   5, /* GP2FP  */
 317   5, /* FP2GP  */
 318   2 /* FP2FP  */
 319 };
 320
 321 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 322 {
 323   1, /* GP2GP  */
 324   /* Avoid the use of slow int<->fp moves for spilling by setting
 325      their cost higher than memmov_cost.  */
 326   5, /* GP2FP  */
 327   5, /* FP2GP  */
 328   2 /* FP2FP  */
 329 };
 330
 331 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 332 {
 333   1, /* GP2GP  */
 334   /* Avoid the use of slow int<->fp moves for spilling by setting
 335      their cost higher than memmov_cost.  */
 336   5, /* GP2FP  */
 337   5, /* FP2GP  */
 338   2 /* FP2FP  */
 339 };
 340
 341 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 342 {
 343   1, /* GP2GP  */
 344   /* Avoid the use of slow int<->fp moves for spilling by setting
 345      their cost higher than memmov_cost (actual, 4 and 9).  */
 346   9, /* GP2FP  */
 347   9, /* FP2GP  */
 348   1 /* FP2FP  */
 349 };
 350
 351 static const struct cpu_regmove_cost thunderx_regmove_cost =
 352 {
 353   2, /* GP2GP  */
 354   2, /* GP2FP  */
 355   6, /* FP2GP  */
 356   4 /* FP2FP  */
 357 };
 358
 359 static const struct cpu_regmove_cost xgene1_regmove_cost =
 360 {
 361   1, /* GP2GP  */
 362   /* Avoid the use of slow int<->fp moves for spilling by setting
 363      their cost higher than memmov_cost.  */
 364   8, /* GP2FP  */
 365   8, /* FP2GP  */
 366   2 /* FP2FP  */
 367 };
 368
 369 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 370 {
 371   2, /* GP2GP  */
 372   /* Avoid the use of int<->fp moves for spilling.  */
 373   6, /* GP2FP  */
 374   6, /* FP2GP  */
 375   4 /* FP2FP  */
 376 };
 377
 378 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 379 {
 380   1, /* GP2GP  */
 381   /* Avoid the use of int<->fp moves for spilling.  */
 382   8, /* GP2FP  */
 383   8, /* FP2GP  */
 384   4  /* FP2FP  */
 385 };
 386
 387 static const struct cpu_regmove_cost tsv110_regmove_cost =
 388 {
 389   1, /* GP2GP  */
 390   /* Avoid the use of slow int<->fp moves for spilling by setting
 391      their cost higher than memmov_cost.  */
 392   2, /* GP2FP  */
 393   3, /* FP2GP  */
 394   2  /* FP2FP  */
 395 };
 396
 397 /* Generic costs for vector insn classes.  */
 398 static const struct cpu_vector_cost generic_vector_cost =
 399 {
 400   1, /* scalar_int_stmt_cost  */
 401   1, /* scalar_fp_stmt_cost  */
 402   1, /* scalar_load_cost  */
 403   1, /* scalar_store_cost  */
 404   1, /* vec_int_stmt_cost  */
 405   1, /* vec_fp_stmt_cost  */
 406   2, /* vec_permute_cost  */
 407   1, /* vec_to_scalar_cost  */
 408   1, /* scalar_to_vec_cost  */
 409   1, /* vec_align_load_cost  */
 410   1, /* vec_unalign_load_cost  */
 411   1, /* vec_unalign_store_cost  */
 412   1, /* vec_store_cost  */
 413   3, /* cond_taken_branch_cost  */
 414   1 /* cond_not_taken_branch_cost  */
 415 };
 416
 417 /* QDF24XX costs for vector insn classes.  */
 418 static const struct cpu_vector_cost qdf24xx_vector_cost =
 419 {
 420   1, /* scalar_int_stmt_cost  */
 421   1, /* scalar_fp_stmt_cost  */
 422   1, /* scalar_load_cost  */
 423   1, /* scalar_store_cost  */
 424   1, /* vec_int_stmt_cost  */
 425   3, /* vec_fp_stmt_cost  */
 426   2, /* vec_permute_cost  */
 427   1, /* vec_to_scalar_cost  */
 428   1, /* scalar_to_vec_cost  */
 429   1, /* vec_align_load_cost  */
 430   1, /* vec_unalign_load_cost  */
 431   1, /* vec_unalign_store_cost  */
 432   1, /* vec_store_cost  */
 433   3, /* cond_taken_branch_cost  */
 434   1 /* cond_not_taken_branch_cost  */
 435 };
 436
 437 /* ThunderX costs for vector insn classes.  */
 438 static const struct cpu_vector_cost thunderx_vector_cost =
 439 {
 440   1, /* scalar_int_stmt_cost  */
 441   1, /* scalar_fp_stmt_cost  */
 442   3, /* scalar_load_cost  */
 443   1, /* scalar_store_cost  */
 444   4, /* vec_int_stmt_cost  */
 445   1, /* vec_fp_stmt_cost  */
 446   4, /* vec_permute_cost  */
 447   2, /* vec_to_scalar_cost  */
 448   2, /* scalar_to_vec_cost  */
 449   3, /* vec_align_load_cost  */
 450   5, /* vec_unalign_load_cost  */
 451   5, /* vec_unalign_store_cost  */
 452   1, /* vec_store_cost  */
 453   3, /* cond_taken_branch_cost  */
 454   3 /* cond_not_taken_branch_cost  */
 455 };
 456
 457 static const struct cpu_vector_cost tsv110_vector_cost =
 458 {
 459   1, /* scalar_int_stmt_cost  */
 460   1, /* scalar_fp_stmt_cost  */
 461   5, /* scalar_load_cost  */
 462   1, /* scalar_store_cost  */
 463   2, /* vec_int_stmt_cost  */
 464   2, /* vec_fp_stmt_cost  */
 465   2, /* vec_permute_cost  */
 466   3, /* vec_to_scalar_cost  */
 467   2, /* scalar_to_vec_cost  */
 468   5, /* vec_align_load_cost  */
 469   5, /* vec_unalign_load_cost  */
 470   1, /* vec_unalign_store_cost  */
 471   1, /* vec_store_cost  */
 472   1, /* cond_taken_branch_cost  */
 473   1 /* cond_not_taken_branch_cost  */
 474 };
 475
 476 /* Generic costs for vector insn classes.  */
 477 static const struct cpu_vector_cost cortexa57_vector_cost =
 478 {
 479   1, /* scalar_int_stmt_cost  */
 480   1, /* scalar_fp_stmt_cost  */
 481   4, /* scalar_load_cost  */
 482   1, /* scalar_store_cost  */
 483   2, /* vec_int_stmt_cost  */
 484   2, /* vec_fp_stmt_cost  */
 485   3, /* vec_permute_cost  */
 486   8, /* vec_to_scalar_cost  */
 487   8, /* scalar_to_vec_cost  */
 488   4, /* vec_align_load_cost  */
 489   4, /* vec_unalign_load_cost  */
 490   1, /* vec_unalign_store_cost  */
 491   1, /* vec_store_cost  */
 492   1, /* cond_taken_branch_cost  */
 493   1 /* cond_not_taken_branch_cost  */
 494 };
 495
 496 static const struct cpu_vector_cost exynosm1_vector_cost =
 497 {
 498   1, /* scalar_int_stmt_cost  */
 499   1, /* scalar_fp_stmt_cost  */
 500   5, /* scalar_load_cost  */
 501   1, /* scalar_store_cost  */
 502   3, /* vec_int_stmt_cost  */
 503   3, /* vec_fp_stmt_cost  */
 504   3, /* vec_permute_cost  */
 505   3, /* vec_to_scalar_cost  */
 506   3, /* scalar_to_vec_cost  */
 507   5, /* vec_align_load_cost  */
 508   5, /* vec_unalign_load_cost  */
 509   1, /* vec_unalign_store_cost  */
 510   1, /* vec_store_cost  */
 511   1, /* cond_taken_branch_cost  */
 512   1 /* cond_not_taken_branch_cost  */
 513 };
 514
 515 /* Generic costs for vector insn classes.  */
 516 static const struct cpu_vector_cost xgene1_vector_cost =
 517 {
 518   1, /* scalar_int_stmt_cost  */
 519   1, /* scalar_fp_stmt_cost  */
 520   5, /* scalar_load_cost  */
 521   1, /* scalar_store_cost  */
 522   2, /* vec_int_stmt_cost  */
 523   2, /* vec_fp_stmt_cost  */
 524   2, /* vec_permute_cost  */
 525   4, /* vec_to_scalar_cost  */
 526   4, /* scalar_to_vec_cost  */
 527   10, /* vec_align_load_cost  */
 528   10, /* vec_unalign_load_cost  */
 529   2, /* vec_unalign_store_cost  */
 530   2, /* vec_store_cost  */
 531   2, /* cond_taken_branch_cost  */
 532   1 /* cond_not_taken_branch_cost  */
 533 };
 534
 535 /* Costs for vector insn classes for Vulcan.  */
 536 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 537 {
 538   1, /* scalar_int_stmt_cost  */
 539   6, /* scalar_fp_stmt_cost  */
 540   4, /* scalar_load_cost  */
 541   1, /* scalar_store_cost  */
 542   5, /* vec_int_stmt_cost  */
 543   6, /* vec_fp_stmt_cost  */
 544   3, /* vec_permute_cost  */
 545   6, /* vec_to_scalar_cost  */
 546   5, /* scalar_to_vec_cost  */
 547   8, /* vec_align_load_cost  */
 548   8, /* vec_unalign_load_cost  */
 549   4, /* vec_unalign_store_cost  */
 550   4, /* vec_store_cost  */
 551   2, /* cond_taken_branch_cost  */
 552   1  /* cond_not_taken_branch_cost  */
 553 };
 554
 555 /* Generic costs for branch instructions.  */
 556 static const struct cpu_branch_cost generic_branch_cost =
 557 {
 558   1,  /* Predictable.  */
 559   3   /* Unpredictable.  */
 560 };
 561
 562 /* Generic approximation modes.  */
 563 static const cpu_approx_modes generic_approx_modes =
 564 {
 565   AARCH64_APPROX_NONE,  /* division  */
 566   AARCH64_APPROX_NONE,  /* sqrt  */
 567   AARCH64_APPROX_NONE   /* recip_sqrt  */
 568 };
 569
 570 /* Approximation modes for Exynos M1.  */
 571 static const cpu_approx_modes exynosm1_approx_modes =
 572 {
 573   AARCH64_APPROX_NONE,  /* division  */
 574   AARCH64_APPROX_ALL,   /* sqrt  */
 575   AARCH64_APPROX_ALL    /* recip_sqrt  */
 576 };
 577
 578 /* Approximation modes for X-Gene 1.  */
 579 static const cpu_approx_modes xgene1_approx_modes =
 580 {
 581   AARCH64_APPROX_NONE,  /* division  */
 582   AARCH64_APPROX_NONE,  /* sqrt  */
 583   AARCH64_APPROX_ALL    /* recip_sqrt  */
 584 };
 585
 586 /* Generic prefetch settings (which disable prefetch).  */
 587 static const cpu_prefetch_tune generic_prefetch_tune =
 588 {
 589   0,                    /* num_slots  */
 590   -1,                   /* l1_cache_size  */
 591   -1,                   /* l1_cache_line_size  */
 592   -1,                   /* l2_cache_size  */
 593   true,                 /* prefetch_dynamic_strides */
 594   -1,                   /* minimum_stride */
 595   -1                    /* default_opt_level  */
 596 };
 597
 598 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 599 {
 600   0,                    /* num_slots  */
 601   -1,                   /* l1_cache_size  */
 602   64,                   /* l1_cache_line_size  */
 603   -1,                   /* l2_cache_size  */
 604   true,                 /* prefetch_dynamic_strides */
 605   -1,                   /* minimum_stride */
 606   -1                    /* default_opt_level  */
 607 };
 608
 609 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 610 {
 611   4,                    /* num_slots  */
 612   32,                   /* l1_cache_size  */
 613   64,                   /* l1_cache_line_size  */
 614   512,                  /* l2_cache_size  */
 615   false,                /* prefetch_dynamic_strides */
 616   2048,                 /* minimum_stride */
 617   3                     /* default_opt_level  */
 618 };
 619
 620 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 621 {
 622   8,                    /* num_slots  */
 623   32,                   /* l1_cache_size  */
 624   128,                  /* l1_cache_line_size  */
 625   16*1024,              /* l2_cache_size  */
 626   true,                 /* prefetch_dynamic_strides */
 627   -1,                   /* minimum_stride */
 628   3                     /* default_opt_level  */
 629 };
 630
 631 static const cpu_prefetch_tune thunderx_prefetch_tune =
 632 {
 633   8,                    /* num_slots  */
 634   32,                   /* l1_cache_size  */
 635   128,                  /* l1_cache_line_size  */
 636   -1,                   /* l2_cache_size  */
 637   true,                 /* prefetch_dynamic_strides */
 638   -1,                   /* minimum_stride */
 639   -1                    /* default_opt_level  */
 640 };
 641
 642 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 643 {
 644   8,                    /* num_slots  */
 645   32,                   /* l1_cache_size  */
 646   64,                   /* l1_cache_line_size  */
 647   256,                  /* l2_cache_size  */
 648   true,                 /* prefetch_dynamic_strides */
 649   -1,                   /* minimum_stride */
 650   -1                    /* default_opt_level  */
 651 };
 652
 653 static const cpu_prefetch_tune tsv110_prefetch_tune =
 654 {
 655   0,                    /* num_slots  */
 656   64,                   /* l1_cache_size  */
 657   64,                   /* l1_cache_line_size  */
 658   512,                  /* l2_cache_size  */
 659   true,                 /* prefetch_dynamic_strides */
 660   -1,                   /* minimum_stride */
 661   -1                    /* default_opt_level  */
 662 };
 663
 664 static const struct tune_params generic_tunings =
 665 {
 666   &cortexa57_extra_costs,
 667   &generic_addrcost_table,
 668   &generic_regmove_cost,
 669   &generic_vector_cost,
 670   &generic_branch_cost,
 671   &generic_approx_modes,
 672   4, /* memmov_cost  */
 673   2, /* issue_rate  */
 674   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 675   "8",  /* function_align.  */
 676   "4",  /* jump_align.  */
 677   "8",  /* loop_align.  */
 678   2,    /* int_reassoc_width.  */
 679   4,    /* fp_reassoc_width.  */
 680   1,    /* vec_reassoc_width.  */
 681   2,    /* min_div_recip_mul_sf.  */
 682   2,    /* min_div_recip_mul_df.  */
 683   0,    /* max_case_values.  */
 684   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 685   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 686   &generic_prefetch_tune
 687 };
 688
 689 static const struct tune_params cortexa35_tunings =
 690 {
 691   &cortexa53_extra_costs,
 692   &generic_addrcost_table,
 693   &cortexa53_regmove_cost,
 694   &generic_vector_cost,
 695   &generic_branch_cost,
 696   &generic_approx_modes,
 697   4, /* memmov_cost  */
 698   1, /* issue_rate  */
 699   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 700    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 701   "16", /* function_align.  */
 702   "4",  /* jump_align.  */
 703   "8",  /* loop_align.  */
 704   2,    /* int_reassoc_width.  */
 705   4,    /* fp_reassoc_width.  */
 706   1,    /* vec_reassoc_width.  */
 707   2,    /* min_div_recip_mul_sf.  */
 708   2,    /* min_div_recip_mul_df.  */
 709   0,    /* max_case_values.  */
 710   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 711   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 712   &generic_prefetch_tune
 713 };
 714
 715 static const struct tune_params cortexa53_tunings =
 716 {
 717   &cortexa53_extra_costs,
 718   &generic_addrcost_table,
 719   &cortexa53_regmove_cost,
 720   &generic_vector_cost,
 721   &generic_branch_cost,
 722   &generic_approx_modes,
 723   4, /* memmov_cost  */
 724   2, /* issue_rate  */
 725   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 726    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 727   "16", /* function_align.  */
 728   "4",  /* jump_align.  */
 729   "8",  /* loop_align.  */
 730   2,    /* int_reassoc_width.  */
 731   4,    /* fp_reassoc_width.  */
 732   1,    /* vec_reassoc_width.  */
 733   2,    /* min_div_recip_mul_sf.  */
 734   2,    /* min_div_recip_mul_df.  */
 735   0,    /* max_case_values.  */
 736   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 737   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 738   &generic_prefetch_tune
 739 };
 740
 741 static const struct tune_params cortexa57_tunings =
 742 {
 743   &cortexa57_extra_costs,
 744   &generic_addrcost_table,
 745   &cortexa57_regmove_cost,
 746   &cortexa57_vector_cost,
 747   &generic_branch_cost,
 748   &generic_approx_modes,
 749   4, /* memmov_cost  */
 750   3, /* issue_rate  */
 751   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 752    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 753   "16", /* function_align.  */
 754   "4",  /* jump_align.  */
 755   "8",  /* loop_align.  */
 756   2,    /* int_reassoc_width.  */
 757   4,    /* fp_reassoc_width.  */
 758   1,    /* vec_reassoc_width.  */
 759   2,    /* min_div_recip_mul_sf.  */
 760   2,    /* min_div_recip_mul_df.  */
 761   0,    /* max_case_values.  */
 762   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 763   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 764   &generic_prefetch_tune
 765 };
 766
 767 static const struct tune_params cortexa72_tunings =
 768 {
 769   &cortexa57_extra_costs,
 770   &generic_addrcost_table,
 771   &cortexa57_regmove_cost,
 772   &cortexa57_vector_cost,
 773   &generic_branch_cost,
 774   &generic_approx_modes,
 775   4, /* memmov_cost  */
 776   3, /* issue_rate  */
 777   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 778    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 779   "16", /* function_align.  */
 780   "4",  /* jump_align.  */
 781   "8",  /* loop_align.  */
 782   2,    /* int_reassoc_width.  */
 783   4,    /* fp_reassoc_width.  */
 784   1,    /* vec_reassoc_width.  */
 785   2,    /* min_div_recip_mul_sf.  */
 786   2,    /* min_div_recip_mul_df.  */
 787   0,    /* max_case_values.  */
 788   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 789   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 790   &generic_prefetch_tune
 791 };
 792
 793 static const struct tune_params cortexa73_tunings =
 794 {
 795   &cortexa57_extra_costs,
 796   &generic_addrcost_table,
 797   &cortexa57_regmove_cost,
 798   &cortexa57_vector_cost,
 799   &generic_branch_cost,
 800   &generic_approx_modes,
 801   4, /* memmov_cost.  */
 802   2, /* issue_rate.  */
 803   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 804    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 805   "16", /* function_align.  */
 806   "4",  /* jump_align.  */
 807   "8",  /* loop_align.  */
 808   2,    /* int_reassoc_width.  */
 809   4,    /* fp_reassoc_width.  */
 810   1,    /* vec_reassoc_width.  */
 811   2,    /* min_div_recip_mul_sf.  */
 812   2,    /* min_div_recip_mul_df.  */
 813   0,    /* max_case_values.  */
 814   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 815   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 816   &generic_prefetch_tune
 817 };
 818
 819
 820
 821 static const struct tune_params exynosm1_tunings =
 822 {
 823   &exynosm1_extra_costs,
 824   &exynosm1_addrcost_table,
 825   &exynosm1_regmove_cost,
 826   &exynosm1_vector_cost,
 827   &generic_branch_cost,
 828   &exynosm1_approx_modes,
 829   4,    /* memmov_cost  */
 830   3,    /* issue_rate  */
 831   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 832   "4",  /* function_align.  */
 833   "4",  /* jump_align.  */
 834   "4",  /* loop_align.  */
 835   2,    /* int_reassoc_width.  */
 836   4,    /* fp_reassoc_width.  */
 837   1,    /* vec_reassoc_width.  */
 838   2,    /* min_div_recip_mul_sf.  */
 839   2,    /* min_div_recip_mul_df.  */
 840   48,   /* max_case_values.  */
 841   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 842   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 843   &exynosm1_prefetch_tune
 844 };
 845
 846 static const struct tune_params thunderxt88_tunings =
 847 {
 848   &thunderx_extra_costs,
 849   &generic_addrcost_table,
 850   &thunderx_regmove_cost,
 851   &thunderx_vector_cost,
 852   &generic_branch_cost,
 853   &generic_approx_modes,
 854   6, /* memmov_cost  */
 855   2, /* issue_rate  */
 856   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 857   "8",  /* function_align.  */
 858   "8",  /* jump_align.  */
 859   "8",  /* loop_align.  */
 860   2,    /* int_reassoc_width.  */
 861   4,    /* fp_reassoc_width.  */
 862   1,    /* vec_reassoc_width.  */
 863   2,    /* min_div_recip_mul_sf.  */
 864   2,    /* min_div_recip_mul_df.  */
 865   0,    /* max_case_values.  */
 866   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 867   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 868   &thunderxt88_prefetch_tune
 869 };
 870
 871 static const struct tune_params thunderx_tunings =
 872 {
 873   &thunderx_extra_costs,
 874   &generic_addrcost_table,
 875   &thunderx_regmove_cost,
 876   &thunderx_vector_cost,
 877   &generic_branch_cost,
 878   &generic_approx_modes,
 879   6, /* memmov_cost  */
 880   2, /* issue_rate  */
 881   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 882   "8",  /* function_align.  */
 883   "8",  /* jump_align.  */
 884   "8",  /* loop_align.  */
 885   2,    /* int_reassoc_width.  */
 886   4,    /* fp_reassoc_width.  */
 887   1,    /* vec_reassoc_width.  */
 888   2,    /* min_div_recip_mul_sf.  */
 889   2,    /* min_div_recip_mul_df.  */
 890   0,    /* max_case_values.  */
 891   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 892   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 893    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 894   &thunderx_prefetch_tune
 895 };
 896
 897 static const struct tune_params tsv110_tunings =
 898 {
 899   &tsv110_extra_costs,
 900   &tsv110_addrcost_table,
 901   &tsv110_regmove_cost,
 902   &tsv110_vector_cost,
 903   &generic_branch_cost,
 904   &generic_approx_modes,
 905   4,    /* memmov_cost  */
 906   4,    /* issue_rate  */
 907   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
 908    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 909   "16", /* function_align.  */
 910   "4",  /* jump_align.  */
 911   "8",  /* loop_align.  */
 912   2,    /* int_reassoc_width.  */
 913   4,    /* fp_reassoc_width.  */
 914   1,    /* vec_reassoc_width.  */
 915   2,    /* min_div_recip_mul_sf.  */
 916   2,    /* min_div_recip_mul_df.  */
 917   0,    /* max_case_values.  */
 918   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 919   (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
 920   &tsv110_prefetch_tune
 921 };
 922
 923 static const struct tune_params xgene1_tunings =
 924 {
 925   &xgene1_extra_costs,
 926   &xgene1_addrcost_table,
 927   &xgene1_regmove_cost,
 928   &xgene1_vector_cost,
 929   &generic_branch_cost,
 930   &xgene1_approx_modes,
 931   6, /* memmov_cost  */
 932   4, /* issue_rate  */
 933   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 934   "16", /* function_align.  */
 935   "8",  /* jump_align.  */
 936   "16", /* loop_align.  */
 937   2,    /* int_reassoc_width.  */
 938   4,    /* fp_reassoc_width.  */
 939   1,    /* vec_reassoc_width.  */
 940   2,    /* min_div_recip_mul_sf.  */
 941   2,    /* min_div_recip_mul_df.  */
 942   0,    /* max_case_values.  */
 943   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 944   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
 945   &generic_prefetch_tune
 946 };
 947
 948 static const struct tune_params qdf24xx_tunings =
 949 {
 950   &qdf24xx_extra_costs,
 951   &qdf24xx_addrcost_table,
 952   &qdf24xx_regmove_cost,
 953   &qdf24xx_vector_cost,
 954   &generic_branch_cost,
 955   &generic_approx_modes,
 956   4, /* memmov_cost  */
 957   4, /* issue_rate  */
 958   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 959    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 960   "16", /* function_align.  */
 961   "8",  /* jump_align.  */
 962   "16", /* loop_align.  */
 963   2,    /* int_reassoc_width.  */
 964   4,    /* fp_reassoc_width.  */
 965   1,    /* vec_reassoc_width.  */
 966   2,    /* min_div_recip_mul_sf.  */
 967   2,    /* min_div_recip_mul_df.  */
 968   0,    /* max_case_values.  */
 969   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 970   AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
 971   &qdf24xx_prefetch_tune
 972 };
 973
 974 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
 975    for now.  */
 976 static const struct tune_params saphira_tunings =
 977 {
 978   &generic_extra_costs,
 979   &generic_addrcost_table,
 980   &generic_regmove_cost,
 981   &generic_vector_cost,
 982   &generic_branch_cost,
 983   &generic_approx_modes,
 984   4, /* memmov_cost  */
 985   4, /* issue_rate  */
 986   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 987    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 988   "16", /* function_align.  */
 989   "8",  /* jump_align.  */
 990   "16", /* loop_align.  */
 991   2,    /* int_reassoc_width.  */
 992   4,    /* fp_reassoc_width.  */
 993   1,    /* vec_reassoc_width.  */
 994   2,    /* min_div_recip_mul_sf.  */
 995   2,    /* min_div_recip_mul_df.  */
 996   0,    /* max_case_values.  */
 997   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 998   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
 999   &generic_prefetch_tune
1000 };
1001
1002 static const struct tune_params thunderx2t99_tunings =
1003 {
1004   &thunderx2t99_extra_costs,
1005   &thunderx2t99_addrcost_table,
1006   &thunderx2t99_regmove_cost,
1007   &thunderx2t99_vector_cost,
1008   &generic_branch_cost,
1009   &generic_approx_modes,
1010   4, /* memmov_cost.  */
1011   4, /* issue_rate.  */
1012   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1013    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
1014   "16", /* function_align.  */
1015   "8",  /* jump_align.  */
1016   "16", /* loop_align.  */
1017   3,    /* int_reassoc_width.  */
1018   2,    /* fp_reassoc_width.  */
1019   2,    /* vec_reassoc_width.  */
1020   2,    /* min_div_recip_mul_sf.  */
1021   2,    /* min_div_recip_mul_df.  */
1022   0,    /* max_case_values.  */
1023   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1024   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1025   &thunderx2t99_prefetch_tune
1026 };
1027
1028 /* Support for fine-grained override of the tuning structures.  */
1029 struct aarch64_tuning_override_function
1030 {
1031   const char* name;
1032   void (*parse_override)(const char*, struct tune_params*);
1033 };
1034
1035 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1036 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1037
1038 static const struct aarch64_tuning_override_function
1039 aarch64_tuning_override_functions[] =
1040 {
1041   { "fuse", aarch64_parse_fuse_string },
1042   { "tune", aarch64_parse_tune_string },
1043   { NULL, NULL }
1044 };
1045
1046 /* A processor implementing AArch64.  */
1047 struct processor
1048 {
1049   const char *const name;
1050   enum aarch64_processor ident;
1051   enum aarch64_processor sched_core;
1052   enum aarch64_arch arch;
1053   unsigned architecture_version;
1054   const unsigned long flags;
1055   const struct tune_params *const tune;
1056 };
1057
1058 /* Architectures implementing AArch64.  */
1059 static const struct processor all_architectures[] =
1060 {
1061 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1062   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1063 #include "aarch64-arches.def"
1064   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1065 };
1066
1067 /* Processor cores implementing AArch64.  */
1068 static const struct processor all_cores[] =
1069 {
1070 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1071   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
1072   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
1073   FLAGS, &COSTS##_tunings},
1074 #include "aarch64-cores.def"
1075   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1076     AARCH64_FL_FOR_ARCH8, &generic_tunings},
1077   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1078 };
1079
1080
1081 /* Target specification.  These are populated by the -march, -mtune, -mcpu
1082    handling code or by target attributes.  */
1083 static const struct processor *selected_arch;
1084 static const struct processor *selected_cpu;
1085 static const struct processor *selected_tune;
1086
1087 /* The current tuning set.  */
1088 struct tune_params aarch64_tune_params = generic_tunings;
1089
1090 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1091
1092 /* An ISA extension in the co-processor and main instruction set space.  */
1093 struct aarch64_option_extension
1094 {
1095   const char *const name;
1096   const unsigned long flags_on;
1097   const unsigned long flags_off;
1098 };
1099
1100 typedef enum aarch64_cond_code
1101 {
1102   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1103   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1104   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1105 }
1106 aarch64_cc;
1107
1108 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1109
1110 /* The condition codes of the processor, and the inverse function.  */
1111 static const char * const aarch64_condition_codes[] =
1112 {
1113   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1114   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1115 };
1116
1117 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1118 const char *
1119 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1120                         const char * branch_format)
1121 {
1122     rtx_code_label * tmp_label = gen_label_rtx ();
1123     char label_buf[256];
1124     char buffer[128];
1125     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1126                                  CODE_LABEL_NUMBER (tmp_label));
1127     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1128     rtx dest_label = operands[pos_label];
1129     operands[pos_label] = tmp_label;
1130
1131     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1132     output_asm_insn (buffer, operands);
1133
1134     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1135     operands[pos_label] = dest_label;
1136     output_asm_insn (buffer, operands);
1137     return "";
1138 }
1139
1140 void
1141 aarch64_err_no_fpadvsimd (machine_mode mode)
1142 {
1143   if (TARGET_GENERAL_REGS_ONLY)
1144     if (FLOAT_MODE_P (mode))
1145       error ("%qs is incompatible with the use of floating-point types",
1146              "-mgeneral-regs-only");
1147     else
1148       error ("%qs is incompatible with the use of vector types",
1149              "-mgeneral-regs-only");
1150   else
1151     if (FLOAT_MODE_P (mode))
1152       error ("%qs feature modifier is incompatible with the use of"
1153              " floating-point types", "+nofp");
1154     else
1155       error ("%qs feature modifier is incompatible with the use of"
1156              " vector types", "+nofp");
1157 }
1158
1159 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1160    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1161    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1162    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1163    and GENERAL_REGS is lower than the memory cost (in this case the best class
1164    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
1165    cost results in bad allocations with many redundant int<->FP moves which
1166    are expensive on various cores.
1167    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1168    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
1169    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
1170    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
1171    The result of this is that it is no longer inefficient to have a higher
1172    memory move cost than the register move cost.
1173 */
1174
1175 static reg_class_t
1176 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1177                                          reg_class_t best_class)
1178 {
1179   machine_mode mode;
1180
1181   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1182       || !reg_class_subset_p (FP_REGS, allocno_class))
1183     return allocno_class;
1184
1185   if (!reg_class_subset_p (GENERAL_REGS, best_class)
1186       || !reg_class_subset_p (FP_REGS, best_class))
1187     return best_class;
1188
1189   mode = PSEUDO_REGNO_MODE (regno);
1190   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1191 }
1192
1193 static unsigned int
1194 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1195 {
1196   if (GET_MODE_UNIT_SIZE (mode) == 4)
1197     return aarch64_tune_params.min_div_recip_mul_sf;
1198   return aarch64_tune_params.min_div_recip_mul_df;
1199 }
1200
1201 /* Return the reassociation width of treeop OPC with mode MODE.  */
1202 static int
1203 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1204 {
1205   if (VECTOR_MODE_P (mode))
1206     return aarch64_tune_params.vec_reassoc_width;
1207   if (INTEGRAL_MODE_P (mode))
1208     return aarch64_tune_params.int_reassoc_width;
1209   /* Avoid reassociating floating point addition so we emit more FMAs.  */
1210   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1211     return aarch64_tune_params.fp_reassoc_width;
1212   return 1;
1213 }
1214
1215 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1216 unsigned
1217 aarch64_dbx_register_number (unsigned regno)
1218 {
1219    if (GP_REGNUM_P (regno))
1220      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1221    else if (regno == SP_REGNUM)
1222      return AARCH64_DWARF_SP;
1223    else if (FP_REGNUM_P (regno))
1224      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1225    else if (PR_REGNUM_P (regno))
1226      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1227    else if (regno == VG_REGNUM)
1228      return AARCH64_DWARF_VG;
1229
1230    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1231       equivalent DWARF register.  */
1232    return DWARF_FRAME_REGISTERS;
1233 }
1234
1235 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1236 static bool
1237 aarch64_advsimd_struct_mode_p (machine_mode mode)
1238 {
1239   return (TARGET_SIMD
1240           && (mode == OImode || mode == CImode || mode == XImode));
1241 }
1242
1243 /* Return true if MODE is an SVE predicate mode.  */
1244 static bool
1245 aarch64_sve_pred_mode_p (machine_mode mode)
1246 {
1247   return (TARGET_SVE
1248           && (mode == VNx16BImode
1249               || mode == VNx8BImode
1250               || mode == VNx4BImode
1251               || mode == VNx2BImode));
1252 }
1253
1254 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1255 const unsigned int VEC_ADVSIMD  = 1;
1256 const unsigned int VEC_SVE_DATA = 2;
1257 const unsigned int VEC_SVE_PRED = 4;
1258 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1259    a structure of 2, 3 or 4 vectors.  */
1260 const unsigned int VEC_STRUCT   = 8;
1261 /* Useful combinations of the above.  */
1262 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1263 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1264
1265 /* Return a set of flags describing the vector properties of mode MODE.
1266    Ignore modes that are not supported by the current target.  */
1267 static unsigned int
1268 aarch64_classify_vector_mode (machine_mode mode)
1269 {
1270   if (aarch64_advsimd_struct_mode_p (mode))
1271     return VEC_ADVSIMD | VEC_STRUCT;
1272
1273   if (aarch64_sve_pred_mode_p (mode))
1274     return VEC_SVE_PRED;
1275
1276   scalar_mode inner = GET_MODE_INNER (mode);
1277   if (VECTOR_MODE_P (mode)
1278       && (inner == QImode
1279           || inner == HImode
1280           || inner == HFmode
1281           || inner == SImode
1282           || inner == SFmode
1283           || inner == DImode
1284           || inner == DFmode))
1285     {
1286       if (TARGET_SVE)
1287         {
1288           if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1289             return VEC_SVE_DATA;
1290           if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1291               || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1292               || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1293             return VEC_SVE_DATA | VEC_STRUCT;
1294         }
1295
1296       /* This includes V1DF but not V1DI (which doesn't exist).  */
1297       if (TARGET_SIMD
1298           && (known_eq (GET_MODE_BITSIZE (mode), 64)
1299               || known_eq (GET_MODE_BITSIZE (mode), 128)))
1300         return VEC_ADVSIMD;
1301     }
1302
1303   return 0;
1304 }
1305
1306 /* Return true if MODE is any of the data vector modes, including
1307    structure modes.  */
1308 static bool
1309 aarch64_vector_data_mode_p (machine_mode mode)
1310 {
1311   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1312 }
1313
1314 /* Return true if MODE is an SVE data vector mode; either a single vector
1315    or a structure of vectors.  */
1316 static bool
1317 aarch64_sve_data_mode_p (machine_mode mode)
1318 {
1319   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1320 }
1321
1322 /* Implement target hook TARGET_ARRAY_MODE.  */
1323 static opt_machine_mode
1324 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1325 {
1326   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1327       && IN_RANGE (nelems, 2, 4))
1328     return mode_for_vector (GET_MODE_INNER (mode),
1329                             GET_MODE_NUNITS (mode) * nelems);
1330
1331   return opt_machine_mode ();
1332 }
1333
1334 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1335 static bool
1336 aarch64_array_mode_supported_p (machine_mode mode,
1337                                 unsigned HOST_WIDE_INT nelems)
1338 {
1339   if (TARGET_SIMD
1340       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1341           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1342       && (nelems >= 2 && nelems <= 4))
1343     return true;
1344
1345   return false;
1346 }
1347
1348 /* Return the SVE predicate mode to use for elements that have
1349    ELEM_NBYTES bytes, if such a mode exists.  */
1350
1351 opt_machine_mode
1352 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1353 {
1354   if (TARGET_SVE)
1355     {
1356       if (elem_nbytes == 1)
1357         return VNx16BImode;
1358       if (elem_nbytes == 2)
1359         return VNx8BImode;
1360       if (elem_nbytes == 4)
1361         return VNx4BImode;
1362       if (elem_nbytes == 8)
1363         return VNx2BImode;
1364     }
1365   return opt_machine_mode ();
1366 }
1367
1368 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1369
1370 static opt_machine_mode
1371 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1372 {
1373   if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1374     {
1375       unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1376       machine_mode pred_mode;
1377       if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1378         return pred_mode;
1379     }
1380
1381   return default_get_mask_mode (nunits, nbytes);
1382 }
1383
1384 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
1385    prefer to use the first arithmetic operand as the else value if
1386    the else value doesn't matter, since that exactly matches the SVE
1387    destructive merging form.  For ternary operations we could either
1388    pick the first operand and use FMAD-like instructions or the last
1389    operand and use FMLA-like instructions; the latter seems more
1390    natural.  */
1391
1392 static tree
1393 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1394 {
1395   return nops == 3 ? ops[2] : ops[0];
1396 }
1397
1398 /* Implement TARGET_HARD_REGNO_NREGS.  */
1399
1400 static unsigned int
1401 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1402 {
1403   /* ??? Logically we should only need to provide a value when
1404      HARD_REGNO_MODE_OK says that the combination is valid,
1405      but at the moment we need to handle all modes.  Just ignore
1406      any runtime parts for registers that can't store them.  */
1407   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1408   switch (aarch64_regno_regclass (regno))
1409     {
1410     case FP_REGS:
1411     case FP_LO_REGS:
1412       if (aarch64_sve_data_mode_p (mode))
1413         return exact_div (GET_MODE_SIZE (mode),
1414                           BYTES_PER_SVE_VECTOR).to_constant ();
1415       return CEIL (lowest_size, UNITS_PER_VREG);
1416     case PR_REGS:
1417     case PR_LO_REGS:
1418     case PR_HI_REGS:
1419       return 1;
1420     default:
1421       return CEIL (lowest_size, UNITS_PER_WORD);
1422     }
1423   gcc_unreachable ();
1424 }
1425
1426 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1427
1428 static bool
1429 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1430 {
1431   if (GET_MODE_CLASS (mode) == MODE_CC)
1432     return regno == CC_REGNUM;
1433
1434   if (regno == VG_REGNUM)
1435     /* This must have the same size as _Unwind_Word.  */
1436     return mode == DImode;
1437
1438   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1439   if (vec_flags & VEC_SVE_PRED)
1440     return PR_REGNUM_P (regno);
1441
1442   if (PR_REGNUM_P (regno))
1443     return 0;
1444
1445   if (regno == SP_REGNUM)
1446     /* The purpose of comparing with ptr_mode is to support the
1447        global register variable associated with the stack pointer
1448        register via the syntax of asm ("wsp") in ILP32.  */
1449     return mode == Pmode || mode == ptr_mode;
1450
1451   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1452     return mode == Pmode;
1453
1454   if (GP_REGNUM_P (regno) && known_le (GET_MODE_SIZE (mode), 16))
1455     return true;
1456
1457   if (FP_REGNUM_P (regno))
1458     {
1459       if (vec_flags & VEC_STRUCT)
1460         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1461       else
1462         return !VECTOR_MODE_P (mode) || vec_flags != 0;
1463     }
1464
1465   return false;
1466 }
1467
1468 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1469    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1470    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1471
1472 static bool
1473 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1474 {
1475   return FP_REGNUM_P (regno) && maybe_gt (GET_MODE_SIZE (mode), 8);
1476 }
1477
1478 /* Implement REGMODE_NATURAL_SIZE.  */
1479 poly_uint64
1480 aarch64_regmode_natural_size (machine_mode mode)
1481 {
1482   /* The natural size for SVE data modes is one SVE data vector,
1483      and similarly for predicates.  We can't independently modify
1484      anything smaller than that.  */
1485   /* ??? For now, only do this for variable-width SVE registers.
1486      Doing it for constant-sized registers breaks lower-subreg.c.  */
1487   /* ??? And once that's fixed, we should probably have similar
1488      code for Advanced SIMD.  */
1489   if (!aarch64_sve_vg.is_constant ())
1490     {
1491       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1492       if (vec_flags & VEC_SVE_PRED)
1493         return BYTES_PER_SVE_PRED;
1494       if (vec_flags & VEC_SVE_DATA)
1495         return BYTES_PER_SVE_VECTOR;
1496     }
1497   return UNITS_PER_WORD;
1498 }
1499
1500 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1501 machine_mode
1502 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1503                                      machine_mode mode)
1504 {
1505   /* The predicate mode determines which bits are significant and
1506      which are "don't care".  Decreasing the number of lanes would
1507      lose data while increasing the number of lanes would make bits
1508      unnecessarily significant.  */
1509   if (PR_REGNUM_P (regno))
1510     return mode;
1511   if (known_ge (GET_MODE_SIZE (mode), 4))
1512     return mode;
1513   else
1514     return SImode;
1515 }
1516
1517 /* Return true if I's bits are consecutive ones from the MSB.  */
1518 bool
1519 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1520 {
1521   return exact_log2 (-i) != HOST_WIDE_INT_M1;
1522 }
1523
1524 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
1525    that strcpy from constants will be faster.  */
1526
1527 static HOST_WIDE_INT
1528 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1529 {
1530   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1531     return MAX (align, BITS_PER_WORD);
1532   return align;
1533 }
1534
1535 /* Return true if calls to DECL should be treated as
1536    long-calls (ie called via a register).  */
1537 static bool
1538 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1539 {
1540   return false;
1541 }
1542
1543 /* Return true if calls to symbol-ref SYM should be treated as
1544    long-calls (ie called via a register).  */
1545 bool
1546 aarch64_is_long_call_p (rtx sym)
1547 {
1548   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1549 }
1550
1551 /* Return true if calls to symbol-ref SYM should not go through
1552    plt stubs.  */
1553
1554 bool
1555 aarch64_is_noplt_call_p (rtx sym)
1556 {
1557   const_tree decl = SYMBOL_REF_DECL (sym);
1558
1559   if (flag_pic
1560       && decl
1561       && (!flag_plt
1562           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1563       && !targetm.binds_local_p (decl))
1564     return true;
1565
1566   return false;
1567 }
1568
1569 /* Return true if the offsets to a zero/sign-extract operation
1570    represent an expression that matches an extend operation.  The
1571    operands represent the paramters from
1572
1573    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1574 bool
1575 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1576                                 rtx extract_imm)
1577 {
1578   HOST_WIDE_INT mult_val, extract_val;
1579
1580   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1581     return false;
1582
1583   mult_val = INTVAL (mult_imm);
1584   extract_val = INTVAL (extract_imm);
1585
1586   if (extract_val > 8
1587       && extract_val < GET_MODE_BITSIZE (mode)
1588       && exact_log2 (extract_val & ~7) > 0
1589       && (extract_val & 7) <= 4
1590       && mult_val == (1 << (extract_val & 7)))
1591     return true;
1592
1593   return false;
1594 }
1595
1596 /* Emit an insn that's a simple single-set.  Both the operands must be
1597    known to be valid.  */
1598 inline static rtx_insn *
1599 emit_set_insn (rtx x, rtx y)
1600 {
1601   return emit_insn (gen_rtx_SET (x, y));
1602 }
1603
1604 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1605    return the rtx for register 0 in the proper mode.  */
1606 rtx
1607 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1608 {
1609   machine_mode mode = SELECT_CC_MODE (code, x, y);
1610   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1611
1612   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1613   return cc_reg;
1614 }
1615
1616 /* Build the SYMBOL_REF for __tls_get_addr.  */
1617
1618 static GTY(()) rtx tls_get_addr_libfunc;
1619
1620 rtx
1621 aarch64_tls_get_addr (void)
1622 {
1623   if (!tls_get_addr_libfunc)
1624     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1625   return tls_get_addr_libfunc;
1626 }
1627
1628 /* Return the TLS model to use for ADDR.  */
1629
1630 static enum tls_model
1631 tls_symbolic_operand_type (rtx addr)
1632 {
1633   enum tls_model tls_kind = TLS_MODEL_NONE;
1634   if (GET_CODE (addr) == CONST)
1635     {
1636       poly_int64 addend;
1637       rtx sym = strip_offset (addr, &addend);
1638       if (GET_CODE (sym) == SYMBOL_REF)
1639         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1640     }
1641   else if (GET_CODE (addr) == SYMBOL_REF)
1642     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1643
1644   return tls_kind;
1645 }
1646
1647 /* We'll allow lo_sum's in addresses in our legitimate addresses
1648    so that combine would take care of combining addresses where
1649    necessary, but for generation purposes, we'll generate the address
1650    as :
1651    RTL                               Absolute
1652    tmp = hi (symbol_ref);            adrp  x1, foo
1653    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1654                                      nop
1655
1656    PIC                               TLS
1657    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1658    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1659                                      bl   __tls_get_addr
1660                                      nop
1661
1662    Load TLS symbol, depending on TLS mechanism and TLS access model.
1663
1664    Global Dynamic - Traditional TLS:
1665    adrp tmp, :tlsgd:imm
1666    add  dest, tmp, #:tlsgd_lo12:imm
1667    bl   __tls_get_addr
1668
1669    Global Dynamic - TLS Descriptors:
1670    adrp dest, :tlsdesc:imm
1671    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1672    add  dest, dest, #:tlsdesc_lo12:imm
1673    blr  tmp
1674    mrs  tp, tpidr_el0
1675    add  dest, dest, tp
1676
1677    Initial Exec:
1678    mrs  tp, tpidr_el0
1679    adrp tmp, :gottprel:imm
1680    ldr  dest, [tmp, #:gottprel_lo12:imm]
1681    add  dest, dest, tp
1682
1683    Local Exec:
1684    mrs  tp, tpidr_el0
1685    add  t0, tp, #:tprel_hi12:imm, lsl #12
1686    add  t0, t0, #:tprel_lo12_nc:imm
1687 */
1688
1689 static void
1690 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1691                                    enum aarch64_symbol_type type)
1692 {
1693   switch (type)
1694     {
1695     case SYMBOL_SMALL_ABSOLUTE:
1696       {
1697         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1698         rtx tmp_reg = dest;
1699         machine_mode mode = GET_MODE (dest);
1700
1701         gcc_assert (mode == Pmode || mode == ptr_mode);
1702
1703         if (can_create_pseudo_p ())
1704           tmp_reg = gen_reg_rtx (mode);
1705
1706         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1707         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1708         return;
1709       }
1710
1711     case SYMBOL_TINY_ABSOLUTE:
1712       emit_insn (gen_rtx_SET (dest, imm));
1713       return;
1714
1715     case SYMBOL_SMALL_GOT_28K:
1716       {
1717         machine_mode mode = GET_MODE (dest);
1718         rtx gp_rtx = pic_offset_table_rtx;
1719         rtx insn;
1720         rtx mem;
1721
1722         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1723            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1724            decide rtx costs, in which case pic_offset_table_rtx is not
1725            initialized.  For that case no need to generate the first adrp
1726            instruction as the final cost for global variable access is
1727            one instruction.  */
1728         if (gp_rtx != NULL)
1729           {
1730             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1731                using the page base as GOT base, the first page may be wasted,
1732                in the worst scenario, there is only 28K space for GOT).
1733
1734                The generate instruction sequence for accessing global variable
1735                is:
1736
1737                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1738
1739                Only one instruction needed. But we must initialize
1740                pic_offset_table_rtx properly.  We generate initialize insn for
1741                every global access, and allow CSE to remove all redundant.
1742
1743                The final instruction sequences will look like the following
1744                for multiply global variables access.
1745
1746                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1747
1748                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1749                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1750                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1751                  ...  */
1752
1753             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1754             crtl->uses_pic_offset_table = 1;
1755             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1756
1757             if (mode != GET_MODE (gp_rtx))
1758              gp_rtx = gen_lowpart (mode, gp_rtx);
1759
1760           }
1761
1762         if (mode == ptr_mode)
1763           {
1764             if (mode == DImode)
1765               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1766             else
1767               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1768
1769             mem = XVECEXP (SET_SRC (insn), 0, 0);
1770           }
1771         else
1772           {
1773             gcc_assert (mode == Pmode);
1774
1775             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1776             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1777           }
1778
1779         /* The operand is expected to be MEM.  Whenever the related insn
1780            pattern changed, above code which calculate mem should be
1781            updated.  */
1782         gcc_assert (GET_CODE (mem) == MEM);
1783         MEM_READONLY_P (mem) = 1;
1784         MEM_NOTRAP_P (mem) = 1;
1785         emit_insn (insn);
1786         return;
1787       }
1788
1789     case SYMBOL_SMALL_GOT_4G:
1790       {
1791         /* In ILP32, the mode of dest can be either SImode or DImode,
1792            while the got entry is always of SImode size.  The mode of
1793            dest depends on how dest is used: if dest is assigned to a
1794            pointer (e.g. in the memory), it has SImode; it may have
1795            DImode if dest is dereferenced to access the memeory.
1796            This is why we have to handle three different ldr_got_small
1797            patterns here (two patterns for ILP32).  */
1798
1799         rtx insn;
1800         rtx mem;
1801         rtx tmp_reg = dest;
1802         machine_mode mode = GET_MODE (dest);
1803
1804         if (can_create_pseudo_p ())
1805           tmp_reg = gen_reg_rtx (mode);
1806
1807         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1808         if (mode == ptr_mode)
1809           {
1810             if (mode == DImode)
1811               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1812             else
1813               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1814
1815             mem = XVECEXP (SET_SRC (insn), 0, 0);
1816           }
1817         else
1818           {
1819             gcc_assert (mode == Pmode);
1820
1821             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1822             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1823           }
1824
1825         gcc_assert (GET_CODE (mem) == MEM);
1826         MEM_READONLY_P (mem) = 1;
1827         MEM_NOTRAP_P (mem) = 1;
1828         emit_insn (insn);
1829         return;
1830       }
1831
1832     case SYMBOL_SMALL_TLSGD:
1833       {
1834         rtx_insn *insns;
1835         machine_mode mode = GET_MODE (dest);
1836         rtx result = gen_rtx_REG (mode, R0_REGNUM);
1837
1838         start_sequence ();
1839         if (TARGET_ILP32)
1840           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1841         else
1842           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1843         insns = get_insns ();
1844         end_sequence ();
1845
1846         RTL_CONST_CALL_P (insns) = 1;
1847         emit_libcall_block (insns, dest, result, imm);
1848         return;
1849       }
1850
1851     case SYMBOL_SMALL_TLSDESC:
1852       {
1853         machine_mode mode = GET_MODE (dest);
1854         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1855         rtx tp;
1856
1857         gcc_assert (mode == Pmode || mode == ptr_mode);
1858
1859         /* In ILP32, the got entry is always of SImode size.  Unlike
1860            small GOT, the dest is fixed at reg 0.  */
1861         if (TARGET_ILP32)
1862           emit_insn (gen_tlsdesc_small_si (imm));
1863         else
1864           emit_insn (gen_tlsdesc_small_di (imm));
1865         tp = aarch64_load_tp (NULL);
1866
1867         if (mode != Pmode)
1868           tp = gen_lowpart (mode, tp);
1869
1870         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1871         if (REG_P (dest))
1872           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1873         return;
1874       }
1875
1876     case SYMBOL_SMALL_TLSIE:
1877       {
1878         /* In ILP32, the mode of dest can be either SImode or DImode,
1879            while the got entry is always of SImode size.  The mode of
1880            dest depends on how dest is used: if dest is assigned to a
1881            pointer (e.g. in the memory), it has SImode; it may have
1882            DImode if dest is dereferenced to access the memeory.
1883            This is why we have to handle three different tlsie_small
1884            patterns here (two patterns for ILP32).  */
1885         machine_mode mode = GET_MODE (dest);
1886         rtx tmp_reg = gen_reg_rtx (mode);
1887         rtx tp = aarch64_load_tp (NULL);
1888
1889         if (mode == ptr_mode)
1890           {
1891             if (mode == DImode)
1892               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1893             else
1894               {
1895                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1896                 tp = gen_lowpart (mode, tp);
1897               }
1898           }
1899         else
1900           {
1901             gcc_assert (mode == Pmode);
1902             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1903           }
1904
1905         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1906         if (REG_P (dest))
1907           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1908         return;
1909       }
1910
1911     case SYMBOL_TLSLE12:
1912     case SYMBOL_TLSLE24:
1913     case SYMBOL_TLSLE32:
1914     case SYMBOL_TLSLE48:
1915       {
1916         machine_mode mode = GET_MODE (dest);
1917         rtx tp = aarch64_load_tp (NULL);
1918
1919         if (mode != Pmode)
1920           tp = gen_lowpart (mode, tp);
1921
1922         switch (type)
1923           {
1924           case SYMBOL_TLSLE12:
1925             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1926                         (dest, tp, imm));
1927             break;
1928           case SYMBOL_TLSLE24:
1929             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1930                         (dest, tp, imm));
1931           break;
1932           case SYMBOL_TLSLE32:
1933             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1934                         (dest, imm));
1935             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1936                         (dest, dest, tp));
1937           break;
1938           case SYMBOL_TLSLE48:
1939             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1940                         (dest, imm));
1941             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1942                         (dest, dest, tp));
1943             break;
1944           default:
1945             gcc_unreachable ();
1946           }
1947
1948         if (REG_P (dest))
1949           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1950         return;
1951       }
1952
1953     case SYMBOL_TINY_GOT:
1954       emit_insn (gen_ldr_got_tiny (dest, imm));
1955       return;
1956
1957     case SYMBOL_TINY_TLSIE:
1958       {
1959         machine_mode mode = GET_MODE (dest);
1960         rtx tp = aarch64_load_tp (NULL);
1961
1962         if (mode == ptr_mode)
1963           {
1964             if (mode == DImode)
1965               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1966             else
1967               {
1968                 tp = gen_lowpart (mode, tp);
1969                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1970               }
1971           }
1972         else
1973           {
1974             gcc_assert (mode == Pmode);
1975             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1976           }
1977
1978         if (REG_P (dest))
1979           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1980         return;
1981       }
1982
1983     default:
1984       gcc_unreachable ();
1985     }
1986 }
1987
1988 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1989    handle all moves if !can_create_pseudo_p ().  The distinction is
1990    important because, unlike emit_move_insn, the move expanders know
1991    how to force Pmode objects into the constant pool even when the
1992    constant pool address is not itself legitimate.  */
1993 static rtx
1994 aarch64_emit_move (rtx dest, rtx src)
1995 {
1996   return (can_create_pseudo_p ()
1997           ? emit_move_insn (dest, src)
1998           : emit_move_insn_1 (dest, src));
1999 }
2000
2001 /* Apply UNOPTAB to OP and store the result in DEST.  */
2002
2003 static void
2004 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2005 {
2006   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2007   if (dest != tmp)
2008     emit_move_insn (dest, tmp);
2009 }
2010
2011 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
2012
2013 static void
2014 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2015 {
2016   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2017                           OPTAB_DIRECT);
2018   if (dest != tmp)
2019     emit_move_insn (dest, tmp);
2020 }
2021
2022 /* Split a 128-bit move operation into two 64-bit move operations,
2023    taking care to handle partial overlap of register to register
2024    copies.  Special cases are needed when moving between GP regs and
2025    FP regs.  SRC can be a register, constant or memory; DST a register
2026    or memory.  If either operand is memory it must not have any side
2027    effects.  */
2028 void
2029 aarch64_split_128bit_move (rtx dst, rtx src)
2030 {
2031   rtx dst_lo, dst_hi;
2032   rtx src_lo, src_hi;
2033
2034   machine_mode mode = GET_MODE (dst);
2035
2036   gcc_assert (mode == TImode || mode == TFmode);
2037   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2038   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2039
2040   if (REG_P (dst) && REG_P (src))
2041     {
2042       int src_regno = REGNO (src);
2043       int dst_regno = REGNO (dst);
2044
2045       /* Handle FP <-> GP regs.  */
2046       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2047         {
2048           src_lo = gen_lowpart (word_mode, src);
2049           src_hi = gen_highpart (word_mode, src);
2050
2051           emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2052           emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2053           return;
2054         }
2055       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2056         {
2057           dst_lo = gen_lowpart (word_mode, dst);
2058           dst_hi = gen_highpart (word_mode, dst);
2059
2060           emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2061           emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2062           return;
2063         }
2064     }
2065
2066   dst_lo = gen_lowpart (word_mode, dst);
2067   dst_hi = gen_highpart (word_mode, dst);
2068   src_lo = gen_lowpart (word_mode, src);
2069   src_hi = gen_highpart_mode (word_mode, mode, src);
2070
2071   /* At most one pairing may overlap.  */
2072   if (reg_overlap_mentioned_p (dst_lo, src_hi))
2073     {
2074       aarch64_emit_move (dst_hi, src_hi);
2075       aarch64_emit_move (dst_lo, src_lo);
2076     }
2077   else
2078     {
2079       aarch64_emit_move (dst_lo, src_lo);
2080       aarch64_emit_move (dst_hi, src_hi);
2081     }
2082 }
2083
2084 bool
2085 aarch64_split_128bit_move_p (rtx dst, rtx src)
2086 {
2087   return (! REG_P (src)
2088           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2089 }
2090
2091 /* Split a complex SIMD combine.  */
2092
2093 void
2094 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2095 {
2096   machine_mode src_mode = GET_MODE (src1);
2097   machine_mode dst_mode = GET_MODE (dst);
2098
2099   gcc_assert (VECTOR_MODE_P (dst_mode));
2100   gcc_assert (register_operand (dst, dst_mode)
2101               && register_operand (src1, src_mode)
2102               && register_operand (src2, src_mode));
2103
2104   emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2105   return;
2106 }
2107
2108 /* Split a complex SIMD move.  */
2109
2110 void
2111 aarch64_split_simd_move (rtx dst, rtx src)
2112 {
2113   machine_mode src_mode = GET_MODE (src);
2114   machine_mode dst_mode = GET_MODE (dst);
2115
2116   gcc_assert (VECTOR_MODE_P (dst_mode));
2117
2118   if (REG_P (dst) && REG_P (src))
2119     {
2120       gcc_assert (VECTOR_MODE_P (src_mode));
2121       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2122     }
2123 }
2124
2125 bool
2126 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2127                               machine_mode ymode, rtx y)
2128 {
2129   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2130   gcc_assert (r != NULL);
2131   return rtx_equal_p (x, r);
2132 }
2133
2134
2135 static rtx
2136 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2137 {
2138   if (can_create_pseudo_p ())
2139     return force_reg (mode, value);
2140   else
2141     {
2142       gcc_assert (x);
2143       aarch64_emit_move (x, value);
2144       return x;
2145     }
2146 }
2147
2148 /* Return true if we can move VALUE into a register using a single
2149    CNT[BHWD] instruction.  */
2150
2151 static bool
2152 aarch64_sve_cnt_immediate_p (poly_int64 value)
2153 {
2154   HOST_WIDE_INT factor = value.coeffs[0];
2155   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
2156   return (value.coeffs[1] == factor
2157           && IN_RANGE (factor, 2, 16 * 16)
2158           && (factor & 1) == 0
2159           && factor <= 16 * (factor & -factor));
2160 }
2161
2162 /* Likewise for rtx X.  */
2163
2164 bool
2165 aarch64_sve_cnt_immediate_p (rtx x)
2166 {
2167   poly_int64 value;
2168   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2169 }
2170
2171 /* Return the asm string for an instruction with a CNT-like vector size
2172    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2173    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2174    first part of the operands template (the part that comes before the
2175    vector size itself).  FACTOR is the number of quadwords.
2176    NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2177    If it is zero, we can use any element size.  */
2178
2179 static char *
2180 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2181                                   unsigned int factor,
2182                                   unsigned int nelts_per_vq)
2183 {
2184   static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2185
2186   if (nelts_per_vq == 0)
2187     /* There is some overlap in the ranges of the four CNT instructions.
2188        Here we always use the smallest possible element size, so that the
2189        multiplier is 1 whereever possible.  */
2190     nelts_per_vq = factor & -factor;
2191   int shift = std::min (exact_log2 (nelts_per_vq), 4);
2192   gcc_assert (IN_RANGE (shift, 1, 4));
2193   char suffix = "dwhb"[shift - 1];
2194
2195   factor >>= shift;
2196   unsigned int written;
2197   if (factor == 1)
2198     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2199                         prefix, suffix, operands);
2200   else
2201     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2202                         prefix, suffix, operands, factor);
2203   gcc_assert (written < sizeof (buffer));
2204   return buffer;
2205 }
2206
2207 /* Return the asm string for an instruction with a CNT-like vector size
2208    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2209    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2210    first part of the operands template (the part that comes before the
2211    vector size itself).  X is the value of the vector size operand,
2212    as a polynomial integer rtx.  */
2213
2214 char *
2215 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2216                                   rtx x)
2217 {
2218   poly_int64 value = rtx_to_poly_int64 (x);
2219   gcc_assert (aarch64_sve_cnt_immediate_p (value));
2220   return aarch64_output_sve_cnt_immediate (prefix, operands,
2221                                            value.coeffs[1], 0);
2222 }
2223
2224 /* Return true if we can add VALUE to a register using a single ADDVL
2225    or ADDPL instruction.  */
2226
2227 static bool
2228 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2229 {
2230   HOST_WIDE_INT factor = value.coeffs[0];
2231   if (factor == 0 || value.coeffs[1] != factor)
2232     return false;
2233   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2234      and a value of 16 is one vector width.  */
2235   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2236           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2237 }
2238
2239 /* Likewise for rtx X.  */
2240
2241 bool
2242 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2243 {
2244   poly_int64 value;
2245   return (poly_int_rtx_p (x, &value)
2246           && aarch64_sve_addvl_addpl_immediate_p (value));
2247 }
2248
2249 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2250    and storing the result in operand 0.  */
2251
2252 char *
2253 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2254 {
2255   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2256   poly_int64 offset_value = rtx_to_poly_int64 (offset);
2257   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2258
2259   /* Use INC or DEC if possible.  */
2260   if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2261     {
2262       if (aarch64_sve_cnt_immediate_p (offset_value))
2263         return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2264                                                  offset_value.coeffs[1], 0);
2265       if (aarch64_sve_cnt_immediate_p (-offset_value))
2266         return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2267                                                  -offset_value.coeffs[1], 0);
2268     }
2269
2270   int factor = offset_value.coeffs[1];
2271   if ((factor & 15) == 0)
2272     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2273   else
2274     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2275   return buffer;
2276 }
2277
2278 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2279    instruction.  If it is, store the number of elements in each vector
2280    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2281    factor in *FACTOR_OUT (if nonnull).  */
2282
2283 bool
2284 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2285                                  unsigned int *nelts_per_vq_out)
2286 {
2287   rtx elt;
2288   poly_int64 value;
2289
2290   if (!const_vec_duplicate_p (x, &elt)
2291       || !poly_int_rtx_p (elt, &value))
2292     return false;
2293
2294   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2295   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2296     /* There's no vector INCB.  */
2297     return false;
2298
2299   HOST_WIDE_INT factor = value.coeffs[0];
2300   if (value.coeffs[1] != factor)
2301     return false;
2302
2303   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
2304   if ((factor % nelts_per_vq) != 0
2305       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2306     return false;
2307
2308   if (factor_out)
2309     *factor_out = factor;
2310   if (nelts_per_vq_out)
2311     *nelts_per_vq_out = nelts_per_vq;
2312   return true;
2313 }
2314
2315 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2316    instruction.  */
2317
2318 bool
2319 aarch64_sve_inc_dec_immediate_p (rtx x)
2320 {
2321   return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2322 }
2323
2324 /* Return the asm template for an SVE vector INC or DEC instruction.
2325    OPERANDS gives the operands before the vector count and X is the
2326    value of the vector count operand itself.  */
2327
2328 char *
2329 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2330 {
2331   int factor;
2332   unsigned int nelts_per_vq;
2333   if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2334     gcc_unreachable ();
2335   if (factor < 0)
2336     return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2337                                              nelts_per_vq);
2338   else
2339     return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2340                                              nelts_per_vq);
2341 }
2342
2343 static int
2344 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2345                                 scalar_int_mode mode)
2346 {
2347   int i;
2348   unsigned HOST_WIDE_INT val, val2, mask;
2349   int one_match, zero_match;
2350   int num_insns;
2351
2352   val = INTVAL (imm);
2353
2354   if (aarch64_move_imm (val, mode))
2355     {
2356       if (generate)
2357         emit_insn (gen_rtx_SET (dest, imm));
2358       return 1;
2359     }
2360
2361   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2362      (with XXXX non-zero). In that case check to see if the move can be done in
2363      a smaller mode.  */
2364   val2 = val & 0xffffffff;
2365   if (mode == DImode
2366       && aarch64_move_imm (val2, SImode)
2367       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2368     {
2369       if (generate)
2370         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2371
2372       /* Check if we have to emit a second instruction by checking to see
2373          if any of the upper 32 bits of the original DI mode value is set.  */
2374       if (val == val2)
2375         return 1;
2376
2377       i = (val >> 48) ? 48 : 32;
2378
2379       if (generate)
2380          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2381                                     GEN_INT ((val >> i) & 0xffff)));
2382
2383       return 2;
2384     }
2385
2386   if ((val >> 32) == 0 || mode == SImode)
2387     {
2388       if (generate)
2389         {
2390           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2391           if (mode == SImode)
2392             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2393                                        GEN_INT ((val >> 16) & 0xffff)));
2394           else
2395             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2396                                        GEN_INT ((val >> 16) & 0xffff)));
2397         }
2398       return 2;
2399     }
2400
2401   /* Remaining cases are all for DImode.  */
2402
2403   mask = 0xffff;
2404   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2405     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2406   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2407     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2408
2409   if (zero_match != 2 && one_match != 2)
2410     {
2411       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2412          For a 64-bit bitmask try whether changing 16 bits to all ones or
2413          zeroes creates a valid bitmask.  To check any repeated bitmask,
2414          try using 16 bits from the other 32-bit half of val.  */
2415
2416       for (i = 0; i < 64; i += 16, mask <<= 16)
2417         {
2418           val2 = val & ~mask;
2419           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2420             break;
2421           val2 = val | mask;
2422           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2423             break;
2424           val2 = val2 & ~mask;
2425           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2426           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2427             break;
2428         }
2429       if (i != 64)
2430         {
2431           if (generate)
2432             {
2433               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2434               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2435                                          GEN_INT ((val >> i) & 0xffff)));
2436             }
2437           return 2;
2438         }
2439     }
2440
2441   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2442      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
2443      otherwise skip zero bits.  */
2444
2445   num_insns = 1;
2446   mask = 0xffff;
2447   val2 = one_match > zero_match ? ~val : val;
2448   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2449
2450   if (generate)
2451     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2452                                            ? (val | ~(mask << i))
2453                                            : (val & (mask << i)))));
2454   for (i += 16; i < 64; i += 16)
2455     {
2456       if ((val2 & (mask << i)) == 0)
2457         continue;
2458       if (generate)
2459         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2460                                    GEN_INT ((val >> i) & 0xffff)));
2461       num_insns ++;
2462     }
2463
2464   return num_insns;
2465 }
2466
2467 /* Return whether imm is a 128-bit immediate which is simple enough to
2468    expand inline.  */
2469 bool
2470 aarch64_mov128_immediate (rtx imm)
2471 {
2472   if (GET_CODE (imm) == CONST_INT)
2473     return true;
2474
2475   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2476
2477   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2478   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2479
2480   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2481          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2482 }
2483
2484
2485 /* Return the number of temporary registers that aarch64_add_offset_1
2486    would need to add OFFSET to a register.  */
2487
2488 static unsigned int
2489 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2490 {
2491   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2492 }
2493
2494 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
2495    a non-polynomial OFFSET.  MODE is the mode of the addition.
2496    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2497    be set and CFA adjustments added to the generated instructions.
2498
2499    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2500    temporary if register allocation is already complete.  This temporary
2501    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
2502    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2503    the immediate again.
2504
2505    Since this function may be used to adjust the stack pointer, we must
2506    ensure that it cannot cause transient stack deallocation (for example
2507    by first incrementing SP and then decrementing when adjusting by a
2508    large immediate).  */
2509
2510 static void
2511 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2512                       rtx src, HOST_WIDE_INT offset, rtx temp1,
2513                       bool frame_related_p, bool emit_move_imm)
2514 {
2515   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2516   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2517
2518   HOST_WIDE_INT moffset = abs_hwi (offset);
2519   rtx_insn *insn;
2520
2521   if (!moffset)
2522     {
2523       if (!rtx_equal_p (dest, src))
2524         {
2525           insn = emit_insn (gen_rtx_SET (dest, src));
2526           RTX_FRAME_RELATED_P (insn) = frame_related_p;
2527         }
2528       return;
2529     }
2530
2531   /* Single instruction adjustment.  */
2532   if (aarch64_uimm12_shift (moffset))
2533     {
2534       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2535       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2536       return;
2537     }
2538
2539   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2540      and either:
2541
2542      a) the offset cannot be loaded by a 16-bit move or
2543      b) there is no spare register into which we can move it.  */
2544   if (moffset < 0x1000000
2545       && ((!temp1 && !can_create_pseudo_p ())
2546           || !aarch64_move_imm (moffset, mode)))
2547     {
2548       HOST_WIDE_INT low_off = moffset & 0xfff;
2549
2550       low_off = offset < 0 ? -low_off : low_off;
2551       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2552       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2553       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2554       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2555       return;
2556     }
2557
2558   /* Emit a move immediate if required and an addition/subtraction.  */
2559   if (emit_move_imm)
2560     {
2561       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2562       temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2563     }
2564   insn = emit_insn (offset < 0
2565                     ? gen_sub3_insn (dest, src, temp1)
2566                     : gen_add3_insn (dest, src, temp1));
2567   if (frame_related_p)
2568     {
2569       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2570       rtx adj = plus_constant (mode, src, offset);
2571       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2572     }
2573 }
2574
2575 /* Return the number of temporary registers that aarch64_add_offset
2576    would need to move OFFSET into a register or add OFFSET to a register;
2577    ADD_P is true if we want the latter rather than the former.  */
2578
2579 static unsigned int
2580 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2581 {
2582   /* This follows the same structure as aarch64_add_offset.  */
2583   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2584     return 0;
2585
2586   unsigned int count = 0;
2587   HOST_WIDE_INT factor = offset.coeffs[1];
2588   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2589   poly_int64 poly_offset (factor, factor);
2590   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2591     /* Need one register for the ADDVL/ADDPL result.  */
2592     count += 1;
2593   else if (factor != 0)
2594     {
2595       factor = abs (factor);
2596       if (factor > 16 * (factor & -factor))
2597         /* Need one register for the CNT result and one for the multiplication
2598            factor.  If necessary, the second temporary can be reused for the
2599            constant part of the offset.  */
2600         return 2;
2601       /* Need one register for the CNT result (which might then
2602          be shifted).  */
2603       count += 1;
2604     }
2605   return count + aarch64_add_offset_1_temporaries (constant);
2606 }
2607
2608 /* If X can be represented as a poly_int64, return the number
2609    of temporaries that are required to add it to a register.
2610    Return -1 otherwise.  */
2611
2612 int
2613 aarch64_add_offset_temporaries (rtx x)
2614 {
2615   poly_int64 offset;
2616   if (!poly_int_rtx_p (x, &offset))
2617     return -1;
2618   return aarch64_offset_temporaries (true, offset);
2619 }
2620
2621 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
2622    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2623    be set and CFA adjustments added to the generated instructions.
2624
2625    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2626    temporary if register allocation is already complete.  This temporary
2627    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2628    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2629    false to avoid emitting the immediate again.
2630
2631    TEMP2, if nonnull, is a second temporary register that doesn't
2632    overlap either DEST or REG.
2633
2634    Since this function may be used to adjust the stack pointer, we must
2635    ensure that it cannot cause transient stack deallocation (for example
2636    by first incrementing SP and then decrementing when adjusting by a
2637    large immediate).  */
2638
2639 static void
2640 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2641                     poly_int64 offset, rtx temp1, rtx temp2,
2642                     bool frame_related_p, bool emit_move_imm = true)
2643 {
2644   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2645   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2646   gcc_assert (temp1 == NULL_RTX
2647               || !frame_related_p
2648               || !reg_overlap_mentioned_p (temp1, dest));
2649   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2650
2651   /* Try using ADDVL or ADDPL to add the whole value.  */
2652   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2653     {
2654       rtx offset_rtx = gen_int_mode (offset, mode);
2655       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2656       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2657       return;
2658     }
2659
2660   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2661      SVE vector register, over and above the minimum size of 128 bits.
2662      This is equivalent to half the value returned by CNTD with a
2663      vector shape of ALL.  */
2664   HOST_WIDE_INT factor = offset.coeffs[1];
2665   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2666
2667   /* Try using ADDVL or ADDPL to add the VG-based part.  */
2668   poly_int64 poly_offset (factor, factor);
2669   if (src != const0_rtx
2670       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2671     {
2672       rtx offset_rtx = gen_int_mode (poly_offset, mode);
2673       if (frame_related_p)
2674         {
2675           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2676           RTX_FRAME_RELATED_P (insn) = true;
2677           src = dest;
2678         }
2679       else
2680         {
2681           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2682           src = aarch64_force_temporary (mode, temp1, addr);
2683           temp1 = temp2;
2684           temp2 = NULL_RTX;
2685         }
2686     }
2687   /* Otherwise use a CNT-based sequence.  */
2688   else if (factor != 0)
2689     {
2690       /* Use a subtraction if we have a negative factor.  */
2691       rtx_code code = PLUS;
2692       if (factor < 0)
2693         {
2694           factor = -factor;
2695           code = MINUS;
2696         }
2697
2698       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
2699          into the multiplication.  */
2700       rtx val;
2701       int shift = 0;
2702       if (factor & 1)
2703         /* Use a right shift by 1.  */
2704         shift = -1;
2705       else
2706         factor /= 2;
2707       HOST_WIDE_INT low_bit = factor & -factor;
2708       if (factor <= 16 * low_bit)
2709         {
2710           if (factor > 16 * 8)
2711             {
2712               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2713                  the value with the minimum multiplier and shift it into
2714                  position.  */
2715               int extra_shift = exact_log2 (low_bit);
2716               shift += extra_shift;
2717               factor >>= extra_shift;
2718             }
2719           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
2720         }
2721       else
2722         {
2723           /* Use CNTD, then multiply it by FACTOR.  */
2724           val = gen_int_mode (poly_int64 (2, 2), mode);
2725           val = aarch64_force_temporary (mode, temp1, val);
2726
2727           /* Go back to using a negative multiplication factor if we have
2728              no register from which to subtract.  */
2729           if (code == MINUS && src == const0_rtx)
2730             {
2731               factor = -factor;
2732               code = PLUS;
2733             }
2734           rtx coeff1 = gen_int_mode (factor, mode);
2735           coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
2736           val = gen_rtx_MULT (mode, val, coeff1);
2737         }
2738
2739       if (shift > 0)
2740         {
2741           /* Multiply by 1 << SHIFT.  */
2742           val = aarch64_force_temporary (mode, temp1, val);
2743           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
2744         }
2745       else if (shift == -1)
2746         {
2747           /* Divide by 2.  */
2748           val = aarch64_force_temporary (mode, temp1, val);
2749           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
2750         }
2751
2752       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
2753       if (src != const0_rtx)
2754         {
2755           val = aarch64_force_temporary (mode, temp1, val);
2756           val = gen_rtx_fmt_ee (code, mode, src, val);
2757         }
2758       else if (code == MINUS)
2759         {
2760           val = aarch64_force_temporary (mode, temp1, val);
2761           val = gen_rtx_NEG (mode, val);
2762         }
2763
2764       if (constant == 0 || frame_related_p)
2765         {
2766           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
2767           if (frame_related_p)
2768             {
2769               RTX_FRAME_RELATED_P (insn) = true;
2770               add_reg_note (insn, REG_CFA_ADJUST_CFA,
2771                             gen_rtx_SET (dest, plus_constant (Pmode, src,
2772                                                               poly_offset)));
2773             }
2774           src = dest;
2775           if (constant == 0)
2776             return;
2777         }
2778       else
2779         {
2780           src = aarch64_force_temporary (mode, temp1, val);
2781           temp1 = temp2;
2782           temp2 = NULL_RTX;
2783         }
2784
2785       emit_move_imm = true;
2786     }
2787
2788   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
2789                         frame_related_p, emit_move_imm);
2790 }
2791
2792 /* Like aarch64_add_offset, but the offset is given as an rtx rather
2793    than a poly_int64.  */
2794
2795 void
2796 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2797                           rtx offset_rtx, rtx temp1, rtx temp2)
2798 {
2799   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
2800                       temp1, temp2, false);
2801 }
2802
2803 /* Add DELTA to the stack pointer, marking the instructions frame-related.
2804    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
2805    if TEMP1 already contains abs (DELTA).  */
2806
2807 static inline void
2808 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
2809 {
2810   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
2811                       temp1, temp2, true, emit_move_imm);
2812 }
2813
2814 /* Subtract DELTA from the stack pointer, marking the instructions
2815    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
2816    if nonnull.  */
2817
2818 static inline void
2819 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p)
2820 {
2821   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
2822                       temp1, temp2, frame_related_p);
2823 }
2824
2825 /* Set DEST to (vec_series BASE STEP).  */
2826
2827 static void
2828 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
2829 {
2830   machine_mode mode = GET_MODE (dest);
2831   scalar_mode inner = GET_MODE_INNER (mode);
2832
2833   /* Each operand can be a register or an immediate in the range [-16, 15].  */
2834   if (!aarch64_sve_index_immediate_p (base))
2835     base = force_reg (inner, base);
2836   if (!aarch64_sve_index_immediate_p (step))
2837     step = force_reg (inner, step);
2838
2839   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
2840 }
2841
2842 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
2843    integer of mode INT_MODE.  Return true on success.  */
2844
2845 static bool
2846 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
2847                                       rtx src)
2848 {
2849   /* If the constant is smaller than 128 bits, we can do the move
2850      using a vector of SRC_MODEs.  */
2851   if (src_mode != TImode)
2852     {
2853       poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
2854                                      GET_MODE_SIZE (src_mode));
2855       machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
2856       emit_move_insn (gen_lowpart (dup_mode, dest),
2857                       gen_const_vec_duplicate (dup_mode, src));
2858       return true;
2859     }
2860
2861   /* Use LD1RQ[BHWD] to load the 128 bits from memory.  */
2862   src = force_const_mem (src_mode, src);
2863   if (!src)
2864     return false;
2865
2866   /* Make sure that the address is legitimate.  */
2867   if (!aarch64_sve_ld1r_operand_p (src))
2868     {
2869       rtx addr = force_reg (Pmode, XEXP (src, 0));
2870       src = replace_equiv_address (src, addr);
2871     }
2872
2873   machine_mode mode = GET_MODE (dest);
2874   unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
2875   machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
2876   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
2877   src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
2878   emit_insn (gen_rtx_SET (dest, src));
2879   return true;
2880 }
2881
2882 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
2883    isn't a simple duplicate or series.  */
2884
2885 static void
2886 aarch64_expand_sve_const_vector (rtx dest, rtx src)
2887 {
2888   machine_mode mode = GET_MODE (src);
2889   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
2890   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
2891   gcc_assert (npatterns > 1);
2892
2893   if (nelts_per_pattern == 1)
2894     {
2895       /* The constant is a repeating seqeuence of at least two elements,
2896          where the repeating elements occupy no more than 128 bits.
2897          Get an integer representation of the replicated value.  */
2898       scalar_int_mode int_mode;
2899       if (BYTES_BIG_ENDIAN)
2900         /* For now, always use LD1RQ to load the value on big-endian
2901            targets, since the handling of smaller integers includes a
2902            subreg that is semantically an element reverse.  */
2903         int_mode = TImode;
2904       else
2905         {
2906           unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
2907           gcc_assert (int_bits <= 128);
2908           int_mode = int_mode_for_size (int_bits, 0).require ();
2909         }
2910       rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
2911       if (int_value
2912           && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
2913         return;
2914     }
2915
2916   /* Expand each pattern individually.  */
2917   rtx_vector_builder builder;
2918   auto_vec<rtx, 16> vectors (npatterns);
2919   for (unsigned int i = 0; i < npatterns; ++i)
2920     {
2921       builder.new_vector (mode, 1, nelts_per_pattern);
2922       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
2923         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
2924       vectors.quick_push (force_reg (mode, builder.build ()));
2925     }
2926
2927   /* Use permutes to interleave the separate vectors.  */
2928   while (npatterns > 1)
2929     {
2930       npatterns /= 2;
2931       for (unsigned int i = 0; i < npatterns; ++i)
2932         {
2933           rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
2934           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
2935           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
2936           vectors[i] = tmp;
2937         }
2938     }
2939   gcc_assert (vectors[0] == dest);
2940 }
2941
2942 /* Set DEST to immediate IMM.  For SVE vector modes, GEN_VEC_DUPLICATE
2943    is a pattern that can be used to set DEST to a replicated scalar
2944    element.  */
2945
2946 void
2947 aarch64_expand_mov_immediate (rtx dest, rtx imm,
2948                               rtx (*gen_vec_duplicate) (rtx, rtx))
2949 {
2950   machine_mode mode = GET_MODE (dest);
2951
2952   /* Check on what type of symbol it is.  */
2953   scalar_int_mode int_mode;
2954   if ((GET_CODE (imm) == SYMBOL_REF
2955        || GET_CODE (imm) == LABEL_REF
2956        || GET_CODE (imm) == CONST
2957        || GET_CODE (imm) == CONST_POLY_INT)
2958       && is_a <scalar_int_mode> (mode, &int_mode))
2959     {
2960       rtx mem;
2961       poly_int64 offset;
2962       HOST_WIDE_INT const_offset;
2963       enum aarch64_symbol_type sty;
2964
2965       /* If we have (const (plus symbol offset)), separate out the offset
2966          before we start classifying the symbol.  */
2967       rtx base = strip_offset (imm, &offset);
2968
2969       /* We must always add an offset involving VL separately, rather than
2970          folding it into the relocation.  */
2971       if (!offset.is_constant (&const_offset))
2972         {
2973           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
2974             emit_insn (gen_rtx_SET (dest, imm));
2975           else
2976             {
2977               /* Do arithmetic on 32-bit values if the result is smaller
2978                  than that.  */
2979               if (partial_subreg_p (int_mode, SImode))
2980                 {
2981                   /* It is invalid to do symbol calculations in modes
2982                      narrower than SImode.  */
2983                   gcc_assert (base == const0_rtx);
2984                   dest = gen_lowpart (SImode, dest);
2985                   int_mode = SImode;
2986                 }
2987               if (base != const0_rtx)
2988                 {
2989                   base = aarch64_force_temporary (int_mode, dest, base);
2990                   aarch64_add_offset (int_mode, dest, base, offset,
2991                                       NULL_RTX, NULL_RTX, false);
2992                 }
2993               else
2994                 aarch64_add_offset (int_mode, dest, base, offset,
2995                                     dest, NULL_RTX, false);
2996             }
2997           return;
2998         }
2999
3000       sty = aarch64_classify_symbol (base, const_offset);
3001       switch (sty)
3002         {
3003         case SYMBOL_FORCE_TO_MEM:
3004           if (const_offset != 0
3005               && targetm.cannot_force_const_mem (int_mode, imm))
3006             {
3007               gcc_assert (can_create_pseudo_p ());
3008               base = aarch64_force_temporary (int_mode, dest, base);
3009               aarch64_add_offset (int_mode, dest, base, const_offset,
3010                                   NULL_RTX, NULL_RTX, false);
3011               return;
3012             }
3013
3014           mem = force_const_mem (ptr_mode, imm);
3015           gcc_assert (mem);
3016
3017           /* If we aren't generating PC relative literals, then
3018              we need to expand the literal pool access carefully.
3019              This is something that needs to be done in a number
3020              of places, so could well live as a separate function.  */
3021           if (!aarch64_pcrelative_literal_loads)
3022             {
3023               gcc_assert (can_create_pseudo_p ());
3024               base = gen_reg_rtx (ptr_mode);
3025               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
3026               if (ptr_mode != Pmode)
3027                 base = convert_memory_address (Pmode, base);
3028               mem = gen_rtx_MEM (ptr_mode, base);
3029             }
3030
3031           if (int_mode != ptr_mode)
3032             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
3033
3034           emit_insn (gen_rtx_SET (dest, mem));
3035
3036           return;
3037
3038         case SYMBOL_SMALL_TLSGD:
3039         case SYMBOL_SMALL_TLSDESC:
3040         case SYMBOL_SMALL_TLSIE:
3041         case SYMBOL_SMALL_GOT_28K:
3042         case SYMBOL_SMALL_GOT_4G:
3043         case SYMBOL_TINY_GOT:
3044         case SYMBOL_TINY_TLSIE:
3045           if (const_offset != 0)
3046             {
3047               gcc_assert(can_create_pseudo_p ());
3048               base = aarch64_force_temporary (int_mode, dest, base);
3049               aarch64_add_offset (int_mode, dest, base, const_offset,
3050                                   NULL_RTX, NULL_RTX, false);
3051               return;
3052             }
3053           /* FALLTHRU */
3054
3055         case SYMBOL_SMALL_ABSOLUTE:
3056         case SYMBOL_TINY_ABSOLUTE:
3057         case SYMBOL_TLSLE12:
3058         case SYMBOL_TLSLE24:
3059         case SYMBOL_TLSLE32:
3060         case SYMBOL_TLSLE48:
3061           aarch64_load_symref_appropriately (dest, imm, sty);
3062           return;
3063
3064         default:
3065           gcc_unreachable ();
3066         }
3067     }
3068
3069   if (!CONST_INT_P (imm))
3070     {
3071       rtx base, step, value;
3072       if (GET_CODE (imm) == HIGH
3073           || aarch64_simd_valid_immediate (imm, NULL))
3074         emit_insn (gen_rtx_SET (dest, imm));
3075       else if (const_vec_series_p (imm, &base, &step))
3076         aarch64_expand_vec_series (dest, base, step);
3077       else if (const_vec_duplicate_p (imm, &value))
3078         {
3079           /* If the constant is out of range of an SVE vector move,
3080              load it from memory if we can, otherwise move it into
3081              a register and use a DUP.  */
3082           scalar_mode inner_mode = GET_MODE_INNER (mode);
3083           rtx op = force_const_mem (inner_mode, value);
3084           if (!op)
3085             op = force_reg (inner_mode, value);
3086           else if (!aarch64_sve_ld1r_operand_p (op))
3087             {
3088               rtx addr = force_reg (Pmode, XEXP (op, 0));
3089               op = replace_equiv_address (op, addr);
3090             }
3091           emit_insn (gen_vec_duplicate (dest, op));
3092         }
3093       else if (GET_CODE (imm) == CONST_VECTOR
3094                && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3095         aarch64_expand_sve_const_vector (dest, imm);
3096       else
3097         {
3098           rtx mem = force_const_mem (mode, imm);
3099           gcc_assert (mem);
3100           emit_move_insn (dest, mem);
3101         }
3102
3103       return;
3104     }
3105
3106   aarch64_internal_mov_immediate (dest, imm, true,
3107                                   as_a <scalar_int_mode> (mode));
3108 }
3109
3110 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
3111    that is known to contain PTRUE.  */
3112
3113 void
3114 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3115 {
3116   emit_insn (gen_rtx_SET (dest, gen_rtx_UNSPEC (GET_MODE (dest),
3117                                                 gen_rtvec (2, pred, src),
3118                                                 UNSPEC_MERGE_PTRUE)));
3119 }
3120
3121 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3122    operand is in memory.  In this case we need to use the predicated LD1
3123    and ST1 instead of LDR and STR, both for correctness on big-endian
3124    targets and because LD1 and ST1 support a wider range of addressing modes.
3125    PRED_MODE is the mode of the predicate.
3126
3127    See the comment at the head of aarch64-sve.md for details about the
3128    big-endian handling.  */
3129
3130 void
3131 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3132 {
3133   machine_mode mode = GET_MODE (dest);
3134   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3135   if (!register_operand (src, mode)
3136       && !register_operand (dest, mode))
3137     {
3138       rtx tmp = gen_reg_rtx (mode);
3139       if (MEM_P (src))
3140         aarch64_emit_sve_pred_move (tmp, ptrue, src);
3141       else
3142         emit_move_insn (tmp, src);
3143       src = tmp;
3144     }
3145   aarch64_emit_sve_pred_move (dest, ptrue, src);
3146 }
3147
3148 /* Called only on big-endian targets.  See whether an SVE vector move
3149    from SRC to DEST is effectively a REV[BHW] instruction, because at
3150    least one operand is a subreg of an SVE vector that has wider or
3151    narrower elements.  Return true and emit the instruction if so.
3152
3153    For example:
3154
3155      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3156
3157    represents a VIEW_CONVERT between the following vectors, viewed
3158    in memory order:
3159
3160      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
3161      R1: { [0],      [1],      [2],      [3],     ... }
3162
3163    The high part of lane X in R2 should therefore correspond to lane X*2
3164    of R1, but the register representations are:
3165
3166          msb                                      lsb
3167      R2: ...... [1].high  [1].low   [0].high  [0].low
3168      R1: ...... [3]       [2]       [1]       [0]
3169
3170    where the low part of lane X in R2 corresponds to lane X*2 in R1.
3171    We therefore need a reverse operation to swap the high and low values
3172    around.
3173
3174    This is purely an optimization.  Without it we would spill the
3175    subreg operand to the stack in one mode and reload it in the
3176    other mode, which has the same effect as the REV.  */
3177
3178 bool
3179 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3180 {
3181   gcc_assert (BYTES_BIG_ENDIAN);
3182   if (GET_CODE (dest) == SUBREG)
3183     dest = SUBREG_REG (dest);
3184   if (GET_CODE (src) == SUBREG)
3185     src = SUBREG_REG (src);
3186
3187   /* The optimization handles two single SVE REGs with different element
3188      sizes.  */
3189   if (!REG_P (dest)
3190       || !REG_P (src)
3191       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3192       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3193       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3194           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3195     return false;
3196
3197   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
3198   rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3199   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3200                                UNSPEC_REV_SUBREG);
3201   emit_insn (gen_rtx_SET (dest, unspec));
3202   return true;
3203 }
3204
3205 /* Return a copy of X with mode MODE, without changing its other
3206    attributes.  Unlike gen_lowpart, this doesn't care whether the
3207    mode change is valid.  */
3208
3209 static rtx
3210 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3211 {
3212   if (GET_MODE (x) == mode)
3213     return x;
3214
3215   x = shallow_copy_rtx (x);
3216   set_mode_and_regno (x, mode, REGNO (x));
3217   return x;
3218 }
3219
3220 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3221    operands.  */
3222
3223 void
3224 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3225 {
3226   /* Decide which REV operation we need.  The mode with narrower elements
3227      determines the mode of the operands and the mode with the wider
3228      elements determines the reverse width.  */
3229   machine_mode mode_with_wider_elts = GET_MODE (dest);
3230   machine_mode mode_with_narrower_elts = GET_MODE (src);
3231   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3232       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3233     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3234
3235   unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3236   unsigned int unspec;
3237   if (wider_bytes == 8)
3238     unspec = UNSPEC_REV64;
3239   else if (wider_bytes == 4)
3240     unspec = UNSPEC_REV32;
3241   else if (wider_bytes == 2)
3242     unspec = UNSPEC_REV16;
3243   else
3244     gcc_unreachable ();
3245   machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3246
3247   /* Emit:
3248
3249        (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3250                          UNSPEC_MERGE_PTRUE))
3251
3252      with the appropriate modes.  */
3253   ptrue = gen_lowpart (pred_mode, ptrue);
3254   dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3255   src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3256   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3257   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3258                         UNSPEC_MERGE_PTRUE);
3259   emit_insn (gen_rtx_SET (dest, src));
3260 }
3261
3262 static bool
3263 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3264                                  tree exp ATTRIBUTE_UNUSED)
3265 {
3266   /* Currently, always true.  */
3267   return true;
3268 }
3269
3270 /* Implement TARGET_PASS_BY_REFERENCE.  */
3271
3272 static bool
3273 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3274                            machine_mode mode,
3275                            const_tree type,
3276                            bool named ATTRIBUTE_UNUSED)
3277 {
3278   HOST_WIDE_INT size;
3279   machine_mode dummymode;
3280   int nregs;
3281
3282   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
3283   if (mode == BLKmode && type)
3284     size = int_size_in_bytes (type);
3285   else
3286     /* No frontends can create types with variable-sized modes, so we
3287        shouldn't be asked to pass or return them.  */
3288     size = GET_MODE_SIZE (mode).to_constant ();
3289
3290   /* Aggregates are passed by reference based on their size.  */
3291   if (type && AGGREGATE_TYPE_P (type))
3292     {
3293       size = int_size_in_bytes (type);
3294     }
3295
3296   /* Variable sized arguments are always returned by reference.  */
3297   if (size < 0)
3298     return true;
3299
3300   /* Can this be a candidate to be passed in fp/simd register(s)?  */
3301   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3302                                                &dummymode, &nregs,
3303                                                NULL))
3304     return false;
3305
3306   /* Arguments which are variable sized or larger than 2 registers are
3307      passed by reference unless they are a homogenous floating point
3308      aggregate.  */
3309   return size > 2 * UNITS_PER_WORD;
3310 }
3311
3312 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
3313 static bool
3314 aarch64_return_in_msb (const_tree valtype)
3315 {
3316   machine_mode dummy_mode;
3317   int dummy_int;
3318
3319   /* Never happens in little-endian mode.  */
3320   if (!BYTES_BIG_ENDIAN)
3321     return false;
3322
3323   /* Only composite types smaller than or equal to 16 bytes can
3324      be potentially returned in registers.  */
3325   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3326       || int_size_in_bytes (valtype) <= 0
3327       || int_size_in_bytes (valtype) > 16)
3328     return false;
3329
3330   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3331      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3332      is always passed/returned in the least significant bits of fp/simd
3333      register(s).  */
3334   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3335                                                &dummy_mode, &dummy_int, NULL))
3336     return false;
3337
3338   return true;
3339 }
3340
3341 /* Implement TARGET_FUNCTION_VALUE.
3342    Define how to find the value returned by a function.  */
3343
3344 static rtx
3345 aarch64_function_value (const_tree type, const_tree func,
3346                         bool outgoing ATTRIBUTE_UNUSED)
3347 {
3348   machine_mode mode;
3349   int unsignedp;
3350   int count;
3351   machine_mode ag_mode;
3352
3353   mode = TYPE_MODE (type);
3354   if (INTEGRAL_TYPE_P (type))
3355     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3356
3357   if (aarch64_return_in_msb (type))
3358     {
3359       HOST_WIDE_INT size = int_size_in_bytes (type);
3360
3361       if (size % UNITS_PER_WORD != 0)
3362         {
3363           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3364           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3365         }
3366     }
3367
3368   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3369                                                &ag_mode, &count, NULL))
3370     {
3371       if (!aarch64_composite_type_p (type, mode))
3372         {
3373           gcc_assert (count == 1 && mode == ag_mode);
3374           return gen_rtx_REG (mode, V0_REGNUM);
3375         }
3376       else
3377         {
3378           int i;
3379           rtx par;
3380
3381           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3382           for (i = 0; i < count; i++)
3383             {
3384               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3385               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3386               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3387               XVECEXP (par, 0, i) = tmp;
3388             }
3389           return par;
3390         }
3391     }
3392   else
3393     return gen_rtx_REG (mode, R0_REGNUM);
3394 }
3395
3396 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3397    Return true if REGNO is the number of a hard register in which the values
3398    of called function may come back.  */
3399
3400 static bool
3401 aarch64_function_value_regno_p (const unsigned int regno)
3402 {
3403   /* Maximum of 16 bytes can be returned in the general registers.  Examples
3404      of 16-byte return values are: 128-bit integers and 16-byte small
3405      structures (excluding homogeneous floating-point aggregates).  */
3406   if (regno == R0_REGNUM || regno == R1_REGNUM)
3407     return true;
3408
3409   /* Up to four fp/simd registers can return a function value, e.g. a
3410      homogeneous floating-point aggregate having four members.  */
3411   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3412     return TARGET_FLOAT;
3413
3414   return false;
3415 }
3416
3417 /* Implement TARGET_RETURN_IN_MEMORY.
3418
3419    If the type T of the result of a function is such that
3420      void func (T arg)
3421    would require that arg be passed as a value in a register (or set of
3422    registers) according to the parameter passing rules, then the result
3423    is returned in the same registers as would be used for such an
3424    argument.  */
3425
3426 static bool
3427 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3428 {
3429   HOST_WIDE_INT size;
3430   machine_mode ag_mode;
3431   int count;
3432
3433   if (!AGGREGATE_TYPE_P (type)
3434       && TREE_CODE (type) != COMPLEX_TYPE
3435       && TREE_CODE (type) != VECTOR_TYPE)
3436     /* Simple scalar types always returned in registers.  */
3437     return false;
3438
3439   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3440                                                type,
3441                                                &ag_mode,
3442                                                &count,
3443                                                NULL))
3444     return false;
3445
3446   /* Types larger than 2 registers returned in memory.  */
3447   size = int_size_in_bytes (type);
3448   return (size < 0 || size > 2 * UNITS_PER_WORD);
3449 }
3450
3451 static bool
3452 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3453                                const_tree type, int *nregs)
3454 {
3455   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3456   return aarch64_vfp_is_call_or_return_candidate (mode,
3457                                                   type,
3458                                                   &pcum->aapcs_vfp_rmode,
3459                                                   nregs,
3460                                                   NULL);
3461 }
3462
3463 /* Given MODE and TYPE of a function argument, return the alignment in
3464    bits.  The idea is to suppress any stronger alignment requested by
3465    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3466    This is a helper function for local use only.  */
3467
3468 static unsigned int
3469 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
3470 {
3471   if (!type)
3472     return GET_MODE_ALIGNMENT (mode);
3473
3474   if (integer_zerop (TYPE_SIZE (type)))
3475     return 0;
3476
3477   gcc_assert (TYPE_MODE (type) == mode);
3478
3479   if (!AGGREGATE_TYPE_P (type))
3480     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3481
3482   if (TREE_CODE (type) == ARRAY_TYPE)
3483     return TYPE_ALIGN (TREE_TYPE (type));
3484
3485   unsigned int alignment = 0;
3486   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3487     if (TREE_CODE (field) == FIELD_DECL)
3488       alignment = std::max (alignment, DECL_ALIGN (field));
3489
3490   return alignment;
3491 }
3492
3493 /* Layout a function argument according to the AAPCS64 rules.  The rule
3494    numbers refer to the rule numbers in the AAPCS64.  */
3495
3496 static void
3497 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3498                     const_tree type,
3499                     bool named ATTRIBUTE_UNUSED)
3500 {
3501   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3502   int ncrn, nvrn, nregs;
3503   bool allocate_ncrn, allocate_nvrn;
3504   HOST_WIDE_INT size;
3505
3506   /* We need to do this once per argument.  */
3507   if (pcum->aapcs_arg_processed)
3508     return;
3509
3510   pcum->aapcs_arg_processed = true;
3511
3512   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
3513   if (type)
3514     size = int_size_in_bytes (type);
3515   else
3516     /* No frontends can create types with variable-sized modes, so we
3517        shouldn't be asked to pass or return them.  */
3518     size = GET_MODE_SIZE (mode).to_constant ();
3519   size = ROUND_UP (size, UNITS_PER_WORD);
3520
3521   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3522   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3523                                                  mode,
3524                                                  type,
3525                                                  &nregs);
3526
3527   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3528      The following code thus handles passing by SIMD/FP registers first.  */
3529
3530   nvrn = pcum->aapcs_nvrn;
3531
3532   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3533      and homogenous short-vector aggregates (HVA).  */
3534   if (allocate_nvrn)
3535     {
3536       if (!TARGET_FLOAT)
3537         aarch64_err_no_fpadvsimd (mode);
3538
3539       if (nvrn + nregs <= NUM_FP_ARG_REGS)
3540         {
3541           pcum->aapcs_nextnvrn = nvrn + nregs;
3542           if (!aarch64_composite_type_p (type, mode))
3543             {
3544               gcc_assert (nregs == 1);
3545               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3546             }
3547           else
3548             {
3549               rtx par;
3550               int i;
3551               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3552               for (i = 0; i < nregs; i++)
3553                 {
3554                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3555                                          V0_REGNUM + nvrn + i);
3556                   rtx offset = gen_int_mode
3557                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3558                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3559                   XVECEXP (par, 0, i) = tmp;
3560                 }
3561               pcum->aapcs_reg = par;
3562             }
3563           return;
3564         }
3565       else
3566         {
3567           /* C.3 NSRN is set to 8.  */
3568           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3569           goto on_stack;
3570         }
3571     }
3572
3573   ncrn = pcum->aapcs_ncrn;
3574   nregs = size / UNITS_PER_WORD;
3575
3576   /* C6 - C9.  though the sign and zero extension semantics are
3577      handled elsewhere.  This is the case where the argument fits
3578      entirely general registers.  */
3579   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3580     {
3581
3582       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3583
3584       /* C.8 if the argument has an alignment of 16 then the NGRN is
3585          rounded up to the next even number.  */
3586       if (nregs == 2
3587           && ncrn % 2
3588           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3589              comparison is there because for > 16 * BITS_PER_UNIT
3590              alignment nregs should be > 2 and therefore it should be
3591              passed by reference rather than value.  */
3592           && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3593         {
3594           ++ncrn;
3595           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3596         }
3597
3598       /* NREGS can be 0 when e.g. an empty structure is to be passed.
3599          A reg is still generated for it, but the caller should be smart
3600          enough not to use it.  */
3601       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3602         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3603       else
3604         {
3605           rtx par;
3606           int i;
3607
3608           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3609           for (i = 0; i < nregs; i++)
3610             {
3611               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3612               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3613                                        GEN_INT (i * UNITS_PER_WORD));
3614               XVECEXP (par, 0, i) = tmp;
3615             }
3616           pcum->aapcs_reg = par;
3617         }
3618
3619       pcum->aapcs_nextncrn = ncrn + nregs;
3620       return;
3621     }
3622
3623   /* C.11  */
3624   pcum->aapcs_nextncrn = NUM_ARG_REGS;
3625
3626   /* The argument is passed on stack; record the needed number of words for
3627      this argument and align the total size if necessary.  */
3628 on_stack:
3629   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3630
3631   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3632     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
3633                                        16 / UNITS_PER_WORD);
3634   return;
3635 }
3636
3637 /* Implement TARGET_FUNCTION_ARG.  */
3638
3639 static rtx
3640 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3641                       const_tree type, bool named)
3642 {
3643   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3644   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3645
3646   if (mode == VOIDmode)
3647     return NULL_RTX;
3648
3649   aarch64_layout_arg (pcum_v, mode, type, named);
3650   return pcum->aapcs_reg;
3651 }
3652
3653 void
3654 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
3655                            const_tree fntype ATTRIBUTE_UNUSED,
3656                            rtx libname ATTRIBUTE_UNUSED,
3657                            const_tree fndecl ATTRIBUTE_UNUSED,
3658                            unsigned n_named ATTRIBUTE_UNUSED)
3659 {
3660   pcum->aapcs_ncrn = 0;
3661   pcum->aapcs_nvrn = 0;
3662   pcum->aapcs_nextncrn = 0;
3663   pcum->aapcs_nextnvrn = 0;
3664   pcum->pcs_variant = ARM_PCS_AAPCS64;
3665   pcum->aapcs_reg = NULL_RTX;
3666   pcum->aapcs_arg_processed = false;
3667   pcum->aapcs_stack_words = 0;
3668   pcum->aapcs_stack_size = 0;
3669
3670   if (!TARGET_FLOAT
3671       && fndecl && TREE_PUBLIC (fndecl)
3672       && fntype && fntype != error_mark_node)
3673     {
3674       const_tree type = TREE_TYPE (fntype);
3675       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
3676       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
3677       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
3678                                                    &mode, &nregs, NULL))
3679         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
3680     }
3681   return;
3682 }
3683
3684 static void
3685 aarch64_function_arg_advance (cumulative_args_t pcum_v,
3686                               machine_mode mode,
3687                               const_tree type,
3688                               bool named)
3689 {
3690   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3691   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
3692     {
3693       aarch64_layout_arg (pcum_v, mode, type, named);
3694       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
3695                   != (pcum->aapcs_stack_words != 0));
3696       pcum->aapcs_arg_processed = false;
3697       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
3698       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
3699       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
3700       pcum->aapcs_stack_words = 0;
3701       pcum->aapcs_reg = NULL_RTX;
3702     }
3703 }
3704
3705 bool
3706 aarch64_function_arg_regno_p (unsigned regno)
3707 {
3708   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
3709           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
3710 }
3711
3712 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
3713    PARM_BOUNDARY bits of alignment, but will be given anything up
3714    to STACK_BOUNDARY bits if the type requires it.  This makes sure
3715    that both before and after the layout of each argument, the Next
3716    Stacked Argument Address (NSAA) will have a minimum alignment of
3717    8 bytes.  */
3718
3719 static unsigned int
3720 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
3721 {
3722   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
3723   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
3724 }
3725
3726 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
3727
3728 static fixed_size_mode
3729 aarch64_get_reg_raw_mode (int regno)
3730 {
3731   if (TARGET_SVE && FP_REGNUM_P (regno))
3732     /* Don't use the SVE part of the register for __builtin_apply and
3733        __builtin_return.  The SVE registers aren't used by the normal PCS,
3734        so using them there would be a waste of time.  The PCS extensions
3735        for SVE types are fundamentally incompatible with the
3736        __builtin_return/__builtin_apply interface.  */
3737     return as_a <fixed_size_mode> (V16QImode);
3738   return default_get_reg_raw_mode (regno);
3739 }
3740
3741 /* Implement TARGET_FUNCTION_ARG_PADDING.
3742
3743    Small aggregate types are placed in the lowest memory address.
3744
3745    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
3746
3747 static pad_direction
3748 aarch64_function_arg_padding (machine_mode mode, const_tree type)
3749 {
3750   /* On little-endian targets, the least significant byte of every stack
3751      argument is passed at the lowest byte address of the stack slot.  */
3752   if (!BYTES_BIG_ENDIAN)
3753     return PAD_UPWARD;
3754
3755   /* Otherwise, integral, floating-point and pointer types are padded downward:
3756      the least significant byte of a stack argument is passed at the highest
3757      byte address of the stack slot.  */
3758   if (type
3759       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
3760          || POINTER_TYPE_P (type))
3761       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
3762     return PAD_DOWNWARD;
3763
3764   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
3765   return PAD_UPWARD;
3766 }
3767
3768 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
3769
3770    It specifies padding for the last (may also be the only)
3771    element of a block move between registers and memory.  If
3772    assuming the block is in the memory, padding upward means that
3773    the last element is padded after its highest significant byte,
3774    while in downward padding, the last element is padded at the
3775    its least significant byte side.
3776
3777    Small aggregates and small complex types are always padded
3778    upwards.
3779
3780    We don't need to worry about homogeneous floating-point or
3781    short-vector aggregates; their move is not affected by the
3782    padding direction determined here.  Regardless of endianness,
3783    each element of such an aggregate is put in the least
3784    significant bits of a fp/simd register.
3785
3786    Return !BYTES_BIG_ENDIAN if the least significant byte of the
3787    register has useful data, and return the opposite if the most
3788    significant byte does.  */
3789
3790 bool
3791 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
3792                      bool first ATTRIBUTE_UNUSED)
3793 {
3794
3795   /* Small composite types are always padded upward.  */
3796   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
3797     {
3798       HOST_WIDE_INT size;
3799       if (type)
3800         size = int_size_in_bytes (type);
3801       else
3802         /* No frontends can create types with variable-sized modes, so we
3803            shouldn't be asked to pass or return them.  */
3804         size = GET_MODE_SIZE (mode).to_constant ();
3805       if (size < 2 * UNITS_PER_WORD)
3806         return true;
3807     }
3808
3809   /* Otherwise, use the default padding.  */
3810   return !BYTES_BIG_ENDIAN;
3811 }
3812
3813 static scalar_int_mode
3814 aarch64_libgcc_cmp_return_mode (void)
3815 {
3816   return SImode;
3817 }
3818
3819 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
3820
3821 /* We use the 12-bit shifted immediate arithmetic instructions so values
3822    must be multiple of (1 << 12), i.e. 4096.  */
3823 #define ARITH_FACTOR 4096
3824
3825 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
3826 #error Cannot use simple address calculation for stack probing
3827 #endif
3828
3829 /* The pair of scratch registers used for stack probing.  */
3830 #define PROBE_STACK_FIRST_REG  9
3831 #define PROBE_STACK_SECOND_REG 10
3832
3833 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
3834    inclusive.  These are offsets from the current stack pointer.  */
3835
3836 static void
3837 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
3838 {
3839   HOST_WIDE_INT size;
3840   if (!poly_size.is_constant (&size))
3841     {
3842       sorry ("stack probes for SVE frames");
3843       return;
3844     }
3845
3846   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
3847
3848   /* See the same assertion on PROBE_INTERVAL above.  */
3849   gcc_assert ((first % ARITH_FACTOR) == 0);
3850
3851   /* See if we have a constant small number of probes to generate.  If so,
3852      that's the easy case.  */
3853   if (size <= PROBE_INTERVAL)
3854     {
3855       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
3856
3857       emit_set_insn (reg1,
3858                      plus_constant (Pmode,
3859                                     stack_pointer_rtx, -(first + base)));
3860       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
3861     }
3862
3863   /* The run-time loop is made up of 8 insns in the generic case while the
3864      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
3865   else if (size <= 4 * PROBE_INTERVAL)
3866     {
3867       HOST_WIDE_INT i, rem;
3868
3869       emit_set_insn (reg1,
3870                      plus_constant (Pmode,
3871                                     stack_pointer_rtx,
3872                                     -(first + PROBE_INTERVAL)));
3873       emit_stack_probe (reg1);
3874
3875       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
3876          it exceeds SIZE.  If only two probes are needed, this will not
3877          generate any code.  Then probe at FIRST + SIZE.  */
3878       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
3879         {
3880           emit_set_insn (reg1,
3881                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
3882           emit_stack_probe (reg1);
3883         }
3884
3885       rem = size - (i - PROBE_INTERVAL);
3886       if (rem > 256)
3887         {
3888           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3889
3890           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
3891           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
3892         }
3893       else
3894         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
3895     }
3896
3897   /* Otherwise, do the same as above, but in a loop.  Note that we must be
3898      extra careful with variables wrapping around because we might be at
3899      the very top (or the very bottom) of the address space and we have
3900      to be able to handle this case properly; in particular, we use an
3901      equality test for the loop condition.  */
3902   else
3903     {
3904       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
3905
3906       /* Step 1: round SIZE to the previous multiple of the interval.  */
3907
3908       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
3909
3910
3911       /* Step 2: compute initial and final value of the loop counter.  */
3912
3913       /* TEST_ADDR = SP + FIRST.  */
3914       emit_set_insn (reg1,
3915                      plus_constant (Pmode, stack_pointer_rtx, -first));
3916
3917       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
3918       HOST_WIDE_INT adjustment = - (first + rounded_size);
3919       if (! aarch64_uimm12_shift (adjustment))
3920         {
3921           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
3922                                           true, Pmode);
3923           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
3924         }
3925       else
3926         emit_set_insn (reg2,
3927                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
3928
3929       /* Step 3: the loop
3930
3931          do
3932            {
3933              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
3934              probe at TEST_ADDR
3935            }
3936          while (TEST_ADDR != LAST_ADDR)
3937
3938          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
3939          until it is equal to ROUNDED_SIZE.  */
3940
3941       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
3942
3943
3944       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
3945          that SIZE is equal to ROUNDED_SIZE.  */
3946
3947       if (size != rounded_size)
3948         {
3949           HOST_WIDE_INT rem = size - rounded_size;
3950
3951           if (rem > 256)
3952             {
3953               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3954
3955               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
3956               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
3957             }
3958           else
3959             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
3960         }
3961     }
3962
3963   /* Make sure nothing is scheduled before we are done.  */
3964   emit_insn (gen_blockage ());
3965 }
3966
3967 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
3968    absolute addresses.  */
3969
3970 const char *
3971 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
3972 {
3973   static int labelno = 0;
3974   char loop_lab[32];
3975   rtx xops[2];
3976
3977   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
3978
3979   /* Loop.  */
3980   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
3981
3982   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
3983   xops[0] = reg1;
3984   xops[1] = GEN_INT (PROBE_INTERVAL);
3985   output_asm_insn ("sub\t%0, %0, %1", xops);
3986
3987   /* Probe at TEST_ADDR.  */
3988   output_asm_insn ("str\txzr, [%0]", xops);
3989
3990   /* Test if TEST_ADDR == LAST_ADDR.  */
3991   xops[1] = reg2;
3992   output_asm_insn ("cmp\t%0, %1", xops);
3993
3994   /* Branch.  */
3995   fputs ("\tb.ne\t", asm_out_file);
3996   assemble_name_raw (asm_out_file, loop_lab);
3997   fputc ('\n', asm_out_file);
3998
3999   return "";
4000 }
4001
4002 /* Determine whether a frame chain needs to be generated.  */
4003 static bool
4004 aarch64_needs_frame_chain (void)
4005 {
4006   /* Force a frame chain for EH returns so the return address is at FP+8.  */
4007   if (frame_pointer_needed || crtl->calls_eh_return)
4008     return true;
4009
4010   /* A leaf function cannot have calls or write LR.  */
4011   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
4012
4013   /* Don't use a frame chain in leaf functions if leaf frame pointers
4014      are disabled.  */
4015   if (flag_omit_leaf_frame_pointer && is_leaf)
4016     return false;
4017
4018   return aarch64_use_frame_pointer;
4019 }
4020
4021 /* Mark the registers that need to be saved by the callee and calculate
4022    the size of the callee-saved registers area and frame record (both FP
4023    and LR may be omitted).  */
4024 static void
4025 aarch64_layout_frame (void)
4026 {
4027   HOST_WIDE_INT offset = 0;
4028   int regno, last_fp_reg = INVALID_REGNUM;
4029
4030   cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
4031
4032 #define SLOT_NOT_REQUIRED (-2)
4033 #define SLOT_REQUIRED     (-1)
4034
4035   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
4036   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
4037
4038   /* First mark all the registers that really need to be saved...  */
4039   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4040     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4041
4042   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4043     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4044
4045   /* ... that includes the eh data registers (if needed)...  */
4046   if (crtl->calls_eh_return)
4047     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
4048       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
4049         = SLOT_REQUIRED;
4050
4051   /* ... and any callee saved register that dataflow says is live.  */
4052   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4053     if (df_regs_ever_live_p (regno)
4054         && (regno == R30_REGNUM
4055             || !call_used_regs[regno]))
4056       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4057
4058   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4059     if (df_regs_ever_live_p (regno)
4060         && !call_used_regs[regno])
4061       {
4062         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4063         last_fp_reg = regno;
4064       }
4065
4066   if (cfun->machine->frame.emit_frame_chain)
4067     {
4068       /* FP and LR are placed in the linkage record.  */
4069       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4070       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4071       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4072       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4073       offset = 2 * UNITS_PER_WORD;
4074     }
4075
4076   /* Now assign stack slots for them.  */
4077   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4078     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4079       {
4080         cfun->machine->frame.reg_offset[regno] = offset;
4081         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4082           cfun->machine->frame.wb_candidate1 = regno;
4083         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4084           cfun->machine->frame.wb_candidate2 = regno;
4085         offset += UNITS_PER_WORD;
4086       }
4087
4088   HOST_WIDE_INT max_int_offset = offset;
4089   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4090   bool has_align_gap = offset != max_int_offset;
4091
4092   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4093     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4094       {
4095         /* If there is an alignment gap between integer and fp callee-saves,
4096            allocate the last fp register to it if possible.  */
4097         if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
4098           {
4099             cfun->machine->frame.reg_offset[regno] = max_int_offset;
4100             break;
4101           }
4102
4103         cfun->machine->frame.reg_offset[regno] = offset;
4104         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4105           cfun->machine->frame.wb_candidate1 = regno;
4106         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4107                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4108           cfun->machine->frame.wb_candidate2 = regno;
4109         offset += UNITS_PER_WORD;
4110       }
4111
4112   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4113
4114   cfun->machine->frame.saved_regs_size = offset;
4115
4116   HOST_WIDE_INT varargs_and_saved_regs_size
4117     = offset + cfun->machine->frame.saved_varargs_size;
4118
4119   cfun->machine->frame.hard_fp_offset
4120     = aligned_upper_bound (varargs_and_saved_regs_size
4121                            + get_frame_size (),
4122                            STACK_BOUNDARY / BITS_PER_UNIT);
4123
4124   /* Both these values are already aligned.  */
4125   gcc_assert (multiple_p (crtl->outgoing_args_size,
4126                           STACK_BOUNDARY / BITS_PER_UNIT));
4127   cfun->machine->frame.frame_size
4128     = (cfun->machine->frame.hard_fp_offset
4129        + crtl->outgoing_args_size);
4130
4131   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4132
4133   cfun->machine->frame.initial_adjust = 0;
4134   cfun->machine->frame.final_adjust = 0;
4135   cfun->machine->frame.callee_adjust = 0;
4136   cfun->machine->frame.callee_offset = 0;
4137
4138   HOST_WIDE_INT max_push_offset = 0;
4139   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4140     max_push_offset = 512;
4141   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4142     max_push_offset = 256;
4143
4144   HOST_WIDE_INT const_size, const_fp_offset;
4145   if (cfun->machine->frame.frame_size.is_constant (&const_size)
4146       && const_size < max_push_offset
4147       && known_eq (crtl->outgoing_args_size, 0))
4148     {
4149       /* Simple, small frame with no outgoing arguments:
4150          stp reg1, reg2, [sp, -frame_size]!
4151          stp reg3, reg4, [sp, 16]  */
4152       cfun->machine->frame.callee_adjust = const_size;
4153     }
4154   else if (known_lt (crtl->outgoing_args_size
4155                      + cfun->machine->frame.saved_regs_size, 512)
4156            && !(cfun->calls_alloca
4157                 && known_lt (cfun->machine->frame.hard_fp_offset,
4158                              max_push_offset)))
4159     {
4160       /* Frame with small outgoing arguments:
4161          sub sp, sp, frame_size
4162          stp reg1, reg2, [sp, outgoing_args_size]
4163          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
4164       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4165       cfun->machine->frame.callee_offset
4166         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4167     }
4168   else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4169            && const_fp_offset < max_push_offset)
4170     {
4171       /* Frame with large outgoing arguments but a small local area:
4172          stp reg1, reg2, [sp, -hard_fp_offset]!
4173          stp reg3, reg4, [sp, 16]
4174          sub sp, sp, outgoing_args_size  */
4175       cfun->machine->frame.callee_adjust = const_fp_offset;
4176       cfun->machine->frame.final_adjust
4177         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4178     }
4179   else
4180     {
4181       /* Frame with large local area and outgoing arguments using frame pointer:
4182          sub sp, sp, hard_fp_offset
4183          stp x29, x30, [sp, 0]
4184          add x29, sp, 0
4185          stp reg3, reg4, [sp, 16]
4186          sub sp, sp, outgoing_args_size  */
4187       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4188       cfun->machine->frame.final_adjust
4189         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4190     }
4191
4192   cfun->machine->frame.laid_out = true;
4193 }
4194
4195 /* Return true if the register REGNO is saved on entry to
4196    the current function.  */
4197
4198 static bool
4199 aarch64_register_saved_on_entry (int regno)
4200 {
4201   return cfun->machine->frame.reg_offset[regno] >= 0;
4202 }
4203
4204 /* Return the next register up from REGNO up to LIMIT for the callee
4205    to save.  */
4206
4207 static unsigned
4208 aarch64_next_callee_save (unsigned regno, unsigned limit)
4209 {
4210   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4211     regno ++;
4212   return regno;
4213 }
4214
4215 /* Push the register number REGNO of mode MODE to the stack with write-back
4216    adjusting the stack by ADJUSTMENT.  */
4217
4218 static void
4219 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4220                            HOST_WIDE_INT adjustment)
4221  {
4222   rtx base_rtx = stack_pointer_rtx;
4223   rtx insn, reg, mem;
4224
4225   reg = gen_rtx_REG (mode, regno);
4226   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4227                             plus_constant (Pmode, base_rtx, -adjustment));
4228   mem = gen_frame_mem (mode, mem);
4229
4230   insn = emit_move_insn (mem, reg);
4231   RTX_FRAME_RELATED_P (insn) = 1;
4232 }
4233
4234 /* Generate and return an instruction to store the pair of registers
4235    REG and REG2 of mode MODE to location BASE with write-back adjusting
4236    the stack location BASE by ADJUSTMENT.  */
4237
4238 static rtx
4239 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4240                           HOST_WIDE_INT adjustment)
4241 {
4242   switch (mode)
4243     {
4244     case E_DImode:
4245       return gen_storewb_pairdi_di (base, base, reg, reg2,
4246                                     GEN_INT (-adjustment),
4247                                     GEN_INT (UNITS_PER_WORD - adjustment));
4248     case E_DFmode:
4249       return gen_storewb_pairdf_di (base, base, reg, reg2,
4250                                     GEN_INT (-adjustment),
4251                                     GEN_INT (UNITS_PER_WORD - adjustment));
4252     default:
4253       gcc_unreachable ();
4254     }
4255 }
4256
4257 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4258    stack pointer by ADJUSTMENT.  */
4259
4260 static void
4261 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4262 {
4263   rtx_insn *insn;
4264   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4265
4266   if (regno2 == INVALID_REGNUM)
4267     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4268
4269   rtx reg1 = gen_rtx_REG (mode, regno1);
4270   rtx reg2 = gen_rtx_REG (mode, regno2);
4271
4272   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4273                                               reg2, adjustment));
4274   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4275   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4276   RTX_FRAME_RELATED_P (insn) = 1;
4277 }
4278
4279 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4280    adjusting it by ADJUSTMENT afterwards.  */
4281
4282 static rtx
4283 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4284                          HOST_WIDE_INT adjustment)
4285 {
4286   switch (mode)
4287     {
4288     case E_DImode:
4289       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4290                                    GEN_INT (UNITS_PER_WORD));
4291     case E_DFmode:
4292       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4293                                    GEN_INT (UNITS_PER_WORD));
4294     default:
4295       gcc_unreachable ();
4296     }
4297 }
4298
4299 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4300    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4301    into CFI_OPS.  */
4302
4303 static void
4304 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4305                   rtx *cfi_ops)
4306 {
4307   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4308   rtx reg1 = gen_rtx_REG (mode, regno1);
4309
4310   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4311
4312   if (regno2 == INVALID_REGNUM)
4313     {
4314       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4315       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4316       emit_move_insn (reg1, gen_frame_mem (mode, mem));
4317     }
4318   else
4319     {
4320       rtx reg2 = gen_rtx_REG (mode, regno2);
4321       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4322       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4323                                           reg2, adjustment));
4324     }
4325 }
4326
4327 /* Generate and return a store pair instruction of mode MODE to store
4328    register REG1 to MEM1 and register REG2 to MEM2.  */
4329
4330 static rtx
4331 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4332                         rtx reg2)
4333 {
4334   switch (mode)
4335     {
4336     case E_DImode:
4337       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
4338
4339     case E_DFmode:
4340       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
4341
4342     default:
4343       gcc_unreachable ();
4344     }
4345 }
4346
4347 /* Generate and regurn a load pair isntruction of mode MODE to load register
4348    REG1 from MEM1 and register REG2 from MEM2.  */
4349
4350 static rtx
4351 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4352                        rtx mem2)
4353 {
4354   switch (mode)
4355     {
4356     case E_DImode:
4357       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
4358
4359     case E_DFmode:
4360       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
4361
4362     default:
4363       gcc_unreachable ();
4364     }
4365 }
4366
4367 /* Return TRUE if return address signing should be enabled for the current
4368    function, otherwise return FALSE.  */
4369
4370 bool
4371 aarch64_return_address_signing_enabled (void)
4372 {
4373   /* This function should only be called after frame laid out.   */
4374   gcc_assert (cfun->machine->frame.laid_out);
4375
4376   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4377      if it's LR is pushed onto stack.  */
4378   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4379           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4380               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4381 }
4382
4383 /* Emit code to save the callee-saved registers from register number START
4384    to LIMIT to the stack at the location starting at offset START_OFFSET,
4385    skipping any write-back candidates if SKIP_WB is true.  */
4386
4387 static void
4388 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4389                            unsigned start, unsigned limit, bool skip_wb)
4390 {
4391   rtx_insn *insn;
4392   unsigned regno;
4393   unsigned regno2;
4394
4395   for (regno = aarch64_next_callee_save (start, limit);
4396        regno <= limit;
4397        regno = aarch64_next_callee_save (regno + 1, limit))
4398     {
4399       rtx reg, mem;
4400       poly_int64 offset;
4401
4402       if (skip_wb
4403           && (regno == cfun->machine->frame.wb_candidate1
4404               || regno == cfun->machine->frame.wb_candidate2))
4405         continue;
4406
4407       if (cfun->machine->reg_is_wrapped_separately[regno])
4408        continue;
4409
4410       reg = gen_rtx_REG (mode, regno);
4411       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4412       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4413                                                 offset));
4414
4415       regno2 = aarch64_next_callee_save (regno + 1, limit);
4416
4417       if (regno2 <= limit
4418           && !cfun->machine->reg_is_wrapped_separately[regno2]
4419           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4420               == cfun->machine->frame.reg_offset[regno2]))
4421
4422         {
4423           rtx reg2 = gen_rtx_REG (mode, regno2);
4424           rtx mem2;
4425
4426           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4427           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4428                                                      offset));
4429           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4430                                                     reg2));
4431
4432           /* The first part of a frame-related parallel insn is
4433              always assumed to be relevant to the frame
4434              calculations; subsequent parts, are only
4435              frame-related if explicitly marked.  */
4436           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4437           regno = regno2;
4438         }
4439       else
4440         insn = emit_move_insn (mem, reg);
4441
4442       RTX_FRAME_RELATED_P (insn) = 1;
4443     }
4444 }
4445
4446 /* Emit code to restore the callee registers of mode MODE from register
4447    number START up to and including LIMIT.  Restore from the stack offset
4448    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4449    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
4450
4451 static void
4452 aarch64_restore_callee_saves (machine_mode mode,
4453                               poly_int64 start_offset, unsigned start,
4454                               unsigned limit, bool skip_wb, rtx *cfi_ops)
4455 {
4456   rtx base_rtx = stack_pointer_rtx;
4457   unsigned regno;
4458   unsigned regno2;
4459   poly_int64 offset;
4460
4461   for (regno = aarch64_next_callee_save (start, limit);
4462        regno <= limit;
4463        regno = aarch64_next_callee_save (regno + 1, limit))
4464     {
4465       if (cfun->machine->reg_is_wrapped_separately[regno])
4466        continue;
4467
4468       rtx reg, mem;
4469
4470       if (skip_wb
4471           && (regno == cfun->machine->frame.wb_candidate1
4472               || regno == cfun->machine->frame.wb_candidate2))
4473         continue;
4474
4475       reg = gen_rtx_REG (mode, regno);
4476       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4477       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4478
4479       regno2 = aarch64_next_callee_save (regno + 1, limit);
4480
4481       if (regno2 <= limit
4482           && !cfun->machine->reg_is_wrapped_separately[regno2]
4483           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4484               == cfun->machine->frame.reg_offset[regno2]))
4485         {
4486           rtx reg2 = gen_rtx_REG (mode, regno2);
4487           rtx mem2;
4488
4489           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4490           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4491           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4492
4493           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4494           regno = regno2;
4495         }
4496       else
4497         emit_move_insn (reg, mem);
4498       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
4499     }
4500 }
4501
4502 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4503    of MODE.  */
4504
4505 static inline bool
4506 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4507 {
4508   HOST_WIDE_INT multiple;
4509   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4510           && IN_RANGE (multiple, -8, 7));
4511 }
4512
4513 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4514    of MODE.  */
4515
4516 static inline bool
4517 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4518 {
4519   HOST_WIDE_INT multiple;
4520   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4521           && IN_RANGE (multiple, 0, 63));
4522 }
4523
4524 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4525    of MODE.  */
4526
4527 bool
4528 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4529 {
4530   HOST_WIDE_INT multiple;
4531   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4532           && IN_RANGE (multiple, -64, 63));
4533 }
4534
4535 /* Return true if OFFSET is a signed 9-bit value.  */
4536
4537 bool
4538 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
4539                                        poly_int64 offset)
4540 {
4541   HOST_WIDE_INT const_offset;
4542   return (offset.is_constant (&const_offset)
4543           && IN_RANGE (const_offset, -256, 255));
4544 }
4545
4546 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4547    of MODE.  */
4548
4549 static inline bool
4550 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4551 {
4552   HOST_WIDE_INT multiple;
4553   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4554           && IN_RANGE (multiple, -256, 255));
4555 }
4556
4557 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4558    of MODE.  */
4559
4560 static inline bool
4561 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4562 {
4563   HOST_WIDE_INT multiple;
4564   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4565           && IN_RANGE (multiple, 0, 4095));
4566 }
4567
4568 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
4569
4570 static sbitmap
4571 aarch64_get_separate_components (void)
4572 {
4573   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4574   bitmap_clear (components);
4575
4576   /* The registers we need saved to the frame.  */
4577   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4578     if (aarch64_register_saved_on_entry (regno))
4579       {
4580         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4581         if (!frame_pointer_needed)
4582           offset += cfun->machine->frame.frame_size
4583                     - cfun->machine->frame.hard_fp_offset;
4584         /* Check that we can access the stack slot of the register with one
4585            direct load with no adjustments needed.  */
4586         if (offset_12bit_unsigned_scaled_p (DImode, offset))
4587           bitmap_set_bit (components, regno);
4588       }
4589
4590   /* Don't mess with the hard frame pointer.  */
4591   if (frame_pointer_needed)
4592     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
4593
4594   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4595   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4596   /* If registers have been chosen to be stored/restored with
4597      writeback don't interfere with them to avoid having to output explicit
4598      stack adjustment instructions.  */
4599   if (reg2 != INVALID_REGNUM)
4600     bitmap_clear_bit (components, reg2);
4601   if (reg1 != INVALID_REGNUM)
4602     bitmap_clear_bit (components, reg1);
4603
4604   bitmap_clear_bit (components, LR_REGNUM);
4605   bitmap_clear_bit (components, SP_REGNUM);
4606
4607   return components;
4608 }
4609
4610 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
4611
4612 static sbitmap
4613 aarch64_components_for_bb (basic_block bb)
4614 {
4615   bitmap in = DF_LIVE_IN (bb);
4616   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
4617   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
4618
4619   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4620   bitmap_clear (components);
4621
4622   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
4623   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4624     if ((!call_used_regs[regno])
4625        && (bitmap_bit_p (in, regno)
4626            || bitmap_bit_p (gen, regno)
4627            || bitmap_bit_p (kill, regno)))
4628       {
4629         unsigned regno2, offset, offset2;
4630         bitmap_set_bit (components, regno);
4631
4632         /* If there is a callee-save at an adjacent offset, add it too
4633            to increase the use of LDP/STP.  */
4634         offset = cfun->machine->frame.reg_offset[regno];
4635         regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
4636
4637         if (regno2 <= LAST_SAVED_REGNUM)
4638           {
4639             offset2 = cfun->machine->frame.reg_offset[regno2];
4640             if ((offset & ~8) == (offset2 & ~8))
4641               bitmap_set_bit (components, regno2);
4642           }
4643       }
4644
4645   return components;
4646 }
4647
4648 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
4649    Nothing to do for aarch64.  */
4650
4651 static void
4652 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
4653 {
4654 }
4655
4656 /* Return the next set bit in BMP from START onwards.  Return the total number
4657    of bits in BMP if no set bit is found at or after START.  */
4658
4659 static unsigned int
4660 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
4661 {
4662   unsigned int nbits = SBITMAP_SIZE (bmp);
4663   if (start == nbits)
4664     return start;
4665
4666   gcc_assert (start < nbits);
4667   for (unsigned int i = start; i < nbits; i++)
4668     if (bitmap_bit_p (bmp, i))
4669       return i;
4670
4671   return nbits;
4672 }
4673
4674 /* Do the work for aarch64_emit_prologue_components and
4675    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
4676    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
4677    for these components or the epilogue sequence.  That is, it determines
4678    whether we should emit stores or loads and what kind of CFA notes to attach
4679    to the insns.  Otherwise the logic for the two sequences is very
4680    similar.  */
4681
4682 static void
4683 aarch64_process_components (sbitmap components, bool prologue_p)
4684 {
4685   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
4686                              ? HARD_FRAME_POINTER_REGNUM
4687                              : STACK_POINTER_REGNUM);
4688
4689   unsigned last_regno = SBITMAP_SIZE (components);
4690   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
4691   rtx_insn *insn = NULL;
4692
4693   while (regno != last_regno)
4694     {
4695       /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
4696          so DFmode for the vector registers is enough.  */
4697       machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
4698       rtx reg = gen_rtx_REG (mode, regno);
4699       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4700       if (!frame_pointer_needed)
4701         offset += cfun->machine->frame.frame_size
4702                   - cfun->machine->frame.hard_fp_offset;
4703       rtx addr = plus_constant (Pmode, ptr_reg, offset);
4704       rtx mem = gen_frame_mem (mode, addr);
4705
4706       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
4707       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
4708       /* No more registers to handle after REGNO.
4709          Emit a single save/restore and exit.  */
4710       if (regno2 == last_regno)
4711         {
4712           insn = emit_insn (set);
4713           RTX_FRAME_RELATED_P (insn) = 1;
4714           if (prologue_p)
4715             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4716           else
4717             add_reg_note (insn, REG_CFA_RESTORE, reg);
4718           break;
4719         }
4720
4721       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
4722       /* The next register is not of the same class or its offset is not
4723          mergeable with the current one into a pair.  */
4724       if (!satisfies_constraint_Ump (mem)
4725           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
4726           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
4727                        GET_MODE_SIZE (mode)))
4728         {
4729           insn = emit_insn (set);
4730           RTX_FRAME_RELATED_P (insn) = 1;
4731           if (prologue_p)
4732             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4733           else
4734             add_reg_note (insn, REG_CFA_RESTORE, reg);
4735
4736           regno = regno2;
4737           continue;
4738         }
4739
4740       /* REGNO2 can be saved/restored in a pair with REGNO.  */
4741       rtx reg2 = gen_rtx_REG (mode, regno2);
4742       if (!frame_pointer_needed)
4743         offset2 += cfun->machine->frame.frame_size
4744                   - cfun->machine->frame.hard_fp_offset;
4745       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
4746       rtx mem2 = gen_frame_mem (mode, addr2);
4747       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
4748                              : gen_rtx_SET (reg2, mem2);
4749
4750       if (prologue_p)
4751         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
4752       else
4753         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4754
4755       RTX_FRAME_RELATED_P (insn) = 1;
4756       if (prologue_p)
4757         {
4758           add_reg_note (insn, REG_CFA_OFFSET, set);
4759           add_reg_note (insn, REG_CFA_OFFSET, set2);
4760         }
4761       else
4762         {
4763           add_reg_note (insn, REG_CFA_RESTORE, reg);
4764           add_reg_note (insn, REG_CFA_RESTORE, reg2);
4765         }
4766
4767       regno = aarch64_get_next_set_bit (components, regno2 + 1);
4768     }
4769 }
4770
4771 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
4772
4773 static void
4774 aarch64_emit_prologue_components (sbitmap components)
4775 {
4776   aarch64_process_components (components, true);
4777 }
4778
4779 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
4780
4781 static void
4782 aarch64_emit_epilogue_components (sbitmap components)
4783 {
4784   aarch64_process_components (components, false);
4785 }
4786
4787 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
4788
4789 static void
4790 aarch64_set_handled_components (sbitmap components)
4791 {
4792   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4793     if (bitmap_bit_p (components, regno))
4794       cfun->machine->reg_is_wrapped_separately[regno] = true;
4795 }
4796
4797 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
4798    is saved at BASE + OFFSET.  */
4799
4800 static void
4801 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
4802                             rtx base, poly_int64 offset)
4803 {
4804   rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
4805   add_reg_note (insn, REG_CFA_EXPRESSION,
4806                 gen_rtx_SET (mem, regno_reg_rtx[reg]));
4807 }
4808
4809 /* AArch64 stack frames generated by this compiler look like:
4810
4811         +-------------------------------+
4812         |                               |
4813         |  incoming stack arguments     |
4814         |                               |
4815         +-------------------------------+
4816         |                               | <-- incoming stack pointer (aligned)
4817         |  callee-allocated save area   |
4818         |  for register varargs         |
4819         |                               |
4820         +-------------------------------+
4821         |  local variables              | <-- frame_pointer_rtx
4822         |                               |
4823         +-------------------------------+
4824         |  padding0                     | \
4825         +-------------------------------+  |
4826         |  callee-saved registers       |  | frame.saved_regs_size
4827         +-------------------------------+  |
4828         |  LR'                          |  |
4829         +-------------------------------+  |
4830         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
4831         +-------------------------------+
4832         |  dynamic allocation           |
4833         +-------------------------------+
4834         |  padding                      |
4835         +-------------------------------+
4836         |  outgoing stack arguments     | <-- arg_pointer
4837         |                               |
4838         +-------------------------------+
4839         |                               | <-- stack_pointer_rtx (aligned)
4840
4841    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
4842    but leave frame_pointer_rtx and hard_frame_pointer_rtx
4843    unchanged.  */
4844
4845 /* Generate the prologue instructions for entry into a function.
4846    Establish the stack frame by decreasing the stack pointer with a
4847    properly calculated size and, if necessary, create a frame record
4848    filled with the values of LR and previous frame pointer.  The
4849    current FP is also set up if it is in use.  */
4850
4851 void
4852 aarch64_expand_prologue (void)
4853 {
4854   poly_int64 frame_size = cfun->machine->frame.frame_size;
4855   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4856   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4857   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4858   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4859   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4860   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4861   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
4862   rtx_insn *insn;
4863
4864   /* Sign return address for functions.  */
4865   if (aarch64_return_address_signing_enabled ())
4866     {
4867       insn = emit_insn (gen_pacisp ());
4868       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
4869       RTX_FRAME_RELATED_P (insn) = 1;
4870     }
4871
4872   if (flag_stack_usage_info)
4873     current_function_static_stack_size = constant_lower_bound (frame_size);
4874
4875   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4876     {
4877       if (crtl->is_leaf && !cfun->calls_alloca)
4878         {
4879           if (maybe_gt (frame_size, PROBE_INTERVAL)
4880               && maybe_gt (frame_size, get_stack_check_protect ()))
4881             aarch64_emit_probe_stack_range (get_stack_check_protect (),
4882                                             (frame_size
4883                                              - get_stack_check_protect ()));
4884         }
4885       else if (maybe_gt (frame_size, 0))
4886         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
4887     }
4888
4889   rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4890   rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4891
4892   aarch64_sub_sp (ip0_rtx, ip1_rtx, initial_adjust, true);
4893
4894   if (callee_adjust != 0)
4895     aarch64_push_regs (reg1, reg2, callee_adjust);
4896
4897   if (emit_frame_chain)
4898     {
4899       poly_int64 reg_offset = callee_adjust;
4900       if (callee_adjust == 0)
4901         {
4902           reg1 = R29_REGNUM;
4903           reg2 = R30_REGNUM;
4904           reg_offset = callee_offset;
4905           aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
4906         }
4907       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
4908                           stack_pointer_rtx, callee_offset,
4909                           ip1_rtx, ip0_rtx, frame_pointer_needed);
4910       if (frame_pointer_needed && !frame_size.is_constant ())
4911         {
4912           /* Variable-sized frames need to describe the save slot
4913              address using DW_CFA_expression rather than DW_CFA_offset.
4914              This means that, without taking further action, the
4915              locations of the registers that we've already saved would
4916              remain based on the stack pointer even after we redefine
4917              the CFA based on the frame pointer.  We therefore need new
4918              DW_CFA_expressions to re-express the save slots with addresses
4919              based on the frame pointer.  */
4920           rtx_insn *insn = get_last_insn ();
4921           gcc_assert (RTX_FRAME_RELATED_P (insn));
4922
4923           /* Add an explicit CFA definition if this was previously
4924              implicit.  */
4925           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
4926             {
4927               rtx src = plus_constant (Pmode, stack_pointer_rtx,
4928                                        callee_offset);
4929               add_reg_note (insn, REG_CFA_ADJUST_CFA,
4930                             gen_rtx_SET (hard_frame_pointer_rtx, src));
4931             }
4932
4933           /* Change the save slot expressions for the registers that
4934              we've already saved.  */
4935           reg_offset -= callee_offset;
4936           aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
4937                                       reg_offset + UNITS_PER_WORD);
4938           aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
4939                                       reg_offset);
4940         }
4941       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
4942     }
4943
4944   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4945                              callee_adjust != 0 || emit_frame_chain);
4946   aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4947                              callee_adjust != 0 || emit_frame_chain);
4948   aarch64_sub_sp (ip1_rtx, ip0_rtx, final_adjust, !frame_pointer_needed);
4949 }
4950
4951 /* Return TRUE if we can use a simple_return insn.
4952
4953    This function checks whether the callee saved stack is empty, which
4954    means no restore actions are need. The pro_and_epilogue will use
4955    this to check whether shrink-wrapping opt is feasible.  */
4956
4957 bool
4958 aarch64_use_return_insn_p (void)
4959 {
4960   if (!reload_completed)
4961     return false;
4962
4963   if (crtl->profile)
4964     return false;
4965
4966   return known_eq (cfun->machine->frame.frame_size, 0);
4967 }
4968
4969 /* Generate the epilogue instructions for returning from a function.
4970    This is almost exactly the reverse of the prolog sequence, except
4971    that we need to insert barriers to avoid scheduling loads that read
4972    from a deallocated stack, and we optimize the unwind records by
4973    emitting them all together if possible.  */
4974 void
4975 aarch64_expand_epilogue (bool for_sibcall)
4976 {
4977   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4978   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4979   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4980   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4981   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4982   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4983   rtx cfi_ops = NULL;
4984   rtx_insn *insn;
4985   /* A stack clash protection prologue may not have left IP0_REGNUM or
4986      IP1_REGNUM in a usable state.  The same is true for allocations
4987      with an SVE component, since we then need both temporary registers
4988      for each allocation.  */
4989   bool can_inherit_p = (initial_adjust.is_constant ()
4990                         && final_adjust.is_constant ()
4991                         && !flag_stack_clash_protection);
4992
4993   /* We need to add memory barrier to prevent read from deallocated stack.  */
4994   bool need_barrier_p
4995     = maybe_ne (get_frame_size ()
4996                 + cfun->machine->frame.saved_varargs_size, 0);
4997
4998   /* Emit a barrier to prevent loads from a deallocated stack.  */
4999   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
5000       || cfun->calls_alloca
5001       || crtl->calls_eh_return)
5002     {
5003       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5004       need_barrier_p = false;
5005     }
5006
5007   /* Restore the stack pointer from the frame pointer if it may not
5008      be the same as the stack pointer.  */
5009   rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
5010   rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
5011   if (frame_pointer_needed
5012       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
5013     /* If writeback is used when restoring callee-saves, the CFA
5014        is restored on the instruction doing the writeback.  */
5015     aarch64_add_offset (Pmode, stack_pointer_rtx,
5016                         hard_frame_pointer_rtx, -callee_offset,
5017                         ip1_rtx, ip0_rtx, callee_adjust == 0);
5018   else
5019     aarch64_add_sp (ip1_rtx, ip0_rtx, final_adjust,
5020                     !can_inherit_p || df_regs_ever_live_p (IP1_REGNUM));
5021
5022   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5023                                 callee_adjust != 0, &cfi_ops);
5024   aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5025                                 callee_adjust != 0, &cfi_ops);
5026
5027   if (need_barrier_p)
5028     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5029
5030   if (callee_adjust != 0)
5031     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
5032
5033   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
5034     {
5035       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
5036       insn = get_last_insn ();
5037       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
5038       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
5039       RTX_FRAME_RELATED_P (insn) = 1;
5040       cfi_ops = NULL;
5041     }
5042
5043   aarch64_add_sp (ip0_rtx, ip1_rtx, initial_adjust,
5044                   !can_inherit_p || df_regs_ever_live_p (IP0_REGNUM));
5045
5046   if (cfi_ops)
5047     {
5048       /* Emit delayed restores and reset the CFA to be SP.  */
5049       insn = get_last_insn ();
5050       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
5051       REG_NOTES (insn) = cfi_ops;
5052       RTX_FRAME_RELATED_P (insn) = 1;
5053     }
5054
5055   /* We prefer to emit the combined return/authenticate instruction RETAA,
5056      however there are three cases in which we must instead emit an explicit
5057      authentication instruction.
5058
5059         1) Sibcalls don't return in a normal way, so if we're about to call one
5060            we must authenticate.
5061
5062         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5063            generating code for !TARGET_ARMV8_3 we can't use it and must
5064            explicitly authenticate.
5065
5066         3) On an eh_return path we make extra stack adjustments to update the
5067            canonical frame address to be the exception handler's CFA.  We want
5068            to authenticate using the CFA of the function which calls eh_return.
5069     */
5070   if (aarch64_return_address_signing_enabled ()
5071       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
5072     {
5073       insn = emit_insn (gen_autisp ());
5074       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5075       RTX_FRAME_RELATED_P (insn) = 1;
5076     }
5077
5078   /* Stack adjustment for exception handler.  */
5079   if (crtl->calls_eh_return)
5080     {
5081       /* We need to unwind the stack by the offset computed by
5082          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
5083          to be SP; letting the CFA move during this adjustment
5084          is just as correct as retaining the CFA from the body
5085          of the function.  Therefore, do nothing special.  */
5086       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
5087     }
5088
5089   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
5090   if (!for_sibcall)
5091     emit_jump_insn (ret_rtx);
5092 }
5093
5094 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
5095    normally or return to a previous frame after unwinding.
5096
5097    An EH return uses a single shared return sequence.  The epilogue is
5098    exactly like a normal epilogue except that it has an extra input
5099    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5100    that must be applied after the frame has been destroyed.  An extra label
5101    is inserted before the epilogue which initializes this register to zero,
5102    and this is the entry point for a normal return.
5103
5104    An actual EH return updates the return address, initializes the stack
5105    adjustment and jumps directly into the epilogue (bypassing the zeroing
5106    of the adjustment).  Since the return address is typically saved on the
5107    stack when a function makes a call, the saved LR must be updated outside
5108    the epilogue.
5109
5110    This poses problems as the store is generated well before the epilogue,
5111    so the offset of LR is not known yet.  Also optimizations will remove the
5112    store as it appears dead, even after the epilogue is generated (as the
5113    base or offset for loading LR is different in many cases).
5114
5115    To avoid these problems this implementation forces the frame pointer
5116    in eh_return functions so that the location of LR is fixed and known early.
5117    It also marks the store volatile, so no optimization is permitted to
5118    remove the store.  */
5119 rtx
5120 aarch64_eh_return_handler_rtx (void)
5121 {
5122   rtx tmp = gen_frame_mem (Pmode,
5123     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
5124
5125   /* Mark the store volatile, so no optimization is permitted to remove it.  */
5126   MEM_VOLATILE_P (tmp) = true;
5127   return tmp;
5128 }
5129
5130 /* Output code to add DELTA to the first argument, and then jump
5131    to FUNCTION.  Used for C++ multiple inheritance.  */
5132 static void
5133 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
5134                          HOST_WIDE_INT delta,
5135                          HOST_WIDE_INT vcall_offset,
5136                          tree function)
5137 {
5138   /* The this pointer is always in x0.  Note that this differs from
5139      Arm where the this pointer maybe bumped to r1 if r0 is required
5140      to return a pointer to an aggregate.  On AArch64 a result value
5141      pointer will be in x8.  */
5142   int this_regno = R0_REGNUM;
5143   rtx this_rtx, temp0, temp1, addr, funexp;
5144   rtx_insn *insn;
5145
5146   reload_completed = 1;
5147   emit_note (NOTE_INSN_PROLOGUE_END);
5148
5149   this_rtx = gen_rtx_REG (Pmode, this_regno);
5150   temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
5151   temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
5152
5153   if (vcall_offset == 0)
5154     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
5155   else
5156     {
5157       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
5158
5159       addr = this_rtx;
5160       if (delta != 0)
5161         {
5162           if (delta >= -256 && delta < 256)
5163             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
5164                                        plus_constant (Pmode, this_rtx, delta));
5165           else
5166             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
5167                                 temp1, temp0, false);
5168         }
5169
5170       if (Pmode == ptr_mode)
5171         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
5172       else
5173         aarch64_emit_move (temp0,
5174                            gen_rtx_ZERO_EXTEND (Pmode,
5175                                                 gen_rtx_MEM (ptr_mode, addr)));
5176
5177       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
5178           addr = plus_constant (Pmode, temp0, vcall_offset);
5179       else
5180         {
5181           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
5182                                           Pmode);
5183           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
5184         }
5185
5186       if (Pmode == ptr_mode)
5187         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
5188       else
5189         aarch64_emit_move (temp1,
5190                            gen_rtx_SIGN_EXTEND (Pmode,
5191                                                 gen_rtx_MEM (ptr_mode, addr)));
5192
5193       emit_insn (gen_add2_insn (this_rtx, temp1));
5194     }
5195
5196   /* Generate a tail call to the target function.  */
5197   if (!TREE_USED (function))
5198     {
5199       assemble_external (function);
5200       TREE_USED (function) = 1;
5201     }
5202   funexp = XEXP (DECL_RTL (function), 0);
5203   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
5204   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
5205   SIBLING_CALL_P (insn) = 1;
5206
5207   insn = get_insns ();
5208   shorten_branches (insn);
5209   final_start_function (insn, file, 1);
5210   final (insn, file, 1);
5211   final_end_function ();
5212
5213   /* Stop pretending to be a post-reload pass.  */
5214   reload_completed = 0;
5215 }
5216
5217 static bool
5218 aarch64_tls_referenced_p (rtx x)
5219 {
5220   if (!TARGET_HAVE_TLS)
5221     return false;
5222   subrtx_iterator::array_type array;
5223   FOR_EACH_SUBRTX (iter, array, x, ALL)
5224     {
5225       const_rtx x = *iter;
5226       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
5227         return true;
5228       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
5229          TLS offsets, not real symbol references.  */
5230       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5231         iter.skip_subrtxes ();
5232     }
5233   return false;
5234 }
5235
5236
5237 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5238    a left shift of 0 or 12 bits.  */
5239 bool
5240 aarch64_uimm12_shift (HOST_WIDE_INT val)
5241 {
5242   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
5243           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
5244           );
5245 }
5246
5247
5248 /* Return true if val is an immediate that can be loaded into a
5249    register by a MOVZ instruction.  */
5250 static bool
5251 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
5252 {
5253   if (GET_MODE_SIZE (mode) > 4)
5254     {
5255       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
5256           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
5257         return 1;
5258     }
5259   else
5260     {
5261       /* Ignore sign extension.  */
5262       val &= (HOST_WIDE_INT) 0xffffffff;
5263     }
5264   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
5265           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
5266 }
5267
5268 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
5269    64-bit (DImode) integer.  */
5270
5271 static unsigned HOST_WIDE_INT
5272 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5273 {
5274   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
5275   while (size < 64)
5276     {
5277       val &= (HOST_WIDE_INT_1U << size) - 1;
5278       val |= val << size;
5279       size *= 2;
5280     }
5281   return val;
5282 }
5283
5284 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
5285
5286 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
5287   {
5288     0x0000000100000001ull,
5289     0x0001000100010001ull,
5290     0x0101010101010101ull,
5291     0x1111111111111111ull,
5292     0x5555555555555555ull,
5293   };
5294
5295
5296 /* Return true if val is a valid bitmask immediate.  */
5297
5298 bool
5299 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
5300 {
5301   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
5302   int bits;
5303
5304   /* Check for a single sequence of one bits and return quickly if so.
5305      The special cases of all ones and all zeroes returns false.  */
5306   val = aarch64_replicate_bitmask_imm (val_in, mode);
5307   tmp = val + (val & -val);
5308
5309   if (tmp == (tmp & -tmp))
5310     return (val + 1) > 1;
5311
5312   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
5313   if (mode == SImode)
5314     val = (val << 32) | (val & 0xffffffff);
5315
5316   /* Invert if the immediate doesn't start with a zero bit - this means we
5317      only need to search for sequences of one bits.  */
5318   if (val & 1)
5319     val = ~val;
5320
5321   /* Find the first set bit and set tmp to val with the first sequence of one
5322      bits removed.  Return success if there is a single sequence of ones.  */
5323   first_one = val & -val;
5324   tmp = val & (val + first_one);
5325
5326   if (tmp == 0)
5327     return true;
5328
5329   /* Find the next set bit and compute the difference in bit position.  */
5330   next_one = tmp & -tmp;
5331   bits = clz_hwi (first_one) - clz_hwi (next_one);
5332   mask = val ^ tmp;
5333
5334   /* Check the bit position difference is a power of 2, and that the first
5335      sequence of one bits fits within 'bits' bits.  */
5336   if ((mask >> bits) != 0 || bits != (bits & -bits))
5337     return false;
5338
5339   /* Check the sequence of one bits is repeated 64/bits times.  */
5340   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
5341 }
5342
5343 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5344    Assumed precondition: VAL_IN Is not zero.  */
5345
5346 unsigned HOST_WIDE_INT
5347 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
5348 {
5349   int lowest_bit_set = ctz_hwi (val_in);
5350   int highest_bit_set = floor_log2 (val_in);
5351   gcc_assert (val_in != 0);
5352
5353   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
5354           (HOST_WIDE_INT_1U << lowest_bit_set));
5355 }
5356
5357 /* Create constant where bits outside of lowest bit set to highest bit set
5358    are set to 1.  */
5359
5360 unsigned HOST_WIDE_INT
5361 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
5362 {
5363   return val_in | ~aarch64_and_split_imm1 (val_in);
5364 }
5365
5366 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
5367
5368 bool
5369 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
5370 {
5371   scalar_int_mode int_mode;
5372   if (!is_a <scalar_int_mode> (mode, &int_mode))
5373     return false;
5374
5375   if (aarch64_bitmask_imm (val_in, int_mode))
5376     return false;
5377
5378   if (aarch64_move_imm (val_in, int_mode))
5379     return false;
5380
5381   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
5382
5383   return aarch64_bitmask_imm (imm2, int_mode);
5384 }
5385
5386 /* Return true if val is an immediate that can be loaded into a
5387    register in a single instruction.  */
5388 bool
5389 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
5390 {
5391   scalar_int_mode int_mode;
5392   if (!is_a <scalar_int_mode> (mode, &int_mode))
5393     return false;
5394
5395   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
5396     return 1;
5397   return aarch64_bitmask_imm (val, int_mode);
5398 }
5399
5400 static bool
5401 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
5402 {
5403   rtx base, offset;
5404
5405   if (GET_CODE (x) == HIGH)
5406     return true;
5407
5408   /* There's no way to calculate VL-based values using relocations.  */
5409   subrtx_iterator::array_type array;
5410   FOR_EACH_SUBRTX (iter, array, x, ALL)
5411     if (GET_CODE (*iter) == CONST_POLY_INT)
5412       return true;
5413
5414   split_const (x, &base, &offset);
5415   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
5416     {
5417       if (aarch64_classify_symbol (base, INTVAL (offset))
5418           != SYMBOL_FORCE_TO_MEM)
5419         return true;
5420       else
5421         /* Avoid generating a 64-bit relocation in ILP32; leave
5422            to aarch64_expand_mov_immediate to handle it properly.  */
5423         return mode != ptr_mode;
5424     }
5425
5426   return aarch64_tls_referenced_p (x);
5427 }
5428
5429 /* Implement TARGET_CASE_VALUES_THRESHOLD.
5430    The expansion for a table switch is quite expensive due to the number
5431    of instructions, the table lookup and hard to predict indirect jump.
5432    When optimizing for speed, and -O3 enabled, use the per-core tuning if
5433    set, otherwise use tables for > 16 cases as a tradeoff between size and
5434    performance.  When optimizing for size, use the default setting.  */
5435
5436 static unsigned int
5437 aarch64_case_values_threshold (void)
5438 {
5439   /* Use the specified limit for the number of cases before using jump
5440      tables at higher optimization levels.  */
5441   if (optimize > 2
5442       && selected_cpu->tune->max_case_values != 0)
5443     return selected_cpu->tune->max_case_values;
5444   else
5445     return optimize_size ? default_case_values_threshold () : 17;
5446 }
5447
5448 /* Return true if register REGNO is a valid index register.
5449    STRICT_P is true if REG_OK_STRICT is in effect.  */
5450
5451 bool
5452 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
5453 {
5454   if (!HARD_REGISTER_NUM_P (regno))
5455     {
5456       if (!strict_p)
5457         return true;
5458
5459       if (!reg_renumber)
5460         return false;
5461
5462       regno = reg_renumber[regno];
5463     }
5464   return GP_REGNUM_P (regno);
5465 }
5466
5467 /* Return true if register REGNO is a valid base register for mode MODE.
5468    STRICT_P is true if REG_OK_STRICT is in effect.  */
5469
5470 bool
5471 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
5472 {
5473   if (!HARD_REGISTER_NUM_P (regno))
5474     {
5475       if (!strict_p)
5476         return true;
5477
5478       if (!reg_renumber)
5479         return false;
5480
5481       regno = reg_renumber[regno];
5482     }
5483
5484   /* The fake registers will be eliminated to either the stack or
5485      hard frame pointer, both of which are usually valid base registers.
5486      Reload deals with the cases where the eliminated form isn't valid.  */
5487   return (GP_REGNUM_P (regno)
5488           || regno == SP_REGNUM
5489           || regno == FRAME_POINTER_REGNUM
5490           || regno == ARG_POINTER_REGNUM);
5491 }
5492
5493 /* Return true if X is a valid base register for mode MODE.
5494    STRICT_P is true if REG_OK_STRICT is in effect.  */
5495
5496 static bool
5497 aarch64_base_register_rtx_p (rtx x, bool strict_p)
5498 {
5499   if (!strict_p
5500       && GET_CODE (x) == SUBREG
5501       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
5502     x = SUBREG_REG (x);
5503
5504   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
5505 }
5506
5507 /* Return true if address offset is a valid index.  If it is, fill in INFO
5508    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
5509
5510 static bool
5511 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
5512                         machine_mode mode, bool strict_p)
5513 {
5514   enum aarch64_address_type type;
5515   rtx index;
5516   int shift;
5517
5518   /* (reg:P) */
5519   if ((REG_P (x) || GET_CODE (x) == SUBREG)
5520       && GET_MODE (x) == Pmode)
5521     {
5522       type = ADDRESS_REG_REG;
5523       index = x;
5524       shift = 0;
5525     }
5526   /* (sign_extend:DI (reg:SI)) */
5527   else if ((GET_CODE (x) == SIGN_EXTEND
5528             || GET_CODE (x) == ZERO_EXTEND)
5529            && GET_MODE (x) == DImode
5530            && GET_MODE (XEXP (x, 0)) == SImode)
5531     {
5532       type = (GET_CODE (x) == SIGN_EXTEND)
5533         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5534       index = XEXP (x, 0);
5535       shift = 0;
5536     }
5537   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
5538   else if (GET_CODE (x) == MULT
5539            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5540                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5541            && GET_MODE (XEXP (x, 0)) == DImode
5542            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5543            && CONST_INT_P (XEXP (x, 1)))
5544     {
5545       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5546         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5547       index = XEXP (XEXP (x, 0), 0);
5548       shift = exact_log2 (INTVAL (XEXP (x, 1)));
5549     }
5550   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
5551   else if (GET_CODE (x) == ASHIFT
5552            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5553                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5554            && GET_MODE (XEXP (x, 0)) == DImode
5555            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5556            && CONST_INT_P (XEXP (x, 1)))
5557     {
5558       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5559         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5560       index = XEXP (XEXP (x, 0), 0);
5561       shift = INTVAL (XEXP (x, 1));
5562     }
5563   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
5564   else if ((GET_CODE (x) == SIGN_EXTRACT
5565             || GET_CODE (x) == ZERO_EXTRACT)
5566            && GET_MODE (x) == DImode
5567            && GET_CODE (XEXP (x, 0)) == MULT
5568            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5569            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5570     {
5571       type = (GET_CODE (x) == SIGN_EXTRACT)
5572         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5573       index = XEXP (XEXP (x, 0), 0);
5574       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5575       if (INTVAL (XEXP (x, 1)) != 32 + shift
5576           || INTVAL (XEXP (x, 2)) != 0)
5577         shift = -1;
5578     }
5579   /* (and:DI (mult:DI (reg:DI) (const_int scale))
5580      (const_int 0xffffffff<<shift)) */
5581   else if (GET_CODE (x) == AND
5582            && GET_MODE (x) == DImode
5583            && GET_CODE (XEXP (x, 0)) == MULT
5584            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5585            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5586            && CONST_INT_P (XEXP (x, 1)))
5587     {
5588       type = ADDRESS_REG_UXTW;
5589       index = XEXP (XEXP (x, 0), 0);
5590       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5591       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5592         shift = -1;
5593     }
5594   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
5595   else if ((GET_CODE (x) == SIGN_EXTRACT
5596             || GET_CODE (x) == ZERO_EXTRACT)
5597            && GET_MODE (x) == DImode
5598            && GET_CODE (XEXP (x, 0)) == ASHIFT
5599            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5600            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5601     {
5602       type = (GET_CODE (x) == SIGN_EXTRACT)
5603         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5604       index = XEXP (XEXP (x, 0), 0);
5605       shift = INTVAL (XEXP (XEXP (x, 0), 1));
5606       if (INTVAL (XEXP (x, 1)) != 32 + shift
5607           || INTVAL (XEXP (x, 2)) != 0)
5608         shift = -1;
5609     }
5610   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
5611      (const_int 0xffffffff<<shift)) */
5612   else if (GET_CODE (x) == AND
5613            && GET_MODE (x) == DImode
5614            && GET_CODE (XEXP (x, 0)) == ASHIFT
5615            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5616            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5617            && CONST_INT_P (XEXP (x, 1)))
5618     {
5619       type = ADDRESS_REG_UXTW;
5620       index = XEXP (XEXP (x, 0), 0);
5621       shift = INTVAL (XEXP (XEXP (x, 0), 1));
5622       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5623         shift = -1;
5624     }
5625   /* (mult:P (reg:P) (const_int scale)) */
5626   else if (GET_CODE (x) == MULT
5627            && GET_MODE (x) == Pmode
5628            && GET_MODE (XEXP (x, 0)) == Pmode
5629            && CONST_INT_P (XEXP (x, 1)))
5630     {
5631       type = ADDRESS_REG_REG;
5632       index = XEXP (x, 0);
5633       shift = exact_log2 (INTVAL (XEXP (x, 1)));
5634     }
5635   /* (ashift:P (reg:P) (const_int shift)) */
5636   else if (GET_CODE (x) == ASHIFT
5637            && GET_MODE (x) == Pmode
5638            && GET_MODE (XEXP (x, 0)) == Pmode
5639            && CONST_INT_P (XEXP (x, 1)))
5640     {
5641       type = ADDRESS_REG_REG;
5642       index = XEXP (x, 0);
5643       shift = INTVAL (XEXP (x, 1));
5644     }
5645   else
5646     return false;
5647
5648   if (!strict_p
5649       && GET_CODE (index) == SUBREG
5650       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
5651     index = SUBREG_REG (index);
5652
5653   if (aarch64_sve_data_mode_p (mode))
5654     {
5655       if (type != ADDRESS_REG_REG
5656           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
5657         return false;
5658     }
5659   else
5660     {
5661       if (shift != 0
5662           && !(IN_RANGE (shift, 1, 3)
5663                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
5664         return false;
5665     }
5666
5667   if (REG_P (index)
5668       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
5669     {
5670       info->type = type;
5671       info->offset = index;
5672       info->shift = shift;
5673       return true;
5674     }
5675
5676   return false;
5677 }
5678
5679 /* Return true if MODE is one of the modes for which we
5680    support LDP/STP operations.  */
5681
5682 static bool
5683 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
5684 {
5685   return mode == SImode || mode == DImode
5686          || mode == SFmode || mode == DFmode
5687          || (aarch64_vector_mode_supported_p (mode)
5688              && (known_eq (GET_MODE_SIZE (mode), 8)
5689                  || (known_eq (GET_MODE_SIZE (mode), 16)
5690                     && (aarch64_tune_params.extra_tuning_flags
5691                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
5692 }
5693
5694 /* Return true if REGNO is a virtual pointer register, or an eliminable
5695    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
5696    include stack_pointer or hard_frame_pointer.  */
5697 static bool
5698 virt_or_elim_regno_p (unsigned regno)
5699 {
5700   return ((regno >= FIRST_VIRTUAL_REGISTER
5701            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
5702           || regno == FRAME_POINTER_REGNUM
5703           || regno == ARG_POINTER_REGNUM);
5704 }
5705
5706 /* Return true if X is a valid address of type TYPE for machine mode MODE.
5707    If it is, fill in INFO appropriately.  STRICT_P is true if
5708    REG_OK_STRICT is in effect.  */
5709
5710 bool
5711 aarch64_classify_address (struct aarch64_address_info *info,
5712                           rtx x, machine_mode mode, bool strict_p,
5713                           aarch64_addr_query_type type)
5714 {
5715   enum rtx_code code = GET_CODE (x);
5716   rtx op0, op1;
5717   poly_int64 offset;
5718
5719   HOST_WIDE_INT const_size;
5720
5721   /* On BE, we use load/store pair for all large int mode load/stores.
5722      TI/TFmode may also use a load/store pair.  */
5723   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5724   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
5725   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
5726                             || type == ADDR_QUERY_LDP_STP_N
5727                             || mode == TImode
5728                             || mode == TFmode
5729                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
5730
5731   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
5732      corresponds to the actual size of the memory being loaded/stored and the
5733      mode of the corresponding addressing mode is half of that.  */
5734   if (type == ADDR_QUERY_LDP_STP_N
5735       && known_eq (GET_MODE_SIZE (mode), 16))
5736     mode = DFmode;
5737
5738   bool allow_reg_index_p = (!load_store_pair_p
5739                             && (known_lt (GET_MODE_SIZE (mode), 16)
5740                                 || vec_flags == VEC_ADVSIMD
5741                                 || vec_flags == VEC_SVE_DATA));
5742
5743   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
5744      [Rn, #offset, MUL VL].  */
5745   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
5746       && (code != REG && code != PLUS))
5747     return false;
5748
5749   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
5750      REG addressing.  */
5751   if (advsimd_struct_p
5752       && !BYTES_BIG_ENDIAN
5753       && (code != POST_INC && code != REG))
5754     return false;
5755
5756   gcc_checking_assert (GET_MODE (x) == VOIDmode
5757                        || SCALAR_INT_MODE_P (GET_MODE (x)));
5758
5759   switch (code)
5760     {
5761     case REG:
5762     case SUBREG:
5763       info->type = ADDRESS_REG_IMM;
5764       info->base = x;
5765       info->offset = const0_rtx;
5766       info->const_offset = 0;
5767       return aarch64_base_register_rtx_p (x, strict_p);
5768
5769     case PLUS:
5770       op0 = XEXP (x, 0);
5771       op1 = XEXP (x, 1);
5772
5773       if (! strict_p
5774           && REG_P (op0)
5775           && virt_or_elim_regno_p (REGNO (op0))
5776           && poly_int_rtx_p (op1, &offset))
5777         {
5778           info->type = ADDRESS_REG_IMM;
5779           info->base = op0;
5780           info->offset = op1;
5781           info->const_offset = offset;
5782
5783           return true;
5784         }
5785
5786       if (maybe_ne (GET_MODE_SIZE (mode), 0)
5787           && aarch64_base_register_rtx_p (op0, strict_p)
5788           && poly_int_rtx_p (op1, &offset))
5789         {
5790           info->type = ADDRESS_REG_IMM;
5791           info->base = op0;
5792           info->offset = op1;
5793           info->const_offset = offset;
5794
5795           /* TImode and TFmode values are allowed in both pairs of X
5796              registers and individual Q registers.  The available
5797              address modes are:
5798              X,X: 7-bit signed scaled offset
5799              Q:   9-bit signed offset
5800              We conservatively require an offset representable in either mode.
5801              When performing the check for pairs of X registers i.e.  LDP/STP
5802              pass down DImode since that is the natural size of the LDP/STP
5803              instruction memory accesses.  */
5804           if (mode == TImode || mode == TFmode)
5805             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
5806                     && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
5807                         || offset_12bit_unsigned_scaled_p (mode, offset)));
5808
5809           /* A 7bit offset check because OImode will emit a ldp/stp
5810              instruction (only big endian will get here).
5811              For ldp/stp instructions, the offset is scaled for the size of a
5812              single element of the pair.  */
5813           if (mode == OImode)
5814             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
5815
5816           /* Three 9/12 bit offsets checks because CImode will emit three
5817              ldr/str instructions (only big endian will get here).  */
5818           if (mode == CImode)
5819             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5820                     && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
5821                                                                offset + 32)
5822                         || offset_12bit_unsigned_scaled_p (V16QImode,
5823                                                            offset + 32)));
5824
5825           /* Two 7bit offsets checks because XImode will emit two ldp/stp
5826              instructions (only big endian will get here).  */
5827           if (mode == XImode)
5828             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5829                     && aarch64_offset_7bit_signed_scaled_p (TImode,
5830                                                             offset + 32));
5831
5832           /* Make "m" use the LD1 offset range for SVE data modes, so
5833              that pre-RTL optimizers like ivopts will work to that
5834              instead of the wider LDR/STR range.  */
5835           if (vec_flags == VEC_SVE_DATA)
5836             return (type == ADDR_QUERY_M
5837                     ? offset_4bit_signed_scaled_p (mode, offset)
5838                     : offset_9bit_signed_scaled_p (mode, offset));
5839
5840           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
5841             {
5842               poly_int64 end_offset = (offset
5843                                        + GET_MODE_SIZE (mode)
5844                                        - BYTES_PER_SVE_VECTOR);
5845               return (type == ADDR_QUERY_M
5846                       ? offset_4bit_signed_scaled_p (mode, offset)
5847                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
5848                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
5849                                                          end_offset)));
5850             }
5851
5852           if (vec_flags == VEC_SVE_PRED)
5853             return offset_9bit_signed_scaled_p (mode, offset);
5854
5855           if (load_store_pair_p)
5856             return ((known_eq (GET_MODE_SIZE (mode), 4)
5857                      || known_eq (GET_MODE_SIZE (mode), 8)
5858                      || known_eq (GET_MODE_SIZE (mode), 16))
5859                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5860           else
5861             return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
5862                     || offset_12bit_unsigned_scaled_p (mode, offset));
5863         }
5864
5865       if (allow_reg_index_p)
5866         {
5867           /* Look for base + (scaled/extended) index register.  */
5868           if (aarch64_base_register_rtx_p (op0, strict_p)
5869               && aarch64_classify_index (info, op1, mode, strict_p))
5870             {
5871               info->base = op0;
5872               return true;
5873             }
5874           if (aarch64_base_register_rtx_p (op1, strict_p)
5875               && aarch64_classify_index (info, op0, mode, strict_p))
5876             {
5877               info->base = op1;
5878               return true;
5879             }
5880         }
5881
5882       return false;
5883
5884     case POST_INC:
5885     case POST_DEC:
5886     case PRE_INC:
5887     case PRE_DEC:
5888       info->type = ADDRESS_REG_WB;
5889       info->base = XEXP (x, 0);
5890       info->offset = NULL_RTX;
5891       return aarch64_base_register_rtx_p (info->base, strict_p);
5892
5893     case POST_MODIFY:
5894     case PRE_MODIFY:
5895       info->type = ADDRESS_REG_WB;
5896       info->base = XEXP (x, 0);
5897       if (GET_CODE (XEXP (x, 1)) == PLUS
5898           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
5899           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
5900           && aarch64_base_register_rtx_p (info->base, strict_p))
5901         {
5902           info->offset = XEXP (XEXP (x, 1), 1);
5903           info->const_offset = offset;
5904
5905           /* TImode and TFmode values are allowed in both pairs of X
5906              registers and individual Q registers.  The available
5907              address modes are:
5908              X,X: 7-bit signed scaled offset
5909              Q:   9-bit signed offset
5910              We conservatively require an offset representable in either mode.
5911            */
5912           if (mode == TImode || mode == TFmode)
5913             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
5914                     && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
5915
5916           if (load_store_pair_p)
5917             return ((known_eq (GET_MODE_SIZE (mode), 4)
5918                      || known_eq (GET_MODE_SIZE (mode), 8)
5919                      || known_eq (GET_MODE_SIZE (mode), 16))
5920                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5921           else
5922             return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
5923         }
5924       return false;
5925
5926     case CONST:
5927     case SYMBOL_REF:
5928     case LABEL_REF:
5929       /* load literal: pc-relative constant pool entry.  Only supported
5930          for SI mode or larger.  */
5931       info->type = ADDRESS_SYMBOLIC;
5932
5933       if (!load_store_pair_p
5934           && GET_MODE_SIZE (mode).is_constant (&const_size)
5935           && const_size >= 4)
5936         {
5937           rtx sym, addend;
5938
5939           split_const (x, &sym, &addend);
5940           return ((GET_CODE (sym) == LABEL_REF
5941                    || (GET_CODE (sym) == SYMBOL_REF
5942                        && CONSTANT_POOL_ADDRESS_P (sym)
5943                        && aarch64_pcrelative_literal_loads)));
5944         }
5945       return false;
5946
5947     case LO_SUM:
5948       info->type = ADDRESS_LO_SUM;
5949       info->base = XEXP (x, 0);
5950       info->offset = XEXP (x, 1);
5951       if (allow_reg_index_p
5952           && aarch64_base_register_rtx_p (info->base, strict_p))
5953         {
5954           rtx sym, offs;
5955           split_const (info->offset, &sym, &offs);
5956           if (GET_CODE (sym) == SYMBOL_REF
5957               && (aarch64_classify_symbol (sym, INTVAL (offs))
5958                   == SYMBOL_SMALL_ABSOLUTE))
5959             {
5960               /* The symbol and offset must be aligned to the access size.  */
5961               unsigned int align;
5962
5963               if (CONSTANT_POOL_ADDRESS_P (sym))
5964                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
5965               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
5966                 {
5967                   tree exp = SYMBOL_REF_DECL (sym);
5968                   align = TYPE_ALIGN (TREE_TYPE (exp));
5969                   align = aarch64_constant_alignment (exp, align);
5970                 }
5971               else if (SYMBOL_REF_DECL (sym))
5972                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
5973               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
5974                        && SYMBOL_REF_BLOCK (sym) != NULL)
5975                 align = SYMBOL_REF_BLOCK (sym)->alignment;
5976               else
5977                 align = BITS_PER_UNIT;
5978
5979               poly_int64 ref_size = GET_MODE_SIZE (mode);
5980               if (known_eq (ref_size, 0))
5981                 ref_size = GET_MODE_SIZE (DImode);
5982
5983               return (multiple_p (INTVAL (offs), ref_size)
5984                       && multiple_p (align / BITS_PER_UNIT, ref_size));
5985             }
5986         }
5987       return false;
5988
5989     default:
5990       return false;
5991     }
5992 }
5993
5994 /* Return true if the address X is valid for a PRFM instruction.
5995    STRICT_P is true if we should do strict checking with
5996    aarch64_classify_address.  */
5997
5998 bool
5999 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
6000 {
6001   struct aarch64_address_info addr;
6002
6003   /* PRFM accepts the same addresses as DImode...  */
6004   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
6005   if (!res)
6006     return false;
6007
6008   /* ... except writeback forms.  */
6009   return addr.type != ADDRESS_REG_WB;
6010 }
6011
6012 bool
6013 aarch64_symbolic_address_p (rtx x)
6014 {
6015   rtx offset;
6016
6017   split_const (x, &x, &offset);
6018   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
6019 }
6020
6021 /* Classify the base of symbolic expression X.  */
6022
6023 enum aarch64_symbol_type
6024 aarch64_classify_symbolic_expression (rtx x)
6025 {
6026   rtx offset;
6027
6028   split_const (x, &x, &offset);
6029   return aarch64_classify_symbol (x, INTVAL (offset));
6030 }
6031
6032
6033 /* Return TRUE if X is a legitimate address for accessing memory in
6034    mode MODE.  */
6035 static bool
6036 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
6037 {
6038   struct aarch64_address_info addr;
6039
6040   return aarch64_classify_address (&addr, x, mode, strict_p);
6041 }
6042
6043 /* Return TRUE if X is a legitimate address of type TYPE for accessing
6044    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
6045 bool
6046 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
6047                               aarch64_addr_query_type type)
6048 {
6049   struct aarch64_address_info addr;
6050
6051   return aarch64_classify_address (&addr, x, mode, strict_p, type);
6052 }
6053
6054 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
6055
6056 static bool
6057 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
6058                                          poly_int64 orig_offset,
6059                                          machine_mode mode)
6060 {
6061   HOST_WIDE_INT size;
6062   if (GET_MODE_SIZE (mode).is_constant (&size))
6063     {
6064       HOST_WIDE_INT const_offset, second_offset;
6065
6066       /* A general SVE offset is A * VQ + B.  Remove the A component from
6067          coefficient 0 in order to get the constant B.  */
6068       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
6069
6070       /* Split an out-of-range address displacement into a base and
6071          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
6072          range otherwise to increase opportunities for sharing the base
6073          address of different sizes.  Unaligned accesses use the signed
6074          9-bit range, TImode/TFmode use the intersection of signed
6075          scaled 7-bit and signed 9-bit offset.  */
6076       if (mode == TImode || mode == TFmode)
6077         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
6078       else if ((const_offset & (size - 1)) != 0)
6079         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
6080       else
6081         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
6082
6083       if (second_offset == 0 || known_eq (orig_offset, second_offset))
6084         return false;
6085
6086       /* Split the offset into second_offset and the rest.  */
6087       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6088       *offset2 = gen_int_mode (second_offset, Pmode);
6089       return true;
6090     }
6091   else
6092     {
6093       /* Get the mode we should use as the basis of the range.  For structure
6094          modes this is the mode of one vector.  */
6095       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6096       machine_mode step_mode
6097         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
6098
6099       /* Get the "mul vl" multiplier we'd like to use.  */
6100       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
6101       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
6102       if (vec_flags & VEC_SVE_DATA)
6103         /* LDR supports a 9-bit range, but the move patterns for
6104            structure modes require all vectors to be in range of the
6105            same base.  The simplest way of accomodating that while still
6106            promoting reuse of anchor points between different modes is
6107            to use an 8-bit range unconditionally.  */
6108         vnum = ((vnum + 128) & 255) - 128;
6109       else
6110         /* Predicates are only handled singly, so we might as well use
6111            the full range.  */
6112         vnum = ((vnum + 256) & 511) - 256;
6113       if (vnum == 0)
6114         return false;
6115
6116       /* Convert the "mul vl" multiplier into a byte offset.  */
6117       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
6118       if (known_eq (second_offset, orig_offset))
6119         return false;
6120
6121       /* Split the offset into second_offset and the rest.  */
6122       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6123       *offset2 = gen_int_mode (second_offset, Pmode);
6124       return true;
6125     }
6126 }
6127
6128 /* Return the binary representation of floating point constant VALUE in INTVAL.
6129    If the value cannot be converted, return false without setting INTVAL.
6130    The conversion is done in the given MODE.  */
6131 bool
6132 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
6133 {
6134
6135   /* We make a general exception for 0.  */
6136   if (aarch64_float_const_zero_rtx_p (value))
6137     {
6138       *intval = 0;
6139       return true;
6140     }
6141
6142   scalar_float_mode mode;
6143   if (GET_CODE (value) != CONST_DOUBLE
6144       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
6145       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
6146       /* Only support up to DF mode.  */
6147       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
6148     return false;
6149
6150   unsigned HOST_WIDE_INT ival = 0;
6151
6152   long res[2];
6153   real_to_target (res,
6154                   CONST_DOUBLE_REAL_VALUE (value),
6155                   REAL_MODE_FORMAT (mode));
6156
6157   if (mode == DFmode)
6158     {
6159       int order = BYTES_BIG_ENDIAN ? 1 : 0;
6160       ival = zext_hwi (res[order], 32);
6161       ival |= (zext_hwi (res[1 - order], 32) << 32);
6162     }
6163   else
6164       ival = zext_hwi (res[0], 32);
6165
6166   *intval = ival;
6167   return true;
6168 }
6169
6170 /* Return TRUE if rtx X is an immediate constant that can be moved using a
6171    single MOV(+MOVK) followed by an FMOV.  */
6172 bool
6173 aarch64_float_const_rtx_p (rtx x)
6174 {
6175   machine_mode mode = GET_MODE (x);
6176   if (mode == VOIDmode)
6177     return false;
6178
6179   /* Determine whether it's cheaper to write float constants as
6180      mov/movk pairs over ldr/adrp pairs.  */
6181   unsigned HOST_WIDE_INT ival;
6182
6183   if (GET_CODE (x) == CONST_DOUBLE
6184       && SCALAR_FLOAT_MODE_P (mode)
6185       && aarch64_reinterpret_float_as_int (x, &ival))
6186     {
6187       scalar_int_mode imode = (mode == HFmode
6188                                ? SImode
6189                                : int_mode_for_mode (mode).require ());
6190       int num_instr = aarch64_internal_mov_immediate
6191                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
6192       return num_instr < 3;
6193     }
6194
6195   return false;
6196 }
6197
6198 /* Return TRUE if rtx X is immediate constant 0.0 */
6199 bool
6200 aarch64_float_const_zero_rtx_p (rtx x)
6201 {
6202   if (GET_MODE (x) == VOIDmode)
6203     return false;
6204
6205   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
6206     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
6207   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
6208 }
6209
6210 /* Return TRUE if rtx X is immediate constant that fits in a single
6211    MOVI immediate operation.  */
6212 bool
6213 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
6214 {
6215   if (!TARGET_SIMD)
6216      return false;
6217
6218   machine_mode vmode;
6219   scalar_int_mode imode;
6220   unsigned HOST_WIDE_INT ival;
6221
6222   if (GET_CODE (x) == CONST_DOUBLE
6223       && SCALAR_FLOAT_MODE_P (mode))
6224     {
6225       if (!aarch64_reinterpret_float_as_int (x, &ival))
6226         return false;
6227
6228       /* We make a general exception for 0.  */
6229       if (aarch64_float_const_zero_rtx_p (x))
6230         return true;
6231
6232       imode = int_mode_for_mode (mode).require ();
6233     }
6234   else if (GET_CODE (x) == CONST_INT
6235            && is_a <scalar_int_mode> (mode, &imode))
6236     ival = INTVAL (x);
6237   else
6238     return false;
6239
6240    /* use a 64 bit mode for everything except for DI/DF mode, where we use
6241      a 128 bit vector mode.  */
6242   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
6243
6244   vmode = aarch64_simd_container_mode (imode, width);
6245   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
6246
6247   return aarch64_simd_valid_immediate (v_op, NULL);
6248 }
6249
6250
6251 /* Return the fixed registers used for condition codes.  */
6252
6253 static bool
6254 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
6255 {
6256   *p1 = CC_REGNUM;
6257   *p2 = INVALID_REGNUM;
6258   return true;
6259 }
6260
6261 /* This function is used by the call expanders of the machine description.
6262    RESULT is the register in which the result is returned.  It's NULL for
6263    "call" and "sibcall".
6264    MEM is the location of the function call.
6265    SIBCALL indicates whether this function call is normal call or sibling call.
6266    It will generate different pattern accordingly.  */
6267
6268 void
6269 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
6270 {
6271   rtx call, callee, tmp;
6272   rtvec vec;
6273   machine_mode mode;
6274
6275   gcc_assert (MEM_P (mem));
6276   callee = XEXP (mem, 0);
6277   mode = GET_MODE (callee);
6278   gcc_assert (mode == Pmode);
6279
6280   /* Decide if we should generate indirect calls by loading the
6281      address of the callee into a register before performing
6282      the branch-and-link.  */
6283   if (SYMBOL_REF_P (callee)
6284       ? (aarch64_is_long_call_p (callee)
6285          || aarch64_is_noplt_call_p (callee))
6286       : !REG_P (callee))
6287     XEXP (mem, 0) = force_reg (mode, callee);
6288
6289   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
6290
6291   if (result != NULL_RTX)
6292     call = gen_rtx_SET (result, call);
6293
6294   if (sibcall)
6295     tmp = ret_rtx;
6296   else
6297     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
6298
6299   vec = gen_rtvec (2, call, tmp);
6300   call = gen_rtx_PARALLEL (VOIDmode, vec);
6301
6302   aarch64_emit_call_insn (call);
6303 }
6304
6305 /* Emit call insn with PAT and do aarch64-specific handling.  */
6306
6307 void
6308 aarch64_emit_call_insn (rtx pat)
6309 {
6310   rtx insn = emit_call_insn (pat);
6311
6312   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
6313   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
6314   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
6315 }
6316
6317 machine_mode
6318 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
6319 {
6320   /* All floating point compares return CCFP if it is an equality
6321      comparison, and CCFPE otherwise.  */
6322   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
6323     {
6324       switch (code)
6325         {
6326         case EQ:
6327         case NE:
6328         case UNORDERED:
6329         case ORDERED:
6330         case UNLT:
6331         case UNLE:
6332         case UNGT:
6333         case UNGE:
6334         case UNEQ:
6335           return CCFPmode;
6336
6337         case LT:
6338         case LE:
6339         case GT:
6340         case GE:
6341         case LTGT:
6342           return CCFPEmode;
6343
6344         default:
6345           gcc_unreachable ();
6346         }
6347     }
6348
6349   /* Equality comparisons of short modes against zero can be performed
6350      using the TST instruction with the appropriate bitmask.  */
6351   if (y == const0_rtx && REG_P (x)
6352       && (code == EQ || code == NE)
6353       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
6354     return CC_NZmode;
6355
6356   /* Similarly, comparisons of zero_extends from shorter modes can
6357      be performed using an ANDS with an immediate mask.  */
6358   if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
6359       && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6360       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
6361       && (code == EQ || code == NE))
6362     return CC_NZmode;
6363
6364   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6365       && y == const0_rtx
6366       && (code == EQ || code == NE || code == LT || code == GE)
6367       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
6368           || GET_CODE (x) == NEG
6369           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
6370               && CONST_INT_P (XEXP (x, 2)))))
6371     return CC_NZmode;
6372
6373   /* A compare with a shifted operand.  Because of canonicalization,
6374      the comparison will have to be swapped when we emit the assembly
6375      code.  */
6376   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6377       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
6378       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
6379           || GET_CODE (x) == LSHIFTRT
6380           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
6381     return CC_SWPmode;
6382
6383   /* Similarly for a negated operand, but we can only do this for
6384      equalities.  */
6385   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6386       && (REG_P (y) || GET_CODE (y) == SUBREG)
6387       && (code == EQ || code == NE)
6388       && GET_CODE (x) == NEG)
6389     return CC_Zmode;
6390
6391   /* A test for unsigned overflow.  */
6392   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
6393       && code == NE
6394       && GET_CODE (x) == PLUS
6395       && GET_CODE (y) == ZERO_EXTEND)
6396     return CC_Cmode;
6397
6398   /* A test for signed overflow.  */
6399   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
6400       && code == NE
6401       && GET_CODE (x) == PLUS
6402       && GET_CODE (y) == SIGN_EXTEND)
6403     return CC_Vmode;
6404
6405   /* For everything else, return CCmode.  */
6406   return CCmode;
6407 }
6408
6409 static int
6410 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
6411
6412 int
6413 aarch64_get_condition_code (rtx x)
6414 {
6415   machine_mode mode = GET_MODE (XEXP (x, 0));
6416   enum rtx_code comp_code = GET_CODE (x);
6417
6418   if (GET_MODE_CLASS (mode) != MODE_CC)
6419     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
6420   return aarch64_get_condition_code_1 (mode, comp_code);
6421 }
6422
6423 static int
6424 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
6425 {
6426   switch (mode)
6427     {
6428     case E_CCFPmode:
6429     case E_CCFPEmode:
6430       switch (comp_code)
6431         {
6432         case GE: return AARCH64_GE;
6433         case GT: return AARCH64_GT;
6434         case LE: return AARCH64_LS;
6435         case LT: return AARCH64_MI;
6436         case NE: return AARCH64_NE;
6437         case EQ: return AARCH64_EQ;
6438         case ORDERED: return AARCH64_VC;
6439         case UNORDERED: return AARCH64_VS;
6440         case UNLT: return AARCH64_LT;
6441         case UNLE: return AARCH64_LE;
6442         case UNGT: return AARCH64_HI;
6443         case UNGE: return AARCH64_PL;
6444         default: return -1;
6445         }
6446       break;
6447
6448     case E_CCmode:
6449       switch (comp_code)
6450         {
6451         case NE: return AARCH64_NE;
6452         case EQ: return AARCH64_EQ;
6453         case GE: return AARCH64_GE;
6454         case GT: return AARCH64_GT;
6455         case LE: return AARCH64_LE;
6456         case LT: return AARCH64_LT;
6457         case GEU: return AARCH64_CS;
6458         case GTU: return AARCH64_HI;
6459         case LEU: return AARCH64_LS;
6460         case LTU: return AARCH64_CC;
6461         default: return -1;
6462         }
6463       break;
6464
6465     case E_CC_SWPmode:
6466       switch (comp_code)
6467         {
6468         case NE: return AARCH64_NE;
6469         case EQ: return AARCH64_EQ;
6470         case GE: return AARCH64_LE;
6471         case GT: return AARCH64_LT;
6472         case LE: return AARCH64_GE;
6473         case LT: return AARCH64_GT;
6474         case GEU: return AARCH64_LS;
6475         case GTU: return AARCH64_CC;
6476         case LEU: return AARCH64_CS;
6477         case LTU: return AARCH64_HI;
6478         default: return -1;
6479         }
6480       break;
6481
6482     case E_CC_NZmode:
6483       switch (comp_code)
6484         {
6485         case NE: return AARCH64_NE;
6486         case EQ: return AARCH64_EQ;
6487         case GE: return AARCH64_PL;
6488         case LT: return AARCH64_MI;
6489         default: return -1;
6490         }
6491       break;
6492
6493     case E_CC_Zmode:
6494       switch (comp_code)
6495         {
6496         case NE: return AARCH64_NE;
6497         case EQ: return AARCH64_EQ;
6498         default: return -1;
6499         }
6500       break;
6501
6502     case E_CC_Cmode:
6503       switch (comp_code)
6504         {
6505         case NE: return AARCH64_CS;
6506         case EQ: return AARCH64_CC;
6507         default: return -1;
6508         }
6509       break;
6510
6511     case E_CC_Vmode:
6512       switch (comp_code)
6513         {
6514         case NE: return AARCH64_VS;
6515         case EQ: return AARCH64_VC;
6516         default: return -1;
6517         }
6518       break;
6519
6520     default:
6521       return -1;
6522     }
6523
6524   return -1;
6525 }
6526
6527 bool
6528 aarch64_const_vec_all_same_in_range_p (rtx x,
6529                                        HOST_WIDE_INT minval,
6530                                        HOST_WIDE_INT maxval)
6531 {
6532   rtx elt;
6533   return (const_vec_duplicate_p (x, &elt)
6534           && CONST_INT_P (elt)
6535           && IN_RANGE (INTVAL (elt), minval, maxval));
6536 }
6537
6538 bool
6539 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
6540 {
6541   return aarch64_const_vec_all_same_in_range_p (x, val, val);
6542 }
6543
6544 /* Return true if VEC is a constant in which every element is in the range
6545    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
6546
6547 static bool
6548 aarch64_const_vec_all_in_range_p (rtx vec,
6549                                   HOST_WIDE_INT minval,
6550                                   HOST_WIDE_INT maxval)
6551 {
6552   if (GET_CODE (vec) != CONST_VECTOR
6553       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
6554     return false;
6555
6556   int nunits;
6557   if (!CONST_VECTOR_STEPPED_P (vec))
6558     nunits = const_vector_encoded_nelts (vec);
6559   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
6560     return false;
6561
6562   for (int i = 0; i < nunits; i++)
6563     {
6564       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
6565       if (!CONST_INT_P (vec_elem)
6566           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
6567         return false;
6568     }
6569   return true;
6570 }
6571
6572 /* N Z C V.  */
6573 #define AARCH64_CC_V 1
6574 #define AARCH64_CC_C (1 << 1)
6575 #define AARCH64_CC_Z (1 << 2)
6576 #define AARCH64_CC_N (1 << 3)
6577
6578 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
6579 static const int aarch64_nzcv_codes[] =
6580 {
6581   0,            /* EQ, Z == 1.  */
6582   AARCH64_CC_Z, /* NE, Z == 0.  */
6583   0,            /* CS, C == 1.  */
6584   AARCH64_CC_C, /* CC, C == 0.  */
6585   0,            /* MI, N == 1.  */
6586   AARCH64_CC_N, /* PL, N == 0.  */
6587   0,            /* VS, V == 1.  */
6588   AARCH64_CC_V, /* VC, V == 0.  */
6589   0,            /* HI, C ==1 && Z == 0.  */
6590   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
6591   AARCH64_CC_V, /* GE, N == V.  */
6592   0,            /* LT, N != V.  */
6593   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
6594   0,            /* LE, !(Z == 0 && N == V).  */
6595   0,            /* AL, Any.  */
6596   0             /* NV, Any.  */
6597 };
6598
6599 /* Print floating-point vector immediate operand X to F, negating it
6600    first if NEGATE is true.  Return true on success, false if it isn't
6601    a constant we can handle.  */
6602
6603 static bool
6604 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
6605 {
6606   rtx elt;
6607
6608   if (!const_vec_duplicate_p (x, &elt))
6609     return false;
6610
6611   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
6612   if (negate)
6613     r = real_value_negate (&r);
6614
6615   /* We only handle the SVE single-bit immediates here.  */
6616   if (real_equal (&r, &dconst0))
6617     asm_fprintf (f, "0.0");
6618   else if (real_equal (&r, &dconst1))
6619     asm_fprintf (f, "1.0");
6620   else if (real_equal (&r, &dconsthalf))
6621     asm_fprintf (f, "0.5");
6622   else
6623     return false;
6624
6625   return true;
6626 }
6627
6628 /* Return the equivalent letter for size.  */
6629 static char
6630 sizetochar (int size)
6631 {
6632   switch (size)
6633     {
6634     case 64: return 'd';
6635     case 32: return 's';
6636     case 16: return 'h';
6637     case 8 : return 'b';
6638     default: gcc_unreachable ();
6639     }
6640 }
6641
6642 /* Print operand X to file F in a target specific manner according to CODE.
6643    The acceptable formatting commands given by CODE are:
6644      'c':               An integer or symbol address without a preceding #
6645                         sign.
6646      'C':               Take the duplicated element in a vector constant
6647                         and print it in hex.
6648      'D':               Take the duplicated element in a vector constant
6649                         and print it as an unsigned integer, in decimal.
6650      'e':               Print the sign/zero-extend size as a character 8->b,
6651                         16->h, 32->w.
6652      'p':               Prints N such that 2^N == X (X must be power of 2 and
6653                         const int).
6654      'P':               Print the number of non-zero bits in X (a const_int).
6655      'H':               Print the higher numbered register of a pair (TImode)
6656                         of regs.
6657      'm':               Print a condition (eq, ne, etc).
6658      'M':               Same as 'm', but invert condition.
6659      'N':               Take the duplicated element in a vector constant
6660                         and print the negative of it in decimal.
6661      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
6662      'S/T/U/V':         Print a FP/SIMD register name for a register list.
6663                         The register printed is the FP/SIMD register name
6664                         of X + 0/1/2/3 for S/T/U/V.
6665      'R':               Print a scalar FP/SIMD register name + 1.
6666      'X':               Print bottom 16 bits of integer constant in hex.
6667      'w/x':             Print a general register name or the zero register
6668                         (32-bit or 64-bit).
6669      '0':               Print a normal operand, if it's a general register,
6670                         then we assume DImode.
6671      'k':               Print NZCV for conditional compare instructions.
6672      'A':               Output address constant representing the first
6673                         argument of X, specifying a relocation offset
6674                         if appropriate.
6675      'L':               Output constant address specified by X
6676                         with a relocation offset if appropriate.
6677      'G':               Prints address of X, specifying a PC relative
6678                         relocation mode if appropriate.
6679      'y':               Output address of LDP or STP - this is used for
6680                         some LDP/STPs which don't use a PARALLEL in their
6681                         pattern (so the mode needs to be adjusted).
6682      'z':               Output address of a typical LDP or STP.  */
6683
6684 static void
6685 aarch64_print_operand (FILE *f, rtx x, int code)
6686 {
6687   rtx elt;
6688   switch (code)
6689     {
6690     case 'c':
6691       switch (GET_CODE (x))
6692         {
6693         case CONST_INT:
6694           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
6695           break;
6696
6697         case SYMBOL_REF:
6698           output_addr_const (f, x);
6699           break;
6700
6701         case CONST:
6702           if (GET_CODE (XEXP (x, 0)) == PLUS
6703               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
6704             {
6705               output_addr_const (f, x);
6706               break;
6707             }
6708           /* Fall through.  */
6709
6710         default:
6711           output_operand_lossage ("unsupported operand for code '%c'", code);
6712         }
6713       break;
6714
6715     case 'e':
6716       {
6717         int n;
6718
6719         if (!CONST_INT_P (x)
6720             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
6721           {
6722             output_operand_lossage ("invalid operand for '%%%c'", code);
6723             return;
6724           }
6725
6726         switch (n)
6727           {
6728           case 3:
6729             fputc ('b', f);
6730             break;
6731           case 4:
6732             fputc ('h', f);
6733             break;
6734           case 5:
6735             fputc ('w', f);
6736             break;
6737           default:
6738             output_operand_lossage ("invalid operand for '%%%c'", code);
6739             return;
6740           }
6741       }
6742       break;
6743
6744     case 'p':
6745       {
6746         int n;
6747
6748         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
6749           {
6750             output_operand_lossage ("invalid operand for '%%%c'", code);
6751             return;
6752           }
6753
6754         asm_fprintf (f, "%d", n);
6755       }
6756       break;
6757
6758     case 'P':
6759       if (!CONST_INT_P (x))
6760         {
6761           output_operand_lossage ("invalid operand for '%%%c'", code);
6762           return;
6763         }
6764
6765       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
6766       break;
6767
6768     case 'H':
6769       if (x == const0_rtx)
6770         {
6771           asm_fprintf (f, "xzr");
6772           break;
6773         }
6774
6775       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
6776         {
6777           output_operand_lossage ("invalid operand for '%%%c'", code);
6778           return;
6779         }
6780
6781       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
6782       break;
6783
6784     case 'M':
6785     case 'm':
6786       {
6787         int cond_code;
6788         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
6789         if (x == const_true_rtx)
6790           {
6791             if (code == 'M')
6792               fputs ("nv", f);
6793             return;
6794           }
6795
6796         if (!COMPARISON_P (x))
6797           {
6798             output_operand_lossage ("invalid operand for '%%%c'", code);
6799             return;
6800           }
6801
6802         cond_code = aarch64_get_condition_code (x);
6803         gcc_assert (cond_code >= 0);
6804         if (code == 'M')
6805           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
6806         fputs (aarch64_condition_codes[cond_code], f);
6807       }
6808       break;
6809
6810     case 'N':
6811       if (!const_vec_duplicate_p (x, &elt))
6812         {
6813           output_operand_lossage ("invalid vector constant");
6814           return;
6815         }
6816
6817       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6818         asm_fprintf (f, "%wd", -INTVAL (elt));
6819       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6820                && aarch64_print_vector_float_operand (f, x, true))
6821         ;
6822       else
6823         {
6824           output_operand_lossage ("invalid vector constant");
6825           return;
6826         }
6827       break;
6828
6829     case 'b':
6830     case 'h':
6831     case 's':
6832     case 'd':
6833     case 'q':
6834       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6835         {
6836           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6837           return;
6838         }
6839       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
6840       break;
6841
6842     case 'S':
6843     case 'T':
6844     case 'U':
6845     case 'V':
6846       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6847         {
6848           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6849           return;
6850         }
6851       asm_fprintf (f, "%c%d",
6852                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
6853                    REGNO (x) - V0_REGNUM + (code - 'S'));
6854       break;
6855
6856     case 'R':
6857       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6858         {
6859           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6860           return;
6861         }
6862       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
6863       break;
6864
6865     case 'X':
6866       if (!CONST_INT_P (x))
6867         {
6868           output_operand_lossage ("invalid operand for '%%%c'", code);
6869           return;
6870         }
6871       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
6872       break;
6873
6874     case 'C':
6875       {
6876         /* Print a replicated constant in hex.  */
6877         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6878           {
6879             output_operand_lossage ("invalid operand for '%%%c'", code);
6880             return;
6881           }
6882         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6883         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6884       }
6885       break;
6886
6887     case 'D':
6888       {
6889         /* Print a replicated constant in decimal, treating it as
6890            unsigned.  */
6891         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6892           {
6893             output_operand_lossage ("invalid operand for '%%%c'", code);
6894             return;
6895           }
6896         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6897         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6898       }
6899       break;
6900
6901     case 'w':
6902     case 'x':
6903       if (x == const0_rtx
6904           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
6905         {
6906           asm_fprintf (f, "%czr", code);
6907           break;
6908         }
6909
6910       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
6911         {
6912           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
6913           break;
6914         }
6915
6916       if (REG_P (x) && REGNO (x) == SP_REGNUM)
6917         {
6918           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
6919           break;
6920         }
6921
6922       /* Fall through */
6923
6924     case 0:
6925       if (x == NULL)
6926         {
6927           output_operand_lossage ("missing operand");
6928           return;
6929         }
6930
6931       switch (GET_CODE (x))
6932         {
6933         case REG:
6934           if (aarch64_sve_data_mode_p (GET_MODE (x)))
6935             {
6936               if (REG_NREGS (x) == 1)
6937                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
6938               else
6939                 {
6940                   char suffix
6941                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
6942                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
6943                                REGNO (x) - V0_REGNUM, suffix,
6944                                END_REGNO (x) - V0_REGNUM - 1, suffix);
6945                 }
6946             }
6947           else
6948             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
6949           break;
6950
6951         case MEM:
6952           output_address (GET_MODE (x), XEXP (x, 0));
6953           break;
6954
6955         case LABEL_REF:
6956         case SYMBOL_REF:
6957           output_addr_const (asm_out_file, x);
6958           break;
6959
6960         case CONST_INT:
6961           asm_fprintf (f, "%wd", INTVAL (x));
6962           break;
6963
6964         case CONST:
6965           if (!VECTOR_MODE_P (GET_MODE (x)))
6966             {
6967               output_addr_const (asm_out_file, x);
6968               break;
6969             }
6970           /* fall through */
6971
6972         case CONST_VECTOR:
6973           if (!const_vec_duplicate_p (x, &elt))
6974             {
6975               output_operand_lossage ("invalid vector constant");
6976               return;
6977             }
6978
6979           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6980             asm_fprintf (f, "%wd", INTVAL (elt));
6981           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6982                    && aarch64_print_vector_float_operand (f, x, false))
6983             ;
6984           else
6985             {
6986               output_operand_lossage ("invalid vector constant");
6987               return;
6988             }
6989           break;
6990
6991         case CONST_DOUBLE:
6992           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
6993              be getting CONST_DOUBLEs holding integers.  */
6994           gcc_assert (GET_MODE (x) != VOIDmode);
6995           if (aarch64_float_const_zero_rtx_p (x))
6996             {
6997               fputc ('0', f);
6998               break;
6999             }
7000           else if (aarch64_float_const_representable_p (x))
7001             {
7002 #define buf_size 20
7003               char float_buf[buf_size] = {'\0'};
7004               real_to_decimal_for_mode (float_buf,
7005                                         CONST_DOUBLE_REAL_VALUE (x),
7006                                         buf_size, buf_size,
7007                                         1, GET_MODE (x));
7008               asm_fprintf (asm_out_file, "%s", float_buf);
7009               break;
7010 #undef buf_size
7011             }
7012           output_operand_lossage ("invalid constant");
7013           return;
7014         default:
7015           output_operand_lossage ("invalid operand");
7016           return;
7017         }
7018       break;
7019
7020     case 'A':
7021       if (GET_CODE (x) == HIGH)
7022         x = XEXP (x, 0);
7023
7024       switch (aarch64_classify_symbolic_expression (x))
7025         {
7026         case SYMBOL_SMALL_GOT_4G:
7027           asm_fprintf (asm_out_file, ":got:");
7028           break;
7029
7030         case SYMBOL_SMALL_TLSGD:
7031           asm_fprintf (asm_out_file, ":tlsgd:");
7032           break;
7033
7034         case SYMBOL_SMALL_TLSDESC:
7035           asm_fprintf (asm_out_file, ":tlsdesc:");
7036           break;
7037
7038         case SYMBOL_SMALL_TLSIE:
7039           asm_fprintf (asm_out_file, ":gottprel:");
7040           break;
7041
7042         case SYMBOL_TLSLE24:
7043           asm_fprintf (asm_out_file, ":tprel:");
7044           break;
7045
7046         case SYMBOL_TINY_GOT:
7047           gcc_unreachable ();
7048           break;
7049
7050         default:
7051           break;
7052         }
7053       output_addr_const (asm_out_file, x);
7054       break;
7055
7056     case 'L':
7057       switch (aarch64_classify_symbolic_expression (x))
7058         {
7059         case SYMBOL_SMALL_GOT_4G:
7060           asm_fprintf (asm_out_file, ":lo12:");
7061           break;
7062
7063         case SYMBOL_SMALL_TLSGD:
7064           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
7065           break;
7066
7067         case SYMBOL_SMALL_TLSDESC:
7068           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
7069           break;
7070
7071         case SYMBOL_SMALL_TLSIE:
7072           asm_fprintf (asm_out_file, ":gottprel_lo12:");
7073           break;
7074
7075         case SYMBOL_TLSLE12:
7076           asm_fprintf (asm_out_file, ":tprel_lo12:");
7077           break;
7078
7079         case SYMBOL_TLSLE24:
7080           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
7081           break;
7082
7083         case SYMBOL_TINY_GOT:
7084           asm_fprintf (asm_out_file, ":got:");
7085           break;
7086
7087         case SYMBOL_TINY_TLSIE:
7088           asm_fprintf (asm_out_file, ":gottprel:");
7089           break;
7090
7091         default:
7092           break;
7093         }
7094       output_addr_const (asm_out_file, x);
7095       break;
7096
7097     case 'G':
7098       switch (aarch64_classify_symbolic_expression (x))
7099         {
7100         case SYMBOL_TLSLE24:
7101           asm_fprintf (asm_out_file, ":tprel_hi12:");
7102           break;
7103         default:
7104           break;
7105         }
7106       output_addr_const (asm_out_file, x);
7107       break;
7108
7109     case 'k':
7110       {
7111         HOST_WIDE_INT cond_code;
7112
7113         if (!CONST_INT_P (x))
7114           {
7115             output_operand_lossage ("invalid operand for '%%%c'", code);
7116             return;
7117           }
7118
7119         cond_code = INTVAL (x);
7120         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
7121         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
7122       }
7123       break;
7124
7125     case 'y':
7126     case 'z':
7127       {
7128         machine_mode mode = GET_MODE (x);
7129
7130         if (GET_CODE (x) != MEM
7131             || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
7132           {
7133             output_operand_lossage ("invalid operand for '%%%c'", code);
7134             return;
7135           }
7136
7137         if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
7138                                             code == 'y'
7139                                             ? ADDR_QUERY_LDP_STP_N
7140                                             : ADDR_QUERY_LDP_STP))
7141           output_operand_lossage ("invalid operand prefix '%%%c'", code);
7142       }
7143       break;
7144
7145     default:
7146       output_operand_lossage ("invalid operand prefix '%%%c'", code);
7147       return;
7148     }
7149 }
7150
7151 /* Print address 'x' of a memory access with mode 'mode'.
7152    'op' is the context required by aarch64_classify_address.  It can either be
7153    MEM for a normal memory access or PARALLEL for LDP/STP.  */
7154 static bool
7155 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
7156                                 aarch64_addr_query_type type)
7157 {
7158   struct aarch64_address_info addr;
7159   unsigned int size;
7160
7161   /* Check all addresses are Pmode - including ILP32.  */
7162   if (GET_MODE (x) != Pmode)
7163     output_operand_lossage ("invalid address mode");
7164
7165   if (aarch64_classify_address (&addr, x, mode, true, type))
7166     switch (addr.type)
7167       {
7168       case ADDRESS_REG_IMM:
7169         if (known_eq (addr.const_offset, 0))
7170           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
7171         else if (aarch64_sve_data_mode_p (mode))
7172           {
7173             HOST_WIDE_INT vnum
7174               = exact_div (addr.const_offset,
7175                            BYTES_PER_SVE_VECTOR).to_constant ();
7176             asm_fprintf (f, "[%s, #%wd, mul vl]",
7177                          reg_names[REGNO (addr.base)], vnum);
7178           }
7179         else if (aarch64_sve_pred_mode_p (mode))
7180           {
7181             HOST_WIDE_INT vnum
7182               = exact_div (addr.const_offset,
7183                            BYTES_PER_SVE_PRED).to_constant ();
7184             asm_fprintf (f, "[%s, #%wd, mul vl]",
7185                          reg_names[REGNO (addr.base)], vnum);
7186           }
7187         else
7188           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
7189                        INTVAL (addr.offset));
7190         return true;
7191
7192       case ADDRESS_REG_REG:
7193         if (addr.shift == 0)
7194           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
7195                        reg_names [REGNO (addr.offset)]);
7196         else
7197           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
7198                        reg_names [REGNO (addr.offset)], addr.shift);
7199         return true;
7200
7201       case ADDRESS_REG_UXTW:
7202         if (addr.shift == 0)
7203           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
7204                        REGNO (addr.offset) - R0_REGNUM);
7205         else
7206           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
7207                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
7208         return true;
7209
7210       case ADDRESS_REG_SXTW:
7211         if (addr.shift == 0)
7212           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
7213                        REGNO (addr.offset) - R0_REGNUM);
7214         else
7215           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
7216                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
7217         return true;
7218
7219       case ADDRESS_REG_WB:
7220         /* Writeback is only supported for fixed-width modes.  */
7221         size = GET_MODE_SIZE (mode).to_constant ();
7222         switch (GET_CODE (x))
7223           {
7224           case PRE_INC:
7225             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
7226             return true;
7227           case POST_INC:
7228             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
7229             return true;
7230           case PRE_DEC:
7231             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
7232             return true;
7233           case POST_DEC:
7234             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
7235             return true;
7236           case PRE_MODIFY:
7237             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
7238                          INTVAL (addr.offset));
7239             return true;
7240           case POST_MODIFY:
7241             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
7242                          INTVAL (addr.offset));
7243             return true;
7244           default:
7245             break;
7246           }
7247         break;
7248
7249       case ADDRESS_LO_SUM:
7250         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
7251         output_addr_const (f, addr.offset);
7252         asm_fprintf (f, "]");
7253         return true;
7254
7255       case ADDRESS_SYMBOLIC:
7256         output_addr_const (f, x);
7257         return true;
7258       }
7259
7260   return false;
7261 }
7262
7263 /* Print address 'x' of a memory access with mode 'mode'.  */
7264 static void
7265 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
7266 {
7267   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
7268     output_addr_const (f, x);
7269 }
7270
7271 bool
7272 aarch64_label_mentioned_p (rtx x)
7273 {
7274   const char *fmt;
7275   int i;
7276
7277   if (GET_CODE (x) == LABEL_REF)
7278     return true;
7279
7280   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
7281      referencing instruction, but they are constant offsets, not
7282      symbols.  */
7283   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
7284     return false;
7285
7286   fmt = GET_RTX_FORMAT (GET_CODE (x));
7287   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
7288     {
7289       if (fmt[i] == 'E')
7290         {
7291           int j;
7292
7293           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
7294             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
7295               return 1;
7296         }
7297       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
7298         return 1;
7299     }
7300
7301   return 0;
7302 }
7303
7304 /* Implement REGNO_REG_CLASS.  */
7305
7306 enum reg_class
7307 aarch64_regno_regclass (unsigned regno)
7308 {
7309   if (GP_REGNUM_P (regno))
7310     return GENERAL_REGS;
7311
7312   if (regno == SP_REGNUM)
7313     return STACK_REG;
7314
7315   if (regno == FRAME_POINTER_REGNUM
7316       || regno == ARG_POINTER_REGNUM)
7317     return POINTER_REGS;
7318
7319   if (FP_REGNUM_P (regno))
7320     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
7321
7322   if (PR_REGNUM_P (regno))
7323     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
7324
7325   return NO_REGS;
7326 }
7327
7328 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
7329    If OFFSET is out of range, return an offset of an anchor point
7330    that is in range.  Return 0 otherwise.  */
7331
7332 static HOST_WIDE_INT
7333 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
7334                        machine_mode mode)
7335 {
7336   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
7337   if (size > 16)
7338     return (offset + 0x400) & ~0x7f0;
7339
7340   /* For offsets that aren't a multiple of the access size, the limit is
7341      -256...255.  */
7342   if (offset & (size - 1))
7343     {
7344       /* BLKmode typically uses LDP of X-registers.  */
7345       if (mode == BLKmode)
7346         return (offset + 512) & ~0x3ff;
7347       return (offset + 0x100) & ~0x1ff;
7348     }
7349
7350   /* Small negative offsets are supported.  */
7351   if (IN_RANGE (offset, -256, 0))
7352     return 0;
7353
7354   if (mode == TImode || mode == TFmode)
7355     return (offset + 0x100) & ~0x1ff;
7356
7357   /* Use 12-bit offset by access size.  */
7358   return offset & (~0xfff * size);
7359 }
7360
7361 static rtx
7362 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
7363 {
7364   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
7365      where mask is selected by alignment and size of the offset.
7366      We try to pick as large a range for the offset as possible to
7367      maximize the chance of a CSE.  However, for aligned addresses
7368      we limit the range to 4k so that structures with different sized
7369      elements are likely to use the same base.  We need to be careful
7370      not to split a CONST for some forms of address expression, otherwise
7371      it will generate sub-optimal code.  */
7372
7373   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
7374     {
7375       rtx base = XEXP (x, 0);
7376       rtx offset_rtx = XEXP (x, 1);
7377       HOST_WIDE_INT offset = INTVAL (offset_rtx);
7378
7379       if (GET_CODE (base) == PLUS)
7380         {
7381           rtx op0 = XEXP (base, 0);
7382           rtx op1 = XEXP (base, 1);
7383
7384           /* Force any scaling into a temp for CSE.  */
7385           op0 = force_reg (Pmode, op0);
7386           op1 = force_reg (Pmode, op1);
7387
7388           /* Let the pointer register be in op0.  */
7389           if (REG_POINTER (op1))
7390             std::swap (op0, op1);
7391
7392           /* If the pointer is virtual or frame related, then we know that
7393              virtual register instantiation or register elimination is going
7394              to apply a second constant.  We want the two constants folded
7395              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
7396           if (virt_or_elim_regno_p (REGNO (op0)))
7397             {
7398               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
7399                                    NULL_RTX, true, OPTAB_DIRECT);
7400               return gen_rtx_PLUS (Pmode, base, op1);
7401             }
7402
7403           /* Otherwise, in order to encourage CSE (and thence loop strength
7404              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
7405           base = expand_binop (Pmode, add_optab, op0, op1,
7406                                NULL_RTX, true, OPTAB_DIRECT);
7407           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
7408         }
7409
7410       HOST_WIDE_INT size;
7411       if (GET_MODE_SIZE (mode).is_constant (&size))
7412         {
7413           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
7414                                                              mode);
7415           if (base_offset != 0)
7416             {
7417               base = plus_constant (Pmode, base, base_offset);
7418               base = force_operand (base, NULL_RTX);
7419               return plus_constant (Pmode, base, offset - base_offset);
7420             }
7421         }
7422     }
7423
7424   return x;
7425 }
7426
7427 static reg_class_t
7428 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
7429                           reg_class_t rclass,
7430                           machine_mode mode,
7431                           secondary_reload_info *sri)
7432 {
7433   /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
7434      directly by the *aarch64_sve_mov<mode>_be move pattern.  See the
7435      comment at the head of aarch64-sve.md for more details about the
7436      big-endian handling.  */
7437   if (BYTES_BIG_ENDIAN
7438       && reg_class_subset_p (rclass, FP_REGS)
7439       && !((REG_P (x) && HARD_REGISTER_P (x))
7440            || aarch64_simd_valid_immediate (x, NULL))
7441       && aarch64_sve_data_mode_p (mode))
7442     {
7443       sri->icode = CODE_FOR_aarch64_sve_reload_be;
7444       return NO_REGS;
7445     }
7446
7447   /* If we have to disable direct literal pool loads and stores because the
7448      function is too big, then we need a scratch register.  */
7449   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
7450       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
7451           || targetm.vector_mode_supported_p (GET_MODE (x)))
7452       && !aarch64_pcrelative_literal_loads)
7453     {
7454       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
7455       return NO_REGS;
7456     }
7457
7458   /* Without the TARGET_SIMD instructions we cannot move a Q register
7459      to a Q register directly.  We need a scratch.  */
7460   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
7461       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
7462       && reg_class_subset_p (rclass, FP_REGS))
7463     {
7464       sri->icode = code_for_aarch64_reload_mov (mode);
7465       return NO_REGS;
7466     }
7467
7468   /* A TFmode or TImode memory access should be handled via an FP_REGS
7469      because AArch64 has richer addressing modes for LDR/STR instructions
7470      than LDP/STP instructions.  */
7471   if (TARGET_FLOAT && rclass == GENERAL_REGS
7472       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
7473     return FP_REGS;
7474
7475   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
7476       return GENERAL_REGS;
7477
7478   return NO_REGS;
7479 }
7480
7481 static bool
7482 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
7483 {
7484   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
7485
7486   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
7487      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
7488   if (frame_pointer_needed)
7489     return to == HARD_FRAME_POINTER_REGNUM;
7490   return true;
7491 }
7492
7493 poly_int64
7494 aarch64_initial_elimination_offset (unsigned from, unsigned to)
7495 {
7496   if (to == HARD_FRAME_POINTER_REGNUM)
7497     {
7498       if (from == ARG_POINTER_REGNUM)
7499         return cfun->machine->frame.hard_fp_offset;
7500
7501       if (from == FRAME_POINTER_REGNUM)
7502         return cfun->machine->frame.hard_fp_offset
7503                - cfun->machine->frame.locals_offset;
7504     }
7505
7506   if (to == STACK_POINTER_REGNUM)
7507     {
7508       if (from == FRAME_POINTER_REGNUM)
7509           return cfun->machine->frame.frame_size
7510                  - cfun->machine->frame.locals_offset;
7511     }
7512
7513   return cfun->machine->frame.frame_size;
7514 }
7515
7516 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
7517    previous frame.  */
7518
7519 rtx
7520 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
7521 {
7522   if (count != 0)
7523     return const0_rtx;
7524   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
7525 }
7526
7527
7528 static void
7529 aarch64_asm_trampoline_template (FILE *f)
7530 {
7531   if (TARGET_ILP32)
7532     {
7533       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
7534       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
7535     }
7536   else
7537     {
7538       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
7539       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
7540     }
7541   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
7542   assemble_aligned_integer (4, const0_rtx);
7543   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7544   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7545 }
7546
7547 static void
7548 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
7549 {
7550   rtx fnaddr, mem, a_tramp;
7551   const int tramp_code_sz = 16;
7552
7553   /* Don't need to copy the trailing D-words, we fill those in below.  */
7554   emit_block_move (m_tramp, assemble_trampoline_template (),
7555                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
7556   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
7557   fnaddr = XEXP (DECL_RTL (fndecl), 0);
7558   if (GET_MODE (fnaddr) != ptr_mode)
7559     fnaddr = convert_memory_address (ptr_mode, fnaddr);
7560   emit_move_insn (mem, fnaddr);
7561
7562   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
7563   emit_move_insn (mem, chain_value);
7564
7565   /* XXX We should really define a "clear_cache" pattern and use
7566      gen_clear_cache().  */
7567   a_tramp = XEXP (m_tramp, 0);
7568   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
7569                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
7570                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
7571                      ptr_mode);
7572 }
7573
7574 static unsigned char
7575 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
7576 {
7577   /* ??? Logically we should only need to provide a value when
7578      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
7579      can hold MODE, but at the moment we need to handle all modes.
7580      Just ignore any runtime parts for registers that can't store them.  */
7581   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
7582   unsigned int nregs;
7583   switch (regclass)
7584     {
7585     case TAILCALL_ADDR_REGS:
7586     case POINTER_REGS:
7587     case GENERAL_REGS:
7588     case ALL_REGS:
7589     case POINTER_AND_FP_REGS:
7590     case FP_REGS:
7591     case FP_LO_REGS:
7592       if (aarch64_sve_data_mode_p (mode)
7593           && constant_multiple_p (GET_MODE_SIZE (mode),
7594                                   BYTES_PER_SVE_VECTOR, &nregs))
7595         return nregs;
7596       return (aarch64_vector_data_mode_p (mode)
7597               ? CEIL (lowest_size, UNITS_PER_VREG)
7598               : CEIL (lowest_size, UNITS_PER_WORD));
7599     case STACK_REG:
7600     case PR_REGS:
7601     case PR_LO_REGS:
7602     case PR_HI_REGS:
7603       return 1;
7604
7605     case NO_REGS:
7606       return 0;
7607
7608     default:
7609       break;
7610     }
7611   gcc_unreachable ();
7612 }
7613
7614 static reg_class_t
7615 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
7616 {
7617   if (regclass == POINTER_REGS)
7618     return GENERAL_REGS;
7619
7620   if (regclass == STACK_REG)
7621     {
7622       if (REG_P(x)
7623           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
7624           return regclass;
7625
7626       return NO_REGS;
7627     }
7628
7629   /* Register eliminiation can result in a request for
7630      SP+constant->FP_REGS.  We cannot support such operations which
7631      use SP as source and an FP_REG as destination, so reject out
7632      right now.  */
7633   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
7634     {
7635       rtx lhs = XEXP (x, 0);
7636
7637       /* Look through a possible SUBREG introduced by ILP32.  */
7638       if (GET_CODE (lhs) == SUBREG)
7639         lhs = SUBREG_REG (lhs);
7640
7641       gcc_assert (REG_P (lhs));
7642       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
7643                                       POINTER_REGS));
7644       return NO_REGS;
7645     }
7646
7647   return regclass;
7648 }
7649
7650 void
7651 aarch64_asm_output_labelref (FILE* f, const char *name)
7652 {
7653   asm_fprintf (f, "%U%s", name);
7654 }
7655
7656 static void
7657 aarch64_elf_asm_constructor (rtx symbol, int priority)
7658 {
7659   if (priority == DEFAULT_INIT_PRIORITY)
7660     default_ctor_section_asm_out_constructor (symbol, priority);
7661   else
7662     {
7663       section *s;
7664       /* While priority is known to be in range [0, 65535], so 18 bytes
7665          would be enough, the compiler might not know that.  To avoid
7666          -Wformat-truncation false positive, use a larger size.  */
7667       char buf[23];
7668       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
7669       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7670       switch_to_section (s);
7671       assemble_align (POINTER_SIZE);
7672       assemble_aligned_integer (POINTER_BYTES, symbol);
7673     }
7674 }
7675
7676 static void
7677 aarch64_elf_asm_destructor (rtx symbol, int priority)
7678 {
7679   if (priority == DEFAULT_INIT_PRIORITY)
7680     default_dtor_section_asm_out_destructor (symbol, priority);
7681   else
7682     {
7683       section *s;
7684       /* While priority is known to be in range [0, 65535], so 18 bytes
7685          would be enough, the compiler might not know that.  To avoid
7686          -Wformat-truncation false positive, use a larger size.  */
7687       char buf[23];
7688       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
7689       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7690       switch_to_section (s);
7691       assemble_align (POINTER_SIZE);
7692       assemble_aligned_integer (POINTER_BYTES, symbol);
7693     }
7694 }
7695
7696 const char*
7697 aarch64_output_casesi (rtx *operands)
7698 {
7699   char buf[100];
7700   char label[100];
7701   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
7702   int index;
7703   static const char *const patterns[4][2] =
7704   {
7705     {
7706       "ldrb\t%w3, [%0,%w1,uxtw]",
7707       "add\t%3, %4, %w3, sxtb #2"
7708     },
7709     {
7710       "ldrh\t%w3, [%0,%w1,uxtw #1]",
7711       "add\t%3, %4, %w3, sxth #2"
7712     },
7713     {
7714       "ldr\t%w3, [%0,%w1,uxtw #2]",
7715       "add\t%3, %4, %w3, sxtw #2"
7716     },
7717     /* We assume that DImode is only generated when not optimizing and
7718        that we don't really need 64-bit address offsets.  That would
7719        imply an object file with 8GB of code in a single function!  */
7720     {
7721       "ldr\t%w3, [%0,%w1,uxtw #2]",
7722       "add\t%3, %4, %w3, sxtw #2"
7723     }
7724   };
7725
7726   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
7727
7728   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
7729   index = exact_log2 (GET_MODE_SIZE (mode));
7730
7731   gcc_assert (index >= 0 && index <= 3);
7732
7733   /* Need to implement table size reduction, by chaning the code below.  */
7734   output_asm_insn (patterns[index][0], operands);
7735   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
7736   snprintf (buf, sizeof (buf),
7737             "adr\t%%4, %s", targetm.strip_name_encoding (label));
7738   output_asm_insn (buf, operands);
7739   output_asm_insn (patterns[index][1], operands);
7740   output_asm_insn ("br\t%3", operands);
7741   assemble_label (asm_out_file, label);
7742   return "";
7743 }
7744
7745
7746 /* Return size in bits of an arithmetic operand which is shifted/scaled and
7747    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
7748    operator.  */
7749
7750 int
7751 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
7752 {
7753   if (shift >= 0 && shift <= 3)
7754     {
7755       int size;
7756       for (size = 8; size <= 32; size *= 2)
7757         {
7758           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
7759           if (mask == bits << shift)
7760             return size;
7761         }
7762     }
7763   return 0;
7764 }
7765
7766 /* Constant pools are per function only when PC relative
7767    literal loads are true or we are in the large memory
7768    model.  */
7769
7770 static inline bool
7771 aarch64_can_use_per_function_literal_pools_p (void)
7772 {
7773   return (aarch64_pcrelative_literal_loads
7774           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
7775 }
7776
7777 static bool
7778 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
7779 {
7780   /* We can't use blocks for constants when we're using a per-function
7781      constant pool.  */
7782   return !aarch64_can_use_per_function_literal_pools_p ();
7783 }
7784
7785 /* Select appropriate section for constants depending
7786    on where we place literal pools.  */
7787
7788 static section *
7789 aarch64_select_rtx_section (machine_mode mode,
7790                             rtx x,
7791                             unsigned HOST_WIDE_INT align)
7792 {
7793   if (aarch64_can_use_per_function_literal_pools_p ())
7794     return function_section (current_function_decl);
7795
7796   return default_elf_select_rtx_section (mode, x, align);
7797 }
7798
7799 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
7800 void
7801 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
7802                                   HOST_WIDE_INT offset)
7803 {
7804   /* When using per-function literal pools, we must ensure that any code
7805      section is aligned to the minimal instruction length, lest we get
7806      errors from the assembler re "unaligned instructions".  */
7807   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
7808     ASM_OUTPUT_ALIGN (f, 2);
7809 }
7810
7811 /* Costs.  */
7812
7813 /* Helper function for rtx cost calculation.  Strip a shift expression
7814    from X.  Returns the inner operand if successful, or the original
7815    expression on failure.  */
7816 static rtx
7817 aarch64_strip_shift (rtx x)
7818 {
7819   rtx op = x;
7820
7821   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
7822      we can convert both to ROR during final output.  */
7823   if ((GET_CODE (op) == ASHIFT
7824        || GET_CODE (op) == ASHIFTRT
7825        || GET_CODE (op) == LSHIFTRT
7826        || GET_CODE (op) == ROTATERT
7827        || GET_CODE (op) == ROTATE)
7828       && CONST_INT_P (XEXP (op, 1)))
7829     return XEXP (op, 0);
7830
7831   if (GET_CODE (op) == MULT
7832       && CONST_INT_P (XEXP (op, 1))
7833       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
7834     return XEXP (op, 0);
7835
7836   return x;
7837 }
7838
7839 /* Helper function for rtx cost calculation.  Strip an extend
7840    expression from X.  Returns the inner operand if successful, or the
7841    original expression on failure.  We deal with a number of possible
7842    canonicalization variations here. If STRIP_SHIFT is true, then
7843    we can strip off a shift also.  */
7844 static rtx
7845 aarch64_strip_extend (rtx x, bool strip_shift)
7846 {
7847   scalar_int_mode mode;
7848   rtx op = x;
7849
7850   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
7851     return op;
7852
7853   /* Zero and sign extraction of a widened value.  */
7854   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
7855       && XEXP (op, 2) == const0_rtx
7856       && GET_CODE (XEXP (op, 0)) == MULT
7857       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
7858                                          XEXP (op, 1)))
7859     return XEXP (XEXP (op, 0), 0);
7860
7861   /* It can also be represented (for zero-extend) as an AND with an
7862      immediate.  */
7863   if (GET_CODE (op) == AND
7864       && GET_CODE (XEXP (op, 0)) == MULT
7865       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
7866       && CONST_INT_P (XEXP (op, 1))
7867       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
7868                            INTVAL (XEXP (op, 1))) != 0)
7869     return XEXP (XEXP (op, 0), 0);
7870
7871   /* Now handle extended register, as this may also have an optional
7872      left shift by 1..4.  */
7873   if (strip_shift
7874       && GET_CODE (op) == ASHIFT
7875       && CONST_INT_P (XEXP (op, 1))
7876       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
7877     op = XEXP (op, 0);
7878
7879   if (GET_CODE (op) == ZERO_EXTEND
7880       || GET_CODE (op) == SIGN_EXTEND)
7881     op = XEXP (op, 0);
7882
7883   if (op != x)
7884     return op;
7885
7886   return x;
7887 }
7888
7889 /* Return true iff CODE is a shift supported in combination
7890    with arithmetic instructions.  */
7891
7892 static bool
7893 aarch64_shift_p (enum rtx_code code)
7894 {
7895   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
7896 }
7897
7898
7899 /* Return true iff X is a cheap shift without a sign extend. */
7900
7901 static bool
7902 aarch64_cheap_mult_shift_p (rtx x)
7903 {
7904   rtx op0, op1;
7905
7906   op0 = XEXP (x, 0);
7907   op1 = XEXP (x, 1);
7908
7909   if (!(aarch64_tune_params.extra_tuning_flags
7910                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
7911     return false;
7912
7913   if (GET_CODE (op0) == SIGN_EXTEND)
7914     return false;
7915
7916   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
7917       && UINTVAL (op1) <= 4)
7918     return true;
7919
7920   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
7921     return false;
7922
7923   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
7924
7925   if (l2 > 0 && l2 <= 4)
7926     return true;
7927
7928   return false;
7929 }
7930
7931 /* Helper function for rtx cost calculation.  Calculate the cost of
7932    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
7933    Return the calculated cost of the expression, recursing manually in to
7934    operands where needed.  */
7935
7936 static int
7937 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
7938 {
7939   rtx op0, op1;
7940   const struct cpu_cost_table *extra_cost
7941     = aarch64_tune_params.insn_extra_cost;
7942   int cost = 0;
7943   bool compound_p = (outer == PLUS || outer == MINUS);
7944   machine_mode mode = GET_MODE (x);
7945
7946   gcc_checking_assert (code == MULT);
7947
7948   op0 = XEXP (x, 0);
7949   op1 = XEXP (x, 1);
7950
7951   if (VECTOR_MODE_P (mode))
7952     mode = GET_MODE_INNER (mode);
7953
7954   /* Integer multiply/fma.  */
7955   if (GET_MODE_CLASS (mode) == MODE_INT)
7956     {
7957       /* The multiply will be canonicalized as a shift, cost it as such.  */
7958       if (aarch64_shift_p (GET_CODE (x))
7959           || (CONST_INT_P (op1)
7960               && exact_log2 (INTVAL (op1)) > 0))
7961         {
7962           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
7963                            || GET_CODE (op0) == SIGN_EXTEND;
7964           if (speed)
7965             {
7966               if (compound_p)
7967                 {
7968                   /* If the shift is considered cheap,
7969                      then don't add any cost. */
7970                   if (aarch64_cheap_mult_shift_p (x))
7971                     ;
7972                   else if (REG_P (op1))
7973                     /* ARITH + shift-by-register.  */
7974                     cost += extra_cost->alu.arith_shift_reg;
7975                   else if (is_extend)
7976                     /* ARITH + extended register.  We don't have a cost field
7977                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
7978                     cost += extra_cost->alu.extend_arith;
7979                   else
7980                     /* ARITH + shift-by-immediate.  */
7981                     cost += extra_cost->alu.arith_shift;
7982                 }
7983               else
7984                 /* LSL (immediate).  */
7985                 cost += extra_cost->alu.shift;
7986
7987             }
7988           /* Strip extends as we will have costed them in the case above.  */
7989           if (is_extend)
7990             op0 = aarch64_strip_extend (op0, true);
7991
7992           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
7993
7994           return cost;
7995         }
7996
7997       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
7998          compound and let the below cases handle it.  After all, MNEG is a
7999          special-case alias of MSUB.  */
8000       if (GET_CODE (op0) == NEG)
8001         {
8002           op0 = XEXP (op0, 0);
8003           compound_p = true;
8004         }
8005
8006       /* Integer multiplies or FMAs have zero/sign extending variants.  */
8007       if ((GET_CODE (op0) == ZERO_EXTEND
8008            && GET_CODE (op1) == ZERO_EXTEND)
8009           || (GET_CODE (op0) == SIGN_EXTEND
8010               && GET_CODE (op1) == SIGN_EXTEND))
8011         {
8012           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
8013           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
8014
8015           if (speed)
8016             {
8017               if (compound_p)
8018                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
8019                 cost += extra_cost->mult[0].extend_add;
8020               else
8021                 /* MUL/SMULL/UMULL.  */
8022                 cost += extra_cost->mult[0].extend;
8023             }
8024
8025           return cost;
8026         }
8027
8028       /* This is either an integer multiply or a MADD.  In both cases
8029          we want to recurse and cost the operands.  */
8030       cost += rtx_cost (op0, mode, MULT, 0, speed);
8031       cost += rtx_cost (op1, mode, MULT, 1, speed);
8032
8033       if (speed)
8034         {
8035           if (compound_p)
8036             /* MADD/MSUB.  */
8037             cost += extra_cost->mult[mode == DImode].add;
8038           else
8039             /* MUL.  */
8040             cost += extra_cost->mult[mode == DImode].simple;
8041         }
8042
8043       return cost;
8044     }
8045   else
8046     {
8047       if (speed)
8048         {
8049           /* Floating-point FMA/FMUL can also support negations of the
8050              operands, unless the rounding mode is upward or downward in
8051              which case FNMUL is different than FMUL with operand negation.  */
8052           bool neg0 = GET_CODE (op0) == NEG;
8053           bool neg1 = GET_CODE (op1) == NEG;
8054           if (compound_p || !flag_rounding_math || (neg0 && neg1))
8055             {
8056               if (neg0)
8057                 op0 = XEXP (op0, 0);
8058               if (neg1)
8059                 op1 = XEXP (op1, 0);
8060             }
8061
8062           if (compound_p)
8063             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
8064             cost += extra_cost->fp[mode == DFmode].fma;
8065           else
8066             /* FMUL/FNMUL.  */
8067             cost += extra_cost->fp[mode == DFmode].mult;
8068         }
8069
8070       cost += rtx_cost (op0, mode, MULT, 0, speed);
8071       cost += rtx_cost (op1, mode, MULT, 1, speed);
8072       return cost;
8073     }
8074 }
8075
8076 static int
8077 aarch64_address_cost (rtx x,
8078                       machine_mode mode,
8079                       addr_space_t as ATTRIBUTE_UNUSED,
8080                       bool speed)
8081 {
8082   enum rtx_code c = GET_CODE (x);
8083   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
8084   struct aarch64_address_info info;
8085   int cost = 0;
8086   info.shift = 0;
8087
8088   if (!aarch64_classify_address (&info, x, mode, false))
8089     {
8090       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
8091         {
8092           /* This is a CONST or SYMBOL ref which will be split
8093              in a different way depending on the code model in use.
8094              Cost it through the generic infrastructure.  */
8095           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
8096           /* Divide through by the cost of one instruction to
8097              bring it to the same units as the address costs.  */
8098           cost_symbol_ref /= COSTS_N_INSNS (1);
8099           /* The cost is then the cost of preparing the address,
8100              followed by an immediate (possibly 0) offset.  */
8101           return cost_symbol_ref + addr_cost->imm_offset;
8102         }
8103       else
8104         {
8105           /* This is most likely a jump table from a case
8106              statement.  */
8107           return addr_cost->register_offset;
8108         }
8109     }
8110
8111   switch (info.type)
8112     {
8113       case ADDRESS_LO_SUM:
8114       case ADDRESS_SYMBOLIC:
8115       case ADDRESS_REG_IMM:
8116         cost += addr_cost->imm_offset;
8117         break;
8118
8119       case ADDRESS_REG_WB:
8120         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
8121           cost += addr_cost->pre_modify;
8122         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
8123           cost += addr_cost->post_modify;
8124         else
8125           gcc_unreachable ();
8126
8127         break;
8128
8129       case ADDRESS_REG_REG:
8130         cost += addr_cost->register_offset;
8131         break;
8132
8133       case ADDRESS_REG_SXTW:
8134         cost += addr_cost->register_sextend;
8135         break;
8136
8137       case ADDRESS_REG_UXTW:
8138         cost += addr_cost->register_zextend;
8139         break;
8140
8141       default:
8142         gcc_unreachable ();
8143     }
8144
8145
8146   if (info.shift > 0)
8147     {
8148       /* For the sake of calculating the cost of the shifted register
8149          component, we can treat same sized modes in the same way.  */
8150       if (known_eq (GET_MODE_BITSIZE (mode), 16))
8151         cost += addr_cost->addr_scale_costs.hi;
8152       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
8153         cost += addr_cost->addr_scale_costs.si;
8154       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
8155         cost += addr_cost->addr_scale_costs.di;
8156       else
8157         /* We can't tell, or this is a 128-bit vector.  */
8158         cost += addr_cost->addr_scale_costs.ti;
8159     }
8160
8161   return cost;
8162 }
8163
8164 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
8165    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
8166    to be taken.  */
8167
8168 int
8169 aarch64_branch_cost (bool speed_p, bool predictable_p)
8170 {
8171   /* When optimizing for speed, use the cost of unpredictable branches.  */
8172   const struct cpu_branch_cost *branch_costs =
8173     aarch64_tune_params.branch_costs;
8174
8175   if (!speed_p || predictable_p)
8176     return branch_costs->predictable;
8177   else
8178     return branch_costs->unpredictable;
8179 }
8180
8181 /* Return true if the RTX X in mode MODE is a zero or sign extract
8182    usable in an ADD or SUB (extended register) instruction.  */
8183 static bool
8184 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
8185 {
8186   /* Catch add with a sign extract.
8187      This is add_<optab><mode>_multp2.  */
8188   if (GET_CODE (x) == SIGN_EXTRACT
8189       || GET_CODE (x) == ZERO_EXTRACT)
8190     {
8191       rtx op0 = XEXP (x, 0);
8192       rtx op1 = XEXP (x, 1);
8193       rtx op2 = XEXP (x, 2);
8194
8195       if (GET_CODE (op0) == MULT
8196           && CONST_INT_P (op1)
8197           && op2 == const0_rtx
8198           && CONST_INT_P (XEXP (op0, 1))
8199           && aarch64_is_extend_from_extract (mode,
8200                                              XEXP (op0, 1),
8201                                              op1))
8202         {
8203           return true;
8204         }
8205     }
8206   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
8207      No shift.  */
8208   else if (GET_CODE (x) == SIGN_EXTEND
8209            || GET_CODE (x) == ZERO_EXTEND)
8210     return REG_P (XEXP (x, 0));
8211
8212   return false;
8213 }
8214
8215 static bool
8216 aarch64_frint_unspec_p (unsigned int u)
8217 {
8218   switch (u)
8219     {
8220       case UNSPEC_FRINTZ:
8221       case UNSPEC_FRINTP:
8222       case UNSPEC_FRINTM:
8223       case UNSPEC_FRINTA:
8224       case UNSPEC_FRINTN:
8225       case UNSPEC_FRINTX:
8226       case UNSPEC_FRINTI:
8227         return true;
8228
8229       default:
8230         return false;
8231     }
8232 }
8233
8234 /* Return true iff X is an rtx that will match an extr instruction
8235    i.e. as described in the *extr<mode>5_insn family of patterns.
8236    OP0 and OP1 will be set to the operands of the shifts involved
8237    on success and will be NULL_RTX otherwise.  */
8238
8239 static bool
8240 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
8241 {
8242   rtx op0, op1;
8243   scalar_int_mode mode;
8244   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
8245     return false;
8246
8247   *res_op0 = NULL_RTX;
8248   *res_op1 = NULL_RTX;
8249
8250   if (GET_CODE (x) != IOR)
8251     return false;
8252
8253   op0 = XEXP (x, 0);
8254   op1 = XEXP (x, 1);
8255
8256   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
8257       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
8258     {
8259      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
8260       if (GET_CODE (op1) == ASHIFT)
8261         std::swap (op0, op1);
8262
8263       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
8264         return false;
8265
8266       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
8267       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
8268
8269       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
8270           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
8271         {
8272           *res_op0 = XEXP (op0, 0);
8273           *res_op1 = XEXP (op1, 0);
8274           return true;
8275         }
8276     }
8277
8278   return false;
8279 }
8280
8281 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
8282    storing it in *COST.  Result is true if the total cost of the operation
8283    has now been calculated.  */
8284 static bool
8285 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
8286 {
8287   rtx inner;
8288   rtx comparator;
8289   enum rtx_code cmpcode;
8290
8291   if (COMPARISON_P (op0))
8292     {
8293       inner = XEXP (op0, 0);
8294       comparator = XEXP (op0, 1);
8295       cmpcode = GET_CODE (op0);
8296     }
8297   else
8298     {
8299       inner = op0;
8300       comparator = const0_rtx;
8301       cmpcode = NE;
8302     }
8303
8304   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
8305     {
8306       /* Conditional branch.  */
8307       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8308         return true;
8309       else
8310         {
8311           if (cmpcode == NE || cmpcode == EQ)
8312             {
8313               if (comparator == const0_rtx)
8314                 {
8315                   /* TBZ/TBNZ/CBZ/CBNZ.  */
8316                   if (GET_CODE (inner) == ZERO_EXTRACT)
8317                     /* TBZ/TBNZ.  */
8318                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
8319                                        ZERO_EXTRACT, 0, speed);
8320                   else
8321                     /* CBZ/CBNZ.  */
8322                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
8323
8324                 return true;
8325               }
8326             }
8327           else if (cmpcode == LT || cmpcode == GE)
8328             {
8329               /* TBZ/TBNZ.  */
8330               if (comparator == const0_rtx)
8331                 return true;
8332             }
8333         }
8334     }
8335   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8336     {
8337       /* CCMP.  */
8338       if (GET_CODE (op1) == COMPARE)
8339         {
8340           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
8341           if (XEXP (op1, 1) == const0_rtx)
8342             *cost += 1;
8343           if (speed)
8344             {
8345               machine_mode mode = GET_MODE (XEXP (op1, 0));
8346               const struct cpu_cost_table *extra_cost
8347                 = aarch64_tune_params.insn_extra_cost;
8348
8349               if (GET_MODE_CLASS (mode) == MODE_INT)
8350                 *cost += extra_cost->alu.arith;
8351               else
8352                 *cost += extra_cost->fp[mode == DFmode].compare;
8353             }
8354           return true;
8355         }
8356
8357       /* It's a conditional operation based on the status flags,
8358          so it must be some flavor of CSEL.  */
8359
8360       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
8361       if (GET_CODE (op1) == NEG
8362           || GET_CODE (op1) == NOT
8363           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
8364         op1 = XEXP (op1, 0);
8365       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
8366         {
8367           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
8368           op1 = XEXP (op1, 0);
8369           op2 = XEXP (op2, 0);
8370         }
8371
8372       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
8373       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
8374       return true;
8375     }
8376
8377   /* We don't know what this is, cost all operands.  */
8378   return false;
8379 }
8380
8381 /* Check whether X is a bitfield operation of the form shift + extend that
8382    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
8383    operand to which the bitfield operation is applied.  Otherwise return
8384    NULL_RTX.  */
8385
8386 static rtx
8387 aarch64_extend_bitfield_pattern_p (rtx x)
8388 {
8389   rtx_code outer_code = GET_CODE (x);
8390   machine_mode outer_mode = GET_MODE (x);
8391
8392   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
8393       && outer_mode != SImode && outer_mode != DImode)
8394     return NULL_RTX;
8395
8396   rtx inner = XEXP (x, 0);
8397   rtx_code inner_code = GET_CODE (inner);
8398   machine_mode inner_mode = GET_MODE (inner);
8399   rtx op = NULL_RTX;
8400
8401   switch (inner_code)
8402     {
8403       case ASHIFT:
8404         if (CONST_INT_P (XEXP (inner, 1))
8405             && (inner_mode == QImode || inner_mode == HImode))
8406           op = XEXP (inner, 0);
8407         break;
8408       case LSHIFTRT:
8409         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
8410             && (inner_mode == QImode || inner_mode == HImode))
8411           op = XEXP (inner, 0);
8412         break;
8413       case ASHIFTRT:
8414         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
8415             && (inner_mode == QImode || inner_mode == HImode))
8416           op = XEXP (inner, 0);
8417         break;
8418       default:
8419         break;
8420     }
8421
8422   return op;
8423 }
8424
8425 /* Return true if the mask and a shift amount from an RTX of the form
8426    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
8427    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
8428
8429 bool
8430 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
8431                                     rtx shft_amnt)
8432 {
8433   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
8434          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
8435          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
8436          && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
8437 }
8438
8439 /* Calculate the cost of calculating X, storing it in *COST.  Result
8440    is true if the total cost of the operation has now been calculated.  */
8441 static bool
8442 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
8443                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
8444 {
8445   rtx op0, op1, op2;
8446   const struct cpu_cost_table *extra_cost
8447     = aarch64_tune_params.insn_extra_cost;
8448   int code = GET_CODE (x);
8449   scalar_int_mode int_mode;
8450
8451   /* By default, assume that everything has equivalent cost to the
8452      cheapest instruction.  Any additional costs are applied as a delta
8453      above this default.  */
8454   *cost = COSTS_N_INSNS (1);
8455
8456   switch (code)
8457     {
8458     case SET:
8459       /* The cost depends entirely on the operands to SET.  */
8460       *cost = 0;
8461       op0 = SET_DEST (x);
8462       op1 = SET_SRC (x);
8463
8464       switch (GET_CODE (op0))
8465         {
8466         case MEM:
8467           if (speed)
8468             {
8469               rtx address = XEXP (op0, 0);
8470               if (VECTOR_MODE_P (mode))
8471                 *cost += extra_cost->ldst.storev;
8472               else if (GET_MODE_CLASS (mode) == MODE_INT)
8473                 *cost += extra_cost->ldst.store;
8474               else if (mode == SFmode)
8475                 *cost += extra_cost->ldst.storef;
8476               else if (mode == DFmode)
8477                 *cost += extra_cost->ldst.stored;
8478
8479               *cost +=
8480                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8481                                                      0, speed));
8482             }
8483
8484           *cost += rtx_cost (op1, mode, SET, 1, speed);
8485           return true;
8486
8487         case SUBREG:
8488           if (! REG_P (SUBREG_REG (op0)))
8489             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
8490
8491           /* Fall through.  */
8492         case REG:
8493           /* The cost is one per vector-register copied.  */
8494           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
8495             {
8496               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
8497               *cost = COSTS_N_INSNS (nregs);
8498             }
8499           /* const0_rtx is in general free, but we will use an
8500              instruction to set a register to 0.  */
8501           else if (REG_P (op1) || op1 == const0_rtx)
8502             {
8503               /* The cost is 1 per register copied.  */
8504               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
8505               *cost = COSTS_N_INSNS (nregs);
8506             }
8507           else
8508             /* Cost is just the cost of the RHS of the set.  */
8509             *cost += rtx_cost (op1, mode, SET, 1, speed);
8510           return true;
8511
8512         case ZERO_EXTRACT:
8513         case SIGN_EXTRACT:
8514           /* Bit-field insertion.  Strip any redundant widening of
8515              the RHS to meet the width of the target.  */
8516           if (GET_CODE (op1) == SUBREG)
8517             op1 = SUBREG_REG (op1);
8518           if ((GET_CODE (op1) == ZERO_EXTEND
8519                || GET_CODE (op1) == SIGN_EXTEND)
8520               && CONST_INT_P (XEXP (op0, 1))
8521               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
8522               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
8523             op1 = XEXP (op1, 0);
8524
8525           if (CONST_INT_P (op1))
8526             {
8527               /* MOV immediate is assumed to always be cheap.  */
8528               *cost = COSTS_N_INSNS (1);
8529             }
8530           else
8531             {
8532               /* BFM.  */
8533               if (speed)
8534                 *cost += extra_cost->alu.bfi;
8535               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
8536             }
8537
8538           return true;
8539
8540         default:
8541           /* We can't make sense of this, assume default cost.  */
8542           *cost = COSTS_N_INSNS (1);
8543           return false;
8544         }
8545       return false;
8546
8547     case CONST_INT:
8548       /* If an instruction can incorporate a constant within the
8549          instruction, the instruction's expression avoids calling
8550          rtx_cost() on the constant.  If rtx_cost() is called on a
8551          constant, then it is usually because the constant must be
8552          moved into a register by one or more instructions.
8553
8554          The exception is constant 0, which can be expressed
8555          as XZR/WZR and is therefore free.  The exception to this is
8556          if we have (set (reg) (const0_rtx)) in which case we must cost
8557          the move.  However, we can catch that when we cost the SET, so
8558          we don't need to consider that here.  */
8559       if (x == const0_rtx)
8560         *cost = 0;
8561       else
8562         {
8563           /* To an approximation, building any other constant is
8564              proportionally expensive to the number of instructions
8565              required to build that constant.  This is true whether we
8566              are compiling for SPEED or otherwise.  */
8567           if (!is_a <scalar_int_mode> (mode, &int_mode))
8568             int_mode = word_mode;
8569           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
8570                                  (NULL_RTX, x, false, int_mode));
8571         }
8572       return true;
8573
8574     case CONST_DOUBLE:
8575
8576       /* First determine number of instructions to do the move
8577           as an integer constant.  */
8578       if (!aarch64_float_const_representable_p (x)
8579            && !aarch64_can_const_movi_rtx_p (x, mode)
8580            && aarch64_float_const_rtx_p (x))
8581         {
8582           unsigned HOST_WIDE_INT ival;
8583           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
8584           gcc_assert (succeed);
8585
8586           scalar_int_mode imode = (mode == HFmode
8587                                    ? SImode
8588                                    : int_mode_for_mode (mode).require ());
8589           int ncost = aarch64_internal_mov_immediate
8590                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
8591           *cost += COSTS_N_INSNS (ncost);
8592           return true;
8593         }
8594
8595       if (speed)
8596         {
8597           /* mov[df,sf]_aarch64.  */
8598           if (aarch64_float_const_representable_p (x))
8599             /* FMOV (scalar immediate).  */
8600             *cost += extra_cost->fp[mode == DFmode].fpconst;
8601           else if (!aarch64_float_const_zero_rtx_p (x))
8602             {
8603               /* This will be a load from memory.  */
8604               if (mode == DFmode)
8605                 *cost += extra_cost->ldst.loadd;
8606               else
8607                 *cost += extra_cost->ldst.loadf;
8608             }
8609           else
8610             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
8611                or MOV v0.s[0], wzr - neither of which are modeled by the
8612                cost tables.  Just use the default cost.  */
8613             {
8614             }
8615         }
8616
8617       return true;
8618
8619     case MEM:
8620       if (speed)
8621         {
8622           /* For loads we want the base cost of a load, plus an
8623              approximation for the additional cost of the addressing
8624              mode.  */
8625           rtx address = XEXP (x, 0);
8626           if (VECTOR_MODE_P (mode))
8627             *cost += extra_cost->ldst.loadv;
8628           else if (GET_MODE_CLASS (mode) == MODE_INT)
8629             *cost += extra_cost->ldst.load;
8630           else if (mode == SFmode)
8631             *cost += extra_cost->ldst.loadf;
8632           else if (mode == DFmode)
8633             *cost += extra_cost->ldst.loadd;
8634
8635           *cost +=
8636                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8637                                                      0, speed));
8638         }
8639
8640       return true;
8641
8642     case NEG:
8643       op0 = XEXP (x, 0);
8644
8645       if (VECTOR_MODE_P (mode))
8646         {
8647           if (speed)
8648             {
8649               /* FNEG.  */
8650               *cost += extra_cost->vect.alu;
8651             }
8652           return false;
8653         }
8654
8655       if (GET_MODE_CLASS (mode) == MODE_INT)
8656         {
8657           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8658               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8659             {
8660               /* CSETM.  */
8661               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
8662               return true;
8663             }
8664
8665           /* Cost this as SUB wzr, X.  */
8666           op0 = CONST0_RTX (mode);
8667           op1 = XEXP (x, 0);
8668           goto cost_minus;
8669         }
8670
8671       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8672         {
8673           /* Support (neg(fma...)) as a single instruction only if
8674              sign of zeros is unimportant.  This matches the decision
8675              making in aarch64.md.  */
8676           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
8677             {
8678               /* FNMADD.  */
8679               *cost = rtx_cost (op0, mode, NEG, 0, speed);
8680               return true;
8681             }
8682           if (GET_CODE (op0) == MULT)
8683             {
8684               /* FNMUL.  */
8685               *cost = rtx_cost (op0, mode, NEG, 0, speed);
8686               return true;
8687             }
8688           if (speed)
8689             /* FNEG.  */
8690             *cost += extra_cost->fp[mode == DFmode].neg;
8691           return false;
8692         }
8693
8694       return false;
8695
8696     case CLRSB:
8697     case CLZ:
8698       if (speed)
8699         {
8700           if (VECTOR_MODE_P (mode))
8701             *cost += extra_cost->vect.alu;
8702           else
8703             *cost += extra_cost->alu.clz;
8704         }
8705
8706       return false;
8707
8708     case COMPARE:
8709       op0 = XEXP (x, 0);
8710       op1 = XEXP (x, 1);
8711
8712       if (op1 == const0_rtx
8713           && GET_CODE (op0) == AND)
8714         {
8715           x = op0;
8716           mode = GET_MODE (op0);
8717           goto cost_logic;
8718         }
8719
8720       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
8721         {
8722           /* TODO: A write to the CC flags possibly costs extra, this
8723              needs encoding in the cost tables.  */
8724
8725           mode = GET_MODE (op0);
8726           /* ANDS.  */
8727           if (GET_CODE (op0) == AND)
8728             {
8729               x = op0;
8730               goto cost_logic;
8731             }
8732
8733           if (GET_CODE (op0) == PLUS)
8734             {
8735               /* ADDS (and CMN alias).  */
8736               x = op0;
8737               goto cost_plus;
8738             }
8739
8740           if (GET_CODE (op0) == MINUS)
8741             {
8742               /* SUBS.  */
8743               x = op0;
8744               goto cost_minus;
8745             }
8746
8747           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
8748               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
8749               && CONST_INT_P (XEXP (op0, 2)))
8750             {
8751               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
8752                  Handle it here directly rather than going to cost_logic
8753                  since we know the immediate generated for the TST is valid
8754                  so we can avoid creating an intermediate rtx for it only
8755                  for costing purposes.  */
8756               if (speed)
8757                 *cost += extra_cost->alu.logical;
8758
8759               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
8760                                  ZERO_EXTRACT, 0, speed);
8761               return true;
8762             }
8763
8764           if (GET_CODE (op1) == NEG)
8765             {
8766               /* CMN.  */
8767               if (speed)
8768                 *cost += extra_cost->alu.arith;
8769
8770               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
8771               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
8772               return true;
8773             }
8774
8775           /* CMP.
8776
8777              Compare can freely swap the order of operands, and
8778              canonicalization puts the more complex operation first.
8779              But the integer MINUS logic expects the shift/extend
8780              operation in op1.  */
8781           if (! (REG_P (op0)
8782                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
8783           {
8784             op0 = XEXP (x, 1);
8785             op1 = XEXP (x, 0);
8786           }
8787           goto cost_minus;
8788         }
8789
8790       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
8791         {
8792           /* FCMP.  */
8793           if (speed)
8794             *cost += extra_cost->fp[mode == DFmode].compare;
8795
8796           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
8797             {
8798               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
8799               /* FCMP supports constant 0.0 for no extra cost. */
8800               return true;
8801             }
8802           return false;
8803         }
8804
8805       if (VECTOR_MODE_P (mode))
8806         {
8807           /* Vector compare.  */
8808           if (speed)
8809             *cost += extra_cost->vect.alu;
8810
8811           if (aarch64_float_const_zero_rtx_p (op1))
8812             {
8813               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
8814                  cost.  */
8815               return true;
8816             }
8817           return false;
8818         }
8819       return false;
8820
8821     case MINUS:
8822       {
8823         op0 = XEXP (x, 0);
8824         op1 = XEXP (x, 1);
8825
8826 cost_minus:
8827         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
8828
8829         /* Detect valid immediates.  */
8830         if ((GET_MODE_CLASS (mode) == MODE_INT
8831              || (GET_MODE_CLASS (mode) == MODE_CC
8832                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
8833             && CONST_INT_P (op1)
8834             && aarch64_uimm12_shift (INTVAL (op1)))
8835           {
8836             if (speed)
8837               /* SUB(S) (immediate).  */
8838               *cost += extra_cost->alu.arith;
8839             return true;
8840           }
8841
8842         /* Look for SUB (extended register).  */
8843         if (is_a <scalar_int_mode> (mode, &int_mode)
8844             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
8845           {
8846             if (speed)
8847               *cost += extra_cost->alu.extend_arith;
8848
8849             op1 = aarch64_strip_extend (op1, true);
8850             *cost += rtx_cost (op1, VOIDmode,
8851                                (enum rtx_code) GET_CODE (op1), 0, speed);
8852             return true;
8853           }
8854
8855         rtx new_op1 = aarch64_strip_extend (op1, false);
8856
8857         /* Cost this as an FMA-alike operation.  */
8858         if ((GET_CODE (new_op1) == MULT
8859              || aarch64_shift_p (GET_CODE (new_op1)))
8860             && code != COMPARE)
8861           {
8862             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
8863                                             (enum rtx_code) code,
8864                                             speed);
8865             return true;
8866           }
8867
8868         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
8869
8870         if (speed)
8871           {
8872             if (VECTOR_MODE_P (mode))
8873               {
8874                 /* Vector SUB.  */
8875                 *cost += extra_cost->vect.alu;
8876               }
8877             else if (GET_MODE_CLASS (mode) == MODE_INT)
8878               {
8879                 /* SUB(S).  */
8880                 *cost += extra_cost->alu.arith;
8881               }
8882             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8883               {
8884                 /* FSUB.  */
8885                 *cost += extra_cost->fp[mode == DFmode].addsub;
8886               }
8887           }
8888         return true;
8889       }
8890
8891     case PLUS:
8892       {
8893         rtx new_op0;
8894
8895         op0 = XEXP (x, 0);
8896         op1 = XEXP (x, 1);
8897
8898 cost_plus:
8899         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8900             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8901           {
8902             /* CSINC.  */
8903             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
8904             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8905             return true;
8906           }
8907
8908         if (GET_MODE_CLASS (mode) == MODE_INT
8909             && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
8910                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
8911           {
8912             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
8913
8914             if (speed)
8915               /* ADD (immediate).  */
8916               *cost += extra_cost->alu.arith;
8917             return true;
8918           }
8919
8920         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8921
8922         /* Look for ADD (extended register).  */
8923         if (is_a <scalar_int_mode> (mode, &int_mode)
8924             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
8925           {
8926             if (speed)
8927               *cost += extra_cost->alu.extend_arith;
8928
8929             op0 = aarch64_strip_extend (op0, true);
8930             *cost += rtx_cost (op0, VOIDmode,
8931                                (enum rtx_code) GET_CODE (op0), 0, speed);
8932             return true;
8933           }
8934
8935         /* Strip any extend, leave shifts behind as we will
8936            cost them through mult_cost.  */
8937         new_op0 = aarch64_strip_extend (op0, false);
8938
8939         if (GET_CODE (new_op0) == MULT
8940             || aarch64_shift_p (GET_CODE (new_op0)))
8941           {
8942             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
8943                                             speed);
8944             return true;
8945           }
8946
8947         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
8948
8949         if (speed)
8950           {
8951             if (VECTOR_MODE_P (mode))
8952               {
8953                 /* Vector ADD.  */
8954                 *cost += extra_cost->vect.alu;
8955               }
8956             else if (GET_MODE_CLASS (mode) == MODE_INT)
8957               {
8958                 /* ADD.  */
8959                 *cost += extra_cost->alu.arith;
8960               }
8961             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8962               {
8963                 /* FADD.  */
8964                 *cost += extra_cost->fp[mode == DFmode].addsub;
8965               }
8966           }
8967         return true;
8968       }
8969
8970     case BSWAP:
8971       *cost = COSTS_N_INSNS (1);
8972
8973       if (speed)
8974         {
8975           if (VECTOR_MODE_P (mode))
8976             *cost += extra_cost->vect.alu;
8977           else
8978             *cost += extra_cost->alu.rev;
8979         }
8980       return false;
8981
8982     case IOR:
8983       if (aarch_rev16_p (x))
8984         {
8985           *cost = COSTS_N_INSNS (1);
8986
8987           if (speed)
8988             {
8989               if (VECTOR_MODE_P (mode))
8990                 *cost += extra_cost->vect.alu;
8991               else
8992                 *cost += extra_cost->alu.rev;
8993             }
8994           return true;
8995         }
8996
8997       if (aarch64_extr_rtx_p (x, &op0, &op1))
8998         {
8999           *cost += rtx_cost (op0, mode, IOR, 0, speed);
9000           *cost += rtx_cost (op1, mode, IOR, 1, speed);
9001           if (speed)
9002             *cost += extra_cost->alu.shift;
9003
9004           return true;
9005         }
9006     /* Fall through.  */
9007     case XOR:
9008     case AND:
9009     cost_logic:
9010       op0 = XEXP (x, 0);
9011       op1 = XEXP (x, 1);
9012
9013       if (VECTOR_MODE_P (mode))
9014         {
9015           if (speed)
9016             *cost += extra_cost->vect.alu;
9017           return true;
9018         }
9019
9020       if (code == AND
9021           && GET_CODE (op0) == MULT
9022           && CONST_INT_P (XEXP (op0, 1))
9023           && CONST_INT_P (op1)
9024           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
9025                                INTVAL (op1)) != 0)
9026         {
9027           /* This is a UBFM/SBFM.  */
9028           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
9029           if (speed)
9030             *cost += extra_cost->alu.bfx;
9031           return true;
9032         }
9033
9034       if (is_int_mode (mode, &int_mode))
9035         {
9036           if (CONST_INT_P (op1))
9037             {
9038               /* We have a mask + shift version of a UBFIZ
9039                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
9040               if (GET_CODE (op0) == ASHIFT
9041                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
9042                                                          XEXP (op0, 1)))
9043                 {
9044                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
9045                                      (enum rtx_code) code, 0, speed);
9046                   if (speed)
9047                     *cost += extra_cost->alu.bfx;
9048
9049                   return true;
9050                 }
9051               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
9052                 {
9053                 /* We possibly get the immediate for free, this is not
9054                    modelled.  */
9055                   *cost += rtx_cost (op0, int_mode,
9056                                      (enum rtx_code) code, 0, speed);
9057                   if (speed)
9058                     *cost += extra_cost->alu.logical;
9059
9060                   return true;
9061                 }
9062             }
9063           else
9064             {
9065               rtx new_op0 = op0;
9066
9067               /* Handle ORN, EON, or BIC.  */
9068               if (GET_CODE (op0) == NOT)
9069                 op0 = XEXP (op0, 0);
9070
9071               new_op0 = aarch64_strip_shift (op0);
9072
9073               /* If we had a shift on op0 then this is a logical-shift-
9074                  by-register/immediate operation.  Otherwise, this is just
9075                  a logical operation.  */
9076               if (speed)
9077                 {
9078                   if (new_op0 != op0)
9079                     {
9080                       /* Shift by immediate.  */
9081                       if (CONST_INT_P (XEXP (op0, 1)))
9082                         *cost += extra_cost->alu.log_shift;
9083                       else
9084                         *cost += extra_cost->alu.log_shift_reg;
9085                     }
9086                   else
9087                     *cost += extra_cost->alu.logical;
9088                 }
9089
9090               /* In both cases we want to cost both operands.  */
9091               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
9092                                  0, speed);
9093               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
9094                                  1, speed);
9095
9096               return true;
9097             }
9098         }
9099       return false;
9100
9101     case NOT:
9102       x = XEXP (x, 0);
9103       op0 = aarch64_strip_shift (x);
9104
9105       if (VECTOR_MODE_P (mode))
9106         {
9107           /* Vector NOT.  */
9108           *cost += extra_cost->vect.alu;
9109           return false;
9110         }
9111
9112       /* MVN-shifted-reg.  */
9113       if (op0 != x)
9114         {
9115           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9116
9117           if (speed)
9118             *cost += extra_cost->alu.log_shift;
9119
9120           return true;
9121         }
9122       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
9123          Handle the second form here taking care that 'a' in the above can
9124          be a shift.  */
9125       else if (GET_CODE (op0) == XOR)
9126         {
9127           rtx newop0 = XEXP (op0, 0);
9128           rtx newop1 = XEXP (op0, 1);
9129           rtx op0_stripped = aarch64_strip_shift (newop0);
9130
9131           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
9132           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
9133
9134           if (speed)
9135             {
9136               if (op0_stripped != newop0)
9137                 *cost += extra_cost->alu.log_shift;
9138               else
9139                 *cost += extra_cost->alu.logical;
9140             }
9141
9142           return true;
9143         }
9144       /* MVN.  */
9145       if (speed)
9146         *cost += extra_cost->alu.logical;
9147
9148       return false;
9149
9150     case ZERO_EXTEND:
9151
9152       op0 = XEXP (x, 0);
9153       /* If a value is written in SI mode, then zero extended to DI
9154          mode, the operation will in general be free as a write to
9155          a 'w' register implicitly zeroes the upper bits of an 'x'
9156          register.  However, if this is
9157
9158            (set (reg) (zero_extend (reg)))
9159
9160          we must cost the explicit register move.  */
9161       if (mode == DImode
9162           && GET_MODE (op0) == SImode
9163           && outer == SET)
9164         {
9165           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
9166
9167         /* If OP_COST is non-zero, then the cost of the zero extend
9168            is effectively the cost of the inner operation.  Otherwise
9169            we have a MOV instruction and we take the cost from the MOV
9170            itself.  This is true independently of whether we are
9171            optimizing for space or time.  */
9172           if (op_cost)
9173             *cost = op_cost;
9174
9175           return true;
9176         }
9177       else if (MEM_P (op0))
9178         {
9179           /* All loads can zero extend to any size for free.  */
9180           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
9181           return true;
9182         }
9183
9184       op0 = aarch64_extend_bitfield_pattern_p (x);
9185       if (op0)
9186         {
9187           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
9188           if (speed)
9189             *cost += extra_cost->alu.bfx;
9190           return true;
9191         }
9192
9193       if (speed)
9194         {
9195           if (VECTOR_MODE_P (mode))
9196             {
9197               /* UMOV.  */
9198               *cost += extra_cost->vect.alu;
9199             }
9200           else
9201             {
9202               /* We generate an AND instead of UXTB/UXTH.  */
9203               *cost += extra_cost->alu.logical;
9204             }
9205         }
9206       return false;
9207
9208     case SIGN_EXTEND:
9209       if (MEM_P (XEXP (x, 0)))
9210         {
9211           /* LDRSH.  */
9212           if (speed)
9213             {
9214               rtx address = XEXP (XEXP (x, 0), 0);
9215               *cost += extra_cost->ldst.load_sign_extend;
9216
9217               *cost +=
9218                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9219                                                      0, speed));
9220             }
9221           return true;
9222         }
9223
9224       op0 = aarch64_extend_bitfield_pattern_p (x);
9225       if (op0)
9226         {
9227           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
9228           if (speed)
9229             *cost += extra_cost->alu.bfx;
9230           return true;
9231         }
9232
9233       if (speed)
9234         {
9235           if (VECTOR_MODE_P (mode))
9236             *cost += extra_cost->vect.alu;
9237           else
9238             *cost += extra_cost->alu.extend;
9239         }
9240       return false;
9241
9242     case ASHIFT:
9243       op0 = XEXP (x, 0);
9244       op1 = XEXP (x, 1);
9245
9246       if (CONST_INT_P (op1))
9247         {
9248           if (speed)
9249             {
9250               if (VECTOR_MODE_P (mode))
9251                 {
9252                   /* Vector shift (immediate).  */
9253                   *cost += extra_cost->vect.alu;
9254                 }
9255               else
9256                 {
9257                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
9258                      aliases.  */
9259                   *cost += extra_cost->alu.shift;
9260                 }
9261             }
9262
9263           /* We can incorporate zero/sign extend for free.  */
9264           if (GET_CODE (op0) == ZERO_EXTEND
9265               || GET_CODE (op0) == SIGN_EXTEND)
9266             op0 = XEXP (op0, 0);
9267
9268           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
9269           return true;
9270         }
9271       else
9272         {
9273           if (VECTOR_MODE_P (mode))
9274             {
9275               if (speed)
9276                 /* Vector shift (register).  */
9277                 *cost += extra_cost->vect.alu;
9278             }
9279           else
9280             {
9281               if (speed)
9282                 /* LSLV.  */
9283                 *cost += extra_cost->alu.shift_reg;
9284
9285               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9286                   && CONST_INT_P (XEXP (op1, 1))
9287                   && known_eq (INTVAL (XEXP (op1, 1)),
9288                                GET_MODE_BITSIZE (mode) - 1))
9289                 {
9290                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9291                   /* We already demanded XEXP (op1, 0) to be REG_P, so
9292                      don't recurse into it.  */
9293                   return true;
9294                 }
9295             }
9296           return false;  /* All arguments need to be in registers.  */
9297         }
9298
9299     case ROTATE:
9300     case ROTATERT:
9301     case LSHIFTRT:
9302     case ASHIFTRT:
9303       op0 = XEXP (x, 0);
9304       op1 = XEXP (x, 1);
9305
9306       if (CONST_INT_P (op1))
9307         {
9308           /* ASR (immediate) and friends.  */
9309           if (speed)
9310             {
9311               if (VECTOR_MODE_P (mode))
9312                 *cost += extra_cost->vect.alu;
9313               else
9314                 *cost += extra_cost->alu.shift;
9315             }
9316
9317           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9318           return true;
9319         }
9320       else
9321         {
9322           if (VECTOR_MODE_P (mode))
9323             {
9324               if (speed)
9325                 /* Vector shift (register).  */
9326                 *cost += extra_cost->vect.alu;
9327             }
9328           else
9329             {
9330               if (speed)
9331                 /* ASR (register) and friends.  */
9332                 *cost += extra_cost->alu.shift_reg;
9333
9334               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9335                   && CONST_INT_P (XEXP (op1, 1))
9336                   && known_eq (INTVAL (XEXP (op1, 1)),
9337                                GET_MODE_BITSIZE (mode) - 1))
9338                 {
9339                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9340                   /* We already demanded XEXP (op1, 0) to be REG_P, so
9341                      don't recurse into it.  */
9342                   return true;
9343                 }
9344             }
9345           return false;  /* All arguments need to be in registers.  */
9346         }
9347
9348     case SYMBOL_REF:
9349
9350       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
9351           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
9352         {
9353           /* LDR.  */
9354           if (speed)
9355             *cost += extra_cost->ldst.load;
9356         }
9357       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
9358                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
9359         {
9360           /* ADRP, followed by ADD.  */
9361           *cost += COSTS_N_INSNS (1);
9362           if (speed)
9363             *cost += 2 * extra_cost->alu.arith;
9364         }
9365       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
9366                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
9367         {
9368           /* ADR.  */
9369           if (speed)
9370             *cost += extra_cost->alu.arith;
9371         }
9372
9373       if (flag_pic)
9374         {
9375           /* One extra load instruction, after accessing the GOT.  */
9376           *cost += COSTS_N_INSNS (1);
9377           if (speed)
9378             *cost += extra_cost->ldst.load;
9379         }
9380       return true;
9381
9382     case HIGH:
9383     case LO_SUM:
9384       /* ADRP/ADD (immediate).  */
9385       if (speed)
9386         *cost += extra_cost->alu.arith;
9387       return true;
9388
9389     case ZERO_EXTRACT:
9390     case SIGN_EXTRACT:
9391       /* UBFX/SBFX.  */
9392       if (speed)
9393         {
9394           if (VECTOR_MODE_P (mode))
9395             *cost += extra_cost->vect.alu;
9396           else
9397             *cost += extra_cost->alu.bfx;
9398         }
9399
9400       /* We can trust that the immediates used will be correct (there
9401          are no by-register forms), so we need only cost op0.  */
9402       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
9403       return true;
9404
9405     case MULT:
9406       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
9407       /* aarch64_rtx_mult_cost always handles recursion to its
9408          operands.  */
9409       return true;
9410
9411     case MOD:
9412     /* We can expand signed mod by power of 2 using a NEGS, two parallel
9413        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
9414        an unconditional negate.  This case should only ever be reached through
9415        the set_smod_pow2_cheap check in expmed.c.  */
9416       if (CONST_INT_P (XEXP (x, 1))
9417           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
9418           && (mode == SImode || mode == DImode))
9419         {
9420           /* We expand to 4 instructions.  Reset the baseline.  */
9421           *cost = COSTS_N_INSNS (4);
9422
9423           if (speed)
9424             *cost += 2 * extra_cost->alu.logical
9425                      + 2 * extra_cost->alu.arith;
9426
9427           return true;
9428         }
9429
9430     /* Fall-through.  */
9431     case UMOD:
9432       if (speed)
9433         {
9434           /* Slighly prefer UMOD over SMOD.  */
9435           if (VECTOR_MODE_P (mode))
9436             *cost += extra_cost->vect.alu;
9437           else if (GET_MODE_CLASS (mode) == MODE_INT)
9438             *cost += (extra_cost->mult[mode == DImode].add
9439                       + extra_cost->mult[mode == DImode].idiv
9440                       + (code == MOD ? 1 : 0));
9441         }
9442       return false;  /* All arguments need to be in registers.  */
9443
9444     case DIV:
9445     case UDIV:
9446     case SQRT:
9447       if (speed)
9448         {
9449           if (VECTOR_MODE_P (mode))
9450             *cost += extra_cost->vect.alu;
9451           else if (GET_MODE_CLASS (mode) == MODE_INT)
9452             /* There is no integer SQRT, so only DIV and UDIV can get
9453                here.  */
9454             *cost += (extra_cost->mult[mode == DImode].idiv
9455                      /* Slighly prefer UDIV over SDIV.  */
9456                      + (code == DIV ? 1 : 0));
9457           else
9458             *cost += extra_cost->fp[mode == DFmode].div;
9459         }
9460       return false;  /* All arguments need to be in registers.  */
9461
9462     case IF_THEN_ELSE:
9463       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
9464                                          XEXP (x, 2), cost, speed);
9465
9466     case EQ:
9467     case NE:
9468     case GT:
9469     case GTU:
9470     case LT:
9471     case LTU:
9472     case GE:
9473     case GEU:
9474     case LE:
9475     case LEU:
9476
9477       return false; /* All arguments must be in registers.  */
9478
9479     case FMA:
9480       op0 = XEXP (x, 0);
9481       op1 = XEXP (x, 1);
9482       op2 = XEXP (x, 2);
9483
9484       if (speed)
9485         {
9486           if (VECTOR_MODE_P (mode))
9487             *cost += extra_cost->vect.alu;
9488           else
9489             *cost += extra_cost->fp[mode == DFmode].fma;
9490         }
9491
9492       /* FMSUB, FNMADD, and FNMSUB are free.  */
9493       if (GET_CODE (op0) == NEG)
9494         op0 = XEXP (op0, 0);
9495
9496       if (GET_CODE (op2) == NEG)
9497         op2 = XEXP (op2, 0);
9498
9499       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
9500          and the by-element operand as operand 0.  */
9501       if (GET_CODE (op1) == NEG)
9502         op1 = XEXP (op1, 0);
9503
9504       /* Catch vector-by-element operations.  The by-element operand can
9505          either be (vec_duplicate (vec_select (x))) or just
9506          (vec_select (x)), depending on whether we are multiplying by
9507          a vector or a scalar.
9508
9509          Canonicalization is not very good in these cases, FMA4 will put the
9510          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
9511       if (GET_CODE (op0) == VEC_DUPLICATE)
9512         op0 = XEXP (op0, 0);
9513       else if (GET_CODE (op1) == VEC_DUPLICATE)
9514         op1 = XEXP (op1, 0);
9515
9516       if (GET_CODE (op0) == VEC_SELECT)
9517         op0 = XEXP (op0, 0);
9518       else if (GET_CODE (op1) == VEC_SELECT)
9519         op1 = XEXP (op1, 0);
9520
9521       /* If the remaining parameters are not registers,
9522          get the cost to put them into registers.  */
9523       *cost += rtx_cost (op0, mode, FMA, 0, speed);
9524       *cost += rtx_cost (op1, mode, FMA, 1, speed);
9525       *cost += rtx_cost (op2, mode, FMA, 2, speed);
9526       return true;
9527
9528     case FLOAT:
9529     case UNSIGNED_FLOAT:
9530       if (speed)
9531         *cost += extra_cost->fp[mode == DFmode].fromint;
9532       return false;
9533
9534     case FLOAT_EXTEND:
9535       if (speed)
9536         {
9537           if (VECTOR_MODE_P (mode))
9538             {
9539               /*Vector truncate.  */
9540               *cost += extra_cost->vect.alu;
9541             }
9542           else
9543             *cost += extra_cost->fp[mode == DFmode].widen;
9544         }
9545       return false;
9546
9547     case FLOAT_TRUNCATE:
9548       if (speed)
9549         {
9550           if (VECTOR_MODE_P (mode))
9551             {
9552               /*Vector conversion.  */
9553               *cost += extra_cost->vect.alu;
9554             }
9555           else
9556             *cost += extra_cost->fp[mode == DFmode].narrow;
9557         }
9558       return false;
9559
9560     case FIX:
9561     case UNSIGNED_FIX:
9562       x = XEXP (x, 0);
9563       /* Strip the rounding part.  They will all be implemented
9564          by the fcvt* family of instructions anyway.  */
9565       if (GET_CODE (x) == UNSPEC)
9566         {
9567           unsigned int uns_code = XINT (x, 1);
9568
9569           if (uns_code == UNSPEC_FRINTA
9570               || uns_code == UNSPEC_FRINTM
9571               || uns_code == UNSPEC_FRINTN
9572               || uns_code == UNSPEC_FRINTP
9573               || uns_code == UNSPEC_FRINTZ)
9574             x = XVECEXP (x, 0, 0);
9575         }
9576
9577       if (speed)
9578         {
9579           if (VECTOR_MODE_P (mode))
9580             *cost += extra_cost->vect.alu;
9581           else
9582             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
9583         }
9584
9585       /* We can combine fmul by a power of 2 followed by a fcvt into a single
9586          fixed-point fcvt.  */
9587       if (GET_CODE (x) == MULT
9588           && ((VECTOR_MODE_P (mode)
9589                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
9590               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
9591         {
9592           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
9593                              0, speed);
9594           return true;
9595         }
9596
9597       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
9598       return true;
9599
9600     case ABS:
9601       if (VECTOR_MODE_P (mode))
9602         {
9603           /* ABS (vector).  */
9604           if (speed)
9605             *cost += extra_cost->vect.alu;
9606         }
9607       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9608         {
9609           op0 = XEXP (x, 0);
9610
9611           /* FABD, which is analogous to FADD.  */
9612           if (GET_CODE (op0) == MINUS)
9613             {
9614               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
9615               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
9616               if (speed)
9617                 *cost += extra_cost->fp[mode == DFmode].addsub;
9618
9619               return true;
9620             }
9621           /* Simple FABS is analogous to FNEG.  */
9622           if (speed)
9623             *cost += extra_cost->fp[mode == DFmode].neg;
9624         }
9625       else
9626         {
9627           /* Integer ABS will either be split to
9628              two arithmetic instructions, or will be an ABS
9629              (scalar), which we don't model.  */
9630           *cost = COSTS_N_INSNS (2);
9631           if (speed)
9632             *cost += 2 * extra_cost->alu.arith;
9633         }
9634       return false;
9635
9636     case SMAX:
9637     case SMIN:
9638       if (speed)
9639         {
9640           if (VECTOR_MODE_P (mode))
9641             *cost += extra_cost->vect.alu;
9642           else
9643             {
9644               /* FMAXNM/FMINNM/FMAX/FMIN.
9645                  TODO: This may not be accurate for all implementations, but
9646                  we do not model this in the cost tables.  */
9647               *cost += extra_cost->fp[mode == DFmode].addsub;
9648             }
9649         }
9650       return false;
9651
9652     case UNSPEC:
9653       /* The floating point round to integer frint* instructions.  */
9654       if (aarch64_frint_unspec_p (XINT (x, 1)))
9655         {
9656           if (speed)
9657             *cost += extra_cost->fp[mode == DFmode].roundint;
9658
9659           return false;
9660         }
9661
9662       if (XINT (x, 1) == UNSPEC_RBIT)
9663         {
9664           if (speed)
9665             *cost += extra_cost->alu.rev;
9666
9667           return false;
9668         }
9669       break;
9670
9671     case TRUNCATE:
9672
9673       /* Decompose <su>muldi3_highpart.  */
9674       if (/* (truncate:DI  */
9675           mode == DImode
9676           /*   (lshiftrt:TI  */
9677           && GET_MODE (XEXP (x, 0)) == TImode
9678           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
9679           /*      (mult:TI  */
9680           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
9681           /*        (ANY_EXTEND:TI (reg:DI))
9682                     (ANY_EXTEND:TI (reg:DI)))  */
9683           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
9684                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
9685               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
9686                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
9687           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
9688           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
9689           /*     (const_int 64)  */
9690           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9691           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
9692         {
9693           /* UMULH/SMULH.  */
9694           if (speed)
9695             *cost += extra_cost->mult[mode == DImode].extend;
9696           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
9697                              mode, MULT, 0, speed);
9698           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
9699                              mode, MULT, 1, speed);
9700           return true;
9701         }
9702
9703       /* Fall through.  */
9704     default:
9705       break;
9706     }
9707
9708   if (dump_file
9709       && flag_aarch64_verbose_cost)
9710     fprintf (dump_file,
9711       "\nFailed to cost RTX.  Assuming default cost.\n");
9712
9713   return true;
9714 }
9715
9716 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
9717    calculated for X.  This cost is stored in *COST.  Returns true
9718    if the total cost of X was calculated.  */
9719 static bool
9720 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
9721                    int param, int *cost, bool speed)
9722 {
9723   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
9724
9725   if (dump_file
9726       && flag_aarch64_verbose_cost)
9727     {
9728       print_rtl_single (dump_file, x);
9729       fprintf (dump_file, "\n%s cost: %d (%s)\n",
9730                speed ? "Hot" : "Cold",
9731                *cost, result ? "final" : "partial");
9732     }
9733
9734   return result;
9735 }
9736
9737 static int
9738 aarch64_register_move_cost (machine_mode mode,
9739                             reg_class_t from_i, reg_class_t to_i)
9740 {
9741   enum reg_class from = (enum reg_class) from_i;
9742   enum reg_class to = (enum reg_class) to_i;
9743   const struct cpu_regmove_cost *regmove_cost
9744     = aarch64_tune_params.regmove_cost;
9745
9746   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
9747   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
9748     to = GENERAL_REGS;
9749
9750   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
9751     from = GENERAL_REGS;
9752
9753   /* Moving between GPR and stack cost is the same as GP2GP.  */
9754   if ((from == GENERAL_REGS && to == STACK_REG)
9755       || (to == GENERAL_REGS && from == STACK_REG))
9756     return regmove_cost->GP2GP;
9757
9758   /* To/From the stack register, we move via the gprs.  */
9759   if (to == STACK_REG || from == STACK_REG)
9760     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
9761             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
9762
9763   if (known_eq (GET_MODE_SIZE (mode), 16))
9764     {
9765       /* 128-bit operations on general registers require 2 instructions.  */
9766       if (from == GENERAL_REGS && to == GENERAL_REGS)
9767         return regmove_cost->GP2GP * 2;
9768       else if (from == GENERAL_REGS)
9769         return regmove_cost->GP2FP * 2;
9770       else if (to == GENERAL_REGS)
9771         return regmove_cost->FP2GP * 2;
9772
9773       /* When AdvSIMD instructions are disabled it is not possible to move
9774          a 128-bit value directly between Q registers.  This is handled in
9775          secondary reload.  A general register is used as a scratch to move
9776          the upper DI value and the lower DI value is moved directly,
9777          hence the cost is the sum of three moves. */
9778       if (! TARGET_SIMD)
9779         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
9780
9781       return regmove_cost->FP2FP;
9782     }
9783
9784   if (from == GENERAL_REGS && to == GENERAL_REGS)
9785     return regmove_cost->GP2GP;
9786   else if (from == GENERAL_REGS)
9787     return regmove_cost->GP2FP;
9788   else if (to == GENERAL_REGS)
9789     return regmove_cost->FP2GP;
9790
9791   return regmove_cost->FP2FP;
9792 }
9793
9794 static int
9795 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
9796                           reg_class_t rclass ATTRIBUTE_UNUSED,
9797                           bool in ATTRIBUTE_UNUSED)
9798 {
9799   return aarch64_tune_params.memmov_cost;
9800 }
9801
9802 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
9803    to optimize 1.0/sqrt.  */
9804
9805 static bool
9806 use_rsqrt_p (machine_mode mode)
9807 {
9808   return (!flag_trapping_math
9809           && flag_unsafe_math_optimizations
9810           && ((aarch64_tune_params.approx_modes->recip_sqrt
9811                & AARCH64_APPROX_MODE (mode))
9812               || flag_mrecip_low_precision_sqrt));
9813 }
9814
9815 /* Function to decide when to use the approximate reciprocal square root
9816    builtin.  */
9817
9818 static tree
9819 aarch64_builtin_reciprocal (tree fndecl)
9820 {
9821   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
9822
9823   if (!use_rsqrt_p (mode))
9824     return NULL_TREE;
9825   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
9826 }
9827
9828 /* Emit instruction sequence to compute either the approximate square root
9829    or its approximate reciprocal, depending on the flag RECP, and return
9830    whether the sequence was emitted or not.  */
9831
9832 bool
9833 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
9834 {
9835   machine_mode mode = GET_MODE (dst);
9836
9837   if (GET_MODE_INNER (mode) == HFmode)
9838     {
9839       gcc_assert (!recp);
9840       return false;
9841     }
9842
9843   if (!recp)
9844     {
9845       if (!(flag_mlow_precision_sqrt
9846             || (aarch64_tune_params.approx_modes->sqrt
9847                 & AARCH64_APPROX_MODE (mode))))
9848         return false;
9849
9850       if (flag_finite_math_only
9851           || flag_trapping_math
9852           || !flag_unsafe_math_optimizations
9853           || optimize_function_for_size_p (cfun))
9854         return false;
9855     }
9856   else
9857     /* Caller assumes we cannot fail.  */
9858     gcc_assert (use_rsqrt_p (mode));
9859
9860   machine_mode mmsk = mode_for_int_vector (mode).require ();
9861   rtx xmsk = gen_reg_rtx (mmsk);
9862   if (!recp)
9863     /* When calculating the approximate square root, compare the
9864        argument with 0.0 and create a mask.  */
9865     emit_insn (gen_rtx_SET (xmsk,
9866                             gen_rtx_NEG (mmsk,
9867                                          gen_rtx_EQ (mmsk, src,
9868                                                      CONST0_RTX (mode)))));
9869
9870   /* Estimate the approximate reciprocal square root.  */
9871   rtx xdst = gen_reg_rtx (mode);
9872   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
9873
9874   /* Iterate over the series twice for SF and thrice for DF.  */
9875   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9876
9877   /* Optionally iterate over the series once less for faster performance
9878      while sacrificing the accuracy.  */
9879   if ((recp && flag_mrecip_low_precision_sqrt)
9880       || (!recp && flag_mlow_precision_sqrt))
9881     iterations--;
9882
9883   /* Iterate over the series to calculate the approximate reciprocal square
9884      root.  */
9885   rtx x1 = gen_reg_rtx (mode);
9886   while (iterations--)
9887     {
9888       rtx x2 = gen_reg_rtx (mode);
9889       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
9890
9891       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
9892
9893       if (iterations > 0)
9894         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
9895     }
9896
9897   if (!recp)
9898     {
9899       /* Qualify the approximate reciprocal square root when the argument is
9900          0.0 by squashing the intermediary result to 0.0.  */
9901       rtx xtmp = gen_reg_rtx (mmsk);
9902       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
9903                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
9904       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
9905
9906       /* Calculate the approximate square root.  */
9907       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
9908     }
9909
9910   /* Finalize the approximation.  */
9911   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
9912
9913   return true;
9914 }
9915
9916 /* Emit the instruction sequence to compute the approximation for the division
9917    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
9918
9919 bool
9920 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
9921 {
9922   machine_mode mode = GET_MODE (quo);
9923
9924   if (GET_MODE_INNER (mode) == HFmode)
9925     return false;
9926
9927   bool use_approx_division_p = (flag_mlow_precision_div
9928                                 || (aarch64_tune_params.approx_modes->division
9929                                     & AARCH64_APPROX_MODE (mode)));
9930
9931   if (!flag_finite_math_only
9932       || flag_trapping_math
9933       || !flag_unsafe_math_optimizations
9934       || optimize_function_for_size_p (cfun)
9935       || !use_approx_division_p)
9936     return false;
9937
9938   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
9939     return false;
9940
9941   /* Estimate the approximate reciprocal.  */
9942   rtx xrcp = gen_reg_rtx (mode);
9943   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
9944
9945   /* Iterate over the series twice for SF and thrice for DF.  */
9946   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9947
9948   /* Optionally iterate over the series once less for faster performance,
9949      while sacrificing the accuracy.  */
9950   if (flag_mlow_precision_div)
9951     iterations--;
9952
9953   /* Iterate over the series to calculate the approximate reciprocal.  */
9954   rtx xtmp = gen_reg_rtx (mode);
9955   while (iterations--)
9956     {
9957       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
9958
9959       if (iterations > 0)
9960         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
9961     }
9962
9963   if (num != CONST1_RTX (mode))
9964     {
9965       /* As the approximate reciprocal of DEN is already calculated, only
9966          calculate the approximate division when NUM is not 1.0.  */
9967       rtx xnum = force_reg (mode, num);
9968       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
9969     }
9970
9971   /* Finalize the approximation.  */
9972   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
9973   return true;
9974 }
9975
9976 /* Return the number of instructions that can be issued per cycle.  */
9977 static int
9978 aarch64_sched_issue_rate (void)
9979 {
9980   return aarch64_tune_params.issue_rate;
9981 }
9982
9983 static int
9984 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
9985 {
9986   int issue_rate = aarch64_sched_issue_rate ();
9987
9988   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
9989 }
9990
9991
9992 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
9993    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
9994    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
9995
9996 static int
9997 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
9998                                                     int ready_index)
9999 {
10000   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
10001 }
10002
10003
10004 /* Vectorizer cost model target hooks.  */
10005
10006 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
10007 static int
10008 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
10009                                     tree vectype,
10010                                     int misalign ATTRIBUTE_UNUSED)
10011 {
10012   unsigned elements;
10013   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
10014   bool fp = false;
10015
10016   if (vectype != NULL)
10017     fp = FLOAT_TYPE_P (vectype);
10018
10019   switch (type_of_cost)
10020     {
10021       case scalar_stmt:
10022         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
10023
10024       case scalar_load:
10025         return costs->scalar_load_cost;
10026
10027       case scalar_store:
10028         return costs->scalar_store_cost;
10029
10030       case vector_stmt:
10031         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10032
10033       case vector_load:
10034         return costs->vec_align_load_cost;
10035
10036       case vector_store:
10037         return costs->vec_store_cost;
10038
10039       case vec_to_scalar:
10040         return costs->vec_to_scalar_cost;
10041
10042       case scalar_to_vec:
10043         return costs->scalar_to_vec_cost;
10044
10045       case unaligned_load:
10046       case vector_gather_load:
10047         return costs->vec_unalign_load_cost;
10048
10049       case unaligned_store:
10050       case vector_scatter_store:
10051         return costs->vec_unalign_store_cost;
10052
10053       case cond_branch_taken:
10054         return costs->cond_taken_branch_cost;
10055
10056       case cond_branch_not_taken:
10057         return costs->cond_not_taken_branch_cost;
10058
10059       case vec_perm:
10060         return costs->vec_permute_cost;
10061
10062       case vec_promote_demote:
10063         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10064
10065       case vec_construct:
10066         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
10067         return elements / 2 + 1;
10068
10069       default:
10070         gcc_unreachable ();
10071     }
10072 }
10073
10074 /* Implement targetm.vectorize.add_stmt_cost.  */
10075 static unsigned
10076 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
10077                        struct _stmt_vec_info *stmt_info, int misalign,
10078                        enum vect_cost_model_location where)
10079 {
10080   unsigned *cost = (unsigned *) data;
10081   unsigned retval = 0;
10082
10083   if (flag_vect_cost_model)
10084     {
10085       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
10086       int stmt_cost =
10087             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
10088
10089       /* Statements in an inner loop relative to the loop being
10090          vectorized are weighted more heavily.  The value here is
10091          arbitrary and could potentially be improved with analysis.  */
10092       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
10093         count *= 50; /*  FIXME  */
10094
10095       retval = (unsigned) (count * stmt_cost);
10096       cost[where] += retval;
10097     }
10098
10099   return retval;
10100 }
10101
10102 static void initialize_aarch64_code_model (struct gcc_options *);
10103
10104 /* Parse the TO_PARSE string and put the architecture struct that it
10105    selects into RES and the architectural features into ISA_FLAGS.
10106    Return an aarch64_parse_opt_result describing the parse result.
10107    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
10108
10109 static enum aarch64_parse_opt_result
10110 aarch64_parse_arch (const char *to_parse, const struct processor **res,
10111                     unsigned long *isa_flags)
10112 {
10113   char *ext;
10114   const struct processor *arch;
10115   char *str = (char *) alloca (strlen (to_parse) + 1);
10116   size_t len;
10117
10118   strcpy (str, to_parse);
10119
10120   ext = strchr (str, '+');
10121
10122   if (ext != NULL)
10123     len = ext - str;
10124   else
10125     len = strlen (str);
10126
10127   if (len == 0)
10128     return AARCH64_PARSE_MISSING_ARG;
10129
10130
10131   /* Loop through the list of supported ARCHes to find a match.  */
10132   for (arch = all_architectures; arch->name != NULL; arch++)
10133     {
10134       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
10135         {
10136           unsigned long isa_temp = arch->flags;
10137
10138           if (ext != NULL)
10139             {
10140               /* TO_PARSE string contains at least one extension.  */
10141               enum aarch64_parse_opt_result ext_res
10142                 = aarch64_parse_extension (ext, &isa_temp);
10143
10144               if (ext_res != AARCH64_PARSE_OK)
10145                 return ext_res;
10146             }
10147           /* Extension parsing was successful.  Confirm the result
10148              arch and ISA flags.  */
10149           *res = arch;
10150           *isa_flags = isa_temp;
10151           return AARCH64_PARSE_OK;
10152         }
10153     }
10154
10155   /* ARCH name not found in list.  */
10156   return AARCH64_PARSE_INVALID_ARG;
10157 }
10158
10159 /* Parse the TO_PARSE string and put the result tuning in RES and the
10160    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
10161    describing the parse result.  If there is an error parsing, RES and
10162    ISA_FLAGS are left unchanged.  */
10163
10164 static enum aarch64_parse_opt_result
10165 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
10166                    unsigned long *isa_flags)
10167 {
10168   char *ext;
10169   const struct processor *cpu;
10170   char *str = (char *) alloca (strlen (to_parse) + 1);
10171   size_t len;
10172
10173   strcpy (str, to_parse);
10174
10175   ext = strchr (str, '+');
10176
10177   if (ext != NULL)
10178     len = ext - str;
10179   else
10180     len = strlen (str);
10181
10182   if (len == 0)
10183     return AARCH64_PARSE_MISSING_ARG;
10184
10185
10186   /* Loop through the list of supported CPUs to find a match.  */
10187   for (cpu = all_cores; cpu->name != NULL; cpu++)
10188     {
10189       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
10190         {
10191           unsigned long isa_temp = cpu->flags;
10192
10193
10194           if (ext != NULL)
10195             {
10196               /* TO_PARSE string contains at least one extension.  */
10197               enum aarch64_parse_opt_result ext_res
10198                 = aarch64_parse_extension (ext, &isa_temp);
10199
10200               if (ext_res != AARCH64_PARSE_OK)
10201                 return ext_res;
10202             }
10203           /* Extension parsing was successfull.  Confirm the result
10204              cpu and ISA flags.  */
10205           *res = cpu;
10206           *isa_flags = isa_temp;
10207           return AARCH64_PARSE_OK;
10208         }
10209     }
10210
10211   /* CPU name not found in list.  */
10212   return AARCH64_PARSE_INVALID_ARG;
10213 }
10214
10215 /* Parse the TO_PARSE string and put the cpu it selects into RES.
10216    Return an aarch64_parse_opt_result describing the parse result.
10217    If the parsing fails the RES does not change.  */
10218
10219 static enum aarch64_parse_opt_result
10220 aarch64_parse_tune (const char *to_parse, const struct processor **res)
10221 {
10222   const struct processor *cpu;
10223   char *str = (char *) alloca (strlen (to_parse) + 1);
10224
10225   strcpy (str, to_parse);
10226
10227   /* Loop through the list of supported CPUs to find a match.  */
10228   for (cpu = all_cores; cpu->name != NULL; cpu++)
10229     {
10230       if (strcmp (cpu->name, str) == 0)
10231         {
10232           *res = cpu;
10233           return AARCH64_PARSE_OK;
10234         }
10235     }
10236
10237   /* CPU name not found in list.  */
10238   return AARCH64_PARSE_INVALID_ARG;
10239 }
10240
10241 /* Parse TOKEN, which has length LENGTH to see if it is an option
10242    described in FLAG.  If it is, return the index bit for that fusion type.
10243    If not, error (printing OPTION_NAME) and return zero.  */
10244
10245 static unsigned int
10246 aarch64_parse_one_option_token (const char *token,
10247                                 size_t length,
10248                                 const struct aarch64_flag_desc *flag,
10249                                 const char *option_name)
10250 {
10251   for (; flag->name != NULL; flag++)
10252     {
10253       if (length == strlen (flag->name)
10254           && !strncmp (flag->name, token, length))
10255         return flag->flag;
10256     }
10257
10258   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
10259   return 0;
10260 }
10261
10262 /* Parse OPTION which is a comma-separated list of flags to enable.
10263    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
10264    default state we inherit from the CPU tuning structures.  OPTION_NAME
10265    gives the top-level option we are parsing in the -moverride string,
10266    for use in error messages.  */
10267
10268 static unsigned int
10269 aarch64_parse_boolean_options (const char *option,
10270                                const struct aarch64_flag_desc *flags,
10271                                unsigned int initial_state,
10272                                const char *option_name)
10273 {
10274   const char separator = '.';
10275   const char* specs = option;
10276   const char* ntoken = option;
10277   unsigned int found_flags = initial_state;
10278
10279   while ((ntoken = strchr (specs, separator)))
10280     {
10281       size_t token_length = ntoken - specs;
10282       unsigned token_ops = aarch64_parse_one_option_token (specs,
10283                                                            token_length,
10284                                                            flags,
10285                                                            option_name);
10286       /* If we find "none" (or, for simplicity's sake, an error) anywhere
10287          in the token stream, reset the supported operations.  So:
10288
10289            adrp+add.cmp+branch.none.adrp+add
10290
10291            would have the result of turning on only adrp+add fusion.  */
10292       if (!token_ops)
10293         found_flags = 0;
10294
10295       found_flags |= token_ops;
10296       specs = ++ntoken;
10297     }
10298
10299   /* We ended with a comma, print something.  */
10300   if (!(*specs))
10301     {
10302       error ("%s string ill-formed\n", option_name);
10303       return 0;
10304     }
10305
10306   /* We still have one more token to parse.  */
10307   size_t token_length = strlen (specs);
10308   unsigned token_ops = aarch64_parse_one_option_token (specs,
10309                                                        token_length,
10310                                                        flags,
10311                                                        option_name);
10312    if (!token_ops)
10313      found_flags = 0;
10314
10315   found_flags |= token_ops;
10316   return found_flags;
10317 }
10318
10319 /* Support for overriding instruction fusion.  */
10320
10321 static void
10322 aarch64_parse_fuse_string (const char *fuse_string,
10323                             struct tune_params *tune)
10324 {
10325   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
10326                                                      aarch64_fusible_pairs,
10327                                                      tune->fusible_ops,
10328                                                      "fuse=");
10329 }
10330
10331 /* Support for overriding other tuning flags.  */
10332
10333 static void
10334 aarch64_parse_tune_string (const char *tune_string,
10335                             struct tune_params *tune)
10336 {
10337   tune->extra_tuning_flags
10338     = aarch64_parse_boolean_options (tune_string,
10339                                      aarch64_tuning_flags,
10340                                      tune->extra_tuning_flags,
10341                                      "tune=");
10342 }
10343
10344 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
10345    we understand.  If it is, extract the option string and handoff to
10346    the appropriate function.  */
10347
10348 void
10349 aarch64_parse_one_override_token (const char* token,
10350                                   size_t length,
10351                                   struct tune_params *tune)
10352 {
10353   const struct aarch64_tuning_override_function *fn
10354     = aarch64_tuning_override_functions;
10355
10356   const char *option_part = strchr (token, '=');
10357   if (!option_part)
10358     {
10359       error ("tuning string missing in option (%s)", token);
10360       return;
10361     }
10362
10363   /* Get the length of the option name.  */
10364   length = option_part - token;
10365   /* Skip the '=' to get to the option string.  */
10366   option_part++;
10367
10368   for (; fn->name != NULL; fn++)
10369     {
10370       if (!strncmp (fn->name, token, length))
10371         {
10372           fn->parse_override (option_part, tune);
10373           return;
10374         }
10375     }
10376
10377   error ("unknown tuning option (%s)",token);
10378   return;
10379 }
10380
10381 /* A checking mechanism for the implementation of the tls size.  */
10382
10383 static void
10384 initialize_aarch64_tls_size (struct gcc_options *opts)
10385 {
10386   if (aarch64_tls_size == 0)
10387     aarch64_tls_size = 24;
10388
10389   switch (opts->x_aarch64_cmodel_var)
10390     {
10391     case AARCH64_CMODEL_TINY:
10392       /* Both the default and maximum TLS size allowed under tiny is 1M which
10393          needs two instructions to address, so we clamp the size to 24.  */
10394       if (aarch64_tls_size > 24)
10395         aarch64_tls_size = 24;
10396       break;
10397     case AARCH64_CMODEL_SMALL:
10398       /* The maximum TLS size allowed under small is 4G.  */
10399       if (aarch64_tls_size > 32)
10400         aarch64_tls_size = 32;
10401       break;
10402     case AARCH64_CMODEL_LARGE:
10403       /* The maximum TLS size allowed under large is 16E.
10404          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
10405       if (aarch64_tls_size > 48)
10406         aarch64_tls_size = 48;
10407       break;
10408     default:
10409       gcc_unreachable ();
10410     }
10411
10412   return;
10413 }
10414
10415 /* Parse STRING looking for options in the format:
10416      string     :: option:string
10417      option     :: name=substring
10418      name       :: {a-z}
10419      substring  :: defined by option.  */
10420
10421 static void
10422 aarch64_parse_override_string (const char* input_string,
10423                                struct tune_params* tune)
10424 {
10425   const char separator = ':';
10426   size_t string_length = strlen (input_string) + 1;
10427   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
10428   char *string = string_root;
10429   strncpy (string, input_string, string_length);
10430   string[string_length - 1] = '\0';
10431
10432   char* ntoken = string;
10433
10434   while ((ntoken = strchr (string, separator)))
10435     {
10436       size_t token_length = ntoken - string;
10437       /* Make this substring look like a string.  */
10438       *ntoken = '\0';
10439       aarch64_parse_one_override_token (string, token_length, tune);
10440       string = ++ntoken;
10441     }
10442
10443   /* One last option to parse.  */
10444   aarch64_parse_one_override_token (string, strlen (string), tune);
10445   free (string_root);
10446 }
10447
10448
10449 static void
10450 aarch64_override_options_after_change_1 (struct gcc_options *opts)
10451 {
10452   /* PR 70044: We have to be careful about being called multiple times for the
10453      same function.  This means all changes should be repeatable.  */
10454
10455   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
10456      Disable the frame pointer flag so the mid-end will not use a frame
10457      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
10458      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
10459      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
10460   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
10461   if (opts->x_flag_omit_frame_pointer == 0)
10462     opts->x_flag_omit_frame_pointer = 2;
10463
10464   /* If not optimizing for size, set the default
10465      alignment to what the target wants.  */
10466   if (!opts->x_optimize_size)
10467     {
10468       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
10469         opts->x_str_align_loops = aarch64_tune_params.loop_align;
10470       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
10471         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
10472       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
10473         opts->x_str_align_functions = aarch64_tune_params.function_align;
10474     }
10475
10476   /* We default to no pc-relative literal loads.  */
10477
10478   aarch64_pcrelative_literal_loads = false;
10479
10480   /* If -mpc-relative-literal-loads is set on the command line, this
10481      implies that the user asked for PC relative literal loads.  */
10482   if (opts->x_pcrelative_literal_loads == 1)
10483     aarch64_pcrelative_literal_loads = true;
10484
10485   /* In the tiny memory model it makes no sense to disallow PC relative
10486      literal pool loads.  */
10487   if (aarch64_cmodel == AARCH64_CMODEL_TINY
10488       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10489     aarch64_pcrelative_literal_loads = true;
10490
10491   /* When enabling the lower precision Newton series for the square root, also
10492      enable it for the reciprocal square root, since the latter is an
10493      intermediary step for the former.  */
10494   if (flag_mlow_precision_sqrt)
10495     flag_mrecip_low_precision_sqrt = true;
10496 }
10497
10498 /* 'Unpack' up the internal tuning structs and update the options
10499     in OPTS.  The caller must have set up selected_tune and selected_arch
10500     as all the other target-specific codegen decisions are
10501     derived from them.  */
10502
10503 void
10504 aarch64_override_options_internal (struct gcc_options *opts)
10505 {
10506   aarch64_tune_flags = selected_tune->flags;
10507   aarch64_tune = selected_tune->sched_core;
10508   /* Make a copy of the tuning parameters attached to the core, which
10509      we may later overwrite.  */
10510   aarch64_tune_params = *(selected_tune->tune);
10511   aarch64_architecture_version = selected_arch->architecture_version;
10512
10513   if (opts->x_aarch64_override_tune_string)
10514     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
10515                                   &aarch64_tune_params);
10516
10517   /* This target defaults to strict volatile bitfields.  */
10518   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
10519     opts->x_flag_strict_volatile_bitfields = 1;
10520
10521   initialize_aarch64_code_model (opts);
10522   initialize_aarch64_tls_size (opts);
10523
10524   int queue_depth = 0;
10525   switch (aarch64_tune_params.autoprefetcher_model)
10526     {
10527       case tune_params::AUTOPREFETCHER_OFF:
10528         queue_depth = -1;
10529         break;
10530       case tune_params::AUTOPREFETCHER_WEAK:
10531         queue_depth = 0;
10532         break;
10533       case tune_params::AUTOPREFETCHER_STRONG:
10534         queue_depth = max_insn_queue_index + 1;
10535         break;
10536       default:
10537         gcc_unreachable ();
10538     }
10539
10540   /* We don't mind passing in global_options_set here as we don't use
10541      the *options_set structs anyway.  */
10542   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
10543                          queue_depth,
10544                          opts->x_param_values,
10545                          global_options_set.x_param_values);
10546
10547   /* Set up parameters to be used in prefetching algorithm.  Do not
10548      override the defaults unless we are tuning for a core we have
10549      researched values for.  */
10550   if (aarch64_tune_params.prefetch->num_slots > 0)
10551     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
10552                            aarch64_tune_params.prefetch->num_slots,
10553                            opts->x_param_values,
10554                            global_options_set.x_param_values);
10555   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
10556     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
10557                            aarch64_tune_params.prefetch->l1_cache_size,
10558                            opts->x_param_values,
10559                            global_options_set.x_param_values);
10560   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
10561     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
10562                            aarch64_tune_params.prefetch->l1_cache_line_size,
10563                            opts->x_param_values,
10564                            global_options_set.x_param_values);
10565   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
10566     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
10567                            aarch64_tune_params.prefetch->l2_cache_size,
10568                            opts->x_param_values,
10569                            global_options_set.x_param_values);
10570   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
10571     maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
10572                            0,
10573                            opts->x_param_values,
10574                            global_options_set.x_param_values);
10575   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
10576     maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
10577                            aarch64_tune_params.prefetch->minimum_stride,
10578                            opts->x_param_values,
10579                            global_options_set.x_param_values);
10580
10581   /* Use the alternative scheduling-pressure algorithm by default.  */
10582   maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
10583                          opts->x_param_values,
10584                          global_options_set.x_param_values);
10585
10586   /* Enable sw prefetching at specified optimization level for
10587      CPUS that have prefetch.  Lower optimization level threshold by 1
10588      when profiling is enabled.  */
10589   if (opts->x_flag_prefetch_loop_arrays < 0
10590       && !opts->x_optimize_size
10591       && aarch64_tune_params.prefetch->default_opt_level >= 0
10592       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
10593     opts->x_flag_prefetch_loop_arrays = 1;
10594
10595   if (opts->x_aarch64_arch_string == NULL)
10596     opts->x_aarch64_arch_string = selected_arch->name;
10597   if (opts->x_aarch64_cpu_string == NULL)
10598     opts->x_aarch64_cpu_string = selected_cpu->name;
10599   if (opts->x_aarch64_tune_string == NULL)
10600     opts->x_aarch64_tune_string = selected_tune->name;
10601
10602   aarch64_override_options_after_change_1 (opts);
10603 }
10604
10605 /* Print a hint with a suggestion for a core or architecture name that
10606    most closely resembles what the user passed in STR.  ARCH is true if
10607    the user is asking for an architecture name.  ARCH is false if the user
10608    is asking for a core name.  */
10609
10610 static void
10611 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
10612 {
10613   auto_vec<const char *> candidates;
10614   const struct processor *entry = arch ? all_architectures : all_cores;
10615   for (; entry->name != NULL; entry++)
10616     candidates.safe_push (entry->name);
10617
10618 #ifdef HAVE_LOCAL_CPU_DETECT
10619   /* Add also "native" as possible value.  */
10620   if (arch)
10621     candidates.safe_push ("native");
10622 #endif
10623
10624   char *s;
10625   const char *hint = candidates_list_and_hint (str, s, candidates);
10626   if (hint)
10627     inform (input_location, "valid arguments are: %s;"
10628                              " did you mean %qs?", s, hint);
10629   else
10630     inform (input_location, "valid arguments are: %s", s);
10631
10632   XDELETEVEC (s);
10633 }
10634
10635 /* Print a hint with a suggestion for a core name that most closely resembles
10636    what the user passed in STR.  */
10637
10638 inline static void
10639 aarch64_print_hint_for_core (const char *str)
10640 {
10641   aarch64_print_hint_for_core_or_arch (str, false);
10642 }
10643
10644 /* Print a hint with a suggestion for an architecture name that most closely
10645    resembles what the user passed in STR.  */
10646
10647 inline static void
10648 aarch64_print_hint_for_arch (const char *str)
10649 {
10650   aarch64_print_hint_for_core_or_arch (str, true);
10651 }
10652
10653 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
10654    specified in STR and throw errors if appropriate.  Put the results if
10655    they are valid in RES and ISA_FLAGS.  Return whether the option is
10656    valid.  */
10657
10658 static bool
10659 aarch64_validate_mcpu (const char *str, const struct processor **res,
10660                        unsigned long *isa_flags)
10661 {
10662   enum aarch64_parse_opt_result parse_res
10663     = aarch64_parse_cpu (str, res, isa_flags);
10664
10665   if (parse_res == AARCH64_PARSE_OK)
10666     return true;
10667
10668   switch (parse_res)
10669     {
10670       case AARCH64_PARSE_MISSING_ARG:
10671         error ("missing cpu name in %<-mcpu=%s%>", str);
10672         break;
10673       case AARCH64_PARSE_INVALID_ARG:
10674         error ("unknown value %qs for -mcpu", str);
10675         aarch64_print_hint_for_core (str);
10676         break;
10677       case AARCH64_PARSE_INVALID_FEATURE:
10678         error ("invalid feature modifier in %<-mcpu=%s%>", str);
10679         break;
10680       default:
10681         gcc_unreachable ();
10682     }
10683
10684   return false;
10685 }
10686
10687 /* Validate a command-line -march option.  Parse the arch and extensions
10688    (if any) specified in STR and throw errors if appropriate.  Put the
10689    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
10690    option is valid.  */
10691
10692 static bool
10693 aarch64_validate_march (const char *str, const struct processor **res,
10694                          unsigned long *isa_flags)
10695 {
10696   enum aarch64_parse_opt_result parse_res
10697     = aarch64_parse_arch (str, res, isa_flags);
10698
10699   if (parse_res == AARCH64_PARSE_OK)
10700     return true;
10701
10702   switch (parse_res)
10703     {
10704       case AARCH64_PARSE_MISSING_ARG:
10705         error ("missing arch name in %<-march=%s%>", str);
10706         break;
10707       case AARCH64_PARSE_INVALID_ARG:
10708         error ("unknown value %qs for -march", str);
10709         aarch64_print_hint_for_arch (str);
10710         break;
10711       case AARCH64_PARSE_INVALID_FEATURE:
10712         error ("invalid feature modifier in %<-march=%s%>", str);
10713         break;
10714       default:
10715         gcc_unreachable ();
10716     }
10717
10718   return false;
10719 }
10720
10721 /* Validate a command-line -mtune option.  Parse the cpu
10722    specified in STR and throw errors if appropriate.  Put the
10723    result, if it is valid, in RES.  Return whether the option is
10724    valid.  */
10725
10726 static bool
10727 aarch64_validate_mtune (const char *str, const struct processor **res)
10728 {
10729   enum aarch64_parse_opt_result parse_res
10730     = aarch64_parse_tune (str, res);
10731
10732   if (parse_res == AARCH64_PARSE_OK)
10733     return true;
10734
10735   switch (parse_res)
10736     {
10737       case AARCH64_PARSE_MISSING_ARG:
10738         error ("missing cpu name in %<-mtune=%s%>", str);
10739         break;
10740       case AARCH64_PARSE_INVALID_ARG:
10741         error ("unknown value %qs for -mtune", str);
10742         aarch64_print_hint_for_core (str);
10743         break;
10744       default:
10745         gcc_unreachable ();
10746     }
10747   return false;
10748 }
10749
10750 /* Return the CPU corresponding to the enum CPU.
10751    If it doesn't specify a cpu, return the default.  */
10752
10753 static const struct processor *
10754 aarch64_get_tune_cpu (enum aarch64_processor cpu)
10755 {
10756   if (cpu != aarch64_none)
10757     return &all_cores[cpu];
10758
10759   /* The & 0x3f is to extract the bottom 6 bits that encode the
10760      default cpu as selected by the --with-cpu GCC configure option
10761      in config.gcc.
10762      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
10763      flags mechanism should be reworked to make it more sane.  */
10764   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10765 }
10766
10767 /* Return the architecture corresponding to the enum ARCH.
10768    If it doesn't specify a valid architecture, return the default.  */
10769
10770 static const struct processor *
10771 aarch64_get_arch (enum aarch64_arch arch)
10772 {
10773   if (arch != aarch64_no_arch)
10774     return &all_architectures[arch];
10775
10776   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10777
10778   return &all_architectures[cpu->arch];
10779 }
10780
10781 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
10782
10783 static poly_uint16
10784 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
10785 {
10786   /* For now generate vector-length agnostic code for -msve-vector-bits=128.
10787      This ensures we can clearly distinguish SVE and Advanced SIMD modes when
10788      deciding which .md file patterns to use and when deciding whether
10789      something is a legitimate address or constant.  */
10790   if (value == SVE_SCALABLE || value == SVE_128)
10791     return poly_uint16 (2, 2);
10792   else
10793     return (int) value / 64;
10794 }
10795
10796 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
10797    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
10798    tuning structs.  In particular it must set selected_tune and
10799    aarch64_isa_flags that define the available ISA features and tuning
10800    decisions.  It must also set selected_arch as this will be used to
10801    output the .arch asm tags for each function.  */
10802
10803 static void
10804 aarch64_override_options (void)
10805 {
10806   unsigned long cpu_isa = 0;
10807   unsigned long arch_isa = 0;
10808   aarch64_isa_flags = 0;
10809
10810   bool valid_cpu = true;
10811   bool valid_tune = true;
10812   bool valid_arch = true;
10813
10814   selected_cpu = NULL;
10815   selected_arch = NULL;
10816   selected_tune = NULL;
10817
10818   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
10819      If either of -march or -mtune is given, they override their
10820      respective component of -mcpu.  */
10821   if (aarch64_cpu_string)
10822     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
10823                                         &cpu_isa);
10824
10825   if (aarch64_arch_string)
10826     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
10827                                           &arch_isa);
10828
10829   if (aarch64_tune_string)
10830     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
10831
10832   /* If the user did not specify a processor, choose the default
10833      one for them.  This will be the CPU set during configuration using
10834      --with-cpu, otherwise it is "generic".  */
10835   if (!selected_cpu)
10836     {
10837       if (selected_arch)
10838         {
10839           selected_cpu = &all_cores[selected_arch->ident];
10840           aarch64_isa_flags = arch_isa;
10841           explicit_arch = selected_arch->arch;
10842         }
10843       else
10844         {
10845           /* Get default configure-time CPU.  */
10846           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
10847           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
10848         }
10849
10850       if (selected_tune)
10851         explicit_tune_core = selected_tune->ident;
10852     }
10853   /* If both -mcpu and -march are specified check that they are architecturally
10854      compatible, warn if they're not and prefer the -march ISA flags.  */
10855   else if (selected_arch)
10856     {
10857       if (selected_arch->arch != selected_cpu->arch)
10858         {
10859           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
10860                        all_architectures[selected_cpu->arch].name,
10861                        selected_arch->name);
10862         }
10863       aarch64_isa_flags = arch_isa;
10864       explicit_arch = selected_arch->arch;
10865       explicit_tune_core = selected_tune ? selected_tune->ident
10866                                           : selected_cpu->ident;
10867     }
10868   else
10869     {
10870       /* -mcpu but no -march.  */
10871       aarch64_isa_flags = cpu_isa;
10872       explicit_tune_core = selected_tune ? selected_tune->ident
10873                                           : selected_cpu->ident;
10874       gcc_assert (selected_cpu);
10875       selected_arch = &all_architectures[selected_cpu->arch];
10876       explicit_arch = selected_arch->arch;
10877     }
10878
10879   /* Set the arch as well as we will need it when outputing
10880      the .arch directive in assembly.  */
10881   if (!selected_arch)
10882     {
10883       gcc_assert (selected_cpu);
10884       selected_arch = &all_architectures[selected_cpu->arch];
10885     }
10886
10887   if (!selected_tune)
10888     selected_tune = selected_cpu;
10889
10890 #ifndef HAVE_AS_MABI_OPTION
10891   /* The compiler may have been configured with 2.23.* binutils, which does
10892      not have support for ILP32.  */
10893   if (TARGET_ILP32)
10894     error ("assembler does not support -mabi=ilp32");
10895 #endif
10896
10897   /* Convert -msve-vector-bits to a VG count.  */
10898   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
10899
10900   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
10901     sorry ("return address signing is only supported for -mabi=lp64");
10902
10903   /* Make sure we properly set up the explicit options.  */
10904   if ((aarch64_cpu_string && valid_cpu)
10905        || (aarch64_tune_string && valid_tune))
10906     gcc_assert (explicit_tune_core != aarch64_none);
10907
10908   if ((aarch64_cpu_string && valid_cpu)
10909        || (aarch64_arch_string && valid_arch))
10910     gcc_assert (explicit_arch != aarch64_no_arch);
10911
10912   aarch64_override_options_internal (&global_options);
10913
10914   /* Save these options as the default ones in case we push and pop them later
10915      while processing functions with potential target attributes.  */
10916   target_option_default_node = target_option_current_node
10917       = build_target_option_node (&global_options);
10918 }
10919
10920 /* Implement targetm.override_options_after_change.  */
10921
10922 static void
10923 aarch64_override_options_after_change (void)
10924 {
10925   aarch64_override_options_after_change_1 (&global_options);
10926 }
10927
10928 static struct machine_function *
10929 aarch64_init_machine_status (void)
10930 {
10931   struct machine_function *machine;
10932   machine = ggc_cleared_alloc<machine_function> ();
10933   return machine;
10934 }
10935
10936 void
10937 aarch64_init_expanders (void)
10938 {
10939   init_machine_status = aarch64_init_machine_status;
10940 }
10941
10942 /* A checking mechanism for the implementation of the various code models.  */
10943 static void
10944 initialize_aarch64_code_model (struct gcc_options *opts)
10945 {
10946    if (opts->x_flag_pic)
10947      {
10948        switch (opts->x_aarch64_cmodel_var)
10949          {
10950          case AARCH64_CMODEL_TINY:
10951            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
10952            break;
10953          case AARCH64_CMODEL_SMALL:
10954 #ifdef HAVE_AS_SMALL_PIC_RELOCS
10955            aarch64_cmodel = (flag_pic == 2
10956                              ? AARCH64_CMODEL_SMALL_PIC
10957                              : AARCH64_CMODEL_SMALL_SPIC);
10958 #else
10959            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
10960 #endif
10961            break;
10962          case AARCH64_CMODEL_LARGE:
10963            sorry ("code model %qs with -f%s", "large",
10964                   opts->x_flag_pic > 1 ? "PIC" : "pic");
10965            break;
10966          default:
10967            gcc_unreachable ();
10968          }
10969      }
10970    else
10971      aarch64_cmodel = opts->x_aarch64_cmodel_var;
10972 }
10973
10974 /* Implement TARGET_OPTION_SAVE.  */
10975
10976 static void
10977 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
10978 {
10979   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
10980 }
10981
10982 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
10983    using the information saved in PTR.  */
10984
10985 static void
10986 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
10987 {
10988   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
10989   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
10990   opts->x_explicit_arch = ptr->x_explicit_arch;
10991   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
10992   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
10993
10994   aarch64_override_options_internal (opts);
10995 }
10996
10997 /* Implement TARGET_OPTION_PRINT.  */
10998
10999 static void
11000 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
11001 {
11002   const struct processor *cpu
11003     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11004   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
11005   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
11006   std::string extension
11007     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
11008
11009   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
11010   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
11011            arch->name, extension.c_str ());
11012 }
11013
11014 static GTY(()) tree aarch64_previous_fndecl;
11015
11016 void
11017 aarch64_reset_previous_fndecl (void)
11018 {
11019   aarch64_previous_fndecl = NULL;
11020 }
11021
11022 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
11023    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
11024    make sure optab availability predicates are recomputed when necessary.  */
11025
11026 void
11027 aarch64_save_restore_target_globals (tree new_tree)
11028 {
11029   if (TREE_TARGET_GLOBALS (new_tree))
11030     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
11031   else if (new_tree == target_option_default_node)
11032     restore_target_globals (&default_target_globals);
11033   else
11034     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
11035 }
11036
11037 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
11038    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
11039    of the function, if such exists.  This function may be called multiple
11040    times on a single function so use aarch64_previous_fndecl to avoid
11041    setting up identical state.  */
11042
11043 static void
11044 aarch64_set_current_function (tree fndecl)
11045 {
11046   if (!fndecl || fndecl == aarch64_previous_fndecl)
11047     return;
11048
11049   tree old_tree = (aarch64_previous_fndecl
11050                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
11051                    : NULL_TREE);
11052
11053   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11054
11055   /* If current function has no attributes but the previous one did,
11056      use the default node.  */
11057   if (!new_tree && old_tree)
11058     new_tree = target_option_default_node;
11059
11060   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
11061      the default have been handled by aarch64_save_restore_target_globals from
11062      aarch64_pragma_target_parse.  */
11063   if (old_tree == new_tree)
11064     return;
11065
11066   aarch64_previous_fndecl = fndecl;
11067
11068   /* First set the target options.  */
11069   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
11070
11071   aarch64_save_restore_target_globals (new_tree);
11072 }
11073
11074 /* Enum describing the various ways we can handle attributes.
11075    In many cases we can reuse the generic option handling machinery.  */
11076
11077 enum aarch64_attr_opt_type
11078 {
11079   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
11080   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
11081   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
11082   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
11083 };
11084
11085 /* All the information needed to handle a target attribute.
11086    NAME is the name of the attribute.
11087    ATTR_TYPE specifies the type of behavior of the attribute as described
11088    in the definition of enum aarch64_attr_opt_type.
11089    ALLOW_NEG is true if the attribute supports a "no-" form.
11090    HANDLER is the function that takes the attribute string as an argument
11091    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
11092    OPT_NUM is the enum specifying the option that the attribute modifies.
11093    This is needed for attributes that mirror the behavior of a command-line
11094    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
11095    aarch64_attr_enum.  */
11096
11097 struct aarch64_attribute_info
11098 {
11099   const char *name;
11100   enum aarch64_attr_opt_type attr_type;
11101   bool allow_neg;
11102   bool (*handler) (const char *);
11103   enum opt_code opt_num;
11104 };
11105
11106 /* Handle the ARCH_STR argument to the arch= target attribute.  */
11107
11108 static bool
11109 aarch64_handle_attr_arch (const char *str)
11110 {
11111   const struct processor *tmp_arch = NULL;
11112   enum aarch64_parse_opt_result parse_res
11113     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
11114
11115   if (parse_res == AARCH64_PARSE_OK)
11116     {
11117       gcc_assert (tmp_arch);
11118       selected_arch = tmp_arch;
11119       explicit_arch = selected_arch->arch;
11120       return true;
11121     }
11122
11123   switch (parse_res)
11124     {
11125       case AARCH64_PARSE_MISSING_ARG:
11126         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
11127         break;
11128       case AARCH64_PARSE_INVALID_ARG:
11129         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
11130         aarch64_print_hint_for_arch (str);
11131         break;
11132       case AARCH64_PARSE_INVALID_FEATURE:
11133         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11134         break;
11135       default:
11136         gcc_unreachable ();
11137     }
11138
11139   return false;
11140 }
11141
11142 /* Handle the argument CPU_STR to the cpu= target attribute.  */
11143
11144 static bool
11145 aarch64_handle_attr_cpu (const char *str)
11146 {
11147   const struct processor *tmp_cpu = NULL;
11148   enum aarch64_parse_opt_result parse_res
11149     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
11150
11151   if (parse_res == AARCH64_PARSE_OK)
11152     {
11153       gcc_assert (tmp_cpu);
11154       selected_tune = tmp_cpu;
11155       explicit_tune_core = selected_tune->ident;
11156
11157       selected_arch = &all_architectures[tmp_cpu->arch];
11158       explicit_arch = selected_arch->arch;
11159       return true;
11160     }
11161
11162   switch (parse_res)
11163     {
11164       case AARCH64_PARSE_MISSING_ARG:
11165         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
11166         break;
11167       case AARCH64_PARSE_INVALID_ARG:
11168         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
11169         aarch64_print_hint_for_core (str);
11170         break;
11171       case AARCH64_PARSE_INVALID_FEATURE:
11172         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11173         break;
11174       default:
11175         gcc_unreachable ();
11176     }
11177
11178   return false;
11179 }
11180
11181 /* Handle the argument STR to the tune= target attribute.  */
11182
11183 static bool
11184 aarch64_handle_attr_tune (const char *str)
11185 {
11186   const struct processor *tmp_tune = NULL;
11187   enum aarch64_parse_opt_result parse_res
11188     = aarch64_parse_tune (str, &tmp_tune);
11189
11190   if (parse_res == AARCH64_PARSE_OK)
11191     {
11192       gcc_assert (tmp_tune);
11193       selected_tune = tmp_tune;
11194       explicit_tune_core = selected_tune->ident;
11195       return true;
11196     }
11197
11198   switch (parse_res)
11199     {
11200       case AARCH64_PARSE_INVALID_ARG:
11201         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
11202         aarch64_print_hint_for_core (str);
11203         break;
11204       default:
11205         gcc_unreachable ();
11206     }
11207
11208   return false;
11209 }
11210
11211 /* Parse an architecture extensions target attribute string specified in STR.
11212    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
11213    if successful.  Update aarch64_isa_flags to reflect the ISA features
11214    modified.  */
11215
11216 static bool
11217 aarch64_handle_attr_isa_flags (char *str)
11218 {
11219   enum aarch64_parse_opt_result parse_res;
11220   unsigned long isa_flags = aarch64_isa_flags;
11221
11222   /* We allow "+nothing" in the beginning to clear out all architectural
11223      features if the user wants to handpick specific features.  */
11224   if (strncmp ("+nothing", str, 8) == 0)
11225     {
11226       isa_flags = 0;
11227       str += 8;
11228     }
11229
11230   parse_res = aarch64_parse_extension (str, &isa_flags);
11231
11232   if (parse_res == AARCH64_PARSE_OK)
11233     {
11234       aarch64_isa_flags = isa_flags;
11235       return true;
11236     }
11237
11238   switch (parse_res)
11239     {
11240       case AARCH64_PARSE_MISSING_ARG:
11241         error ("missing value in %<target()%> pragma or attribute");
11242         break;
11243
11244       case AARCH64_PARSE_INVALID_FEATURE:
11245         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11246         break;
11247
11248       default:
11249         gcc_unreachable ();
11250     }
11251
11252  return false;
11253 }
11254
11255 /* The target attributes that we support.  On top of these we also support just
11256    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
11257    handled explicitly in aarch64_process_one_target_attr.  */
11258
11259 static const struct aarch64_attribute_info aarch64_attributes[] =
11260 {
11261   { "general-regs-only", aarch64_attr_mask, false, NULL,
11262      OPT_mgeneral_regs_only },
11263   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
11264      OPT_mfix_cortex_a53_835769 },
11265   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
11266      OPT_mfix_cortex_a53_843419 },
11267   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
11268   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
11269   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
11270      OPT_momit_leaf_frame_pointer },
11271   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
11272   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
11273      OPT_march_ },
11274   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
11275   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
11276      OPT_mtune_ },
11277   { "sign-return-address", aarch64_attr_enum, false, NULL,
11278      OPT_msign_return_address_ },
11279   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
11280 };
11281
11282 /* Parse ARG_STR which contains the definition of one target attribute.
11283    Show appropriate errors if any or return true if the attribute is valid.  */
11284
11285 static bool
11286 aarch64_process_one_target_attr (char *arg_str)
11287 {
11288   bool invert = false;
11289
11290   size_t len = strlen (arg_str);
11291
11292   if (len == 0)
11293     {
11294       error ("malformed %<target()%> pragma or attribute");
11295       return false;
11296     }
11297
11298   char *str_to_check = (char *) alloca (len + 1);
11299   strcpy (str_to_check, arg_str);
11300
11301   /* Skip leading whitespace.  */
11302   while (*str_to_check == ' ' || *str_to_check == '\t')
11303     str_to_check++;
11304
11305   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
11306      It is easier to detect and handle it explicitly here rather than going
11307      through the machinery for the rest of the target attributes in this
11308      function.  */
11309   if (*str_to_check == '+')
11310     return aarch64_handle_attr_isa_flags (str_to_check);
11311
11312   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
11313     {
11314       invert = true;
11315       str_to_check += 3;
11316     }
11317   char *arg = strchr (str_to_check, '=');
11318
11319   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
11320      and point ARG to "foo".  */
11321   if (arg)
11322     {
11323       *arg = '\0';
11324       arg++;
11325     }
11326   const struct aarch64_attribute_info *p_attr;
11327   bool found = false;
11328   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
11329     {
11330       /* If the names don't match up, or the user has given an argument
11331          to an attribute that doesn't accept one, or didn't give an argument
11332          to an attribute that expects one, fail to match.  */
11333       if (strcmp (str_to_check, p_attr->name) != 0)
11334         continue;
11335
11336       found = true;
11337       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
11338                               || p_attr->attr_type == aarch64_attr_enum;
11339
11340       if (attr_need_arg_p ^ (arg != NULL))
11341         {
11342           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
11343           return false;
11344         }
11345
11346       /* If the name matches but the attribute does not allow "no-" versions
11347          then we can't match.  */
11348       if (invert && !p_attr->allow_neg)
11349         {
11350           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
11351           return false;
11352         }
11353
11354       switch (p_attr->attr_type)
11355         {
11356         /* Has a custom handler registered.
11357            For example, cpu=, arch=, tune=.  */
11358           case aarch64_attr_custom:
11359             gcc_assert (p_attr->handler);
11360             if (!p_attr->handler (arg))
11361               return false;
11362             break;
11363
11364           /* Either set or unset a boolean option.  */
11365           case aarch64_attr_bool:
11366             {
11367               struct cl_decoded_option decoded;
11368
11369               generate_option (p_attr->opt_num, NULL, !invert,
11370                                CL_TARGET, &decoded);
11371               aarch64_handle_option (&global_options, &global_options_set,
11372                                       &decoded, input_location);
11373               break;
11374             }
11375           /* Set or unset a bit in the target_flags.  aarch64_handle_option
11376              should know what mask to apply given the option number.  */
11377           case aarch64_attr_mask:
11378             {
11379               struct cl_decoded_option decoded;
11380               /* We only need to specify the option number.
11381                  aarch64_handle_option will know which mask to apply.  */
11382               decoded.opt_index = p_attr->opt_num;
11383               decoded.value = !invert;
11384               aarch64_handle_option (&global_options, &global_options_set,
11385                                       &decoded, input_location);
11386               break;
11387             }
11388           /* Use the option setting machinery to set an option to an enum.  */
11389           case aarch64_attr_enum:
11390             {
11391               gcc_assert (arg);
11392               bool valid;
11393               int value;
11394               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
11395                                               &value, CL_TARGET);
11396               if (valid)
11397                 {
11398                   set_option (&global_options, NULL, p_attr->opt_num, value,
11399                               NULL, DK_UNSPECIFIED, input_location,
11400                               global_dc);
11401                 }
11402               else
11403                 {
11404                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
11405                 }
11406               break;
11407             }
11408           default:
11409             gcc_unreachable ();
11410         }
11411     }
11412
11413   /* If we reached here we either have found an attribute and validated
11414      it or didn't match any.  If we matched an attribute but its arguments
11415      were malformed we will have returned false already.  */
11416   return found;
11417 }
11418
11419 /* Count how many times the character C appears in
11420    NULL-terminated string STR.  */
11421
11422 static unsigned int
11423 num_occurences_in_str (char c, char *str)
11424 {
11425   unsigned int res = 0;
11426   while (*str != '\0')
11427     {
11428       if (*str == c)
11429         res++;
11430
11431       str++;
11432     }
11433
11434   return res;
11435 }
11436
11437 /* Parse the tree in ARGS that contains the target attribute information
11438    and update the global target options space.  */
11439
11440 bool
11441 aarch64_process_target_attr (tree args)
11442 {
11443   if (TREE_CODE (args) == TREE_LIST)
11444     {
11445       do
11446         {
11447           tree head = TREE_VALUE (args);
11448           if (head)
11449             {
11450               if (!aarch64_process_target_attr (head))
11451                 return false;
11452             }
11453           args = TREE_CHAIN (args);
11454         } while (args);
11455
11456       return true;
11457     }
11458
11459   if (TREE_CODE (args) != STRING_CST)
11460     {
11461       error ("attribute %<target%> argument not a string");
11462       return false;
11463     }
11464
11465   size_t len = strlen (TREE_STRING_POINTER (args));
11466   char *str_to_check = (char *) alloca (len + 1);
11467   strcpy (str_to_check, TREE_STRING_POINTER (args));
11468
11469   if (len == 0)
11470     {
11471       error ("malformed %<target()%> pragma or attribute");
11472       return false;
11473     }
11474
11475   /* Used to catch empty spaces between commas i.e.
11476      attribute ((target ("attr1,,attr2"))).  */
11477   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
11478
11479   /* Handle multiple target attributes separated by ','.  */
11480   char *token = strtok (str_to_check, ",");
11481
11482   unsigned int num_attrs = 0;
11483   while (token)
11484     {
11485       num_attrs++;
11486       if (!aarch64_process_one_target_attr (token))
11487         {
11488           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
11489           return false;
11490         }
11491
11492       token = strtok (NULL, ",");
11493     }
11494
11495   if (num_attrs != num_commas + 1)
11496     {
11497       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
11498       return false;
11499     }
11500
11501   return true;
11502 }
11503
11504 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
11505    process attribute ((target ("..."))).  */
11506
11507 static bool
11508 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
11509 {
11510   struct cl_target_option cur_target;
11511   bool ret;
11512   tree old_optimize;
11513   tree new_target, new_optimize;
11514   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11515
11516   /* If what we're processing is the current pragma string then the
11517      target option node is already stored in target_option_current_node
11518      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
11519      having to re-parse the string.  This is especially useful to keep
11520      arm_neon.h compile times down since that header contains a lot
11521      of intrinsics enclosed in pragmas.  */
11522   if (!existing_target && args == current_target_pragma)
11523     {
11524       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
11525       return true;
11526     }
11527   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11528
11529   old_optimize = build_optimization_node (&global_options);
11530   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11531
11532   /* If the function changed the optimization levels as well as setting
11533      target options, start with the optimizations specified.  */
11534   if (func_optimize && func_optimize != old_optimize)
11535     cl_optimization_restore (&global_options,
11536                              TREE_OPTIMIZATION (func_optimize));
11537
11538   /* Save the current target options to restore at the end.  */
11539   cl_target_option_save (&cur_target, &global_options);
11540
11541   /* If fndecl already has some target attributes applied to it, unpack
11542      them so that we add this attribute on top of them, rather than
11543      overwriting them.  */
11544   if (existing_target)
11545     {
11546       struct cl_target_option *existing_options
11547         = TREE_TARGET_OPTION (existing_target);
11548
11549       if (existing_options)
11550         cl_target_option_restore (&global_options, existing_options);
11551     }
11552   else
11553     cl_target_option_restore (&global_options,
11554                         TREE_TARGET_OPTION (target_option_current_node));
11555
11556   ret = aarch64_process_target_attr (args);
11557
11558   /* Set up any additional state.  */
11559   if (ret)
11560     {
11561       aarch64_override_options_internal (&global_options);
11562       /* Initialize SIMD builtins if we haven't already.
11563          Set current_target_pragma to NULL for the duration so that
11564          the builtin initialization code doesn't try to tag the functions
11565          being built with the attributes specified by any current pragma, thus
11566          going into an infinite recursion.  */
11567       if (TARGET_SIMD)
11568         {
11569           tree saved_current_target_pragma = current_target_pragma;
11570           current_target_pragma = NULL;
11571           aarch64_init_simd_builtins ();
11572           current_target_pragma = saved_current_target_pragma;
11573         }
11574       new_target = build_target_option_node (&global_options);
11575     }
11576   else
11577     new_target = NULL;
11578
11579   new_optimize = build_optimization_node (&global_options);
11580
11581   if (fndecl && ret)
11582     {
11583       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
11584
11585       if (old_optimize != new_optimize)
11586         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
11587     }
11588
11589   cl_target_option_restore (&global_options, &cur_target);
11590
11591   if (old_optimize != new_optimize)
11592     cl_optimization_restore (&global_options,
11593                              TREE_OPTIMIZATION (old_optimize));
11594   return ret;
11595 }
11596
11597 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
11598    tri-bool options (yes, no, don't care) and the default value is
11599    DEF, determine whether to reject inlining.  */
11600
11601 static bool
11602 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
11603                                      int dont_care, int def)
11604 {
11605   /* If the callee doesn't care, always allow inlining.  */
11606   if (callee == dont_care)
11607     return true;
11608
11609   /* If the caller doesn't care, always allow inlining.  */
11610   if (caller == dont_care)
11611     return true;
11612
11613   /* Otherwise, allow inlining if either the callee and caller values
11614      agree, or if the callee is using the default value.  */
11615   return (callee == caller || callee == def);
11616 }
11617
11618 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
11619    to inline CALLEE into CALLER based on target-specific info.
11620    Make sure that the caller and callee have compatible architectural
11621    features.  Then go through the other possible target attributes
11622    and see if they can block inlining.  Try not to reject always_inline
11623    callees unless they are incompatible architecturally.  */
11624
11625 static bool
11626 aarch64_can_inline_p (tree caller, tree callee)
11627 {
11628   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
11629   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
11630
11631   struct cl_target_option *caller_opts
11632         = TREE_TARGET_OPTION (caller_tree ? caller_tree
11633                                            : target_option_default_node);
11634
11635   struct cl_target_option *callee_opts
11636         = TREE_TARGET_OPTION (callee_tree ? callee_tree
11637                                            : target_option_default_node);
11638
11639   /* Callee's ISA flags should be a subset of the caller's.  */
11640   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
11641        != callee_opts->x_aarch64_isa_flags)
11642     return false;
11643
11644   /* Allow non-strict aligned functions inlining into strict
11645      aligned ones.  */
11646   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
11647        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
11648       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
11649            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
11650     return false;
11651
11652   bool always_inline = lookup_attribute ("always_inline",
11653                                           DECL_ATTRIBUTES (callee));
11654
11655   /* If the architectural features match up and the callee is always_inline
11656      then the other attributes don't matter.  */
11657   if (always_inline)
11658     return true;
11659
11660   if (caller_opts->x_aarch64_cmodel_var
11661       != callee_opts->x_aarch64_cmodel_var)
11662     return false;
11663
11664   if (caller_opts->x_aarch64_tls_dialect
11665       != callee_opts->x_aarch64_tls_dialect)
11666     return false;
11667
11668   /* Honour explicit requests to workaround errata.  */
11669   if (!aarch64_tribools_ok_for_inlining_p (
11670           caller_opts->x_aarch64_fix_a53_err835769,
11671           callee_opts->x_aarch64_fix_a53_err835769,
11672           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
11673     return false;
11674
11675   if (!aarch64_tribools_ok_for_inlining_p (
11676           caller_opts->x_aarch64_fix_a53_err843419,
11677           callee_opts->x_aarch64_fix_a53_err843419,
11678           2, TARGET_FIX_ERR_A53_843419))
11679     return false;
11680
11681   /* If the user explicitly specified -momit-leaf-frame-pointer for the
11682      caller and calle and they don't match up, reject inlining.  */
11683   if (!aarch64_tribools_ok_for_inlining_p (
11684           caller_opts->x_flag_omit_leaf_frame_pointer,
11685           callee_opts->x_flag_omit_leaf_frame_pointer,
11686           2, 1))
11687     return false;
11688
11689   /* If the callee has specific tuning overrides, respect them.  */
11690   if (callee_opts->x_aarch64_override_tune_string != NULL
11691       && caller_opts->x_aarch64_override_tune_string == NULL)
11692     return false;
11693
11694   /* If the user specified tuning override strings for the
11695      caller and callee and they don't match up, reject inlining.
11696      We just do a string compare here, we don't analyze the meaning
11697      of the string, as it would be too costly for little gain.  */
11698   if (callee_opts->x_aarch64_override_tune_string
11699       && caller_opts->x_aarch64_override_tune_string
11700       && (strcmp (callee_opts->x_aarch64_override_tune_string,
11701                   caller_opts->x_aarch64_override_tune_string) != 0))
11702     return false;
11703
11704   return true;
11705 }
11706
11707 /* Return true if SYMBOL_REF X binds locally.  */
11708
11709 static bool
11710 aarch64_symbol_binds_local_p (const_rtx x)
11711 {
11712   return (SYMBOL_REF_DECL (x)
11713           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
11714           : SYMBOL_REF_LOCAL_P (x));
11715 }
11716
11717 /* Return true if SYMBOL_REF X is thread local */
11718 static bool
11719 aarch64_tls_symbol_p (rtx x)
11720 {
11721   if (! TARGET_HAVE_TLS)
11722     return false;
11723
11724   if (GET_CODE (x) != SYMBOL_REF)
11725     return false;
11726
11727   return SYMBOL_REF_TLS_MODEL (x) != 0;
11728 }
11729
11730 /* Classify a TLS symbol into one of the TLS kinds.  */
11731 enum aarch64_symbol_type
11732 aarch64_classify_tls_symbol (rtx x)
11733 {
11734   enum tls_model tls_kind = tls_symbolic_operand_type (x);
11735
11736   switch (tls_kind)
11737     {
11738     case TLS_MODEL_GLOBAL_DYNAMIC:
11739     case TLS_MODEL_LOCAL_DYNAMIC:
11740       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
11741
11742     case TLS_MODEL_INITIAL_EXEC:
11743       switch (aarch64_cmodel)
11744         {
11745         case AARCH64_CMODEL_TINY:
11746         case AARCH64_CMODEL_TINY_PIC:
11747           return SYMBOL_TINY_TLSIE;
11748         default:
11749           return SYMBOL_SMALL_TLSIE;
11750         }
11751
11752     case TLS_MODEL_LOCAL_EXEC:
11753       if (aarch64_tls_size == 12)
11754         return SYMBOL_TLSLE12;
11755       else if (aarch64_tls_size == 24)
11756         return SYMBOL_TLSLE24;
11757       else if (aarch64_tls_size == 32)
11758         return SYMBOL_TLSLE32;
11759       else if (aarch64_tls_size == 48)
11760         return SYMBOL_TLSLE48;
11761       else
11762         gcc_unreachable ();
11763
11764     case TLS_MODEL_EMULATED:
11765     case TLS_MODEL_NONE:
11766       return SYMBOL_FORCE_TO_MEM;
11767
11768     default:
11769       gcc_unreachable ();
11770     }
11771 }
11772
11773 /* Return the correct method for accessing X + OFFSET, where X is either
11774    a SYMBOL_REF or LABEL_REF.  */
11775
11776 enum aarch64_symbol_type
11777 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
11778 {
11779   if (GET_CODE (x) == LABEL_REF)
11780     {
11781       switch (aarch64_cmodel)
11782         {
11783         case AARCH64_CMODEL_LARGE:
11784           return SYMBOL_FORCE_TO_MEM;
11785
11786         case AARCH64_CMODEL_TINY_PIC:
11787         case AARCH64_CMODEL_TINY:
11788           return SYMBOL_TINY_ABSOLUTE;
11789
11790         case AARCH64_CMODEL_SMALL_SPIC:
11791         case AARCH64_CMODEL_SMALL_PIC:
11792         case AARCH64_CMODEL_SMALL:
11793           return SYMBOL_SMALL_ABSOLUTE;
11794
11795         default:
11796           gcc_unreachable ();
11797         }
11798     }
11799
11800   if (GET_CODE (x) == SYMBOL_REF)
11801     {
11802       if (aarch64_tls_symbol_p (x))
11803         return aarch64_classify_tls_symbol (x);
11804
11805       switch (aarch64_cmodel)
11806         {
11807         case AARCH64_CMODEL_TINY:
11808           /* When we retrieve symbol + offset address, we have to make sure
11809              the offset does not cause overflow of the final address.  But
11810              we have no way of knowing the address of symbol at compile time
11811              so we can't accurately say if the distance between the PC and
11812              symbol + offset is outside the addressible range of +/-1M in the
11813              TINY code model.  So we rely on images not being greater than
11814              1M and cap the offset at 1M and anything beyond 1M will have to
11815              be loaded using an alternative mechanism.  Furthermore if the
11816              symbol is a weak reference to something that isn't known to
11817              resolve to a symbol in this module, then force to memory.  */
11818           if ((SYMBOL_REF_WEAK (x)
11819                && !aarch64_symbol_binds_local_p (x))
11820               || !IN_RANGE (offset, -1048575, 1048575))
11821             return SYMBOL_FORCE_TO_MEM;
11822           return SYMBOL_TINY_ABSOLUTE;
11823
11824         case AARCH64_CMODEL_SMALL:
11825           /* Same reasoning as the tiny code model, but the offset cap here is
11826              4G.  */
11827           if ((SYMBOL_REF_WEAK (x)
11828                && !aarch64_symbol_binds_local_p (x))
11829               || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
11830                             HOST_WIDE_INT_C (4294967264)))
11831             return SYMBOL_FORCE_TO_MEM;
11832           return SYMBOL_SMALL_ABSOLUTE;
11833
11834         case AARCH64_CMODEL_TINY_PIC:
11835           if (!aarch64_symbol_binds_local_p (x))
11836             return SYMBOL_TINY_GOT;
11837           return SYMBOL_TINY_ABSOLUTE;
11838
11839         case AARCH64_CMODEL_SMALL_SPIC:
11840         case AARCH64_CMODEL_SMALL_PIC:
11841           if (!aarch64_symbol_binds_local_p (x))
11842             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
11843                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
11844           return SYMBOL_SMALL_ABSOLUTE;
11845
11846         case AARCH64_CMODEL_LARGE:
11847           /* This is alright even in PIC code as the constant
11848              pool reference is always PC relative and within
11849              the same translation unit.  */
11850           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
11851             return SYMBOL_SMALL_ABSOLUTE;
11852           else
11853             return SYMBOL_FORCE_TO_MEM;
11854
11855         default:
11856           gcc_unreachable ();
11857         }
11858     }
11859
11860   /* By default push everything into the constant pool.  */
11861   return SYMBOL_FORCE_TO_MEM;
11862 }
11863
11864 bool
11865 aarch64_constant_address_p (rtx x)
11866 {
11867   return (CONSTANT_P (x) && memory_address_p (DImode, x));
11868 }
11869
11870 bool
11871 aarch64_legitimate_pic_operand_p (rtx x)
11872 {
11873   if (GET_CODE (x) == SYMBOL_REF
11874       || (GET_CODE (x) == CONST
11875           && GET_CODE (XEXP (x, 0)) == PLUS
11876           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
11877      return false;
11878
11879   return true;
11880 }
11881
11882 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
11883    that should be rematerialized rather than spilled.  */
11884
11885 static bool
11886 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
11887 {
11888   /* Support CSE and rematerialization of common constants.  */
11889   if (CONST_INT_P (x)
11890       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
11891       || GET_CODE (x) == CONST_VECTOR)
11892     return true;
11893
11894   /* Do not allow vector struct mode constants for Advanced SIMD.
11895      We could support 0 and -1 easily, but they need support in
11896      aarch64-simd.md.  */
11897   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11898   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
11899     return false;
11900
11901   /* Only accept variable-length vector constants if they can be
11902      handled directly.
11903
11904      ??? It would be possible to handle rematerialization of other
11905      constants via secondary reloads.  */
11906   if (vec_flags & VEC_ANY_SVE)
11907     return aarch64_simd_valid_immediate (x, NULL);
11908
11909   if (GET_CODE (x) == HIGH)
11910     x = XEXP (x, 0);
11911
11912   /* Accept polynomial constants that can be calculated by using the
11913      destination of a move as the sole temporary.  Constants that
11914      require a second temporary cannot be rematerialized (they can't be
11915      forced to memory and also aren't legitimate constants).  */
11916   poly_int64 offset;
11917   if (poly_int_rtx_p (x, &offset))
11918     return aarch64_offset_temporaries (false, offset) <= 1;
11919
11920   /* If an offset is being added to something else, we need to allow the
11921      base to be moved into the destination register, meaning that there
11922      are no free temporaries for the offset.  */
11923   x = strip_offset (x, &offset);
11924   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
11925     return false;
11926
11927   /* Do not allow const (plus (anchor_symbol, const_int)).  */
11928   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
11929     return false;
11930
11931   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
11932      so spilling them is better than rematerialization.  */
11933   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
11934     return true;
11935
11936   /* Label references are always constant.  */
11937   if (GET_CODE (x) == LABEL_REF)
11938     return true;
11939
11940   return false;
11941 }
11942
11943 rtx
11944 aarch64_load_tp (rtx target)
11945 {
11946   if (!target
11947       || GET_MODE (target) != Pmode
11948       || !register_operand (target, Pmode))
11949     target = gen_reg_rtx (Pmode);
11950
11951   /* Can return in any reg.  */
11952   emit_insn (gen_aarch64_load_tp_hard (target));
11953   return target;
11954 }
11955
11956 /* On AAPCS systems, this is the "struct __va_list".  */
11957 static GTY(()) tree va_list_type;
11958
11959 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
11960    Return the type to use as __builtin_va_list.
11961
11962    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
11963
11964    struct __va_list
11965    {
11966      void *__stack;
11967      void *__gr_top;
11968      void *__vr_top;
11969      int   __gr_offs;
11970      int   __vr_offs;
11971    };  */
11972
11973 static tree
11974 aarch64_build_builtin_va_list (void)
11975 {
11976   tree va_list_name;
11977   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
11978
11979   /* Create the type.  */
11980   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
11981   /* Give it the required name.  */
11982   va_list_name = build_decl (BUILTINS_LOCATION,
11983                              TYPE_DECL,
11984                              get_identifier ("__va_list"),
11985                              va_list_type);
11986   DECL_ARTIFICIAL (va_list_name) = 1;
11987   TYPE_NAME (va_list_type) = va_list_name;
11988   TYPE_STUB_DECL (va_list_type) = va_list_name;
11989
11990   /* Create the fields.  */
11991   f_stack = build_decl (BUILTINS_LOCATION,
11992                         FIELD_DECL, get_identifier ("__stack"),
11993                         ptr_type_node);
11994   f_grtop = build_decl (BUILTINS_LOCATION,
11995                         FIELD_DECL, get_identifier ("__gr_top"),
11996                         ptr_type_node);
11997   f_vrtop = build_decl (BUILTINS_LOCATION,
11998                         FIELD_DECL, get_identifier ("__vr_top"),
11999                         ptr_type_node);
12000   f_groff = build_decl (BUILTINS_LOCATION,
12001                         FIELD_DECL, get_identifier ("__gr_offs"),
12002                         integer_type_node);
12003   f_vroff = build_decl (BUILTINS_LOCATION,
12004                         FIELD_DECL, get_identifier ("__vr_offs"),
12005                         integer_type_node);
12006
12007   /* Tell tree-stdarg pass about our internal offset fields.
12008      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
12009      purpose to identify whether the code is updating va_list internal
12010      offset fields through irregular way.  */
12011   va_list_gpr_counter_field = f_groff;
12012   va_list_fpr_counter_field = f_vroff;
12013
12014   DECL_ARTIFICIAL (f_stack) = 1;
12015   DECL_ARTIFICIAL (f_grtop) = 1;
12016   DECL_ARTIFICIAL (f_vrtop) = 1;
12017   DECL_ARTIFICIAL (f_groff) = 1;
12018   DECL_ARTIFICIAL (f_vroff) = 1;
12019
12020   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
12021   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
12022   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
12023   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
12024   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
12025
12026   TYPE_FIELDS (va_list_type) = f_stack;
12027   DECL_CHAIN (f_stack) = f_grtop;
12028   DECL_CHAIN (f_grtop) = f_vrtop;
12029   DECL_CHAIN (f_vrtop) = f_groff;
12030   DECL_CHAIN (f_groff) = f_vroff;
12031
12032   /* Compute its layout.  */
12033   layout_type (va_list_type);
12034
12035   return va_list_type;
12036 }
12037
12038 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
12039 static void
12040 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
12041 {
12042   const CUMULATIVE_ARGS *cum;
12043   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12044   tree stack, grtop, vrtop, groff, vroff;
12045   tree t;
12046   int gr_save_area_size = cfun->va_list_gpr_size;
12047   int vr_save_area_size = cfun->va_list_fpr_size;
12048   int vr_offset;
12049
12050   cum = &crtl->args.info;
12051   if (cfun->va_list_gpr_size)
12052     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
12053                              cfun->va_list_gpr_size);
12054   if (cfun->va_list_fpr_size)
12055     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
12056                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
12057
12058   if (!TARGET_FLOAT)
12059     {
12060       gcc_assert (cum->aapcs_nvrn == 0);
12061       vr_save_area_size = 0;
12062     }
12063
12064   f_stack = TYPE_FIELDS (va_list_type_node);
12065   f_grtop = DECL_CHAIN (f_stack);
12066   f_vrtop = DECL_CHAIN (f_grtop);
12067   f_groff = DECL_CHAIN (f_vrtop);
12068   f_vroff = DECL_CHAIN (f_groff);
12069
12070   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
12071                   NULL_TREE);
12072   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
12073                   NULL_TREE);
12074   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
12075                   NULL_TREE);
12076   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
12077                   NULL_TREE);
12078   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
12079                   NULL_TREE);
12080
12081   /* Emit code to initialize STACK, which points to the next varargs stack
12082      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
12083      by named arguments.  STACK is 8-byte aligned.  */
12084   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
12085   if (cum->aapcs_stack_size > 0)
12086     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
12087   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
12088   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12089
12090   /* Emit code to initialize GRTOP, the top of the GR save area.
12091      virtual_incoming_args_rtx should have been 16 byte aligned.  */
12092   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
12093   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
12094   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12095
12096   /* Emit code to initialize VRTOP, the top of the VR save area.
12097      This address is gr_save_area_bytes below GRTOP, rounded
12098      down to the next 16-byte boundary.  */
12099   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
12100   vr_offset = ROUND_UP (gr_save_area_size,
12101                         STACK_BOUNDARY / BITS_PER_UNIT);
12102
12103   if (vr_offset)
12104     t = fold_build_pointer_plus_hwi (t, -vr_offset);
12105   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
12106   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12107
12108   /* Emit code to initialize GROFF, the offset from GRTOP of the
12109      next GPR argument.  */
12110   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
12111               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
12112   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12113
12114   /* Likewise emit code to initialize VROFF, the offset from FTOP
12115      of the next VR argument.  */
12116   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
12117               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
12118   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12119 }
12120
12121 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
12122
12123 static tree
12124 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
12125                               gimple_seq *post_p ATTRIBUTE_UNUSED)
12126 {
12127   tree addr;
12128   bool indirect_p;
12129   bool is_ha;           /* is HFA or HVA.  */
12130   bool dw_align;        /* double-word align.  */
12131   machine_mode ag_mode = VOIDmode;
12132   int nregs;
12133   machine_mode mode;
12134
12135   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12136   tree stack, f_top, f_off, off, arg, roundup, on_stack;
12137   HOST_WIDE_INT size, rsize, adjust, align;
12138   tree t, u, cond1, cond2;
12139
12140   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
12141   if (indirect_p)
12142     type = build_pointer_type (type);
12143
12144   mode = TYPE_MODE (type);
12145
12146   f_stack = TYPE_FIELDS (va_list_type_node);
12147   f_grtop = DECL_CHAIN (f_stack);
12148   f_vrtop = DECL_CHAIN (f_grtop);
12149   f_groff = DECL_CHAIN (f_vrtop);
12150   f_vroff = DECL_CHAIN (f_groff);
12151
12152   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
12153                   f_stack, NULL_TREE);
12154   size = int_size_in_bytes (type);
12155   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
12156
12157   dw_align = false;
12158   adjust = 0;
12159   if (aarch64_vfp_is_call_or_return_candidate (mode,
12160                                                type,
12161                                                &ag_mode,
12162                                                &nregs,
12163                                                &is_ha))
12164     {
12165       /* No frontends can create types with variable-sized modes, so we
12166          shouldn't be asked to pass or return them.  */
12167       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
12168
12169       /* TYPE passed in fp/simd registers.  */
12170       if (!TARGET_FLOAT)
12171         aarch64_err_no_fpadvsimd (mode);
12172
12173       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
12174                       unshare_expr (valist), f_vrtop, NULL_TREE);
12175       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
12176                       unshare_expr (valist), f_vroff, NULL_TREE);
12177
12178       rsize = nregs * UNITS_PER_VREG;
12179
12180       if (is_ha)
12181         {
12182           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
12183             adjust = UNITS_PER_VREG - ag_size;
12184         }
12185       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12186                && size < UNITS_PER_VREG)
12187         {
12188           adjust = UNITS_PER_VREG - size;
12189         }
12190     }
12191   else
12192     {
12193       /* TYPE passed in general registers.  */
12194       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
12195                       unshare_expr (valist), f_grtop, NULL_TREE);
12196       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
12197                       unshare_expr (valist), f_groff, NULL_TREE);
12198       rsize = ROUND_UP (size, UNITS_PER_WORD);
12199       nregs = rsize / UNITS_PER_WORD;
12200
12201       if (align > 8)
12202         dw_align = true;
12203
12204       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12205           && size < UNITS_PER_WORD)
12206         {
12207           adjust = UNITS_PER_WORD  - size;
12208         }
12209     }
12210
12211   /* Get a local temporary for the field value.  */
12212   off = get_initialized_tmp_var (f_off, pre_p, NULL);
12213
12214   /* Emit code to branch if off >= 0.  */
12215   t = build2 (GE_EXPR, boolean_type_node, off,
12216               build_int_cst (TREE_TYPE (off), 0));
12217   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
12218
12219   if (dw_align)
12220     {
12221       /* Emit: offs = (offs + 15) & -16.  */
12222       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12223                   build_int_cst (TREE_TYPE (off), 15));
12224       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
12225                   build_int_cst (TREE_TYPE (off), -16));
12226       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
12227     }
12228   else
12229     roundup = NULL;
12230
12231   /* Update ap.__[g|v]r_offs  */
12232   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12233               build_int_cst (TREE_TYPE (off), rsize));
12234   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
12235
12236   /* String up.  */
12237   if (roundup)
12238     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12239
12240   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
12241   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
12242               build_int_cst (TREE_TYPE (f_off), 0));
12243   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
12244
12245   /* String up: make sure the assignment happens before the use.  */
12246   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
12247   COND_EXPR_ELSE (cond1) = t;
12248
12249   /* Prepare the trees handling the argument that is passed on the stack;
12250      the top level node will store in ON_STACK.  */
12251   arg = get_initialized_tmp_var (stack, pre_p, NULL);
12252   if (align > 8)
12253     {
12254       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
12255       t = fold_build_pointer_plus_hwi (arg, 15);
12256       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12257                   build_int_cst (TREE_TYPE (t), -16));
12258       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
12259     }
12260   else
12261     roundup = NULL;
12262   /* Advance ap.__stack  */
12263   t = fold_build_pointer_plus_hwi (arg, size + 7);
12264   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12265               build_int_cst (TREE_TYPE (t), -8));
12266   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
12267   /* String up roundup and advance.  */
12268   if (roundup)
12269     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12270   /* String up with arg */
12271   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
12272   /* Big-endianness related address adjustment.  */
12273   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12274       && size < UNITS_PER_WORD)
12275   {
12276     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
12277                 size_int (UNITS_PER_WORD - size));
12278     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
12279   }
12280
12281   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
12282   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
12283
12284   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
12285   t = off;
12286   if (adjust)
12287     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
12288                 build_int_cst (TREE_TYPE (off), adjust));
12289
12290   t = fold_convert (sizetype, t);
12291   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
12292
12293   if (is_ha)
12294     {
12295       /* type ha; // treat as "struct {ftype field[n];}"
12296          ... [computing offs]
12297          for (i = 0; i <nregs; ++i, offs += 16)
12298            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
12299          return ha;  */
12300       int i;
12301       tree tmp_ha, field_t, field_ptr_t;
12302
12303       /* Declare a local variable.  */
12304       tmp_ha = create_tmp_var_raw (type, "ha");
12305       gimple_add_tmp_var (tmp_ha);
12306
12307       /* Establish the base type.  */
12308       switch (ag_mode)
12309         {
12310         case E_SFmode:
12311           field_t = float_type_node;
12312           field_ptr_t = float_ptr_type_node;
12313           break;
12314         case E_DFmode:
12315           field_t = double_type_node;
12316           field_ptr_t = double_ptr_type_node;
12317           break;
12318         case E_TFmode:
12319           field_t = long_double_type_node;
12320           field_ptr_t = long_double_ptr_type_node;
12321           break;
12322         case E_HFmode:
12323           field_t = aarch64_fp16_type_node;
12324           field_ptr_t = aarch64_fp16_ptr_type_node;
12325           break;
12326         case E_V2SImode:
12327         case E_V4SImode:
12328             {
12329               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
12330               field_t = build_vector_type_for_mode (innertype, ag_mode);
12331               field_ptr_t = build_pointer_type (field_t);
12332             }
12333           break;
12334         default:
12335           gcc_assert (0);
12336         }
12337
12338       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
12339       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
12340       addr = t;
12341       t = fold_convert (field_ptr_t, addr);
12342       t = build2 (MODIFY_EXPR, field_t,
12343                   build1 (INDIRECT_REF, field_t, tmp_ha),
12344                   build1 (INDIRECT_REF, field_t, t));
12345
12346       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
12347       for (i = 1; i < nregs; ++i)
12348         {
12349           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
12350           u = fold_convert (field_ptr_t, addr);
12351           u = build2 (MODIFY_EXPR, field_t,
12352                       build2 (MEM_REF, field_t, tmp_ha,
12353                               build_int_cst (field_ptr_t,
12354                                              (i *
12355                                               int_size_in_bytes (field_t)))),
12356                       build1 (INDIRECT_REF, field_t, u));
12357           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
12358         }
12359
12360       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
12361       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
12362     }
12363
12364   COND_EXPR_ELSE (cond2) = t;
12365   addr = fold_convert (build_pointer_type (type), cond1);
12366   addr = build_va_arg_indirect_ref (addr);
12367
12368   if (indirect_p)
12369     addr = build_va_arg_indirect_ref (addr);
12370
12371   return addr;
12372 }
12373
12374 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
12375
12376 static void
12377 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
12378                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
12379                                 int no_rtl)
12380 {
12381   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
12382   CUMULATIVE_ARGS local_cum;
12383   int gr_saved = cfun->va_list_gpr_size;
12384   int vr_saved = cfun->va_list_fpr_size;
12385
12386   /* The caller has advanced CUM up to, but not beyond, the last named
12387      argument.  Advance a local copy of CUM past the last "real" named
12388      argument, to find out how many registers are left over.  */
12389   local_cum = *cum;
12390   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
12391
12392   /* Found out how many registers we need to save.
12393      Honor tree-stdvar analysis results.  */
12394   if (cfun->va_list_gpr_size)
12395     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
12396                     cfun->va_list_gpr_size / UNITS_PER_WORD);
12397   if (cfun->va_list_fpr_size)
12398     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
12399                     cfun->va_list_fpr_size / UNITS_PER_VREG);
12400
12401   if (!TARGET_FLOAT)
12402     {
12403       gcc_assert (local_cum.aapcs_nvrn == 0);
12404       vr_saved = 0;
12405     }
12406
12407   if (!no_rtl)
12408     {
12409       if (gr_saved > 0)
12410         {
12411           rtx ptr, mem;
12412
12413           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
12414           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
12415                                - gr_saved * UNITS_PER_WORD);
12416           mem = gen_frame_mem (BLKmode, ptr);
12417           set_mem_alias_set (mem, get_varargs_alias_set ());
12418
12419           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
12420                                mem, gr_saved);
12421         }
12422       if (vr_saved > 0)
12423         {
12424           /* We can't use move_block_from_reg, because it will use
12425              the wrong mode, storing D regs only.  */
12426           machine_mode mode = TImode;
12427           int off, i, vr_start;
12428
12429           /* Set OFF to the offset from virtual_incoming_args_rtx of
12430              the first vector register.  The VR save area lies below
12431              the GR one, and is aligned to 16 bytes.  */
12432           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
12433                            STACK_BOUNDARY / BITS_PER_UNIT);
12434           off -= vr_saved * UNITS_PER_VREG;
12435
12436           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
12437           for (i = 0; i < vr_saved; ++i)
12438             {
12439               rtx ptr, mem;
12440
12441               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
12442               mem = gen_frame_mem (mode, ptr);
12443               set_mem_alias_set (mem, get_varargs_alias_set ());
12444               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
12445               off += UNITS_PER_VREG;
12446             }
12447         }
12448     }
12449
12450   /* We don't save the size into *PRETEND_SIZE because we want to avoid
12451      any complication of having crtl->args.pretend_args_size changed.  */
12452   cfun->machine->frame.saved_varargs_size
12453     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
12454                  STACK_BOUNDARY / BITS_PER_UNIT)
12455        + vr_saved * UNITS_PER_VREG);
12456 }
12457
12458 static void
12459 aarch64_conditional_register_usage (void)
12460 {
12461   int i;
12462   if (!TARGET_FLOAT)
12463     {
12464       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
12465         {
12466           fixed_regs[i] = 1;
12467           call_used_regs[i] = 1;
12468         }
12469     }
12470   if (!TARGET_SVE)
12471     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
12472       {
12473         fixed_regs[i] = 1;
12474         call_used_regs[i] = 1;
12475       }
12476
12477   /* When tracking speculation, we need a couple of call-clobbered registers
12478      to track the speculation state.  It would be nice to just use
12479      IP0 and IP1, but currently there are numerous places that just
12480      assume these registers are free for other uses (eg pointer
12481      authentication).  */
12482   if (aarch64_track_speculation)
12483     {
12484       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
12485       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
12486       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
12487       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
12488     }
12489 }
12490
12491 /* Walk down the type tree of TYPE counting consecutive base elements.
12492    If *MODEP is VOIDmode, then set it to the first valid floating point
12493    type.  If a non-floating point type is found, or if a floating point
12494    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
12495    otherwise return the count in the sub-tree.  */
12496 static int
12497 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
12498 {
12499   machine_mode mode;
12500   HOST_WIDE_INT size;
12501
12502   switch (TREE_CODE (type))
12503     {
12504     case REAL_TYPE:
12505       mode = TYPE_MODE (type);
12506       if (mode != DFmode && mode != SFmode
12507           && mode != TFmode && mode != HFmode)
12508         return -1;
12509
12510       if (*modep == VOIDmode)
12511         *modep = mode;
12512
12513       if (*modep == mode)
12514         return 1;
12515
12516       break;
12517
12518     case COMPLEX_TYPE:
12519       mode = TYPE_MODE (TREE_TYPE (type));
12520       if (mode != DFmode && mode != SFmode
12521           && mode != TFmode && mode != HFmode)
12522         return -1;
12523
12524       if (*modep == VOIDmode)
12525         *modep = mode;
12526
12527       if (*modep == mode)
12528         return 2;
12529
12530       break;
12531
12532     case VECTOR_TYPE:
12533       /* Use V2SImode and V4SImode as representatives of all 64-bit
12534          and 128-bit vector types.  */
12535       size = int_size_in_bytes (type);
12536       switch (size)
12537         {
12538         case 8:
12539           mode = V2SImode;
12540           break;
12541         case 16:
12542           mode = V4SImode;
12543           break;
12544         default:
12545           return -1;
12546         }
12547
12548       if (*modep == VOIDmode)
12549         *modep = mode;
12550
12551       /* Vector modes are considered to be opaque: two vectors are
12552          equivalent for the purposes of being homogeneous aggregates
12553          if they are the same size.  */
12554       if (*modep == mode)
12555         return 1;
12556
12557       break;
12558
12559     case ARRAY_TYPE:
12560       {
12561         int count;
12562         tree index = TYPE_DOMAIN (type);
12563
12564         /* Can't handle incomplete types nor sizes that are not
12565            fixed.  */
12566         if (!COMPLETE_TYPE_P (type)
12567             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12568           return -1;
12569
12570         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
12571         if (count == -1
12572             || !index
12573             || !TYPE_MAX_VALUE (index)
12574             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
12575             || !TYPE_MIN_VALUE (index)
12576             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
12577             || count < 0)
12578           return -1;
12579
12580         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
12581                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
12582
12583         /* There must be no padding.  */
12584         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12585                       count * GET_MODE_BITSIZE (*modep)))
12586           return -1;
12587
12588         return count;
12589       }
12590
12591     case RECORD_TYPE:
12592       {
12593         int count = 0;
12594         int sub_count;
12595         tree field;
12596
12597         /* Can't handle incomplete types nor sizes that are not
12598            fixed.  */
12599         if (!COMPLETE_TYPE_P (type)
12600             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12601           return -1;
12602
12603         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12604           {
12605             if (TREE_CODE (field) != FIELD_DECL)
12606               continue;
12607
12608             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12609             if (sub_count < 0)
12610               return -1;
12611             count += sub_count;
12612           }
12613
12614         /* There must be no padding.  */
12615         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12616                       count * GET_MODE_BITSIZE (*modep)))
12617           return -1;
12618
12619         return count;
12620       }
12621
12622     case UNION_TYPE:
12623     case QUAL_UNION_TYPE:
12624       {
12625         /* These aren't very interesting except in a degenerate case.  */
12626         int count = 0;
12627         int sub_count;
12628         tree field;
12629
12630         /* Can't handle incomplete types nor sizes that are not
12631            fixed.  */
12632         if (!COMPLETE_TYPE_P (type)
12633             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12634           return -1;
12635
12636         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12637           {
12638             if (TREE_CODE (field) != FIELD_DECL)
12639               continue;
12640
12641             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12642             if (sub_count < 0)
12643               return -1;
12644             count = count > sub_count ? count : sub_count;
12645           }
12646
12647         /* There must be no padding.  */
12648         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12649                       count * GET_MODE_BITSIZE (*modep)))
12650           return -1;
12651
12652         return count;
12653       }
12654
12655     default:
12656       break;
12657     }
12658
12659   return -1;
12660 }
12661
12662 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
12663    type as described in AAPCS64 \S 4.1.2.
12664
12665    See the comment above aarch64_composite_type_p for the notes on MODE.  */
12666
12667 static bool
12668 aarch64_short_vector_p (const_tree type,
12669                         machine_mode mode)
12670 {
12671   poly_int64 size = -1;
12672
12673   if (type && TREE_CODE (type) == VECTOR_TYPE)
12674     size = int_size_in_bytes (type);
12675   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
12676             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
12677     size = GET_MODE_SIZE (mode);
12678
12679   return known_eq (size, 8) || known_eq (size, 16);
12680 }
12681
12682 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
12683    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
12684    array types.  The C99 floating-point complex types are also considered
12685    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
12686    types, which are GCC extensions and out of the scope of AAPCS64, are
12687    treated as composite types here as well.
12688
12689    Note that MODE itself is not sufficient in determining whether a type
12690    is such a composite type or not.  This is because
12691    stor-layout.c:compute_record_mode may have already changed the MODE
12692    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
12693    structure with only one field may have its MODE set to the mode of the
12694    field.  Also an integer mode whose size matches the size of the
12695    RECORD_TYPE type may be used to substitute the original mode
12696    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
12697    solely relied on.  */
12698
12699 static bool
12700 aarch64_composite_type_p (const_tree type,
12701                           machine_mode mode)
12702 {
12703   if (aarch64_short_vector_p (type, mode))
12704     return false;
12705
12706   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
12707     return true;
12708
12709   if (mode == BLKmode
12710       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
12711       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
12712     return true;
12713
12714   return false;
12715 }
12716
12717 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
12718    shall be passed or returned in simd/fp register(s) (providing these
12719    parameter passing registers are available).
12720
12721    Upon successful return, *COUNT returns the number of needed registers,
12722    *BASE_MODE returns the mode of the individual register and when IS_HAF
12723    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
12724    floating-point aggregate or a homogeneous short-vector aggregate.  */
12725
12726 static bool
12727 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
12728                                          const_tree type,
12729                                          machine_mode *base_mode,
12730                                          int *count,
12731                                          bool *is_ha)
12732 {
12733   machine_mode new_mode = VOIDmode;
12734   bool composite_p = aarch64_composite_type_p (type, mode);
12735
12736   if (is_ha != NULL) *is_ha = false;
12737
12738   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
12739       || aarch64_short_vector_p (type, mode))
12740     {
12741       *count = 1;
12742       new_mode = mode;
12743     }
12744   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
12745     {
12746       if (is_ha != NULL) *is_ha = true;
12747       *count = 2;
12748       new_mode = GET_MODE_INNER (mode);
12749     }
12750   else if (type && composite_p)
12751     {
12752       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
12753
12754       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
12755         {
12756           if (is_ha != NULL) *is_ha = true;
12757           *count = ag_count;
12758         }
12759       else
12760         return false;
12761     }
12762   else
12763     return false;
12764
12765   *base_mode = new_mode;
12766   return true;
12767 }
12768
12769 /* Implement TARGET_STRUCT_VALUE_RTX.  */
12770
12771 static rtx
12772 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
12773                           int incoming ATTRIBUTE_UNUSED)
12774 {
12775   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
12776 }
12777
12778 /* Implements target hook vector_mode_supported_p.  */
12779 static bool
12780 aarch64_vector_mode_supported_p (machine_mode mode)
12781 {
12782   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12783   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
12784 }
12785
12786 /* Return appropriate SIMD container
12787    for MODE within a vector of WIDTH bits.  */
12788 static machine_mode
12789 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
12790 {
12791   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
12792     switch (mode)
12793       {
12794       case E_DFmode:
12795         return VNx2DFmode;
12796       case E_SFmode:
12797         return VNx4SFmode;
12798       case E_HFmode:
12799         return VNx8HFmode;
12800       case E_DImode:
12801         return VNx2DImode;
12802       case E_SImode:
12803         return VNx4SImode;
12804       case E_HImode:
12805         return VNx8HImode;
12806       case E_QImode:
12807         return VNx16QImode;
12808       default:
12809         return word_mode;
12810       }
12811
12812   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
12813   if (TARGET_SIMD)
12814     {
12815       if (known_eq (width, 128))
12816         switch (mode)
12817           {
12818           case E_DFmode:
12819             return V2DFmode;
12820           case E_SFmode:
12821             return V4SFmode;
12822           case E_HFmode:
12823             return V8HFmode;
12824           case E_SImode:
12825             return V4SImode;
12826           case E_HImode:
12827             return V8HImode;
12828           case E_QImode:
12829             return V16QImode;
12830           case E_DImode:
12831             return V2DImode;
12832           default:
12833             break;
12834           }
12835       else
12836         switch (mode)
12837           {
12838           case E_SFmode:
12839             return V2SFmode;
12840           case E_HFmode:
12841             return V4HFmode;
12842           case E_SImode:
12843             return V2SImode;
12844           case E_HImode:
12845             return V4HImode;
12846           case E_QImode:
12847             return V8QImode;
12848           default:
12849             break;
12850           }
12851     }
12852   return word_mode;
12853 }
12854
12855 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
12856 static machine_mode
12857 aarch64_preferred_simd_mode (scalar_mode mode)
12858 {
12859   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
12860   return aarch64_simd_container_mode (mode, bits);
12861 }
12862
12863 /* Return a list of possible vector sizes for the vectorizer
12864    to iterate over.  */
12865 static void
12866 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
12867 {
12868   if (TARGET_SVE)
12869     sizes->safe_push (BYTES_PER_SVE_VECTOR);
12870   sizes->safe_push (16);
12871   sizes->safe_push (8);
12872 }
12873
12874 /* Implement TARGET_MANGLE_TYPE.  */
12875
12876 static const char *
12877 aarch64_mangle_type (const_tree type)
12878 {
12879   /* The AArch64 ABI documents say that "__va_list" has to be
12880      managled as if it is in the "std" namespace.  */
12881   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
12882     return "St9__va_list";
12883
12884   /* Half-precision float.  */
12885   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
12886     return "Dh";
12887
12888   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
12889      builtin types.  */
12890   if (TYPE_NAME (type) != NULL)
12891     return aarch64_mangle_builtin_type (type);
12892
12893   /* Use the default mangling.  */
12894   return NULL;
12895 }
12896
12897 /* Find the first rtx_insn before insn that will generate an assembly
12898    instruction.  */
12899
12900 static rtx_insn *
12901 aarch64_prev_real_insn (rtx_insn *insn)
12902 {
12903   if (!insn)
12904     return NULL;
12905
12906   do
12907     {
12908       insn = prev_real_insn (insn);
12909     }
12910   while (insn && recog_memoized (insn) < 0);
12911
12912   return insn;
12913 }
12914
12915 static bool
12916 is_madd_op (enum attr_type t1)
12917 {
12918   unsigned int i;
12919   /* A number of these may be AArch32 only.  */
12920   enum attr_type mlatypes[] = {
12921     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
12922     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
12923     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
12924   };
12925
12926   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
12927     {
12928       if (t1 == mlatypes[i])
12929         return true;
12930     }
12931
12932   return false;
12933 }
12934
12935 /* Check if there is a register dependency between a load and the insn
12936    for which we hold recog_data.  */
12937
12938 static bool
12939 dep_between_memop_and_curr (rtx memop)
12940 {
12941   rtx load_reg;
12942   int opno;
12943
12944   gcc_assert (GET_CODE (memop) == SET);
12945
12946   if (!REG_P (SET_DEST (memop)))
12947     return false;
12948
12949   load_reg = SET_DEST (memop);
12950   for (opno = 1; opno < recog_data.n_operands; opno++)
12951     {
12952       rtx operand = recog_data.operand[opno];
12953       if (REG_P (operand)
12954           && reg_overlap_mentioned_p (load_reg, operand))
12955         return true;
12956
12957     }
12958   return false;
12959 }
12960
12961
12962 /* When working around the Cortex-A53 erratum 835769,
12963    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
12964    instruction and has a preceding memory instruction such that a NOP
12965    should be inserted between them.  */
12966
12967 bool
12968 aarch64_madd_needs_nop (rtx_insn* insn)
12969 {
12970   enum attr_type attr_type;
12971   rtx_insn *prev;
12972   rtx body;
12973
12974   if (!TARGET_FIX_ERR_A53_835769)
12975     return false;
12976
12977   if (!INSN_P (insn) || recog_memoized (insn) < 0)
12978     return false;
12979
12980   attr_type = get_attr_type (insn);
12981   if (!is_madd_op (attr_type))
12982     return false;
12983
12984   prev = aarch64_prev_real_insn (insn);
12985   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
12986      Restore recog state to INSN to avoid state corruption.  */
12987   extract_constrain_insn_cached (insn);
12988
12989   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
12990     return false;
12991
12992   body = single_set (prev);
12993
12994   /* If the previous insn is a memory op and there is no dependency between
12995      it and the DImode madd, emit a NOP between them.  If body is NULL then we
12996      have a complex memory operation, probably a load/store pair.
12997      Be conservative for now and emit a NOP.  */
12998   if (GET_MODE (recog_data.operand[0]) == DImode
12999       && (!body || !dep_between_memop_and_curr (body)))
13000     return true;
13001
13002   return false;
13003
13004 }
13005
13006
13007 /* Implement FINAL_PRESCAN_INSN.  */
13008
13009 void
13010 aarch64_final_prescan_insn (rtx_insn *insn)
13011 {
13012   if (aarch64_madd_needs_nop (insn))
13013     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
13014 }
13015
13016
13017 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
13018    instruction.  */
13019
13020 bool
13021 aarch64_sve_index_immediate_p (rtx base_or_step)
13022 {
13023   return (CONST_INT_P (base_or_step)
13024           && IN_RANGE (INTVAL (base_or_step), -16, 15));
13025 }
13026
13027 /* Return true if X is a valid immediate for the SVE ADD and SUB
13028    instructions.  Negate X first if NEGATE_P is true.  */
13029
13030 bool
13031 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
13032 {
13033   rtx elt;
13034
13035   if (!const_vec_duplicate_p (x, &elt)
13036       || !CONST_INT_P (elt))
13037     return false;
13038
13039   HOST_WIDE_INT val = INTVAL (elt);
13040   if (negate_p)
13041     val = -val;
13042   val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
13043
13044   if (val & 0xff)
13045     return IN_RANGE (val, 0, 0xff);
13046   return IN_RANGE (val, 0, 0xff00);
13047 }
13048
13049 /* Return true if X is a valid immediate operand for an SVE logical
13050    instruction such as AND.  */
13051
13052 bool
13053 aarch64_sve_bitmask_immediate_p (rtx x)
13054 {
13055   rtx elt;
13056
13057   return (const_vec_duplicate_p (x, &elt)
13058           && CONST_INT_P (elt)
13059           && aarch64_bitmask_imm (INTVAL (elt),
13060                                   GET_MODE_INNER (GET_MODE (x))));
13061 }
13062
13063 /* Return true if X is a valid immediate for the SVE DUP and CPY
13064    instructions.  */
13065
13066 bool
13067 aarch64_sve_dup_immediate_p (rtx x)
13068 {
13069   rtx elt;
13070
13071   if (!const_vec_duplicate_p (x, &elt)
13072       || !CONST_INT_P (elt))
13073     return false;
13074
13075   HOST_WIDE_INT val = INTVAL (elt);
13076   if (val & 0xff)
13077     return IN_RANGE (val, -0x80, 0x7f);
13078   return IN_RANGE (val, -0x8000, 0x7f00);
13079 }
13080
13081 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
13082    SIGNED_P says whether the operand is signed rather than unsigned.  */
13083
13084 bool
13085 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
13086 {
13087   rtx elt;
13088
13089   return (const_vec_duplicate_p (x, &elt)
13090           && CONST_INT_P (elt)
13091           && (signed_p
13092               ? IN_RANGE (INTVAL (elt), -16, 15)
13093               : IN_RANGE (INTVAL (elt), 0, 127)));
13094 }
13095
13096 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
13097    instruction.  Negate X first if NEGATE_P is true.  */
13098
13099 bool
13100 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
13101 {
13102   rtx elt;
13103   REAL_VALUE_TYPE r;
13104
13105   if (!const_vec_duplicate_p (x, &elt)
13106       || GET_CODE (elt) != CONST_DOUBLE)
13107     return false;
13108
13109   r = *CONST_DOUBLE_REAL_VALUE (elt);
13110
13111   if (negate_p)
13112     r = real_value_negate (&r);
13113
13114   if (real_equal (&r, &dconst1))
13115     return true;
13116   if (real_equal (&r, &dconsthalf))
13117     return true;
13118   return false;
13119 }
13120
13121 /* Return true if X is a valid immediate operand for an SVE FMUL
13122    instruction.  */
13123
13124 bool
13125 aarch64_sve_float_mul_immediate_p (rtx x)
13126 {
13127   rtx elt;
13128
13129   /* GCC will never generate a multiply with an immediate of 2, so there is no
13130      point testing for it (even though it is a valid constant).  */
13131   return (const_vec_duplicate_p (x, &elt)
13132           && GET_CODE (elt) == CONST_DOUBLE
13133           && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
13134 }
13135
13136 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
13137    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
13138    is nonnull, use it to describe valid immediates.  */
13139 static bool
13140 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
13141                                     simd_immediate_info *info,
13142                                     enum simd_immediate_check which,
13143                                     simd_immediate_info::insn_type insn)
13144 {
13145   /* Try a 4-byte immediate with LSL.  */
13146   for (unsigned int shift = 0; shift < 32; shift += 8)
13147     if ((val32 & (0xff << shift)) == val32)
13148       {
13149         if (info)
13150           *info = simd_immediate_info (SImode, val32 >> shift, insn,
13151                                        simd_immediate_info::LSL, shift);
13152         return true;
13153       }
13154
13155   /* Try a 2-byte immediate with LSL.  */
13156   unsigned int imm16 = val32 & 0xffff;
13157   if (imm16 == (val32 >> 16))
13158     for (unsigned int shift = 0; shift < 16; shift += 8)
13159       if ((imm16 & (0xff << shift)) == imm16)
13160         {
13161           if (info)
13162             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
13163                                          simd_immediate_info::LSL, shift);
13164           return true;
13165         }
13166
13167   /* Try a 4-byte immediate with MSL, except for cases that MVN
13168      can handle.  */
13169   if (which == AARCH64_CHECK_MOV)
13170     for (unsigned int shift = 8; shift < 24; shift += 8)
13171       {
13172         unsigned int low = (1 << shift) - 1;
13173         if (((val32 & (0xff << shift)) | low) == val32)
13174           {
13175             if (info)
13176               *info = simd_immediate_info (SImode, val32 >> shift, insn,
13177                                            simd_immediate_info::MSL, shift);
13178             return true;
13179           }
13180       }
13181
13182   return false;
13183 }
13184
13185 /* Return true if replicating VAL64 is a valid immediate for the
13186    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
13187    use it to describe valid immediates.  */
13188 static bool
13189 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
13190                                  simd_immediate_info *info,
13191                                  enum simd_immediate_check which)
13192 {
13193   unsigned int val32 = val64 & 0xffffffff;
13194   unsigned int val16 = val64 & 0xffff;
13195   unsigned int val8 = val64 & 0xff;
13196
13197   if (val32 == (val64 >> 32))
13198     {
13199       if ((which & AARCH64_CHECK_ORR) != 0
13200           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
13201                                                  simd_immediate_info::MOV))
13202         return true;
13203
13204       if ((which & AARCH64_CHECK_BIC) != 0
13205           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
13206                                                  simd_immediate_info::MVN))
13207         return true;
13208
13209       /* Try using a replicated byte.  */
13210       if (which == AARCH64_CHECK_MOV
13211           && val16 == (val32 >> 16)
13212           && val8 == (val16 >> 8))
13213         {
13214           if (info)
13215             *info = simd_immediate_info (QImode, val8);
13216           return true;
13217         }
13218     }
13219
13220   /* Try using a bit-to-bytemask.  */
13221   if (which == AARCH64_CHECK_MOV)
13222     {
13223       unsigned int i;
13224       for (i = 0; i < 64; i += 8)
13225         {
13226           unsigned char byte = (val64 >> i) & 0xff;
13227           if (byte != 0 && byte != 0xff)
13228             break;
13229         }
13230       if (i == 64)
13231         {
13232           if (info)
13233             *info = simd_immediate_info (DImode, val64);
13234           return true;
13235         }
13236     }
13237   return false;
13238 }
13239
13240 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
13241    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
13242
13243 static bool
13244 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
13245                              simd_immediate_info *info)
13246 {
13247   scalar_int_mode mode = DImode;
13248   unsigned int val32 = val64 & 0xffffffff;
13249   if (val32 == (val64 >> 32))
13250     {
13251       mode = SImode;
13252       unsigned int val16 = val32 & 0xffff;
13253       if (val16 == (val32 >> 16))
13254         {
13255           mode = HImode;
13256           unsigned int val8 = val16 & 0xff;
13257           if (val8 == (val16 >> 8))
13258             mode = QImode;
13259         }
13260     }
13261   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
13262   if (IN_RANGE (val, -0x80, 0x7f))
13263     {
13264       /* DUP with no shift.  */
13265       if (info)
13266         *info = simd_immediate_info (mode, val);
13267       return true;
13268     }
13269   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
13270     {
13271       /* DUP with LSL #8.  */
13272       if (info)
13273         *info = simd_immediate_info (mode, val);
13274       return true;
13275     }
13276   if (aarch64_bitmask_imm (val64, mode))
13277     {
13278       /* DUPM.  */
13279       if (info)
13280         *info = simd_immediate_info (mode, val);
13281       return true;
13282     }
13283   return false;
13284 }
13285
13286 /* Return true if OP is a valid SIMD immediate for the operation
13287    described by WHICH.  If INFO is nonnull, use it to describe valid
13288    immediates.  */
13289 bool
13290 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
13291                               enum simd_immediate_check which)
13292 {
13293   machine_mode mode = GET_MODE (op);
13294   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13295   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13296     return false;
13297
13298   scalar_mode elt_mode = GET_MODE_INNER (mode);
13299   rtx base, step;
13300   unsigned int n_elts;
13301   if (GET_CODE (op) == CONST_VECTOR
13302       && CONST_VECTOR_DUPLICATE_P (op))
13303     n_elts = CONST_VECTOR_NPATTERNS (op);
13304   else if ((vec_flags & VEC_SVE_DATA)
13305            && const_vec_series_p (op, &base, &step))
13306     {
13307       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
13308       if (!aarch64_sve_index_immediate_p (base)
13309           || !aarch64_sve_index_immediate_p (step))
13310         return false;
13311
13312       if (info)
13313         *info = simd_immediate_info (elt_mode, base, step);
13314       return true;
13315     }
13316   else if (GET_CODE (op) == CONST_VECTOR
13317            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
13318     /* N_ELTS set above.  */;
13319   else
13320     return false;
13321
13322   /* Handle PFALSE and PTRUE.  */
13323   if (vec_flags & VEC_SVE_PRED)
13324     return (op == CONST0_RTX (mode)
13325             || op == CONSTM1_RTX (mode));
13326
13327   scalar_float_mode elt_float_mode;
13328   if (n_elts == 1
13329       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
13330     {
13331       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
13332       if (aarch64_float_const_zero_rtx_p (elt)
13333           || aarch64_float_const_representable_p (elt))
13334         {
13335           if (info)
13336             *info = simd_immediate_info (elt_float_mode, elt);
13337           return true;
13338         }
13339     }
13340
13341   unsigned int elt_size = GET_MODE_SIZE (elt_mode);
13342   if (elt_size > 8)
13343     return false;
13344
13345   scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
13346
13347   /* Expand the vector constant out into a byte vector, with the least
13348      significant byte of the register first.  */
13349   auto_vec<unsigned char, 16> bytes;
13350   bytes.reserve (n_elts * elt_size);
13351   for (unsigned int i = 0; i < n_elts; i++)
13352     {
13353       /* The vector is provided in gcc endian-neutral fashion.
13354          For aarch64_be Advanced SIMD, it must be laid out in the vector
13355          register in reverse order.  */
13356       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
13357       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
13358
13359       if (elt_mode != elt_int_mode)
13360         elt = gen_lowpart (elt_int_mode, elt);
13361
13362       if (!CONST_INT_P (elt))
13363         return false;
13364
13365       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
13366       for (unsigned int byte = 0; byte < elt_size; byte++)
13367         {
13368           bytes.quick_push (elt_val & 0xff);
13369           elt_val >>= BITS_PER_UNIT;
13370         }
13371     }
13372
13373   /* The immediate must repeat every eight bytes.  */
13374   unsigned int nbytes = bytes.length ();
13375   for (unsigned i = 8; i < nbytes; ++i)
13376     if (bytes[i] != bytes[i - 8])
13377       return false;
13378
13379   /* Get the repeating 8-byte value as an integer.  No endian correction
13380      is needed here because bytes is already in lsb-first order.  */
13381   unsigned HOST_WIDE_INT val64 = 0;
13382   for (unsigned int i = 0; i < 8; i++)
13383     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
13384               << (i * BITS_PER_UNIT));
13385
13386   if (vec_flags & VEC_SVE_DATA)
13387     return aarch64_sve_valid_immediate (val64, info);
13388   else
13389     return aarch64_advsimd_valid_immediate (val64, info, which);
13390 }
13391
13392 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
13393    has a step in the range of INDEX.  Return the index expression if so,
13394    otherwise return null.  */
13395 rtx
13396 aarch64_check_zero_based_sve_index_immediate (rtx x)
13397 {
13398   rtx base, step;
13399   if (const_vec_series_p (x, &base, &step)
13400       && base == const0_rtx
13401       && aarch64_sve_index_immediate_p (step))
13402     return step;
13403   return NULL_RTX;
13404 }
13405
13406 /* Check of immediate shift constants are within range.  */
13407 bool
13408 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
13409 {
13410   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
13411   if (left)
13412     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
13413   else
13414     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
13415 }
13416
13417 /* Return the bitmask CONST_INT to select the bits required by a zero extract
13418    operation of width WIDTH at bit position POS.  */
13419
13420 rtx
13421 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
13422 {
13423   gcc_assert (CONST_INT_P (width));
13424   gcc_assert (CONST_INT_P (pos));
13425
13426   unsigned HOST_WIDE_INT mask
13427     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
13428   return GEN_INT (mask << UINTVAL (pos));
13429 }
13430
13431 bool
13432 aarch64_mov_operand_p (rtx x, machine_mode mode)
13433 {
13434   if (GET_CODE (x) == HIGH
13435       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
13436     return true;
13437
13438   if (CONST_INT_P (x))
13439     return true;
13440
13441   if (VECTOR_MODE_P (GET_MODE (x)))
13442     return aarch64_simd_valid_immediate (x, NULL);
13443
13444   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
13445     return true;
13446
13447   if (aarch64_sve_cnt_immediate_p (x))
13448     return true;
13449
13450   return aarch64_classify_symbolic_expression (x)
13451     == SYMBOL_TINY_ABSOLUTE;
13452 }
13453
13454 /* Return a const_int vector of VAL.  */
13455 rtx
13456 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
13457 {
13458   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
13459   return gen_const_vec_duplicate (mode, c);
13460 }
13461
13462 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
13463
13464 bool
13465 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
13466 {
13467   machine_mode vmode;
13468
13469   vmode = aarch64_simd_container_mode (mode, 64);
13470   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
13471   return aarch64_simd_valid_immediate (op_v, NULL);
13472 }
13473
13474 /* Construct and return a PARALLEL RTX vector with elements numbering the
13475    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
13476    the vector - from the perspective of the architecture.  This does not
13477    line up with GCC's perspective on lane numbers, so we end up with
13478    different masks depending on our target endian-ness.  The diagram
13479    below may help.  We must draw the distinction when building masks
13480    which select one half of the vector.  An instruction selecting
13481    architectural low-lanes for a big-endian target, must be described using
13482    a mask selecting GCC high-lanes.
13483
13484                  Big-Endian             Little-Endian
13485
13486 GCC             0   1   2   3           3   2   1   0
13487               | x | x | x | x |       | x | x | x | x |
13488 Architecture    3   2   1   0           3   2   1   0
13489
13490 Low Mask:         { 2, 3 }                { 0, 1 }
13491 High Mask:        { 0, 1 }                { 2, 3 }
13492
13493    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
13494
13495 rtx
13496 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
13497 {
13498   rtvec v = rtvec_alloc (nunits / 2);
13499   int high_base = nunits / 2;
13500   int low_base = 0;
13501   int base;
13502   rtx t1;
13503   int i;
13504
13505   if (BYTES_BIG_ENDIAN)
13506     base = high ? low_base : high_base;
13507   else
13508     base = high ? high_base : low_base;
13509
13510   for (i = 0; i < nunits / 2; i++)
13511     RTVEC_ELT (v, i) = GEN_INT (base + i);
13512
13513   t1 = gen_rtx_PARALLEL (mode, v);
13514   return t1;
13515 }
13516
13517 /* Check OP for validity as a PARALLEL RTX vector with elements
13518    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
13519    from the perspective of the architecture.  See the diagram above
13520    aarch64_simd_vect_par_cnst_half for more details.  */
13521
13522 bool
13523 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
13524                                        bool high)
13525 {
13526   int nelts;
13527   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
13528     return false;
13529
13530   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
13531   HOST_WIDE_INT count_op = XVECLEN (op, 0);
13532   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
13533   int i = 0;
13534
13535   if (count_op != count_ideal)
13536     return false;
13537
13538   for (i = 0; i < count_ideal; i++)
13539     {
13540       rtx elt_op = XVECEXP (op, 0, i);
13541       rtx elt_ideal = XVECEXP (ideal, 0, i);
13542
13543       if (!CONST_INT_P (elt_op)
13544           || INTVAL (elt_ideal) != INTVAL (elt_op))
13545         return false;
13546     }
13547   return true;
13548 }
13549
13550 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
13551    HIGH (exclusive).  */
13552 void
13553 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
13554                           const_tree exp)
13555 {
13556   HOST_WIDE_INT lane;
13557   gcc_assert (CONST_INT_P (operand));
13558   lane = INTVAL (operand);
13559
13560   if (lane < low || lane >= high)
13561   {
13562     if (exp)
13563       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
13564     else
13565       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
13566   }
13567 }
13568
13569 /* Peform endian correction on lane number N, which indexes a vector
13570    of mode MODE, and return the result as an SImode rtx.  */
13571
13572 rtx
13573 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
13574 {
13575   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
13576 }
13577
13578 /* Return TRUE if OP is a valid vector addressing mode.  */
13579
13580 bool
13581 aarch64_simd_mem_operand_p (rtx op)
13582 {
13583   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
13584                         || REG_P (XEXP (op, 0)));
13585 }
13586
13587 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
13588
13589 bool
13590 aarch64_sve_ld1r_operand_p (rtx op)
13591 {
13592   struct aarch64_address_info addr;
13593   scalar_mode mode;
13594
13595   return (MEM_P (op)
13596           && is_a <scalar_mode> (GET_MODE (op), &mode)
13597           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
13598           && addr.type == ADDRESS_REG_IMM
13599           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
13600 }
13601
13602 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
13603    The conditions for STR are the same.  */
13604 bool
13605 aarch64_sve_ldr_operand_p (rtx op)
13606 {
13607   struct aarch64_address_info addr;
13608
13609   return (MEM_P (op)
13610           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
13611                                        false, ADDR_QUERY_ANY)
13612           && addr.type == ADDRESS_REG_IMM);
13613 }
13614
13615 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
13616    We need to be able to access the individual pieces, so the range
13617    is different from LD[234] and ST[234].  */
13618 bool
13619 aarch64_sve_struct_memory_operand_p (rtx op)
13620 {
13621   if (!MEM_P (op))
13622     return false;
13623
13624   machine_mode mode = GET_MODE (op);
13625   struct aarch64_address_info addr;
13626   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
13627                                  ADDR_QUERY_ANY)
13628       || addr.type != ADDRESS_REG_IMM)
13629     return false;
13630
13631   poly_int64 first = addr.const_offset;
13632   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
13633   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
13634           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
13635 }
13636
13637 /* Emit a register copy from operand to operand, taking care not to
13638    early-clobber source registers in the process.
13639
13640    COUNT is the number of components into which the copy needs to be
13641    decomposed.  */
13642 void
13643 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
13644                                 unsigned int count)
13645 {
13646   unsigned int i;
13647   int rdest = REGNO (operands[0]);
13648   int rsrc = REGNO (operands[1]);
13649
13650   if (!reg_overlap_mentioned_p (operands[0], operands[1])
13651       || rdest < rsrc)
13652     for (i = 0; i < count; i++)
13653       emit_move_insn (gen_rtx_REG (mode, rdest + i),
13654                       gen_rtx_REG (mode, rsrc + i));
13655   else
13656     for (i = 0; i < count; i++)
13657       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
13658                       gen_rtx_REG (mode, rsrc + count - i - 1));
13659 }
13660
13661 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
13662    one of VSTRUCT modes: OI, CI, or XI.  */
13663 int
13664 aarch64_simd_attr_length_rglist (machine_mode mode)
13665 {
13666   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
13667   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
13668 }
13669
13670 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
13671    alignment of a vector to 128 bits.  SVE predicates have an alignment of
13672    16 bits.  */
13673 static HOST_WIDE_INT
13674 aarch64_simd_vector_alignment (const_tree type)
13675 {
13676   if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13677     /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
13678        be set for non-predicate vectors of booleans.  Modes are the most
13679        direct way we have of identifying real SVE predicate types.  */
13680     return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
13681   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
13682   return MIN (align, 128);
13683 }
13684
13685 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
13686 static HOST_WIDE_INT
13687 aarch64_vectorize_preferred_vector_alignment (const_tree type)
13688 {
13689   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
13690     {
13691       /* If the length of the vector is fixed, try to align to that length,
13692          otherwise don't try to align at all.  */
13693       HOST_WIDE_INT result;
13694       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
13695         result = TYPE_ALIGN (TREE_TYPE (type));
13696       return result;
13697     }
13698   return TYPE_ALIGN (type);
13699 }
13700
13701 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
13702 static bool
13703 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
13704 {
13705   if (is_packed)
13706     return false;
13707
13708   /* For fixed-length vectors, check that the vectorizer will aim for
13709      full-vector alignment.  This isn't true for generic GCC vectors
13710      that are wider than the ABI maximum of 128 bits.  */
13711   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
13712       && (wi::to_widest (TYPE_SIZE (type))
13713           != aarch64_vectorize_preferred_vector_alignment (type)))
13714     return false;
13715
13716   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
13717   return true;
13718 }
13719
13720 /* Return true if the vector misalignment factor is supported by the
13721    target.  */
13722 static bool
13723 aarch64_builtin_support_vector_misalignment (machine_mode mode,
13724                                              const_tree type, int misalignment,
13725                                              bool is_packed)
13726 {
13727   if (TARGET_SIMD && STRICT_ALIGNMENT)
13728     {
13729       /* Return if movmisalign pattern is not supported for this mode.  */
13730       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
13731         return false;
13732
13733       /* Misalignment factor is unknown at compile time.  */
13734       if (misalignment == -1)
13735         return false;
13736     }
13737   return default_builtin_support_vector_misalignment (mode, type, misalignment,
13738                                                       is_packed);
13739 }
13740
13741 /* If VALS is a vector constant that can be loaded into a register
13742    using DUP, generate instructions to do so and return an RTX to
13743    assign to the register.  Otherwise return NULL_RTX.  */
13744 static rtx
13745 aarch64_simd_dup_constant (rtx vals)
13746 {
13747   machine_mode mode = GET_MODE (vals);
13748   machine_mode inner_mode = GET_MODE_INNER (mode);
13749   rtx x;
13750
13751   if (!const_vec_duplicate_p (vals, &x))
13752     return NULL_RTX;
13753
13754   /* We can load this constant by using DUP and a constant in a
13755      single ARM register.  This will be cheaper than a vector
13756      load.  */
13757   x = copy_to_mode_reg (inner_mode, x);
13758   return gen_vec_duplicate (mode, x);
13759 }
13760
13761
13762 /* Generate code to load VALS, which is a PARALLEL containing only
13763    constants (for vec_init) or CONST_VECTOR, efficiently into a
13764    register.  Returns an RTX to copy into the register, or NULL_RTX
13765    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
13766 static rtx
13767 aarch64_simd_make_constant (rtx vals)
13768 {
13769   machine_mode mode = GET_MODE (vals);
13770   rtx const_dup;
13771   rtx const_vec = NULL_RTX;
13772   int n_const = 0;
13773   int i;
13774
13775   if (GET_CODE (vals) == CONST_VECTOR)
13776     const_vec = vals;
13777   else if (GET_CODE (vals) == PARALLEL)
13778     {
13779       /* A CONST_VECTOR must contain only CONST_INTs and
13780          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
13781          Only store valid constants in a CONST_VECTOR.  */
13782       int n_elts = XVECLEN (vals, 0);
13783       for (i = 0; i < n_elts; ++i)
13784         {
13785           rtx x = XVECEXP (vals, 0, i);
13786           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13787             n_const++;
13788         }
13789       if (n_const == n_elts)
13790         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
13791     }
13792   else
13793     gcc_unreachable ();
13794
13795   if (const_vec != NULL_RTX
13796       && aarch64_simd_valid_immediate (const_vec, NULL))
13797     /* Load using MOVI/MVNI.  */
13798     return const_vec;
13799   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
13800     /* Loaded using DUP.  */
13801     return const_dup;
13802   else if (const_vec != NULL_RTX)
13803     /* Load from constant pool. We can not take advantage of single-cycle
13804        LD1 because we need a PC-relative addressing mode.  */
13805     return const_vec;
13806   else
13807     /* A PARALLEL containing something not valid inside CONST_VECTOR.
13808        We can not construct an initializer.  */
13809     return NULL_RTX;
13810 }
13811
13812 /* Expand a vector initialisation sequence, such that TARGET is
13813    initialised to contain VALS.  */
13814
13815 void
13816 aarch64_expand_vector_init (rtx target, rtx vals)
13817 {
13818   machine_mode mode = GET_MODE (target);
13819   scalar_mode inner_mode = GET_MODE_INNER (mode);
13820   /* The number of vector elements.  */
13821   int n_elts = XVECLEN (vals, 0);
13822   /* The number of vector elements which are not constant.  */
13823   int n_var = 0;
13824   rtx any_const = NULL_RTX;
13825   /* The first element of vals.  */
13826   rtx v0 = XVECEXP (vals, 0, 0);
13827   bool all_same = true;
13828
13829   /* Count the number of variable elements to initialise.  */
13830   for (int i = 0; i < n_elts; ++i)
13831     {
13832       rtx x = XVECEXP (vals, 0, i);
13833       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
13834         ++n_var;
13835       else
13836         any_const = x;
13837
13838       all_same &= rtx_equal_p (x, v0);
13839     }
13840
13841   /* No variable elements, hand off to aarch64_simd_make_constant which knows
13842      how best to handle this.  */
13843   if (n_var == 0)
13844     {
13845       rtx constant = aarch64_simd_make_constant (vals);
13846       if (constant != NULL_RTX)
13847         {
13848           emit_move_insn (target, constant);
13849           return;
13850         }
13851     }
13852
13853   /* Splat a single non-constant element if we can.  */
13854   if (all_same)
13855     {
13856       rtx x = copy_to_mode_reg (inner_mode, v0);
13857       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13858       return;
13859     }
13860
13861   enum insn_code icode = optab_handler (vec_set_optab, mode);
13862   gcc_assert (icode != CODE_FOR_nothing);
13863
13864   /* If there are only variable elements, try to optimize
13865      the insertion using dup for the most common element
13866      followed by insertions.  */
13867
13868   /* The algorithm will fill matches[*][0] with the earliest matching element,
13869      and matches[X][1] with the count of duplicate elements (if X is the
13870      earliest element which has duplicates).  */
13871
13872   if (n_var == n_elts && n_elts <= 16)
13873     {
13874       int matches[16][2] = {0};
13875       for (int i = 0; i < n_elts; i++)
13876         {
13877           for (int j = 0; j <= i; j++)
13878             {
13879               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
13880                 {
13881                   matches[i][0] = j;
13882                   matches[j][1]++;
13883                   break;
13884                 }
13885             }
13886         }
13887       int maxelement = 0;
13888       int maxv = 0;
13889       for (int i = 0; i < n_elts; i++)
13890         if (matches[i][1] > maxv)
13891           {
13892             maxelement = i;
13893             maxv = matches[i][1];
13894           }
13895
13896       /* Create a duplicate of the most common element, unless all elements
13897          are equally useless to us, in which case just immediately set the
13898          vector register using the first element.  */
13899
13900       if (maxv == 1)
13901         {
13902           /* For vectors of two 64-bit elements, we can do even better.  */
13903           if (n_elts == 2
13904               && (inner_mode == E_DImode
13905                   || inner_mode == E_DFmode))
13906
13907             {
13908               rtx x0 = XVECEXP (vals, 0, 0);
13909               rtx x1 = XVECEXP (vals, 0, 1);
13910               /* Combine can pick up this case, but handling it directly
13911                  here leaves clearer RTL.
13912
13913                  This is load_pair_lanes<mode>, and also gives us a clean-up
13914                  for store_pair_lanes<mode>.  */
13915               if (memory_operand (x0, inner_mode)
13916                   && memory_operand (x1, inner_mode)
13917                   && !STRICT_ALIGNMENT
13918                   && rtx_equal_p (XEXP (x1, 0),
13919                                   plus_constant (Pmode,
13920                                                  XEXP (x0, 0),
13921                                                  GET_MODE_SIZE (inner_mode))))
13922                 {
13923                   rtx t;
13924                   if (inner_mode == DFmode)
13925                     t = gen_load_pair_lanesdf (target, x0, x1);
13926                   else
13927                     t = gen_load_pair_lanesdi (target, x0, x1);
13928                   emit_insn (t);
13929                   return;
13930                 }
13931             }
13932           /* The subreg-move sequence below will move into lane zero of the
13933              vector register.  For big-endian we want that position to hold
13934              the last element of VALS.  */
13935           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
13936           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
13937           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
13938         }
13939       else
13940         {
13941           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
13942           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13943         }
13944
13945       /* Insert the rest.  */
13946       for (int i = 0; i < n_elts; i++)
13947         {
13948           rtx x = XVECEXP (vals, 0, i);
13949           if (matches[i][0] == maxelement)
13950             continue;
13951           x = copy_to_mode_reg (inner_mode, x);
13952           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
13953         }
13954       return;
13955     }
13956
13957   /* Initialise a vector which is part-variable.  We want to first try
13958      to build those lanes which are constant in the most efficient way we
13959      can.  */
13960   if (n_var != n_elts)
13961     {
13962       rtx copy = copy_rtx (vals);
13963
13964       /* Load constant part of vector.  We really don't care what goes into the
13965          parts we will overwrite, but we're more likely to be able to load the
13966          constant efficiently if it has fewer, larger, repeating parts
13967          (see aarch64_simd_valid_immediate).  */
13968       for (int i = 0; i < n_elts; i++)
13969         {
13970           rtx x = XVECEXP (vals, 0, i);
13971           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13972             continue;
13973           rtx subst = any_const;
13974           for (int bit = n_elts / 2; bit > 0; bit /= 2)
13975             {
13976               /* Look in the copied vector, as more elements are const.  */
13977               rtx test = XVECEXP (copy, 0, i ^ bit);
13978               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
13979                 {
13980                   subst = test;
13981                   break;
13982                 }
13983             }
13984           XVECEXP (copy, 0, i) = subst;
13985         }
13986       aarch64_expand_vector_init (target, copy);
13987     }
13988
13989   /* Insert the variable lanes directly.  */
13990   for (int i = 0; i < n_elts; i++)
13991     {
13992       rtx x = XVECEXP (vals, 0, i);
13993       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13994         continue;
13995       x = copy_to_mode_reg (inner_mode, x);
13996       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
13997     }
13998 }
13999
14000 static unsigned HOST_WIDE_INT
14001 aarch64_shift_truncation_mask (machine_mode mode)
14002 {
14003   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
14004     return 0;
14005   return GET_MODE_UNIT_BITSIZE (mode) - 1;
14006 }
14007
14008 /* Select a format to encode pointers in exception handling data.  */
14009 int
14010 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
14011 {
14012    int type;
14013    switch (aarch64_cmodel)
14014      {
14015      case AARCH64_CMODEL_TINY:
14016      case AARCH64_CMODEL_TINY_PIC:
14017      case AARCH64_CMODEL_SMALL:
14018      case AARCH64_CMODEL_SMALL_PIC:
14019      case AARCH64_CMODEL_SMALL_SPIC:
14020        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
14021           for everything.  */
14022        type = DW_EH_PE_sdata4;
14023        break;
14024      default:
14025        /* No assumptions here.  8-byte relocs required.  */
14026        type = DW_EH_PE_sdata8;
14027        break;
14028      }
14029    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
14030 }
14031
14032 /* The last .arch and .tune assembly strings that we printed.  */
14033 static std::string aarch64_last_printed_arch_string;
14034 static std::string aarch64_last_printed_tune_string;
14035
14036 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
14037    by the function fndecl.  */
14038
14039 void
14040 aarch64_declare_function_name (FILE *stream, const char* name,
14041                                 tree fndecl)
14042 {
14043   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
14044
14045   struct cl_target_option *targ_options;
14046   if (target_parts)
14047     targ_options = TREE_TARGET_OPTION (target_parts);
14048   else
14049     targ_options = TREE_TARGET_OPTION (target_option_current_node);
14050   gcc_assert (targ_options);
14051
14052   const struct processor *this_arch
14053     = aarch64_get_arch (targ_options->x_explicit_arch);
14054
14055   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
14056   std::string extension
14057     = aarch64_get_extension_string_for_isa_flags (isa_flags,
14058                                                   this_arch->flags);
14059   /* Only update the assembler .arch string if it is distinct from the last
14060      such string we printed.  */
14061   std::string to_print = this_arch->name + extension;
14062   if (to_print != aarch64_last_printed_arch_string)
14063     {
14064       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
14065       aarch64_last_printed_arch_string = to_print;
14066     }
14067
14068   /* Print the cpu name we're tuning for in the comments, might be
14069      useful to readers of the generated asm.  Do it only when it changes
14070      from function to function and verbose assembly is requested.  */
14071   const struct processor *this_tune
14072     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
14073
14074   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
14075     {
14076       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
14077                    this_tune->name);
14078       aarch64_last_printed_tune_string = this_tune->name;
14079     }
14080
14081   /* Don't forget the type directive for ELF.  */
14082   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
14083   ASM_OUTPUT_LABEL (stream, name);
14084 }
14085
14086 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
14087
14088 static void
14089 aarch64_start_file (void)
14090 {
14091   struct cl_target_option *default_options
14092     = TREE_TARGET_OPTION (target_option_default_node);
14093
14094   const struct processor *default_arch
14095     = aarch64_get_arch (default_options->x_explicit_arch);
14096   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
14097   std::string extension
14098     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
14099                                                   default_arch->flags);
14100
14101    aarch64_last_printed_arch_string = default_arch->name + extension;
14102    aarch64_last_printed_tune_string = "";
14103    asm_fprintf (asm_out_file, "\t.arch %s\n",
14104                 aarch64_last_printed_arch_string.c_str ());
14105
14106    default_file_start ();
14107 }
14108
14109 /* Emit load exclusive.  */
14110
14111 static void
14112 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
14113                              rtx mem, rtx model_rtx)
14114 {
14115   emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
14116 }
14117
14118 /* Emit store exclusive.  */
14119
14120 static void
14121 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
14122                               rtx rval, rtx mem, rtx model_rtx)
14123 {
14124   emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
14125 }
14126
14127 /* Mark the previous jump instruction as unlikely.  */
14128
14129 static void
14130 aarch64_emit_unlikely_jump (rtx insn)
14131 {
14132   rtx_insn *jump = emit_jump_insn (insn);
14133   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
14134 }
14135
14136 /* Expand a compare and swap pattern.  */
14137
14138 void
14139 aarch64_expand_compare_and_swap (rtx operands[])
14140 {
14141   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
14142   machine_mode mode, cmp_mode;
14143
14144   bval = operands[0];
14145   rval = operands[1];
14146   mem = operands[2];
14147   oldval = operands[3];
14148   newval = operands[4];
14149   is_weak = operands[5];
14150   mod_s = operands[6];
14151   mod_f = operands[7];
14152   mode = GET_MODE (mem);
14153   cmp_mode = mode;
14154
14155   /* Normally the succ memory model must be stronger than fail, but in the
14156      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
14157      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
14158
14159   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
14160       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
14161     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
14162
14163   switch (mode)
14164     {
14165     case E_QImode:
14166     case E_HImode:
14167       /* For short modes, we're going to perform the comparison in SImode,
14168          so do the zero-extension now.  */
14169       cmp_mode = SImode;
14170       rval = gen_reg_rtx (SImode);
14171       oldval = convert_modes (SImode, mode, oldval, true);
14172       /* Fall through.  */
14173
14174     case E_SImode:
14175     case E_DImode:
14176       /* Force the value into a register if needed.  */
14177       if (!aarch64_plus_operand (oldval, mode))
14178         oldval = force_reg (cmp_mode, oldval);
14179       break;
14180
14181     default:
14182       gcc_unreachable ();
14183     }
14184
14185   if (TARGET_LSE)
14186     emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem, oldval,
14187                                                  newval, is_weak, mod_s,
14188                                                  mod_f));
14189   else
14190     emit_insn (gen_aarch64_compare_and_swap (mode, rval, mem, oldval, newval,
14191                                              is_weak, mod_s, mod_f));
14192
14193
14194   if (mode == QImode || mode == HImode)
14195     emit_move_insn (operands[1], gen_lowpart (mode, rval));
14196
14197   x = gen_rtx_REG (CCmode, CC_REGNUM);
14198   x = gen_rtx_EQ (SImode, x, const0_rtx);
14199   emit_insn (gen_rtx_SET (bval, x));
14200 }
14201
14202 /* Test whether the target supports using a atomic load-operate instruction.
14203    CODE is the operation and AFTER is TRUE if the data in memory after the
14204    operation should be returned and FALSE if the data before the operation
14205    should be returned.  Returns FALSE if the operation isn't supported by the
14206    architecture.  */
14207
14208 bool
14209 aarch64_atomic_ldop_supported_p (enum rtx_code code)
14210 {
14211   if (!TARGET_LSE)
14212     return false;
14213
14214   switch (code)
14215     {
14216     case SET:
14217     case AND:
14218     case IOR:
14219     case XOR:
14220     case MINUS:
14221     case PLUS:
14222       return true;
14223     default:
14224       return false;
14225     }
14226 }
14227
14228 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
14229    sequence implementing an atomic operation.  */
14230
14231 static void
14232 aarch64_emit_post_barrier (enum memmodel model)
14233 {
14234   const enum memmodel base_model = memmodel_base (model);
14235
14236   if (is_mm_sync (model)
14237       && (base_model == MEMMODEL_ACQUIRE
14238           || base_model == MEMMODEL_ACQ_REL
14239           || base_model == MEMMODEL_SEQ_CST))
14240     {
14241       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
14242     }
14243 }
14244
14245 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
14246    for the data in memory.  EXPECTED is the value expected to be in memory.
14247    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
14248    is the memory ordering to use.  */
14249
14250 void
14251 aarch64_gen_atomic_cas (rtx rval, rtx mem,
14252                         rtx expected, rtx desired,
14253                         rtx model)
14254 {
14255   machine_mode mode;
14256
14257   mode = GET_MODE (mem);
14258
14259   /* Move the expected value into the CAS destination register.  */
14260   emit_insn (gen_rtx_SET (rval, expected));
14261
14262   /* Emit the CAS.  */
14263   emit_insn (gen_aarch64_atomic_cas (mode, rval, mem, desired, model));
14264
14265   /* Compare the expected value with the value loaded by the CAS, to establish
14266      whether the swap was made.  */
14267   aarch64_gen_compare_reg (EQ, rval, expected);
14268 }
14269
14270 /* Split a compare and swap pattern.  */
14271
14272 void
14273 aarch64_split_compare_and_swap (rtx operands[])
14274 {
14275   rtx rval, mem, oldval, newval, scratch;
14276   machine_mode mode;
14277   bool is_weak;
14278   rtx_code_label *label1, *label2;
14279   rtx x, cond;
14280   enum memmodel model;
14281   rtx model_rtx;
14282
14283   rval = operands[0];
14284   mem = operands[1];
14285   oldval = operands[2];
14286   newval = operands[3];
14287   is_weak = (operands[4] != const0_rtx);
14288   model_rtx = operands[5];
14289   scratch = operands[7];
14290   mode = GET_MODE (mem);
14291   model = memmodel_from_int (INTVAL (model_rtx));
14292
14293   /* When OLDVAL is zero and we want the strong version we can emit a tighter
14294     loop:
14295     .label1:
14296         LD[A]XR rval, [mem]
14297         CBNZ    rval, .label2
14298         ST[L]XR scratch, newval, [mem]
14299         CBNZ    scratch, .label1
14300     .label2:
14301         CMP     rval, 0.  */
14302   bool strong_zero_p = !is_weak && oldval == const0_rtx;
14303
14304   label1 = NULL;
14305   if (!is_weak)
14306     {
14307       label1 = gen_label_rtx ();
14308       emit_label (label1);
14309     }
14310   label2 = gen_label_rtx ();
14311
14312   /* The initial load can be relaxed for a __sync operation since a final
14313      barrier will be emitted to stop code hoisting.  */
14314   if (is_mm_sync (model))
14315     aarch64_emit_load_exclusive (mode, rval, mem,
14316                                  GEN_INT (MEMMODEL_RELAXED));
14317   else
14318     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
14319
14320   if (strong_zero_p)
14321     {
14322       if (aarch64_track_speculation)
14323         {
14324           /* Emit an explicit compare instruction, so that we can correctly
14325              track the condition codes.  */
14326           rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
14327           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
14328         }
14329       else
14330         x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
14331
14332       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14333                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14334       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14335     }
14336   else
14337     {
14338       cond = aarch64_gen_compare_reg (NE, rval, oldval);
14339       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14340       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14341                                  gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14342       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14343     }
14344
14345   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
14346
14347   if (!is_weak)
14348     {
14349       if (aarch64_track_speculation)
14350         {
14351           /* Emit an explicit compare instruction, so that we can correctly
14352              track the condition codes.  */
14353           rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
14354           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
14355         }
14356       else
14357         x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
14358
14359       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14360                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
14361       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14362     }
14363   else
14364     {
14365       cond = gen_rtx_REG (CCmode, CC_REGNUM);
14366       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
14367       emit_insn (gen_rtx_SET (cond, x));
14368     }
14369
14370   emit_label (label2);
14371   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
14372      to set the condition flags.  If this is not used it will be removed by
14373      later passes.  */
14374   if (strong_zero_p)
14375     {
14376       cond = gen_rtx_REG (CCmode, CC_REGNUM);
14377       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
14378       emit_insn (gen_rtx_SET (cond, x));
14379     }
14380   /* Emit any final barrier needed for a __sync operation.  */
14381   if (is_mm_sync (model))
14382     aarch64_emit_post_barrier (model);
14383 }
14384
14385 /* Emit a BIC instruction.  */
14386
14387 static void
14388 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
14389 {
14390   rtx shift_rtx = GEN_INT (shift);
14391   rtx (*gen) (rtx, rtx, rtx, rtx);
14392
14393   switch (mode)
14394     {
14395     case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
14396     case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
14397     default:
14398       gcc_unreachable ();
14399     }
14400
14401   emit_insn (gen (dst, s2, shift_rtx, s1));
14402 }
14403
14404 /* Emit an atomic swap.  */
14405
14406 static void
14407 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
14408                           rtx mem, rtx model)
14409 {
14410   emit_insn (gen_aarch64_atomic_swp (mode, dst, mem, value, model));
14411 }
14412
14413 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
14414    location to store the data read from memory.  OUT_RESULT is the location to
14415    store the result of the operation.  MEM is the memory location to read and
14416    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
14417    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
14418    be NULL.  */
14419
14420 void
14421 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
14422                          rtx mem, rtx value, rtx model_rtx)
14423 {
14424   machine_mode mode = GET_MODE (mem);
14425   machine_mode wmode = (mode == DImode ? DImode : SImode);
14426   const bool short_mode = (mode < SImode);
14427   int ldop_code;
14428   rtx src;
14429   rtx x;
14430
14431   if (out_data)
14432     out_data = gen_lowpart (mode, out_data);
14433
14434   if (out_result)
14435     out_result = gen_lowpart (mode, out_result);
14436
14437   /* Make sure the value is in a register, putting it into a destination
14438      register if it needs to be manipulated.  */
14439   if (!register_operand (value, mode)
14440       || code == AND || code == MINUS)
14441     {
14442       src = out_result ? out_result : out_data;
14443       emit_move_insn (src, gen_lowpart (mode, value));
14444     }
14445   else
14446     src = value;
14447   gcc_assert (register_operand (src, mode));
14448
14449   /* Preprocess the data for the operation as necessary.  If the operation is
14450      a SET then emit a swap instruction and finish.  */
14451   switch (code)
14452     {
14453     case SET:
14454       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
14455       return;
14456
14457     case MINUS:
14458       /* Negate the value and treat it as a PLUS.  */
14459       {
14460         rtx neg_src;
14461
14462         /* Resize the value if necessary.  */
14463         if (short_mode)
14464           src = gen_lowpart (wmode, src);
14465
14466         neg_src = gen_rtx_NEG (wmode, src);
14467         emit_insn (gen_rtx_SET (src, neg_src));
14468
14469         if (short_mode)
14470           src = gen_lowpart (mode, src);
14471       }
14472       /* Fall-through.  */
14473     case PLUS:
14474       ldop_code = UNSPECV_ATOMIC_LDOP_PLUS;
14475       break;
14476
14477     case IOR:
14478       ldop_code = UNSPECV_ATOMIC_LDOP_OR;
14479       break;
14480
14481     case XOR:
14482       ldop_code = UNSPECV_ATOMIC_LDOP_XOR;
14483       break;
14484
14485     case AND:
14486       {
14487         rtx not_src;
14488
14489         /* Resize the value if necessary.  */
14490         if (short_mode)
14491           src = gen_lowpart (wmode, src);
14492
14493         not_src = gen_rtx_NOT (wmode, src);
14494         emit_insn (gen_rtx_SET (src, not_src));
14495
14496         if (short_mode)
14497           src = gen_lowpart (mode, src);
14498       }
14499       ldop_code = UNSPECV_ATOMIC_LDOP_BIC;
14500       break;
14501
14502     default:
14503       /* The operation can't be done with atomic instructions.  */
14504       gcc_unreachable ();
14505     }
14506
14507   emit_insn (gen_aarch64_atomic_load (ldop_code, mode,
14508                                       out_data, mem, src, model_rtx));
14509
14510   /* If necessary, calculate the data in memory after the update by redoing the
14511      operation from values in registers.  */
14512   if (!out_result)
14513     return;
14514
14515   if (short_mode)
14516     {
14517       src = gen_lowpart (wmode, src);
14518       out_data = gen_lowpart (wmode, out_data);
14519       out_result = gen_lowpart (wmode, out_result);
14520     }
14521
14522   x = NULL_RTX;
14523
14524   switch (code)
14525     {
14526     case MINUS:
14527     case PLUS:
14528       x = gen_rtx_PLUS (wmode, out_data, src);
14529       break;
14530     case IOR:
14531       x = gen_rtx_IOR (wmode, out_data, src);
14532       break;
14533     case XOR:
14534       x = gen_rtx_XOR (wmode, out_data, src);
14535       break;
14536     case AND:
14537       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
14538       return;
14539     default:
14540       gcc_unreachable ();
14541     }
14542
14543   emit_set_insn (out_result, x);
14544
14545   return;
14546 }
14547
14548 /* Split an atomic operation.  */
14549
14550 void
14551 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
14552                          rtx value, rtx model_rtx, rtx cond)
14553 {
14554   machine_mode mode = GET_MODE (mem);
14555   machine_mode wmode = (mode == DImode ? DImode : SImode);
14556   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
14557   const bool is_sync = is_mm_sync (model);
14558   rtx_code_label *label;
14559   rtx x;
14560
14561   /* Split the atomic operation into a sequence.  */
14562   label = gen_label_rtx ();
14563   emit_label (label);
14564
14565   if (new_out)
14566     new_out = gen_lowpart (wmode, new_out);
14567   if (old_out)
14568     old_out = gen_lowpart (wmode, old_out);
14569   else
14570     old_out = new_out;
14571   value = simplify_gen_subreg (wmode, value, mode, 0);
14572
14573   /* The initial load can be relaxed for a __sync operation since a final
14574      barrier will be emitted to stop code hoisting.  */
14575  if (is_sync)
14576     aarch64_emit_load_exclusive (mode, old_out, mem,
14577                                  GEN_INT (MEMMODEL_RELAXED));
14578   else
14579     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
14580
14581   switch (code)
14582     {
14583     case SET:
14584       new_out = value;
14585       break;
14586
14587     case NOT:
14588       x = gen_rtx_AND (wmode, old_out, value);
14589       emit_insn (gen_rtx_SET (new_out, x));
14590       x = gen_rtx_NOT (wmode, new_out);
14591       emit_insn (gen_rtx_SET (new_out, x));
14592       break;
14593
14594     case MINUS:
14595       if (CONST_INT_P (value))
14596         {
14597           value = GEN_INT (-INTVAL (value));
14598           code = PLUS;
14599         }
14600       /* Fall through.  */
14601
14602     default:
14603       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
14604       emit_insn (gen_rtx_SET (new_out, x));
14605       break;
14606     }
14607
14608   aarch64_emit_store_exclusive (mode, cond, mem,
14609                                 gen_lowpart (mode, new_out), model_rtx);
14610
14611   if (aarch64_track_speculation)
14612     {
14613       /* Emit an explicit compare instruction, so that we can correctly
14614          track the condition codes.  */
14615       rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
14616       x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
14617     }
14618   else
14619     x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14620
14621   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14622                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
14623   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14624
14625   /* Emit any final barrier needed for a __sync operation.  */
14626   if (is_sync)
14627     aarch64_emit_post_barrier (model);
14628 }
14629
14630 static void
14631 aarch64_init_libfuncs (void)
14632 {
14633    /* Half-precision float operations.  The compiler handles all operations
14634      with NULL libfuncs by converting to SFmode.  */
14635
14636   /* Conversions.  */
14637   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
14638   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
14639
14640   /* Arithmetic.  */
14641   set_optab_libfunc (add_optab, HFmode, NULL);
14642   set_optab_libfunc (sdiv_optab, HFmode, NULL);
14643   set_optab_libfunc (smul_optab, HFmode, NULL);
14644   set_optab_libfunc (neg_optab, HFmode, NULL);
14645   set_optab_libfunc (sub_optab, HFmode, NULL);
14646
14647   /* Comparisons.  */
14648   set_optab_libfunc (eq_optab, HFmode, NULL);
14649   set_optab_libfunc (ne_optab, HFmode, NULL);
14650   set_optab_libfunc (lt_optab, HFmode, NULL);
14651   set_optab_libfunc (le_optab, HFmode, NULL);
14652   set_optab_libfunc (ge_optab, HFmode, NULL);
14653   set_optab_libfunc (gt_optab, HFmode, NULL);
14654   set_optab_libfunc (unord_optab, HFmode, NULL);
14655 }
14656
14657 /* Target hook for c_mode_for_suffix.  */
14658 static machine_mode
14659 aarch64_c_mode_for_suffix (char suffix)
14660 {
14661   if (suffix == 'q')
14662     return TFmode;
14663
14664   return VOIDmode;
14665 }
14666
14667 /* We can only represent floating point constants which will fit in
14668    "quarter-precision" values.  These values are characterised by
14669    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
14670    by:
14671
14672    (-1)^s * (n/16) * 2^r
14673
14674    Where:
14675      's' is the sign bit.
14676      'n' is an integer in the range 16 <= n <= 31.
14677      'r' is an integer in the range -3 <= r <= 4.  */
14678
14679 /* Return true iff X can be represented by a quarter-precision
14680    floating point immediate operand X.  Note, we cannot represent 0.0.  */
14681 bool
14682 aarch64_float_const_representable_p (rtx x)
14683 {
14684   /* This represents our current view of how many bits
14685      make up the mantissa.  */
14686   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
14687   int exponent;
14688   unsigned HOST_WIDE_INT mantissa, mask;
14689   REAL_VALUE_TYPE r, m;
14690   bool fail;
14691
14692   if (!CONST_DOUBLE_P (x))
14693     return false;
14694
14695   if (GET_MODE (x) == VOIDmode
14696       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
14697     return false;
14698
14699   r = *CONST_DOUBLE_REAL_VALUE (x);
14700
14701   /* We cannot represent infinities, NaNs or +/-zero.  We won't
14702      know if we have +zero until we analyse the mantissa, but we
14703      can reject the other invalid values.  */
14704   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
14705       || REAL_VALUE_MINUS_ZERO (r))
14706     return false;
14707
14708   /* Extract exponent.  */
14709   r = real_value_abs (&r);
14710   exponent = REAL_EXP (&r);
14711
14712   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
14713      highest (sign) bit, with a fixed binary point at bit point_pos.
14714      m1 holds the low part of the mantissa, m2 the high part.
14715      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
14716      bits for the mantissa, this can fail (low bits will be lost).  */
14717   real_ldexp (&m, &r, point_pos - exponent);
14718   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
14719
14720   /* If the low part of the mantissa has bits set we cannot represent
14721      the value.  */
14722   if (w.ulow () != 0)
14723     return false;
14724   /* We have rejected the lower HOST_WIDE_INT, so update our
14725      understanding of how many bits lie in the mantissa and
14726      look only at the high HOST_WIDE_INT.  */
14727   mantissa = w.elt (1);
14728   point_pos -= HOST_BITS_PER_WIDE_INT;
14729
14730   /* We can only represent values with a mantissa of the form 1.xxxx.  */
14731   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
14732   if ((mantissa & mask) != 0)
14733     return false;
14734
14735   /* Having filtered unrepresentable values, we may now remove all
14736      but the highest 5 bits.  */
14737   mantissa >>= point_pos - 5;
14738
14739   /* We cannot represent the value 0.0, so reject it.  This is handled
14740      elsewhere.  */
14741   if (mantissa == 0)
14742     return false;
14743
14744   /* Then, as bit 4 is always set, we can mask it off, leaving
14745      the mantissa in the range [0, 15].  */
14746   mantissa &= ~(1 << 4);
14747   gcc_assert (mantissa <= 15);
14748
14749   /* GCC internally does not use IEEE754-like encoding (where normalized
14750      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
14751      Our mantissa values are shifted 4 places to the left relative to
14752      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
14753      by 5 places to correct for GCC's representation.  */
14754   exponent = 5 - exponent;
14755
14756   return (exponent >= 0 && exponent <= 7);
14757 }
14758
14759 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
14760    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
14761    output MOVI/MVNI, ORR or BIC immediate.  */
14762 char*
14763 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
14764                                    enum simd_immediate_check which)
14765 {
14766   bool is_valid;
14767   static char templ[40];
14768   const char *mnemonic;
14769   const char *shift_op;
14770   unsigned int lane_count = 0;
14771   char element_char;
14772
14773   struct simd_immediate_info info;
14774
14775   /* This will return true to show const_vector is legal for use as either
14776      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
14777      It will also update INFO to show how the immediate should be generated.
14778      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
14779   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
14780   gcc_assert (is_valid);
14781
14782   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14783   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
14784
14785   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
14786     {
14787       gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
14788       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
14789          move immediate path.  */
14790       if (aarch64_float_const_zero_rtx_p (info.value))
14791         info.value = GEN_INT (0);
14792       else
14793         {
14794           const unsigned int buf_size = 20;
14795           char float_buf[buf_size] = {'\0'};
14796           real_to_decimal_for_mode (float_buf,
14797                                     CONST_DOUBLE_REAL_VALUE (info.value),
14798                                     buf_size, buf_size, 1, info.elt_mode);
14799
14800           if (lane_count == 1)
14801             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
14802           else
14803             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
14804                       lane_count, element_char, float_buf);
14805           return templ;
14806         }
14807     }
14808
14809   gcc_assert (CONST_INT_P (info.value));
14810
14811   if (which == AARCH64_CHECK_MOV)
14812     {
14813       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
14814       shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
14815       if (lane_count == 1)
14816         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
14817                   mnemonic, UINTVAL (info.value));
14818       else if (info.shift)
14819         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
14820                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
14821                   element_char, UINTVAL (info.value), shift_op, info.shift);
14822       else
14823         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
14824                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
14825                   element_char, UINTVAL (info.value));
14826     }
14827   else
14828     {
14829       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
14830       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
14831       if (info.shift)
14832         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
14833                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
14834                   element_char, UINTVAL (info.value), "lsl", info.shift);
14835       else
14836         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
14837                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
14838                   element_char, UINTVAL (info.value));
14839     }
14840   return templ;
14841 }
14842
14843 char*
14844 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
14845 {
14846
14847   /* If a floating point number was passed and we desire to use it in an
14848      integer mode do the conversion to integer.  */
14849   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
14850     {
14851       unsigned HOST_WIDE_INT ival;
14852       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
14853           gcc_unreachable ();
14854       immediate = gen_int_mode (ival, mode);
14855     }
14856
14857   machine_mode vmode;
14858   /* use a 64 bit mode for everything except for DI/DF mode, where we use
14859      a 128 bit vector mode.  */
14860   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
14861
14862   vmode = aarch64_simd_container_mode (mode, width);
14863   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
14864   return aarch64_output_simd_mov_immediate (v_op, width);
14865 }
14866
14867 /* Return the output string to use for moving immediate CONST_VECTOR
14868    into an SVE register.  */
14869
14870 char *
14871 aarch64_output_sve_mov_immediate (rtx const_vector)
14872 {
14873   static char templ[40];
14874   struct simd_immediate_info info;
14875   char element_char;
14876
14877   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
14878   gcc_assert (is_valid);
14879
14880   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14881
14882   if (info.step)
14883     {
14884       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
14885                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
14886                 element_char, INTVAL (info.value), INTVAL (info.step));
14887       return templ;
14888     }
14889
14890   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
14891     {
14892       if (aarch64_float_const_zero_rtx_p (info.value))
14893         info.value = GEN_INT (0);
14894       else
14895         {
14896           const int buf_size = 20;
14897           char float_buf[buf_size] = {};
14898           real_to_decimal_for_mode (float_buf,
14899                                     CONST_DOUBLE_REAL_VALUE (info.value),
14900                                     buf_size, buf_size, 1, info.elt_mode);
14901
14902           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
14903                     element_char, float_buf);
14904           return templ;
14905         }
14906     }
14907
14908   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
14909             element_char, INTVAL (info.value));
14910   return templ;
14911 }
14912
14913 /* Return the asm format for a PTRUE instruction whose destination has
14914    mode MODE.  SUFFIX is the element size suffix.  */
14915
14916 char *
14917 aarch64_output_ptrue (machine_mode mode, char suffix)
14918 {
14919   unsigned int nunits;
14920   static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
14921   if (GET_MODE_NUNITS (mode).is_constant (&nunits))
14922     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
14923   else
14924     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
14925   return buf;
14926 }
14927
14928 /* Split operands into moves from op[1] + op[2] into op[0].  */
14929
14930 void
14931 aarch64_split_combinev16qi (rtx operands[3])
14932 {
14933   unsigned int dest = REGNO (operands[0]);
14934   unsigned int src1 = REGNO (operands[1]);
14935   unsigned int src2 = REGNO (operands[2]);
14936   machine_mode halfmode = GET_MODE (operands[1]);
14937   unsigned int halfregs = REG_NREGS (operands[1]);
14938   rtx destlo, desthi;
14939
14940   gcc_assert (halfmode == V16QImode);
14941
14942   if (src1 == dest && src2 == dest + halfregs)
14943     {
14944       /* No-op move.  Can't split to nothing; emit something.  */
14945       emit_note (NOTE_INSN_DELETED);
14946       return;
14947     }
14948
14949   /* Preserve register attributes for variable tracking.  */
14950   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
14951   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
14952                                GET_MODE_SIZE (halfmode));
14953
14954   /* Special case of reversed high/low parts.  */
14955   if (reg_overlap_mentioned_p (operands[2], destlo)
14956       && reg_overlap_mentioned_p (operands[1], desthi))
14957     {
14958       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
14959       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
14960       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
14961     }
14962   else if (!reg_overlap_mentioned_p (operands[2], destlo))
14963     {
14964       /* Try to avoid unnecessary moves if part of the result
14965          is in the right place already.  */
14966       if (src1 != dest)
14967         emit_move_insn (destlo, operands[1]);
14968       if (src2 != dest + halfregs)
14969         emit_move_insn (desthi, operands[2]);
14970     }
14971   else
14972     {
14973       if (src2 != dest + halfregs)
14974         emit_move_insn (desthi, operands[2]);
14975       if (src1 != dest)
14976         emit_move_insn (destlo, operands[1]);
14977     }
14978 }
14979
14980 /* vec_perm support.  */
14981
14982 struct expand_vec_perm_d
14983 {
14984   rtx target, op0, op1;
14985   vec_perm_indices perm;
14986   machine_mode vmode;
14987   unsigned int vec_flags;
14988   bool one_vector_p;
14989   bool testing_p;
14990 };
14991
14992 /* Generate a variable permutation.  */
14993
14994 static void
14995 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
14996 {
14997   machine_mode vmode = GET_MODE (target);
14998   bool one_vector_p = rtx_equal_p (op0, op1);
14999
15000   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
15001   gcc_checking_assert (GET_MODE (op0) == vmode);
15002   gcc_checking_assert (GET_MODE (op1) == vmode);
15003   gcc_checking_assert (GET_MODE (sel) == vmode);
15004   gcc_checking_assert (TARGET_SIMD);
15005
15006   if (one_vector_p)
15007     {
15008       if (vmode == V8QImode)
15009         {
15010           /* Expand the argument to a V16QI mode by duplicating it.  */
15011           rtx pair = gen_reg_rtx (V16QImode);
15012           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
15013           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15014         }
15015       else
15016         {
15017           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
15018         }
15019     }
15020   else
15021     {
15022       rtx pair;
15023
15024       if (vmode == V8QImode)
15025         {
15026           pair = gen_reg_rtx (V16QImode);
15027           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
15028           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15029         }
15030       else
15031         {
15032           pair = gen_reg_rtx (OImode);
15033           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
15034           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
15035         }
15036     }
15037 }
15038
15039 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
15040    NELT is the number of elements in the vector.  */
15041
15042 void
15043 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
15044                          unsigned int nelt)
15045 {
15046   machine_mode vmode = GET_MODE (target);
15047   bool one_vector_p = rtx_equal_p (op0, op1);
15048   rtx mask;
15049
15050   /* The TBL instruction does not use a modulo index, so we must take care
15051      of that ourselves.  */
15052   mask = aarch64_simd_gen_const_vector_dup (vmode,
15053       one_vector_p ? nelt - 1 : 2 * nelt - 1);
15054   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
15055
15056   /* For big-endian, we also need to reverse the index within the vector
15057      (but not which vector).  */
15058   if (BYTES_BIG_ENDIAN)
15059     {
15060       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
15061       if (!one_vector_p)
15062         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
15063       sel = expand_simple_binop (vmode, XOR, sel, mask,
15064                                  NULL, 0, OPTAB_LIB_WIDEN);
15065     }
15066   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
15067 }
15068
15069 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
15070
15071 static void
15072 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
15073 {
15074   emit_insn (gen_rtx_SET (target,
15075                           gen_rtx_UNSPEC (GET_MODE (target),
15076                                           gen_rtvec (2, op0, op1), code)));
15077 }
15078
15079 /* Expand an SVE vec_perm with the given operands.  */
15080
15081 void
15082 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
15083 {
15084   machine_mode data_mode = GET_MODE (target);
15085   machine_mode sel_mode = GET_MODE (sel);
15086   /* Enforced by the pattern condition.  */
15087   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
15088
15089   /* Note: vec_perm indices are supposed to wrap when they go beyond the
15090      size of the two value vectors, i.e. the upper bits of the indices
15091      are effectively ignored.  SVE TBL instead produces 0 for any
15092      out-of-range indices, so we need to modulo all the vec_perm indices
15093      to ensure they are all in range.  */
15094   rtx sel_reg = force_reg (sel_mode, sel);
15095
15096   /* Check if the sel only references the first values vector.  */
15097   if (GET_CODE (sel) == CONST_VECTOR
15098       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
15099     {
15100       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
15101       return;
15102     }
15103
15104   /* Check if the two values vectors are the same.  */
15105   if (rtx_equal_p (op0, op1))
15106     {
15107       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
15108       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15109                                          NULL, 0, OPTAB_DIRECT);
15110       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
15111       return;
15112     }
15113
15114   /* Run TBL on for each value vector and combine the results.  */
15115
15116   rtx res0 = gen_reg_rtx (data_mode);
15117   rtx res1 = gen_reg_rtx (data_mode);
15118   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
15119   if (GET_CODE (sel) != CONST_VECTOR
15120       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
15121     {
15122       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
15123                                                        2 * nunits - 1);
15124       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15125                                      NULL, 0, OPTAB_DIRECT);
15126     }
15127   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
15128   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
15129                                      NULL, 0, OPTAB_DIRECT);
15130   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
15131   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
15132     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
15133   else
15134     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
15135 }
15136
15137 /* Recognize patterns suitable for the TRN instructions.  */
15138 static bool
15139 aarch64_evpc_trn (struct expand_vec_perm_d *d)
15140 {
15141   HOST_WIDE_INT odd;
15142   poly_uint64 nelt = d->perm.length ();
15143   rtx out, in0, in1, x;
15144   machine_mode vmode = d->vmode;
15145
15146   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15147     return false;
15148
15149   /* Note that these are little-endian tests.
15150      We correct for big-endian later.  */
15151   if (!d->perm[0].is_constant (&odd)
15152       || (odd != 0 && odd != 1)
15153       || !d->perm.series_p (0, 2, odd, 2)
15154       || !d->perm.series_p (1, 2, nelt + odd, 2))
15155     return false;
15156
15157   /* Success!  */
15158   if (d->testing_p)
15159     return true;
15160
15161   in0 = d->op0;
15162   in1 = d->op1;
15163   /* We don't need a big-endian lane correction for SVE; see the comment
15164      at the head of aarch64-sve.md for details.  */
15165   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15166     {
15167       x = in0, in0 = in1, in1 = x;
15168       odd = !odd;
15169     }
15170   out = d->target;
15171
15172   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15173                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
15174   return true;
15175 }
15176
15177 /* Recognize patterns suitable for the UZP instructions.  */
15178 static bool
15179 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
15180 {
15181   HOST_WIDE_INT odd;
15182   rtx out, in0, in1, x;
15183   machine_mode vmode = d->vmode;
15184
15185   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15186     return false;
15187
15188   /* Note that these are little-endian tests.
15189      We correct for big-endian later.  */
15190   if (!d->perm[0].is_constant (&odd)
15191       || (odd != 0 && odd != 1)
15192       || !d->perm.series_p (0, 1, odd, 2))
15193     return false;
15194
15195   /* Success!  */
15196   if (d->testing_p)
15197     return true;
15198
15199   in0 = d->op0;
15200   in1 = d->op1;
15201   /* We don't need a big-endian lane correction for SVE; see the comment
15202      at the head of aarch64-sve.md for details.  */
15203   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15204     {
15205       x = in0, in0 = in1, in1 = x;
15206       odd = !odd;
15207     }
15208   out = d->target;
15209
15210   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15211                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
15212   return true;
15213 }
15214
15215 /* Recognize patterns suitable for the ZIP instructions.  */
15216 static bool
15217 aarch64_evpc_zip (struct expand_vec_perm_d *d)
15218 {
15219   unsigned int high;
15220   poly_uint64 nelt = d->perm.length ();
15221   rtx out, in0, in1, x;
15222   machine_mode vmode = d->vmode;
15223
15224   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15225     return false;
15226
15227   /* Note that these are little-endian tests.
15228      We correct for big-endian later.  */
15229   poly_uint64 first = d->perm[0];
15230   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
15231       || !d->perm.series_p (0, 2, first, 1)
15232       || !d->perm.series_p (1, 2, first + nelt, 1))
15233     return false;
15234   high = maybe_ne (first, 0U);
15235
15236   /* Success!  */
15237   if (d->testing_p)
15238     return true;
15239
15240   in0 = d->op0;
15241   in1 = d->op1;
15242   /* We don't need a big-endian lane correction for SVE; see the comment
15243      at the head of aarch64-sve.md for details.  */
15244   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15245     {
15246       x = in0, in0 = in1, in1 = x;
15247       high = !high;
15248     }
15249   out = d->target;
15250
15251   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15252                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
15253   return true;
15254 }
15255
15256 /* Recognize patterns for the EXT insn.  */
15257
15258 static bool
15259 aarch64_evpc_ext (struct expand_vec_perm_d *d)
15260 {
15261   HOST_WIDE_INT location;
15262   rtx offset;
15263
15264   /* The first element always refers to the first vector.
15265      Check if the extracted indices are increasing by one.  */
15266   if (d->vec_flags == VEC_SVE_PRED
15267       || !d->perm[0].is_constant (&location)
15268       || !d->perm.series_p (0, 1, location, 1))
15269     return false;
15270
15271   /* Success! */
15272   if (d->testing_p)
15273     return true;
15274
15275   /* The case where (location == 0) is a no-op for both big- and little-endian,
15276      and is removed by the mid-end at optimization levels -O1 and higher.
15277
15278      We don't need a big-endian lane correction for SVE; see the comment
15279      at the head of aarch64-sve.md for details.  */
15280   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
15281     {
15282       /* After setup, we want the high elements of the first vector (stored
15283          at the LSB end of the register), and the low elements of the second
15284          vector (stored at the MSB end of the register). So swap.  */
15285       std::swap (d->op0, d->op1);
15286       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
15287          to_constant () is safe since this is restricted to Advanced SIMD
15288          vectors.  */
15289       location = d->perm.length ().to_constant () - location;
15290     }
15291
15292   offset = GEN_INT (location);
15293   emit_set_insn (d->target,
15294                  gen_rtx_UNSPEC (d->vmode,
15295                                  gen_rtvec (3, d->op0, d->op1, offset),
15296                                  UNSPEC_EXT));
15297   return true;
15298 }
15299
15300 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
15301    within each 64-bit, 32-bit or 16-bit granule.  */
15302
15303 static bool
15304 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
15305 {
15306   HOST_WIDE_INT diff;
15307   unsigned int i, size, unspec;
15308   machine_mode pred_mode;
15309
15310   if (d->vec_flags == VEC_SVE_PRED
15311       || !d->one_vector_p
15312       || !d->perm[0].is_constant (&diff))
15313     return false;
15314
15315   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
15316   if (size == 8)
15317     {
15318       unspec = UNSPEC_REV64;
15319       pred_mode = VNx2BImode;
15320     }
15321   else if (size == 4)
15322     {
15323       unspec = UNSPEC_REV32;
15324       pred_mode = VNx4BImode;
15325     }
15326   else if (size == 2)
15327     {
15328       unspec = UNSPEC_REV16;
15329       pred_mode = VNx8BImode;
15330     }
15331   else
15332     return false;
15333
15334   unsigned int step = diff + 1;
15335   for (i = 0; i < step; ++i)
15336     if (!d->perm.series_p (i, step, diff - i, step))
15337       return false;
15338
15339   /* Success! */
15340   if (d->testing_p)
15341     return true;
15342
15343   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
15344   if (d->vec_flags == VEC_SVE_DATA)
15345     {
15346       rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15347       src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
15348                             UNSPEC_MERGE_PTRUE);
15349     }
15350   emit_set_insn (d->target, src);
15351   return true;
15352 }
15353
15354 /* Recognize patterns for the REV insn, which reverses elements within
15355    a full vector.  */
15356
15357 static bool
15358 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
15359 {
15360   poly_uint64 nelt = d->perm.length ();
15361
15362   if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
15363     return false;
15364
15365   if (!d->perm.series_p (0, 1, nelt - 1, -1))
15366     return false;
15367
15368   /* Success! */
15369   if (d->testing_p)
15370     return true;
15371
15372   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
15373   emit_set_insn (d->target, src);
15374   return true;
15375 }
15376
15377 static bool
15378 aarch64_evpc_dup (struct expand_vec_perm_d *d)
15379 {
15380   rtx out = d->target;
15381   rtx in0;
15382   HOST_WIDE_INT elt;
15383   machine_mode vmode = d->vmode;
15384   rtx lane;
15385
15386   if (d->vec_flags == VEC_SVE_PRED
15387       || d->perm.encoding ().encoded_nelts () != 1
15388       || !d->perm[0].is_constant (&elt))
15389     return false;
15390
15391   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
15392     return false;
15393
15394   /* Success! */
15395   if (d->testing_p)
15396     return true;
15397
15398   /* The generic preparation in aarch64_expand_vec_perm_const_1
15399      swaps the operand order and the permute indices if it finds
15400      d->perm[0] to be in the second operand.  Thus, we can always
15401      use d->op0 and need not do any extra arithmetic to get the
15402      correct lane number.  */
15403   in0 = d->op0;
15404   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
15405
15406   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
15407   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
15408   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
15409   return true;
15410 }
15411
15412 static bool
15413 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
15414 {
15415   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
15416   machine_mode vmode = d->vmode;
15417
15418   /* Make sure that the indices are constant.  */
15419   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
15420   for (unsigned int i = 0; i < encoded_nelts; ++i)
15421     if (!d->perm[i].is_constant ())
15422       return false;
15423
15424   if (d->testing_p)
15425     return true;
15426
15427   /* Generic code will try constant permutation twice.  Once with the
15428      original mode and again with the elements lowered to QImode.
15429      So wait and don't do the selector expansion ourselves.  */
15430   if (vmode != V8QImode && vmode != V16QImode)
15431     return false;
15432
15433   /* to_constant is safe since this routine is specific to Advanced SIMD
15434      vectors.  */
15435   unsigned int nelt = d->perm.length ().to_constant ();
15436   for (unsigned int i = 0; i < nelt; ++i)
15437     /* If big-endian and two vectors we end up with a weird mixed-endian
15438        mode on NEON.  Reverse the index within each word but not the word
15439        itself.  to_constant is safe because we checked is_constant above.  */
15440     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
15441                         ? d->perm[i].to_constant () ^ (nelt - 1)
15442                         : d->perm[i].to_constant ());
15443
15444   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
15445   sel = force_reg (vmode, sel);
15446
15447   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
15448   return true;
15449 }
15450
15451 /* Try to implement D using an SVE TBL instruction.  */
15452
15453 static bool
15454 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
15455 {
15456   unsigned HOST_WIDE_INT nelt;
15457
15458   /* Permuting two variable-length vectors could overflow the
15459      index range.  */
15460   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
15461     return false;
15462
15463   if (d->testing_p)
15464     return true;
15465
15466   machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
15467   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
15468   if (d->one_vector_p)
15469     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
15470   else
15471     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
15472   return true;
15473 }
15474
15475 static bool
15476 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
15477 {
15478   /* The pattern matching functions above are written to look for a small
15479      number to begin the sequence (0, 1, N/2).  If we begin with an index
15480      from the second operand, we can swap the operands.  */
15481   poly_int64 nelt = d->perm.length ();
15482   if (known_ge (d->perm[0], nelt))
15483     {
15484       d->perm.rotate_inputs (1);
15485       std::swap (d->op0, d->op1);
15486     }
15487
15488   if ((d->vec_flags == VEC_ADVSIMD
15489        || d->vec_flags == VEC_SVE_DATA
15490        || d->vec_flags == VEC_SVE_PRED)
15491       && known_gt (nelt, 1))
15492     {
15493       if (aarch64_evpc_rev_local (d))
15494         return true;
15495       else if (aarch64_evpc_rev_global (d))
15496         return true;
15497       else if (aarch64_evpc_ext (d))
15498         return true;
15499       else if (aarch64_evpc_dup (d))
15500         return true;
15501       else if (aarch64_evpc_zip (d))
15502         return true;
15503       else if (aarch64_evpc_uzp (d))
15504         return true;
15505       else if (aarch64_evpc_trn (d))
15506         return true;
15507       if (d->vec_flags == VEC_SVE_DATA)
15508         return aarch64_evpc_sve_tbl (d);
15509       else if (d->vec_flags == VEC_ADVSIMD)
15510         return aarch64_evpc_tbl (d);
15511     }
15512   return false;
15513 }
15514
15515 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
15516
15517 static bool
15518 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
15519                                   rtx op1, const vec_perm_indices &sel)
15520 {
15521   struct expand_vec_perm_d d;
15522
15523   /* Check whether the mask can be applied to a single vector.  */
15524   if (sel.ninputs () == 1
15525       || (op0 && rtx_equal_p (op0, op1)))
15526     d.one_vector_p = true;
15527   else if (sel.all_from_input_p (0))
15528     {
15529       d.one_vector_p = true;
15530       op1 = op0;
15531     }
15532   else if (sel.all_from_input_p (1))
15533     {
15534       d.one_vector_p = true;
15535       op0 = op1;
15536     }
15537   else
15538     d.one_vector_p = false;
15539
15540   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
15541                      sel.nelts_per_input ());
15542   d.vmode = vmode;
15543   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
15544   d.target = target;
15545   d.op0 = op0;
15546   d.op1 = op1;
15547   d.testing_p = !target;
15548
15549   if (!d.testing_p)
15550     return aarch64_expand_vec_perm_const_1 (&d);
15551
15552   rtx_insn *last = get_last_insn ();
15553   bool ret = aarch64_expand_vec_perm_const_1 (&d);
15554   gcc_assert (last == get_last_insn ());
15555
15556   return ret;
15557 }
15558
15559 /* Generate a byte permute mask for a register of mode MODE,
15560    which has NUNITS units.  */
15561
15562 rtx
15563 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
15564 {
15565   /* We have to reverse each vector because we dont have
15566      a permuted load that can reverse-load according to ABI rules.  */
15567   rtx mask;
15568   rtvec v = rtvec_alloc (16);
15569   unsigned int i, j;
15570   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
15571
15572   gcc_assert (BYTES_BIG_ENDIAN);
15573   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
15574
15575   for (i = 0; i < nunits; i++)
15576     for (j = 0; j < usize; j++)
15577       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
15578   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
15579   return force_reg (V16QImode, mask);
15580 }
15581
15582 /* Return true if X is a valid second operand for the SVE instruction
15583    that implements integer comparison OP_CODE.  */
15584
15585 static bool
15586 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
15587 {
15588   if (register_operand (x, VOIDmode))
15589     return true;
15590
15591   switch (op_code)
15592     {
15593     case LTU:
15594     case LEU:
15595     case GEU:
15596     case GTU:
15597       return aarch64_sve_cmp_immediate_p (x, false);
15598     case LT:
15599     case LE:
15600     case GE:
15601     case GT:
15602     case NE:
15603     case EQ:
15604       return aarch64_sve_cmp_immediate_p (x, true);
15605     default:
15606       gcc_unreachable ();
15607     }
15608 }
15609
15610 /* Use predicated SVE instructions to implement the equivalent of:
15611
15612      (set TARGET OP)
15613
15614    given that PTRUE is an all-true predicate of the appropriate mode.  */
15615
15616 static void
15617 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
15618 {
15619   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
15620                                gen_rtvec (2, ptrue, op),
15621                                UNSPEC_MERGE_PTRUE);
15622   rtx_insn *insn = emit_set_insn (target, unspec);
15623   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
15624 }
15625
15626 /* Likewise, but also clobber the condition codes.  */
15627
15628 static void
15629 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
15630 {
15631   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
15632                                gen_rtvec (2, ptrue, op),
15633                                UNSPEC_MERGE_PTRUE);
15634   rtx_insn *insn = emit_insn (gen_set_clobber_cc (target, unspec));
15635   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
15636 }
15637
15638 /* Return the UNSPEC_COND_* code for comparison CODE.  */
15639
15640 static unsigned int
15641 aarch64_unspec_cond_code (rtx_code code)
15642 {
15643   switch (code)
15644     {
15645     case NE:
15646       return UNSPEC_COND_NE;
15647     case EQ:
15648       return UNSPEC_COND_EQ;
15649     case LT:
15650       return UNSPEC_COND_LT;
15651     case GT:
15652       return UNSPEC_COND_GT;
15653     case LE:
15654       return UNSPEC_COND_LE;
15655     case GE:
15656       return UNSPEC_COND_GE;
15657     default:
15658       gcc_unreachable ();
15659     }
15660 }
15661
15662 /* Emit:
15663
15664       (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
15665
15666    where <X> is the operation associated with comparison CODE.  This form
15667    of instruction is used when (and (CODE OP0 OP1) PRED) would have different
15668    semantics, such as when PRED might not be all-true and when comparing
15669    inactive lanes could have side effects.  */
15670
15671 static void
15672 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
15673                                   rtx pred, rtx op0, rtx op1)
15674 {
15675   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
15676                                gen_rtvec (3, pred, op0, op1),
15677                                aarch64_unspec_cond_code (code));
15678   emit_set_insn (target, unspec);
15679 }
15680
15681 /* Expand an SVE integer comparison using the SVE equivalent of:
15682
15683      (set TARGET (CODE OP0 OP1)).  */
15684
15685 void
15686 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
15687 {
15688   machine_mode pred_mode = GET_MODE (target);
15689   machine_mode data_mode = GET_MODE (op0);
15690
15691   if (!aarch64_sve_cmp_operand_p (code, op1))
15692     op1 = force_reg (data_mode, op1);
15693
15694   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15695   rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15696   aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
15697 }
15698
15699 /* Emit the SVE equivalent of:
15700
15701       (set TMP1 (CODE1 OP0 OP1))
15702       (set TMP2 (CODE2 OP0 OP1))
15703       (set TARGET (ior:PRED_MODE TMP1 TMP2))
15704
15705    PTRUE is an all-true predicate with the same mode as TARGET.  */
15706
15707 static void
15708 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
15709                            rtx ptrue, rtx op0, rtx op1)
15710 {
15711   machine_mode pred_mode = GET_MODE (ptrue);
15712   rtx tmp1 = gen_reg_rtx (pred_mode);
15713   aarch64_emit_sve_ptrue_op (tmp1, ptrue,
15714                              gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
15715   rtx tmp2 = gen_reg_rtx (pred_mode);
15716   aarch64_emit_sve_ptrue_op (tmp2, ptrue,
15717                              gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
15718   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
15719 }
15720
15721 /* Emit the SVE equivalent of:
15722
15723       (set TMP (CODE OP0 OP1))
15724       (set TARGET (not TMP))
15725
15726    PTRUE is an all-true predicate with the same mode as TARGET.  */
15727
15728 static void
15729 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
15730                                 rtx op0, rtx op1)
15731 {
15732   machine_mode pred_mode = GET_MODE (ptrue);
15733   rtx tmp = gen_reg_rtx (pred_mode);
15734   aarch64_emit_sve_ptrue_op (tmp, ptrue,
15735                              gen_rtx_fmt_ee (code, pred_mode, op0, op1));
15736   aarch64_emit_unop (target, one_cmpl_optab, tmp);
15737 }
15738
15739 /* Expand an SVE floating-point comparison using the SVE equivalent of:
15740
15741      (set TARGET (CODE OP0 OP1))
15742
15743    If CAN_INVERT_P is true, the caller can also handle inverted results;
15744    return true if the result is in fact inverted.  */
15745
15746 bool
15747 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
15748                                   rtx op0, rtx op1, bool can_invert_p)
15749 {
15750   machine_mode pred_mode = GET_MODE (target);
15751   machine_mode data_mode = GET_MODE (op0);
15752
15753   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15754   switch (code)
15755     {
15756     case UNORDERED:
15757       /* UNORDERED has no immediate form.  */
15758       op1 = force_reg (data_mode, op1);
15759       /* fall through */
15760     case LT:
15761     case LE:
15762     case GT:
15763     case GE:
15764     case EQ:
15765     case NE:
15766       {
15767         /* There is native support for the comparison.  */
15768         rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15769         aarch64_emit_sve_ptrue_op (target, ptrue, cond);
15770         return false;
15771       }
15772
15773     case LTGT:
15774       /* This is a trapping operation (LT or GT).  */
15775       aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
15776       return false;
15777
15778     case UNEQ:
15779       if (!flag_trapping_math)
15780         {
15781           /* This would trap for signaling NaNs.  */
15782           op1 = force_reg (data_mode, op1);
15783           aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
15784           return false;
15785         }
15786       /* fall through */
15787     case UNLT:
15788     case UNLE:
15789     case UNGT:
15790     case UNGE:
15791       if (flag_trapping_math)
15792         {
15793           /* Work out which elements are ordered.  */
15794           rtx ordered = gen_reg_rtx (pred_mode);
15795           op1 = force_reg (data_mode, op1);
15796           aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
15797
15798           /* Test the opposite condition for the ordered elements,
15799              then invert the result.  */
15800           if (code == UNEQ)
15801             code = NE;
15802           else
15803             code = reverse_condition_maybe_unordered (code);
15804           if (can_invert_p)
15805             {
15806               aarch64_emit_sve_predicated_cond (target, code,
15807                                                 ordered, op0, op1);
15808               return true;
15809             }
15810           rtx tmp = gen_reg_rtx (pred_mode);
15811           aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
15812           aarch64_emit_unop (target, one_cmpl_optab, tmp);
15813           return false;
15814         }
15815       break;
15816
15817     case ORDERED:
15818       /* ORDERED has no immediate form.  */
15819       op1 = force_reg (data_mode, op1);
15820       break;
15821
15822     default:
15823       gcc_unreachable ();
15824     }
15825
15826   /* There is native support for the inverse comparison.  */
15827   code = reverse_condition_maybe_unordered (code);
15828   if (can_invert_p)
15829     {
15830       rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15831       aarch64_emit_sve_ptrue_op (target, ptrue, cond);
15832       return true;
15833     }
15834   aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
15835   return false;
15836 }
15837
15838 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
15839    of the data being selected and CMP_MODE is the mode of the values being
15840    compared.  */
15841
15842 void
15843 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
15844                           rtx *ops)
15845 {
15846   machine_mode pred_mode
15847     = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
15848                              GET_MODE_SIZE (cmp_mode)).require ();
15849   rtx pred = gen_reg_rtx (pred_mode);
15850   if (FLOAT_MODE_P (cmp_mode))
15851     {
15852       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
15853                                             ops[4], ops[5], true))
15854         std::swap (ops[1], ops[2]);
15855     }
15856   else
15857     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
15858
15859   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
15860   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
15861 }
15862
15863 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
15864    true.  However due to issues with register allocation it is preferable
15865    to avoid tieing integer scalar and FP scalar modes.  Executing integer
15866    operations in general registers is better than treating them as scalar
15867    vector operations.  This reduces latency and avoids redundant int<->FP
15868    moves.  So tie modes if they are either the same class, or vector modes
15869    with other vector modes, vector structs or any scalar mode.  */
15870
15871 static bool
15872 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
15873 {
15874   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
15875     return true;
15876
15877   /* We specifically want to allow elements of "structure" modes to
15878      be tieable to the structure.  This more general condition allows
15879      other rarer situations too.  The reason we don't extend this to
15880      predicate modes is that there are no predicate structure modes
15881      nor any specific instructions for extracting part of a predicate
15882      register.  */
15883   if (aarch64_vector_data_mode_p (mode1)
15884       && aarch64_vector_data_mode_p (mode2))
15885     return true;
15886
15887   /* Also allow any scalar modes with vectors.  */
15888   if (aarch64_vector_mode_supported_p (mode1)
15889       || aarch64_vector_mode_supported_p (mode2))
15890     return true;
15891
15892   return false;
15893 }
15894
15895 /* Return a new RTX holding the result of moving POINTER forward by
15896    AMOUNT bytes.  */
15897
15898 static rtx
15899 aarch64_move_pointer (rtx pointer, poly_int64 amount)
15900 {
15901   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
15902
15903   return adjust_automodify_address (pointer, GET_MODE (pointer),
15904                                     next, amount);
15905 }
15906
15907 /* Return a new RTX holding the result of moving POINTER forward by the
15908    size of the mode it points to.  */
15909
15910 static rtx
15911 aarch64_progress_pointer (rtx pointer)
15912 {
15913   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
15914 }
15915
15916 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
15917    MODE bytes.  */
15918
15919 static void
15920 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
15921                                               machine_mode mode)
15922 {
15923   rtx reg = gen_reg_rtx (mode);
15924
15925   /* "Cast" the pointers to the correct mode.  */
15926   *src = adjust_address (*src, mode, 0);
15927   *dst = adjust_address (*dst, mode, 0);
15928   /* Emit the memcpy.  */
15929   emit_move_insn (reg, *src);
15930   emit_move_insn (*dst, reg);
15931   /* Move the pointers forward.  */
15932   *src = aarch64_progress_pointer (*src);
15933   *dst = aarch64_progress_pointer (*dst);
15934 }
15935
15936 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
15937    we succeed, otherwise return false.  */
15938
15939 bool
15940 aarch64_expand_movmem (rtx *operands)
15941 {
15942   int n, mode_bits;
15943   rtx dst = operands[0];
15944   rtx src = operands[1];
15945   rtx base;
15946   machine_mode cur_mode = BLKmode, next_mode;
15947   bool speed_p = !optimize_function_for_size_p (cfun);
15948
15949   /* When optimizing for size, give a better estimate of the length of a
15950      memcpy call, but use the default otherwise.  Moves larger than 8 bytes
15951      will always require an even number of instructions to do now.  And each
15952      operation requires both a load+store, so devide the max number by 2.  */
15953   int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
15954
15955   /* We can't do anything smart if the amount to copy is not constant.  */
15956   if (!CONST_INT_P (operands[2]))
15957     return false;
15958
15959   n = INTVAL (operands[2]);
15960
15961   /* Try to keep the number of instructions low.  For all cases we will do at
15962      most two moves for the residual amount, since we'll always overlap the
15963      remainder.  */
15964   if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
15965     return false;
15966
15967   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
15968   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
15969
15970   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
15971   src = adjust_automodify_address (src, VOIDmode, base, 0);
15972
15973   /* Convert n to bits to make the rest of the code simpler.  */
15974   n = n * BITS_PER_UNIT;
15975
15976   /* Maximum amount to copy in one go.  The AArch64 back-end has integer modes
15977      larger than TImode, but we should not use them for loads/stores here.  */
15978   const int copy_limit = GET_MODE_BITSIZE (TImode);
15979
15980   while (n > 0)
15981     {
15982       /* Find the largest mode in which to do the copy in without over reading
15983          or writing.  */
15984       opt_scalar_int_mode mode_iter;
15985       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
15986         if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
15987           cur_mode = mode_iter.require ();
15988
15989       gcc_assert (cur_mode != BLKmode);
15990
15991       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
15992       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
15993
15994       n -= mode_bits;
15995
15996       /* Do certain trailing copies as overlapping if it's going to be
15997          cheaper.  i.e. less instructions to do so.  For instance doing a 15
15998          byte copy it's more efficient to do two overlapping 8 byte copies than
15999          8 + 6 + 1.  */
16000       if (n > 0 && n <= 8 * BITS_PER_UNIT)
16001         {
16002           next_mode = smallest_mode_for_size (n, MODE_INT);
16003           int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
16004           src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
16005           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
16006           n = n_bits;
16007         }
16008     }
16009
16010   return true;
16011 }
16012
16013 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
16014    SImode stores.  Handle the case when the constant has identical
16015    bottom and top halves.  This is beneficial when the two stores can be
16016    merged into an STP and we avoid synthesising potentially expensive
16017    immediates twice.  Return true if such a split is possible.  */
16018
16019 bool
16020 aarch64_split_dimode_const_store (rtx dst, rtx src)
16021 {
16022   rtx lo = gen_lowpart (SImode, src);
16023   rtx hi = gen_highpart_mode (SImode, DImode, src);
16024
16025   bool size_p = optimize_function_for_size_p (cfun);
16026
16027   if (!rtx_equal_p (lo, hi))
16028     return false;
16029
16030   unsigned int orig_cost
16031     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
16032   unsigned int lo_cost
16033     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
16034
16035   /* We want to transform:
16036      MOV        x1, 49370
16037      MOVK       x1, 0x140, lsl 16
16038      MOVK       x1, 0xc0da, lsl 32
16039      MOVK       x1, 0x140, lsl 48
16040      STR        x1, [x0]
16041    into:
16042      MOV        w1, 49370
16043      MOVK       w1, 0x140, lsl 16
16044      STP        w1, w1, [x0]
16045    So we want to perform this only when we save two instructions
16046    or more.  When optimizing for size, however, accept any code size
16047    savings we can.  */
16048   if (size_p && orig_cost <= lo_cost)
16049     return false;
16050
16051   if (!size_p
16052       && (orig_cost <= lo_cost + 1))
16053     return false;
16054
16055   rtx mem_lo = adjust_address (dst, SImode, 0);
16056   if (!aarch64_mem_pair_operand (mem_lo, SImode))
16057     return false;
16058
16059   rtx tmp_reg = gen_reg_rtx (SImode);
16060   aarch64_expand_mov_immediate (tmp_reg, lo);
16061   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
16062   /* Don't emit an explicit store pair as this may not be always profitable.
16063      Let the sched-fusion logic decide whether to merge them.  */
16064   emit_move_insn (mem_lo, tmp_reg);
16065   emit_move_insn (mem_hi, tmp_reg);
16066
16067   return true;
16068 }
16069
16070 /* Generate RTL for a conditional branch with rtx comparison CODE in
16071    mode CC_MODE.  The destination of the unlikely conditional branch
16072    is LABEL_REF.  */
16073
16074 void
16075 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
16076                               rtx label_ref)
16077 {
16078   rtx x;
16079   x = gen_rtx_fmt_ee (code, VOIDmode,
16080                       gen_rtx_REG (cc_mode, CC_REGNUM),
16081                       const0_rtx);
16082
16083   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16084                             gen_rtx_LABEL_REF (VOIDmode, label_ref),
16085                             pc_rtx);
16086   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16087 }
16088
16089 /* Generate DImode scratch registers for 128-bit (TImode) addition.
16090
16091    OP1 represents the TImode destination operand 1
16092    OP2 represents the TImode destination operand 2
16093    LOW_DEST represents the low half (DImode) of TImode operand 0
16094    LOW_IN1 represents the low half (DImode) of TImode operand 1
16095    LOW_IN2 represents the low half (DImode) of TImode operand 2
16096    HIGH_DEST represents the high half (DImode) of TImode operand 0
16097    HIGH_IN1 represents the high half (DImode) of TImode operand 1
16098    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
16099
16100 void
16101 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
16102                             rtx *low_in1, rtx *low_in2,
16103                             rtx *high_dest, rtx *high_in1,
16104                             rtx *high_in2)
16105 {
16106   *low_dest = gen_reg_rtx (DImode);
16107   *low_in1 = gen_lowpart (DImode, op1);
16108   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
16109                                   subreg_lowpart_offset (DImode, TImode));
16110   *high_dest = gen_reg_rtx (DImode);
16111   *high_in1 = gen_highpart (DImode, op1);
16112   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
16113                                    subreg_highpart_offset (DImode, TImode));
16114 }
16115
16116 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
16117
16118    This function differs from 'arch64_addti_scratch_regs' in that
16119    OP1 can be an immediate constant (zero). We must call
16120    subreg_highpart_offset with DImode and TImode arguments, otherwise
16121    VOIDmode will be used for the const_int which generates an internal
16122    error from subreg_size_highpart_offset which does not expect a size of zero.
16123
16124    OP1 represents the TImode destination operand 1
16125    OP2 represents the TImode destination operand 2
16126    LOW_DEST represents the low half (DImode) of TImode operand 0
16127    LOW_IN1 represents the low half (DImode) of TImode operand 1
16128    LOW_IN2 represents the low half (DImode) of TImode operand 2
16129    HIGH_DEST represents the high half (DImode) of TImode operand 0
16130    HIGH_IN1 represents the high half (DImode) of TImode operand 1
16131    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
16132
16133
16134 void
16135 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
16136                              rtx *low_in1, rtx *low_in2,
16137                              rtx *high_dest, rtx *high_in1,
16138                              rtx *high_in2)
16139 {
16140   *low_dest = gen_reg_rtx (DImode);
16141   *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
16142                                   subreg_lowpart_offset (DImode, TImode));
16143
16144   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
16145                                   subreg_lowpart_offset (DImode, TImode));
16146   *high_dest = gen_reg_rtx (DImode);
16147
16148   *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
16149                                    subreg_highpart_offset (DImode, TImode));
16150   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
16151                                    subreg_highpart_offset (DImode, TImode));
16152 }
16153
16154 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
16155
16156    OP0 represents the TImode destination operand 0
16157    LOW_DEST represents the low half (DImode) of TImode operand 0
16158    LOW_IN1 represents the low half (DImode) of TImode operand 1
16159    LOW_IN2 represents the low half (DImode) of TImode operand 2
16160    HIGH_DEST represents the high half (DImode) of TImode operand 0
16161    HIGH_IN1 represents the high half (DImode) of TImode operand 1
16162    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
16163
16164 void
16165 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
16166                        rtx low_in2, rtx high_dest, rtx high_in1,
16167                        rtx high_in2)
16168 {
16169   if (low_in2 == const0_rtx)
16170     {
16171       low_dest = low_in1;
16172       emit_insn (gen_subdi3_compare1 (high_dest, high_in1,
16173                                       force_reg (DImode, high_in2)));
16174     }
16175   else
16176     {
16177       if (CONST_INT_P (low_in2))
16178         {
16179           low_in2 = force_reg (DImode, GEN_INT (-UINTVAL (low_in2)));
16180           high_in2 = force_reg (DImode, high_in2);
16181           emit_insn (gen_adddi3_compareC (low_dest, low_in1, low_in2));
16182         }
16183       else
16184         emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
16185       emit_insn (gen_subdi3_carryinCV (high_dest,
16186                                        force_reg (DImode, high_in1),
16187                                        high_in2));
16188     }
16189
16190   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
16191   emit_move_insn (gen_highpart (DImode, op0), high_dest);
16192
16193 }
16194
16195 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
16196
16197 static unsigned HOST_WIDE_INT
16198 aarch64_asan_shadow_offset (void)
16199 {
16200   return (HOST_WIDE_INT_1 << 36);
16201 }
16202
16203 static rtx
16204 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
16205                         int code, tree treeop0, tree treeop1)
16206 {
16207   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16208   rtx op0, op1;
16209   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16210   insn_code icode;
16211   struct expand_operand ops[4];
16212
16213   start_sequence ();
16214   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16215
16216   op_mode = GET_MODE (op0);
16217   if (op_mode == VOIDmode)
16218     op_mode = GET_MODE (op1);
16219
16220   switch (op_mode)
16221     {
16222     case E_QImode:
16223     case E_HImode:
16224     case E_SImode:
16225       cmp_mode = SImode;
16226       icode = CODE_FOR_cmpsi;
16227       break;
16228
16229     case E_DImode:
16230       cmp_mode = DImode;
16231       icode = CODE_FOR_cmpdi;
16232       break;
16233
16234     case E_SFmode:
16235       cmp_mode = SFmode;
16236       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16237       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
16238       break;
16239
16240     case E_DFmode:
16241       cmp_mode = DFmode;
16242       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16243       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
16244       break;
16245
16246     default:
16247       end_sequence ();
16248       return NULL_RTX;
16249     }
16250
16251   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
16252   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
16253   if (!op0 || !op1)
16254     {
16255       end_sequence ();
16256       return NULL_RTX;
16257     }
16258   *prep_seq = get_insns ();
16259   end_sequence ();
16260
16261   create_fixed_operand (&ops[0], op0);
16262   create_fixed_operand (&ops[1], op1);
16263
16264   start_sequence ();
16265   if (!maybe_expand_insn (icode, 2, ops))
16266     {
16267       end_sequence ();
16268       return NULL_RTX;
16269     }
16270   *gen_seq = get_insns ();
16271   end_sequence ();
16272
16273   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
16274                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
16275 }
16276
16277 static rtx
16278 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
16279                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
16280 {
16281   rtx op0, op1, target;
16282   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16283   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16284   insn_code icode;
16285   struct expand_operand ops[6];
16286   int aarch64_cond;
16287
16288   push_to_sequence (*prep_seq);
16289   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16290
16291   op_mode = GET_MODE (op0);
16292   if (op_mode == VOIDmode)
16293     op_mode = GET_MODE (op1);
16294
16295   switch (op_mode)
16296     {
16297     case E_QImode:
16298     case E_HImode:
16299     case E_SImode:
16300       cmp_mode = SImode;
16301       icode = CODE_FOR_ccmpsi;
16302       break;
16303
16304     case E_DImode:
16305       cmp_mode = DImode;
16306       icode = CODE_FOR_ccmpdi;
16307       break;
16308
16309     case E_SFmode:
16310       cmp_mode = SFmode;
16311       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16312       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
16313       break;
16314
16315     case E_DFmode:
16316       cmp_mode = DFmode;
16317       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16318       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
16319       break;
16320
16321     default:
16322       end_sequence ();
16323       return NULL_RTX;
16324     }
16325
16326   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
16327   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
16328   if (!op0 || !op1)
16329     {
16330       end_sequence ();
16331       return NULL_RTX;
16332     }
16333   *prep_seq = get_insns ();
16334   end_sequence ();
16335
16336   target = gen_rtx_REG (cc_mode, CC_REGNUM);
16337   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
16338
16339   if (bit_code != AND)
16340     {
16341       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
16342                                                 GET_MODE (XEXP (prev, 0))),
16343                              VOIDmode, XEXP (prev, 0), const0_rtx);
16344       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
16345     }
16346
16347   create_fixed_operand (&ops[0], XEXP (prev, 0));
16348   create_fixed_operand (&ops[1], target);
16349   create_fixed_operand (&ops[2], op0);
16350   create_fixed_operand (&ops[3], op1);
16351   create_fixed_operand (&ops[4], prev);
16352   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
16353
16354   push_to_sequence (*gen_seq);
16355   if (!maybe_expand_insn (icode, 6, ops))
16356     {
16357       end_sequence ();
16358       return NULL_RTX;
16359     }
16360
16361   *gen_seq = get_insns ();
16362   end_sequence ();
16363
16364   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
16365 }
16366
16367 #undef TARGET_GEN_CCMP_FIRST
16368 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
16369
16370 #undef TARGET_GEN_CCMP_NEXT
16371 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
16372
16373 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
16374    instruction fusion of some sort.  */
16375
16376 static bool
16377 aarch64_macro_fusion_p (void)
16378 {
16379   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
16380 }
16381
16382
16383 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
16384    should be kept together during scheduling.  */
16385
16386 static bool
16387 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
16388 {
16389   rtx set_dest;
16390   rtx prev_set = single_set (prev);
16391   rtx curr_set = single_set (curr);
16392   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
16393   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
16394
16395   if (!aarch64_macro_fusion_p ())
16396     return false;
16397
16398   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
16399     {
16400       /* We are trying to match:
16401          prev (mov)  == (set (reg r0) (const_int imm16))
16402          curr (movk) == (set (zero_extract (reg r0)
16403                                            (const_int 16)
16404                                            (const_int 16))
16405                              (const_int imm16_1))  */
16406
16407       set_dest = SET_DEST (curr_set);
16408
16409       if (GET_CODE (set_dest) == ZERO_EXTRACT
16410           && CONST_INT_P (SET_SRC (curr_set))
16411           && CONST_INT_P (SET_SRC (prev_set))
16412           && CONST_INT_P (XEXP (set_dest, 2))
16413           && INTVAL (XEXP (set_dest, 2)) == 16
16414           && REG_P (XEXP (set_dest, 0))
16415           && REG_P (SET_DEST (prev_set))
16416           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
16417         {
16418           return true;
16419         }
16420     }
16421
16422   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
16423     {
16424
16425       /*  We're trying to match:
16426           prev (adrp) == (set (reg r1)
16427                               (high (symbol_ref ("SYM"))))
16428           curr (add) == (set (reg r0)
16429                              (lo_sum (reg r1)
16430                                      (symbol_ref ("SYM"))))
16431           Note that r0 need not necessarily be the same as r1, especially
16432           during pre-regalloc scheduling.  */
16433
16434       if (satisfies_constraint_Ush (SET_SRC (prev_set))
16435           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16436         {
16437           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
16438               && REG_P (XEXP (SET_SRC (curr_set), 0))
16439               && REGNO (XEXP (SET_SRC (curr_set), 0))
16440                  == REGNO (SET_DEST (prev_set))
16441               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
16442                               XEXP (SET_SRC (curr_set), 1)))
16443             return true;
16444         }
16445     }
16446
16447   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
16448     {
16449
16450       /* We're trying to match:
16451          prev (movk) == (set (zero_extract (reg r0)
16452                                            (const_int 16)
16453                                            (const_int 32))
16454                              (const_int imm16_1))
16455          curr (movk) == (set (zero_extract (reg r0)
16456                                            (const_int 16)
16457                                            (const_int 48))
16458                              (const_int imm16_2))  */
16459
16460       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
16461           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
16462           && REG_P (XEXP (SET_DEST (prev_set), 0))
16463           && REG_P (XEXP (SET_DEST (curr_set), 0))
16464           && REGNO (XEXP (SET_DEST (prev_set), 0))
16465              == REGNO (XEXP (SET_DEST (curr_set), 0))
16466           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
16467           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
16468           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
16469           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
16470           && CONST_INT_P (SET_SRC (prev_set))
16471           && CONST_INT_P (SET_SRC (curr_set)))
16472         return true;
16473
16474     }
16475   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
16476     {
16477       /* We're trying to match:
16478           prev (adrp) == (set (reg r0)
16479                               (high (symbol_ref ("SYM"))))
16480           curr (ldr) == (set (reg r1)
16481                              (mem (lo_sum (reg r0)
16482                                              (symbol_ref ("SYM")))))
16483                  or
16484           curr (ldr) == (set (reg r1)
16485                              (zero_extend (mem
16486                                            (lo_sum (reg r0)
16487                                                    (symbol_ref ("SYM"))))))  */
16488       if (satisfies_constraint_Ush (SET_SRC (prev_set))
16489           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16490         {
16491           rtx curr_src = SET_SRC (curr_set);
16492
16493           if (GET_CODE (curr_src) == ZERO_EXTEND)
16494             curr_src = XEXP (curr_src, 0);
16495
16496           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
16497               && REG_P (XEXP (XEXP (curr_src, 0), 0))
16498               && REGNO (XEXP (XEXP (curr_src, 0), 0))
16499                  == REGNO (SET_DEST (prev_set))
16500               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
16501                               XEXP (SET_SRC (prev_set), 0)))
16502               return true;
16503         }
16504     }
16505
16506   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
16507        && aarch_crypto_can_dual_issue (prev, curr))
16508     return true;
16509
16510   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
16511       && any_condjump_p (curr))
16512     {
16513       enum attr_type prev_type = get_attr_type (prev);
16514
16515       unsigned int condreg1, condreg2;
16516       rtx cc_reg_1;
16517       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
16518       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
16519
16520       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
16521           && prev
16522           && modified_in_p (cc_reg_1, prev))
16523         {
16524           /* FIXME: this misses some which is considered simple arthematic
16525              instructions for ThunderX.  Simple shifts are missed here.  */
16526           if (prev_type == TYPE_ALUS_SREG
16527               || prev_type == TYPE_ALUS_IMM
16528               || prev_type == TYPE_LOGICS_REG
16529               || prev_type == TYPE_LOGICS_IMM)
16530             return true;
16531         }
16532     }
16533
16534   if (prev_set
16535       && curr_set
16536       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
16537       && any_condjump_p (curr))
16538     {
16539       /* We're trying to match:
16540           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
16541           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
16542                                                          (const_int 0))
16543                                                  (label_ref ("SYM"))
16544                                                  (pc))  */
16545       if (SET_DEST (curr_set) == (pc_rtx)
16546           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
16547           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
16548           && REG_P (SET_DEST (prev_set))
16549           && REGNO (SET_DEST (prev_set))
16550              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
16551         {
16552           /* Fuse ALU operations followed by conditional branch instruction.  */
16553           switch (get_attr_type (prev))
16554             {
16555             case TYPE_ALU_IMM:
16556             case TYPE_ALU_SREG:
16557             case TYPE_ADC_REG:
16558             case TYPE_ADC_IMM:
16559             case TYPE_ADCS_REG:
16560             case TYPE_ADCS_IMM:
16561             case TYPE_LOGIC_REG:
16562             case TYPE_LOGIC_IMM:
16563             case TYPE_CSEL:
16564             case TYPE_ADR:
16565             case TYPE_MOV_IMM:
16566             case TYPE_SHIFT_REG:
16567             case TYPE_SHIFT_IMM:
16568             case TYPE_BFM:
16569             case TYPE_RBIT:
16570             case TYPE_REV:
16571             case TYPE_EXTEND:
16572               return true;
16573
16574             default:;
16575             }
16576         }
16577     }
16578
16579   return false;
16580 }
16581
16582 /* Return true iff the instruction fusion described by OP is enabled.  */
16583
16584 bool
16585 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
16586 {
16587   return (aarch64_tune_params.fusible_ops & op) != 0;
16588 }
16589
16590 /* If MEM is in the form of [base+offset], extract the two parts
16591    of address and set to BASE and OFFSET, otherwise return false
16592    after clearing BASE and OFFSET.  */
16593
16594 bool
16595 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
16596 {
16597   rtx addr;
16598
16599   gcc_assert (MEM_P (mem));
16600
16601   addr = XEXP (mem, 0);
16602
16603   if (REG_P (addr))
16604     {
16605       *base = addr;
16606       *offset = const0_rtx;
16607       return true;
16608     }
16609
16610   if (GET_CODE (addr) == PLUS
16611       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
16612     {
16613       *base = XEXP (addr, 0);
16614       *offset = XEXP (addr, 1);
16615       return true;
16616     }
16617
16618   *base = NULL_RTX;
16619   *offset = NULL_RTX;
16620
16621   return false;
16622 }
16623
16624 /* Types for scheduling fusion.  */
16625 enum sched_fusion_type
16626 {
16627   SCHED_FUSION_NONE = 0,
16628   SCHED_FUSION_LD_SIGN_EXTEND,
16629   SCHED_FUSION_LD_ZERO_EXTEND,
16630   SCHED_FUSION_LD,
16631   SCHED_FUSION_ST,
16632   SCHED_FUSION_NUM
16633 };
16634
16635 /* If INSN is a load or store of address in the form of [base+offset],
16636    extract the two parts and set to BASE and OFFSET.  Return scheduling
16637    fusion type this INSN is.  */
16638
16639 static enum sched_fusion_type
16640 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
16641 {
16642   rtx x, dest, src;
16643   enum sched_fusion_type fusion = SCHED_FUSION_LD;
16644
16645   gcc_assert (INSN_P (insn));
16646   x = PATTERN (insn);
16647   if (GET_CODE (x) != SET)
16648     return SCHED_FUSION_NONE;
16649
16650   src = SET_SRC (x);
16651   dest = SET_DEST (x);
16652
16653   machine_mode dest_mode = GET_MODE (dest);
16654
16655   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
16656     return SCHED_FUSION_NONE;
16657
16658   if (GET_CODE (src) == SIGN_EXTEND)
16659     {
16660       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
16661       src = XEXP (src, 0);
16662       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16663         return SCHED_FUSION_NONE;
16664     }
16665   else if (GET_CODE (src) == ZERO_EXTEND)
16666     {
16667       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
16668       src = XEXP (src, 0);
16669       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16670         return SCHED_FUSION_NONE;
16671     }
16672
16673   if (GET_CODE (src) == MEM && REG_P (dest))
16674     extract_base_offset_in_addr (src, base, offset);
16675   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
16676     {
16677       fusion = SCHED_FUSION_ST;
16678       extract_base_offset_in_addr (dest, base, offset);
16679     }
16680   else
16681     return SCHED_FUSION_NONE;
16682
16683   if (*base == NULL_RTX || *offset == NULL_RTX)
16684     fusion = SCHED_FUSION_NONE;
16685
16686   return fusion;
16687 }
16688
16689 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
16690
16691    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
16692    and PRI are only calculated for these instructions.  For other instruction,
16693    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
16694    type instruction fusion can be added by returning different priorities.
16695
16696    It's important that irrelevant instructions get the largest FUSION_PRI.  */
16697
16698 static void
16699 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
16700                                int *fusion_pri, int *pri)
16701 {
16702   int tmp, off_val;
16703   rtx base, offset;
16704   enum sched_fusion_type fusion;
16705
16706   gcc_assert (INSN_P (insn));
16707
16708   tmp = max_pri - 1;
16709   fusion = fusion_load_store (insn, &base, &offset);
16710   if (fusion == SCHED_FUSION_NONE)
16711     {
16712       *pri = tmp;
16713       *fusion_pri = tmp;
16714       return;
16715     }
16716
16717   /* Set FUSION_PRI according to fusion type and base register.  */
16718   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
16719
16720   /* Calculate PRI.  */
16721   tmp /= 2;
16722
16723   /* INSN with smaller offset goes first.  */
16724   off_val = (int)(INTVAL (offset));
16725   if (off_val >= 0)
16726     tmp -= (off_val & 0xfffff);
16727   else
16728     tmp += ((- off_val) & 0xfffff);
16729
16730   *pri = tmp;
16731   return;
16732 }
16733
16734 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
16735    Adjust priority of sha1h instructions so they are scheduled before
16736    other SHA1 instructions.  */
16737
16738 static int
16739 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
16740 {
16741   rtx x = PATTERN (insn);
16742
16743   if (GET_CODE (x) == SET)
16744     {
16745       x = SET_SRC (x);
16746
16747       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
16748         return priority + 10;
16749     }
16750
16751   return priority;
16752 }
16753
16754 /* Given OPERANDS of consecutive load/store, check if we can merge
16755    them into ldp/stp.  LOAD is true if they are load instructions.
16756    MODE is the mode of memory operands.  */
16757
16758 bool
16759 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
16760                                 machine_mode mode)
16761 {
16762   HOST_WIDE_INT offval_1, offval_2, msize;
16763   enum reg_class rclass_1, rclass_2;
16764   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
16765
16766   if (load)
16767     {
16768       mem_1 = operands[1];
16769       mem_2 = operands[3];
16770       reg_1 = operands[0];
16771       reg_2 = operands[2];
16772       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
16773       if (REGNO (reg_1) == REGNO (reg_2))
16774         return false;
16775     }
16776   else
16777     {
16778       mem_1 = operands[0];
16779       mem_2 = operands[2];
16780       reg_1 = operands[1];
16781       reg_2 = operands[3];
16782     }
16783
16784   /* The mems cannot be volatile.  */
16785   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
16786     return false;
16787
16788   /* If we have SImode and slow unaligned ldp,
16789      check the alignment to be at least 8 byte. */
16790   if (mode == SImode
16791       && (aarch64_tune_params.extra_tuning_flags
16792           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16793       && !optimize_size
16794       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
16795     return false;
16796
16797   /* Check if the addresses are in the form of [base+offset].  */
16798   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16799   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16800     return false;
16801   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16802   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16803     return false;
16804
16805   /* Check if the bases are same.  */
16806   if (!rtx_equal_p (base_1, base_2))
16807     return false;
16808
16809   /* The operands must be of the same size.  */
16810   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
16811                          GET_MODE_SIZE (GET_MODE (mem_2))));
16812
16813   offval_1 = INTVAL (offset_1);
16814   offval_2 = INTVAL (offset_2);
16815   /* We should only be trying this for fixed-sized modes.  There is no
16816      SVE LDP/STP instruction.  */
16817   msize = GET_MODE_SIZE (mode).to_constant ();
16818   /* Check if the offsets are consecutive.  */
16819   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
16820     return false;
16821
16822   /* Check if the addresses are clobbered by load.  */
16823   if (load)
16824     {
16825       if (reg_mentioned_p (reg_1, mem_1))
16826         return false;
16827
16828       /* In increasing order, the last load can clobber the address.  */
16829       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
16830         return false;
16831     }
16832
16833   /* One of the memory accesses must be a mempair operand.
16834      If it is not the first one, they need to be swapped by the
16835      peephole.  */
16836   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
16837        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
16838     return false;
16839
16840   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
16841     rclass_1 = FP_REGS;
16842   else
16843     rclass_1 = GENERAL_REGS;
16844
16845   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
16846     rclass_2 = FP_REGS;
16847   else
16848     rclass_2 = GENERAL_REGS;
16849
16850   /* Check if the registers are of same class.  */
16851   if (rclass_1 != rclass_2)
16852     return false;
16853
16854   return true;
16855 }
16856
16857 /* Given OPERANDS of consecutive load/store that can be merged,
16858    swap them if they are not in ascending order.  */
16859 void
16860 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
16861 {
16862   rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
16863   HOST_WIDE_INT offval_1, offval_2;
16864
16865   if (load)
16866     {
16867       mem_1 = operands[1];
16868       mem_2 = operands[3];
16869     }
16870   else
16871     {
16872       mem_1 = operands[0];
16873       mem_2 = operands[2];
16874     }
16875
16876   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16877   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16878
16879   offval_1 = INTVAL (offset_1);
16880   offval_2 = INTVAL (offset_2);
16881
16882   if (offval_1 > offval_2)
16883     {
16884       /* Irrespective of whether this is a load or a store,
16885          we do the same swap.  */
16886       std::swap (operands[0], operands[2]);
16887       std::swap (operands[1], operands[3]);
16888     }
16889 }
16890
16891 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
16892    comparison between the two.  */
16893 int
16894 aarch64_host_wide_int_compare (const void *x, const void *y)
16895 {
16896   return wi::cmps (* ((const HOST_WIDE_INT *) x),
16897                    * ((const HOST_WIDE_INT *) y));
16898 }
16899
16900 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
16901    other pointing to a REG rtx containing an offset, compare the offsets
16902    of the two pairs.
16903
16904    Return:
16905
16906         1 iff offset (X) > offset (Y)
16907         0 iff offset (X) == offset (Y)
16908         -1 iff offset (X) < offset (Y)  */
16909 int
16910 aarch64_ldrstr_offset_compare (const void *x, const void *y)
16911 {
16912   const rtx * operands_1 = (const rtx *) x;
16913   const rtx * operands_2 = (const rtx *) y;
16914   rtx mem_1, mem_2, base, offset_1, offset_2;
16915
16916   if (MEM_P (operands_1[0]))
16917     mem_1 = operands_1[0];
16918   else
16919     mem_1 = operands_1[1];
16920
16921   if (MEM_P (operands_2[0]))
16922     mem_2 = operands_2[0];
16923   else
16924     mem_2 = operands_2[1];
16925
16926   /* Extract the offsets.  */
16927   extract_base_offset_in_addr (mem_1, &base, &offset_1);
16928   extract_base_offset_in_addr (mem_2, &base, &offset_2);
16929
16930   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
16931
16932   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
16933 }
16934
16935 /* Given OPERANDS of consecutive load/store, check if we can merge
16936    them into ldp/stp by adjusting the offset.  LOAD is true if they
16937    are load instructions.  MODE is the mode of memory operands.
16938
16939    Given below consecutive stores:
16940
16941      str  w1, [xb, 0x100]
16942      str  w1, [xb, 0x104]
16943      str  w1, [xb, 0x108]
16944      str  w1, [xb, 0x10c]
16945
16946    Though the offsets are out of the range supported by stp, we can
16947    still pair them after adjusting the offset, like:
16948
16949      add  scratch, xb, 0x100
16950      stp  w1, w1, [scratch]
16951      stp  w1, w1, [scratch, 0x8]
16952
16953    The peephole patterns detecting this opportunity should guarantee
16954    the scratch register is avaliable.  */
16955
16956 bool
16957 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
16958                                        scalar_mode mode)
16959 {
16960   const int num_insns = 4;
16961   enum reg_class rclass;
16962   HOST_WIDE_INT offvals[num_insns], msize;
16963   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
16964
16965   if (load)
16966     {
16967       for (int i = 0; i < num_insns; i++)
16968         {
16969           reg[i] = operands[2 * i];
16970           mem[i] = operands[2 * i + 1];
16971
16972           gcc_assert (REG_P (reg[i]));
16973         }
16974
16975       /* Do not attempt to merge the loads if the loads clobber each other.  */
16976       for (int i = 0; i < 8; i += 2)
16977         for (int j = i + 2; j < 8; j += 2)
16978           if (reg_overlap_mentioned_p (operands[i], operands[j]))
16979             return false;
16980     }
16981   else
16982     for (int i = 0; i < num_insns; i++)
16983       {
16984         mem[i] = operands[2 * i];
16985         reg[i] = operands[2 * i + 1];
16986       }
16987
16988   /* Skip if memory operand is by itself valid for ldp/stp.  */
16989   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
16990     return false;
16991
16992   for (int i = 0; i < num_insns; i++)
16993     {
16994       /* The mems cannot be volatile.  */
16995       if (MEM_VOLATILE_P (mem[i]))
16996         return false;
16997
16998       /* Check if the addresses are in the form of [base+offset].  */
16999       extract_base_offset_in_addr (mem[i], base + i, offset + i);
17000       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
17001         return false;
17002     }
17003
17004   /* Check if the registers are of same class.  */
17005   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
17006     ? FP_REGS : GENERAL_REGS;
17007
17008   for (int i = 1; i < num_insns; i++)
17009     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
17010       {
17011         if (rclass != FP_REGS)
17012           return false;
17013       }
17014     else
17015       {
17016         if (rclass != GENERAL_REGS)
17017           return false;
17018       }
17019
17020   /* Only the last register in the order in which they occur
17021      may be clobbered by the load.  */
17022   if (rclass == GENERAL_REGS && load)
17023     for (int i = 0; i < num_insns - 1; i++)
17024       if (reg_mentioned_p (reg[i], mem[i]))
17025         return false;
17026
17027   /* Check if the bases are same.  */
17028   for (int i = 0; i < num_insns - 1; i++)
17029     if (!rtx_equal_p (base[i], base[i + 1]))
17030       return false;
17031
17032   for (int i = 0; i < num_insns; i++)
17033     offvals[i] = INTVAL (offset[i]);
17034
17035   msize = GET_MODE_SIZE (mode);
17036
17037   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
17038   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
17039          aarch64_host_wide_int_compare);
17040
17041   if (!(offvals[1] == offvals[0] + msize
17042         && offvals[3] == offvals[2] + msize))
17043     return false;
17044
17045   /* Check that offsets are within range of each other.  The ldp/stp
17046      instructions have 7 bit immediate offsets, so use 0x80.  */
17047   if (offvals[2] - offvals[0] >= msize * 0x80)
17048     return false;
17049
17050   /* The offsets must be aligned with respect to each other.  */
17051   if (offvals[0] % msize != offvals[2] % msize)
17052     return false;
17053
17054   /* If we have SImode and slow unaligned ldp,
17055      check the alignment to be at least 8 byte. */
17056   if (mode == SImode
17057       && (aarch64_tune_params.extra_tuning_flags
17058           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
17059       && !optimize_size
17060       && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
17061     return false;
17062
17063   return true;
17064 }
17065
17066 /* Given OPERANDS of consecutive load/store, this function pairs them
17067    into LDP/STP after adjusting the offset.  It depends on the fact
17068    that the operands can be sorted so the offsets are correct for STP.
17069    MODE is the mode of memory operands.  CODE is the rtl operator
17070    which should be applied to all memory operands, it's SIGN_EXTEND,
17071    ZERO_EXTEND or UNKNOWN.  */
17072
17073 bool
17074 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
17075                              scalar_mode mode, RTX_CODE code)
17076 {
17077   rtx base, offset_1, offset_3, t1, t2;
17078   rtx mem_1, mem_2, mem_3, mem_4;
17079   rtx temp_operands[8];
17080   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
17081                 stp_off_upper_limit, stp_off_lower_limit, msize;
17082
17083   /* We make changes on a copy as we may still bail out.  */
17084   for (int i = 0; i < 8; i ++)
17085     temp_operands[i] = operands[i];
17086
17087   /* Sort the operands.  */
17088   qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
17089
17090   if (load)
17091     {
17092       mem_1 = temp_operands[1];
17093       mem_2 = temp_operands[3];
17094       mem_3 = temp_operands[5];
17095       mem_4 = temp_operands[7];
17096     }
17097   else
17098     {
17099       mem_1 = temp_operands[0];
17100       mem_2 = temp_operands[2];
17101       mem_3 = temp_operands[4];
17102       mem_4 = temp_operands[6];
17103       gcc_assert (code == UNKNOWN);
17104     }
17105
17106   extract_base_offset_in_addr (mem_1, &base, &offset_1);
17107   extract_base_offset_in_addr (mem_3, &base, &offset_3);
17108   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
17109               && offset_3 != NULL_RTX);
17110
17111   /* Adjust offset so it can fit in LDP/STP instruction.  */
17112   msize = GET_MODE_SIZE (mode);
17113   stp_off_upper_limit = msize * (0x40 - 1);
17114   stp_off_lower_limit = - msize * 0x40;
17115
17116   off_val_1 = INTVAL (offset_1);
17117   off_val_3 = INTVAL (offset_3);
17118
17119   /* The base offset is optimally half way between the two STP/LDP offsets.  */
17120   if (msize <= 4)
17121     base_off = (off_val_1 + off_val_3) / 2;
17122   else
17123     /* However, due to issues with negative LDP/STP offset generation for
17124        larger modes, for DF, DI and vector modes. we must not use negative
17125        addresses smaller than 9 signed unadjusted bits can store.  This
17126        provides the most range in this case.  */
17127     base_off = off_val_1;
17128
17129   /* Adjust the base so that it is aligned with the addresses but still
17130      optimal.  */
17131   if (base_off % msize != off_val_1 % msize)
17132     /* Fix the offset, bearing in mind we want to make it bigger not
17133        smaller.  */
17134     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17135   else if (msize <= 4)
17136     /* The negative range of LDP/STP is one larger than the positive range.  */
17137     base_off += msize;
17138
17139   /* Check if base offset is too big or too small.  We can attempt to resolve
17140      this issue by setting it to the maximum value and seeing if the offsets
17141      still fit.  */
17142   if (base_off >= 0x1000)
17143     {
17144       base_off = 0x1000 - 1;
17145       /* We must still make sure that the base offset is aligned with respect
17146          to the address.  But it may may not be made any bigger.  */
17147       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17148     }
17149
17150   /* Likewise for the case where the base is too small.  */
17151   if (base_off <= -0x1000)
17152     {
17153       base_off = -0x1000 + 1;
17154       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17155     }
17156
17157   /* Offset of the first STP/LDP.  */
17158   new_off_1 = off_val_1 - base_off;
17159
17160   /* Offset of the second STP/LDP.  */
17161   new_off_3 = off_val_3 - base_off;
17162
17163   /* The offsets must be within the range of the LDP/STP instructions.  */
17164   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
17165       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
17166     return false;
17167
17168   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
17169                                                   new_off_1), true);
17170   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
17171                                                   new_off_1 + msize), true);
17172   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
17173                                                   new_off_3), true);
17174   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
17175                                                   new_off_3 + msize), true);
17176
17177   if (!aarch64_mem_pair_operand (mem_1, mode)
17178       || !aarch64_mem_pair_operand (mem_3, mode))
17179     return false;
17180
17181   if (code == ZERO_EXTEND)
17182     {
17183       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
17184       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
17185       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
17186       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
17187     }
17188   else if (code == SIGN_EXTEND)
17189     {
17190       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
17191       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
17192       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
17193       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
17194     }
17195
17196   if (load)
17197     {
17198       operands[0] = temp_operands[0];
17199       operands[1] = mem_1;
17200       operands[2] = temp_operands[2];
17201       operands[3] = mem_2;
17202       operands[4] = temp_operands[4];
17203       operands[5] = mem_3;
17204       operands[6] = temp_operands[6];
17205       operands[7] = mem_4;
17206     }
17207   else
17208     {
17209       operands[0] = mem_1;
17210       operands[1] = temp_operands[1];
17211       operands[2] = mem_2;
17212       operands[3] = temp_operands[3];
17213       operands[4] = mem_3;
17214       operands[5] = temp_operands[5];
17215       operands[6] = mem_4;
17216       operands[7] = temp_operands[7];
17217     }
17218
17219   /* Emit adjusting instruction.  */
17220   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
17221   /* Emit ldp/stp instructions.  */
17222   t1 = gen_rtx_SET (operands[0], operands[1]);
17223   t2 = gen_rtx_SET (operands[2], operands[3]);
17224   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17225   t1 = gen_rtx_SET (operands[4], operands[5]);
17226   t2 = gen_rtx_SET (operands[6], operands[7]);
17227   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17228   return true;
17229 }
17230
17231 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
17232    it isn't worth branching around empty masked ops (including masked
17233    stores).  */
17234
17235 static bool
17236 aarch64_empty_mask_is_expensive (unsigned)
17237 {
17238   return false;
17239 }
17240
17241 /* Return 1 if pseudo register should be created and used to hold
17242    GOT address for PIC code.  */
17243
17244 bool
17245 aarch64_use_pseudo_pic_reg (void)
17246 {
17247   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
17248 }
17249
17250 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
17251
17252 static int
17253 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
17254 {
17255   switch (XINT (x, 1))
17256     {
17257     case UNSPEC_GOTSMALLPIC:
17258     case UNSPEC_GOTSMALLPIC28K:
17259     case UNSPEC_GOTTINYPIC:
17260       return 0;
17261     default:
17262       break;
17263     }
17264
17265   return default_unspec_may_trap_p (x, flags);
17266 }
17267
17268
17269 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
17270    return the log2 of that value.  Otherwise return -1.  */
17271
17272 int
17273 aarch64_fpconst_pow_of_2 (rtx x)
17274 {
17275   const REAL_VALUE_TYPE *r;
17276
17277   if (!CONST_DOUBLE_P (x))
17278     return -1;
17279
17280   r = CONST_DOUBLE_REAL_VALUE (x);
17281
17282   if (REAL_VALUE_NEGATIVE (*r)
17283       || REAL_VALUE_ISNAN (*r)
17284       || REAL_VALUE_ISINF (*r)
17285       || !real_isinteger (r, DFmode))
17286     return -1;
17287
17288   return exact_log2 (real_to_integer (r));
17289 }
17290
17291 /* If X is a vector of equal CONST_DOUBLE values and that value is
17292    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
17293
17294 int
17295 aarch64_vec_fpconst_pow_of_2 (rtx x)
17296 {
17297   int nelts;
17298   if (GET_CODE (x) != CONST_VECTOR
17299       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
17300     return -1;
17301
17302   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
17303     return -1;
17304
17305   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
17306   if (firstval <= 0)
17307     return -1;
17308
17309   for (int i = 1; i < nelts; i++)
17310     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
17311       return -1;
17312
17313   return firstval;
17314 }
17315
17316 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
17317    to float.
17318
17319    __fp16 always promotes through this hook.
17320    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
17321    through the generic excess precision logic rather than here.  */
17322
17323 static tree
17324 aarch64_promoted_type (const_tree t)
17325 {
17326   if (SCALAR_FLOAT_TYPE_P (t)
17327       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
17328     return float_type_node;
17329
17330   return NULL_TREE;
17331 }
17332
17333 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
17334
17335 static bool
17336 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
17337                            optimization_type opt_type)
17338 {
17339   switch (op)
17340     {
17341     case rsqrt_optab:
17342       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
17343
17344     default:
17345       return true;
17346     }
17347 }
17348
17349 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
17350
17351 static unsigned int
17352 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
17353                                         int *offset)
17354 {
17355   /* Polynomial invariant 1 == (VG / 2) - 1.  */
17356   gcc_assert (i == 1);
17357   *factor = 2;
17358   *offset = 1;
17359   return AARCH64_DWARF_VG;
17360 }
17361
17362 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
17363    if MODE is HFmode, and punt to the generic implementation otherwise.  */
17364
17365 static bool
17366 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
17367 {
17368   return (mode == HFmode
17369           ? true
17370           : default_libgcc_floating_mode_supported_p (mode));
17371 }
17372
17373 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
17374    if MODE is HFmode, and punt to the generic implementation otherwise.  */
17375
17376 static bool
17377 aarch64_scalar_mode_supported_p (scalar_mode mode)
17378 {
17379   return (mode == HFmode
17380           ? true
17381           : default_scalar_mode_supported_p (mode));
17382 }
17383
17384 /* Set the value of FLT_EVAL_METHOD.
17385    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
17386
17387     0: evaluate all operations and constants, whose semantic type has at
17388        most the range and precision of type float, to the range and
17389        precision of float; evaluate all other operations and constants to
17390        the range and precision of the semantic type;
17391
17392     N, where _FloatN is a supported interchange floating type
17393        evaluate all operations and constants, whose semantic type has at
17394        most the range and precision of _FloatN type, to the range and
17395        precision of the _FloatN type; evaluate all other operations and
17396        constants to the range and precision of the semantic type;
17397
17398    If we have the ARMv8.2-A extensions then we support _Float16 in native
17399    precision, so we should set this to 16.  Otherwise, we support the type,
17400    but want to evaluate expressions in float precision, so set this to
17401    0.  */
17402
17403 static enum flt_eval_method
17404 aarch64_excess_precision (enum excess_precision_type type)
17405 {
17406   switch (type)
17407     {
17408       case EXCESS_PRECISION_TYPE_FAST:
17409       case EXCESS_PRECISION_TYPE_STANDARD:
17410         /* We can calculate either in 16-bit range and precision or
17411            32-bit range and precision.  Make that decision based on whether
17412            we have native support for the ARMv8.2-A 16-bit floating-point
17413            instructions or not.  */
17414         return (TARGET_FP_F16INST
17415                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
17416                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
17417       case EXCESS_PRECISION_TYPE_IMPLICIT:
17418         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
17419       default:
17420         gcc_unreachable ();
17421     }
17422   return FLT_EVAL_METHOD_UNPREDICTABLE;
17423 }
17424
17425 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
17426    scheduled for speculative execution.  Reject the long-running division
17427    and square-root instructions.  */
17428
17429 static bool
17430 aarch64_sched_can_speculate_insn (rtx_insn *insn)
17431 {
17432   switch (get_attr_type (insn))
17433     {
17434       case TYPE_SDIV:
17435       case TYPE_UDIV:
17436       case TYPE_FDIVS:
17437       case TYPE_FDIVD:
17438       case TYPE_FSQRTS:
17439       case TYPE_FSQRTD:
17440       case TYPE_NEON_FP_SQRT_S:
17441       case TYPE_NEON_FP_SQRT_D:
17442       case TYPE_NEON_FP_SQRT_S_Q:
17443       case TYPE_NEON_FP_SQRT_D_Q:
17444       case TYPE_NEON_FP_DIV_S:
17445       case TYPE_NEON_FP_DIV_D:
17446       case TYPE_NEON_FP_DIV_S_Q:
17447       case TYPE_NEON_FP_DIV_D_Q:
17448         return false;
17449       default:
17450         return true;
17451     }
17452 }
17453
17454 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
17455
17456 static int
17457 aarch64_compute_pressure_classes (reg_class *classes)
17458 {
17459   int i = 0;
17460   classes[i++] = GENERAL_REGS;
17461   classes[i++] = FP_REGS;
17462   /* PR_REGS isn't a useful pressure class because many predicate pseudo
17463      registers need to go in PR_LO_REGS at some point during their
17464      lifetime.  Splitting it into two halves has the effect of making
17465      all predicates count against PR_LO_REGS, so that we try whenever
17466      possible to restrict the number of live predicates to 8.  This
17467      greatly reduces the amount of spilling in certain loops.  */
17468   classes[i++] = PR_LO_REGS;
17469   classes[i++] = PR_HI_REGS;
17470   return i;
17471 }
17472
17473 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
17474
17475 static bool
17476 aarch64_can_change_mode_class (machine_mode from,
17477                                machine_mode to, reg_class_t)
17478 {
17479   if (BYTES_BIG_ENDIAN)
17480     {
17481       bool from_sve_p = aarch64_sve_data_mode_p (from);
17482       bool to_sve_p = aarch64_sve_data_mode_p (to);
17483
17484       /* Don't allow changes between SVE data modes and non-SVE modes.
17485          See the comment at the head of aarch64-sve.md for details.  */
17486       if (from_sve_p != to_sve_p)
17487         return false;
17488
17489       /* Don't allow changes in element size: lane 0 of the new vector
17490          would not then be lane 0 of the old vector.  See the comment
17491          above aarch64_maybe_expand_sve_subreg_move for a more detailed
17492          description.
17493
17494          In the worst case, this forces a register to be spilled in
17495          one mode and reloaded in the other, which handles the
17496          endianness correctly.  */
17497       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
17498         return false;
17499     }
17500   return true;
17501 }
17502
17503 /* Implement TARGET_EARLY_REMAT_MODES.  */
17504
17505 static void
17506 aarch64_select_early_remat_modes (sbitmap modes)
17507 {
17508   /* SVE values are not normally live across a call, so it should be
17509      worth doing early rematerialization even in VL-specific mode.  */
17510   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
17511     {
17512       machine_mode mode = (machine_mode) i;
17513       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
17514       if (vec_flags & VEC_ANY_SVE)
17515         bitmap_set_bit (modes, i);
17516     }
17517 }
17518
17519 /* Override the default target speculation_safe_value.  */
17520 static rtx
17521 aarch64_speculation_safe_value (machine_mode mode,
17522                                 rtx result, rtx val, rtx failval)
17523 {
17524   /* Maybe we should warn if falling back to hard barriers.  They are
17525      likely to be noticably more expensive than the alternative below.  */
17526   if (!aarch64_track_speculation)
17527     return default_speculation_safe_value (mode, result, val, failval);
17528
17529   if (!REG_P (val))
17530     val = copy_to_mode_reg (mode, val);
17531
17532   if (!aarch64_reg_or_zero (failval, mode))
17533     failval = copy_to_mode_reg (mode, failval);
17534
17535   switch (mode)
17536     {
17537     case E_QImode:
17538       emit_insn (gen_despeculate_copyqi (result, val, failval));
17539       break;
17540     case E_HImode:
17541       emit_insn (gen_despeculate_copyhi (result, val, failval));
17542       break;
17543     case E_SImode:
17544       emit_insn (gen_despeculate_copysi (result, val, failval));
17545       break;
17546     case E_DImode:
17547       emit_insn (gen_despeculate_copydi (result, val, failval));
17548       break;
17549     case E_TImode:
17550       emit_insn (gen_despeculate_copyti (result, val, failval));
17551       break;
17552     default:
17553       gcc_unreachable ();
17554     }
17555   return result;
17556 }
17557
17558 /* Target-specific selftests.  */
17559
17560 #if CHECKING_P
17561
17562 namespace selftest {
17563
17564 /* Selftest for the RTL loader.
17565    Verify that the RTL loader copes with a dump from
17566    print_rtx_function.  This is essentially just a test that class
17567    function_reader can handle a real dump, but it also verifies
17568    that lookup_reg_by_dump_name correctly handles hard regs.
17569    The presence of hard reg names in the dump means that the test is
17570    target-specific, hence it is in this file.  */
17571
17572 static void
17573 aarch64_test_loading_full_dump ()
17574 {
17575   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
17576
17577   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
17578
17579   rtx_insn *insn_1 = get_insn_by_uid (1);
17580   ASSERT_EQ (NOTE, GET_CODE (insn_1));
17581
17582   rtx_insn *insn_15 = get_insn_by_uid (15);
17583   ASSERT_EQ (INSN, GET_CODE (insn_15));
17584   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
17585
17586   /* Verify crtl->return_rtx.  */
17587   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
17588   ASSERT_EQ (0, REGNO (crtl->return_rtx));
17589   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
17590 }
17591
17592 /* Run all target-specific selftests.  */
17593
17594 static void
17595 aarch64_run_selftests (void)
17596 {
17597   aarch64_test_loading_full_dump ();
17598 }
17599
17600 } // namespace selftest
17601
17602 #endif /* #if CHECKING_P */
17603
17604 #undef TARGET_ADDRESS_COST
17605 #define TARGET_ADDRESS_COST aarch64_address_cost
17606
17607 /* This hook will determines whether unnamed bitfields affect the alignment
17608    of the containing structure.  The hook returns true if the structure
17609    should inherit the alignment requirements of an unnamed bitfield's
17610    type.  */
17611 #undef TARGET_ALIGN_ANON_BITFIELD
17612 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
17613
17614 #undef TARGET_ASM_ALIGNED_DI_OP
17615 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
17616
17617 #undef TARGET_ASM_ALIGNED_HI_OP
17618 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
17619
17620 #undef TARGET_ASM_ALIGNED_SI_OP
17621 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
17622
17623 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
17624 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
17625   hook_bool_const_tree_hwi_hwi_const_tree_true
17626
17627 #undef TARGET_ASM_FILE_START
17628 #define TARGET_ASM_FILE_START aarch64_start_file
17629
17630 #undef TARGET_ASM_OUTPUT_MI_THUNK
17631 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
17632
17633 #undef TARGET_ASM_SELECT_RTX_SECTION
17634 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
17635
17636 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
17637 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
17638
17639 #undef TARGET_BUILD_BUILTIN_VA_LIST
17640 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
17641
17642 #undef TARGET_CALLEE_COPIES
17643 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
17644
17645 #undef TARGET_CAN_ELIMINATE
17646 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
17647
17648 #undef TARGET_CAN_INLINE_P
17649 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
17650
17651 #undef TARGET_CANNOT_FORCE_CONST_MEM
17652 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
17653
17654 #undef TARGET_CASE_VALUES_THRESHOLD
17655 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
17656
17657 #undef TARGET_CONDITIONAL_REGISTER_USAGE
17658 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
17659
17660 /* Only the least significant bit is used for initialization guard
17661    variables.  */
17662 #undef TARGET_CXX_GUARD_MASK_BIT
17663 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
17664
17665 #undef TARGET_C_MODE_FOR_SUFFIX
17666 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
17667
17668 #ifdef TARGET_BIG_ENDIAN_DEFAULT
17669 #undef  TARGET_DEFAULT_TARGET_FLAGS
17670 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
17671 #endif
17672
17673 #undef TARGET_CLASS_MAX_NREGS
17674 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
17675
17676 #undef TARGET_BUILTIN_DECL
17677 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
17678
17679 #undef TARGET_BUILTIN_RECIPROCAL
17680 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
17681
17682 #undef TARGET_C_EXCESS_PRECISION
17683 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
17684
17685 #undef  TARGET_EXPAND_BUILTIN
17686 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
17687
17688 #undef TARGET_EXPAND_BUILTIN_VA_START
17689 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
17690
17691 #undef TARGET_FOLD_BUILTIN
17692 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
17693
17694 #undef TARGET_FUNCTION_ARG
17695 #define TARGET_FUNCTION_ARG aarch64_function_arg
17696
17697 #undef TARGET_FUNCTION_ARG_ADVANCE
17698 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
17699
17700 #undef TARGET_FUNCTION_ARG_BOUNDARY
17701 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
17702
17703 #undef TARGET_FUNCTION_ARG_PADDING
17704 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
17705
17706 #undef TARGET_GET_RAW_RESULT_MODE
17707 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
17708 #undef TARGET_GET_RAW_ARG_MODE
17709 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
17710
17711 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
17712 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
17713
17714 #undef TARGET_FUNCTION_VALUE
17715 #define TARGET_FUNCTION_VALUE aarch64_function_value
17716
17717 #undef TARGET_FUNCTION_VALUE_REGNO_P
17718 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
17719
17720 #undef TARGET_GIMPLE_FOLD_BUILTIN
17721 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
17722
17723 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
17724 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
17725
17726 #undef  TARGET_INIT_BUILTINS
17727 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
17728
17729 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
17730 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
17731   aarch64_ira_change_pseudo_allocno_class
17732
17733 #undef TARGET_LEGITIMATE_ADDRESS_P
17734 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
17735
17736 #undef TARGET_LEGITIMATE_CONSTANT_P
17737 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
17738
17739 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
17740 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
17741   aarch64_legitimize_address_displacement
17742
17743 #undef TARGET_LIBGCC_CMP_RETURN_MODE
17744 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
17745
17746 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
17747 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
17748 aarch64_libgcc_floating_mode_supported_p
17749
17750 #undef TARGET_MANGLE_TYPE
17751 #define TARGET_MANGLE_TYPE aarch64_mangle_type
17752
17753 #undef TARGET_MEMORY_MOVE_COST
17754 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
17755
17756 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
17757 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
17758
17759 #undef TARGET_MUST_PASS_IN_STACK
17760 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
17761
17762 /* This target hook should return true if accesses to volatile bitfields
17763    should use the narrowest mode possible.  It should return false if these
17764    accesses should use the bitfield container type.  */
17765 #undef TARGET_NARROW_VOLATILE_BITFIELD
17766 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
17767
17768 #undef  TARGET_OPTION_OVERRIDE
17769 #define TARGET_OPTION_OVERRIDE aarch64_override_options
17770
17771 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
17772 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
17773   aarch64_override_options_after_change
17774
17775 #undef TARGET_OPTION_SAVE
17776 #define TARGET_OPTION_SAVE aarch64_option_save
17777
17778 #undef TARGET_OPTION_RESTORE
17779 #define TARGET_OPTION_RESTORE aarch64_option_restore
17780
17781 #undef TARGET_OPTION_PRINT
17782 #define TARGET_OPTION_PRINT aarch64_option_print
17783
17784 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
17785 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
17786
17787 #undef TARGET_SET_CURRENT_FUNCTION
17788 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
17789
17790 #undef TARGET_PASS_BY_REFERENCE
17791 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
17792
17793 #undef TARGET_PREFERRED_RELOAD_CLASS
17794 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
17795
17796 #undef TARGET_SCHED_REASSOCIATION_WIDTH
17797 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
17798
17799 #undef TARGET_PROMOTED_TYPE
17800 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
17801
17802 #undef TARGET_SECONDARY_RELOAD
17803 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
17804
17805 #undef TARGET_SHIFT_TRUNCATION_MASK
17806 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
17807
17808 #undef TARGET_SETUP_INCOMING_VARARGS
17809 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
17810
17811 #undef TARGET_STRUCT_VALUE_RTX
17812 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
17813
17814 #undef TARGET_REGISTER_MOVE_COST
17815 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
17816
17817 #undef TARGET_RETURN_IN_MEMORY
17818 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
17819
17820 #undef TARGET_RETURN_IN_MSB
17821 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
17822
17823 #undef TARGET_RTX_COSTS
17824 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
17825
17826 #undef TARGET_SCALAR_MODE_SUPPORTED_P
17827 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
17828
17829 #undef TARGET_SCHED_ISSUE_RATE
17830 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
17831
17832 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
17833 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
17834   aarch64_sched_first_cycle_multipass_dfa_lookahead
17835
17836 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
17837 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
17838   aarch64_first_cycle_multipass_dfa_lookahead_guard
17839
17840 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
17841 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
17842   aarch64_get_separate_components
17843
17844 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
17845 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
17846   aarch64_components_for_bb
17847
17848 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
17849 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
17850   aarch64_disqualify_components
17851
17852 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
17853 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
17854   aarch64_emit_prologue_components
17855
17856 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
17857 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
17858   aarch64_emit_epilogue_components
17859
17860 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
17861 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
17862   aarch64_set_handled_components
17863
17864 #undef TARGET_TRAMPOLINE_INIT
17865 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
17866
17867 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
17868 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
17869
17870 #undef TARGET_VECTOR_MODE_SUPPORTED_P
17871 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
17872
17873 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
17874 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
17875   aarch64_builtin_support_vector_misalignment
17876
17877 #undef TARGET_ARRAY_MODE
17878 #define TARGET_ARRAY_MODE aarch64_array_mode
17879
17880 #undef TARGET_ARRAY_MODE_SUPPORTED_P
17881 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
17882
17883 #undef TARGET_VECTORIZE_ADD_STMT_COST
17884 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
17885
17886 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
17887 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
17888   aarch64_builtin_vectorization_cost
17889
17890 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
17891 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
17892
17893 #undef TARGET_VECTORIZE_BUILTINS
17894 #define TARGET_VECTORIZE_BUILTINS
17895
17896 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
17897 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
17898   aarch64_builtin_vectorized_function
17899
17900 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
17901 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
17902   aarch64_autovectorize_vector_sizes
17903
17904 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
17905 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
17906   aarch64_atomic_assign_expand_fenv
17907
17908 /* Section anchor support.  */
17909
17910 #undef TARGET_MIN_ANCHOR_OFFSET
17911 #define TARGET_MIN_ANCHOR_OFFSET -256
17912
17913 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
17914    byte offset; we can do much more for larger data types, but have no way
17915    to determine the size of the access.  We assume accesses are aligned.  */
17916 #undef TARGET_MAX_ANCHOR_OFFSET
17917 #define TARGET_MAX_ANCHOR_OFFSET 4095
17918
17919 #undef TARGET_VECTOR_ALIGNMENT
17920 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
17921
17922 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
17923 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
17924   aarch64_vectorize_preferred_vector_alignment
17925 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
17926 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
17927   aarch64_simd_vector_alignment_reachable
17928
17929 /* vec_perm support.  */
17930
17931 #undef TARGET_VECTORIZE_VEC_PERM_CONST
17932 #define TARGET_VECTORIZE_VEC_PERM_CONST \
17933   aarch64_vectorize_vec_perm_const
17934
17935 #undef TARGET_VECTORIZE_GET_MASK_MODE
17936 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
17937 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
17938 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
17939   aarch64_empty_mask_is_expensive
17940 #undef TARGET_PREFERRED_ELSE_VALUE
17941 #define TARGET_PREFERRED_ELSE_VALUE \
17942   aarch64_preferred_else_value
17943
17944 #undef TARGET_INIT_LIBFUNCS
17945 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
17946
17947 #undef TARGET_FIXED_CONDITION_CODE_REGS
17948 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
17949
17950 #undef TARGET_FLAGS_REGNUM
17951 #define TARGET_FLAGS_REGNUM CC_REGNUM
17952
17953 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
17954 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
17955
17956 #undef TARGET_ASAN_SHADOW_OFFSET
17957 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
17958
17959 #undef TARGET_LEGITIMIZE_ADDRESS
17960 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
17961
17962 #undef TARGET_SCHED_CAN_SPECULATE_INSN
17963 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
17964
17965 #undef TARGET_CAN_USE_DOLOOP_P
17966 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
17967
17968 #undef TARGET_SCHED_ADJUST_PRIORITY
17969 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
17970
17971 #undef TARGET_SCHED_MACRO_FUSION_P
17972 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
17973
17974 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
17975 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
17976
17977 #undef TARGET_SCHED_FUSION_PRIORITY
17978 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
17979
17980 #undef TARGET_UNSPEC_MAY_TRAP_P
17981 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
17982
17983 #undef TARGET_USE_PSEUDO_PIC_REG
17984 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
17985
17986 #undef TARGET_PRINT_OPERAND
17987 #define TARGET_PRINT_OPERAND aarch64_print_operand
17988
17989 #undef TARGET_PRINT_OPERAND_ADDRESS
17990 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
17991
17992 #undef TARGET_OPTAB_SUPPORTED_P
17993 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
17994
17995 #undef TARGET_OMIT_STRUCT_RETURN_REG
17996 #define TARGET_OMIT_STRUCT_RETURN_REG true
17997
17998 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
17999 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
18000   aarch64_dwarf_poly_indeterminate_value
18001
18002 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
18003 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
18004 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
18005
18006 #undef TARGET_HARD_REGNO_NREGS
18007 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
18008 #undef TARGET_HARD_REGNO_MODE_OK
18009 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
18010
18011 #undef TARGET_MODES_TIEABLE_P
18012 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
18013
18014 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
18015 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
18016   aarch64_hard_regno_call_part_clobbered
18017
18018 #undef TARGET_CONSTANT_ALIGNMENT
18019 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
18020
18021 #undef TARGET_COMPUTE_PRESSURE_CLASSES
18022 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
18023
18024 #undef TARGET_CAN_CHANGE_MODE_CLASS
18025 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
18026
18027 #undef TARGET_SELECT_EARLY_REMAT_MODES
18028 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
18029
18030 #undef TARGET_SPECULATION_SAFE_VALUE
18031 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
18032
18033 #if CHECKING_P
18034 #undef TARGET_RUN_TARGET_SELFTESTS
18035 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
18036 #endif /* #if CHECKING_P */
18037
18038 struct gcc_target targetm = TARGET_INITIALIZER;
18039
18040 #include "gt-aarch64.h"