gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2017 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #define INCLUDE_STRING
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "memmodel.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "cfgloop.h"
  33 #include "df.h"
  34 #include "tm_p.h"
  35 #include "stringpool.h"
  36 #include "attribs.h"
  37 #include "optabs.h"
  38 #include "regs.h"
  39 #include "emit-rtl.h"
  40 #include "recog.h"
  41 #include "diagnostic.h"
  42 #include "insn-attr.h"
  43 #include "alias.h"
  44 #include "fold-const.h"
  45 #include "stor-layout.h"
  46 #include "calls.h"
  47 #include "varasm.h"
  48 #include "output.h"
  49 #include "flags.h"
  50 #include "explow.h"
  51 #include "expr.h"
  52 #include "reload.h"
  53 #include "langhooks.h"
  54 #include "opts.h"
  55 #include "params.h"
  56 #include "gimplify.h"
  57 #include "dwarf2.h"
  58 #include "gimple-iterator.h"
  59 #include "tree-vectorizer.h"
  60 #include "aarch64-cost-tables.h"
  61 #include "dumpfile.h"
  62 #include "builtins.h"
  63 #include "rtl-iter.h"
  64 #include "tm-constrs.h"
  65 #include "sched-int.h"
  66 #include "target-globals.h"
  67 #include "common/common-target.h"
  68 #include "selftest.h"
  69 #include "selftest-rtl.h"
  70
  71 /* This file should be included last.  */
  72 #include "target-def.h"
  73
  74 /* Defined for convenience.  */
  75 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  76
  77 /* Classifies an address.
  78
  79    ADDRESS_REG_IMM
  80        A simple base register plus immediate offset.
  81
  82    ADDRESS_REG_WB
  83        A base register indexed by immediate offset with writeback.
  84
  85    ADDRESS_REG_REG
  86        A base register indexed by (optionally scaled) register.
  87
  88    ADDRESS_REG_UXTW
  89        A base register indexed by (optionally scaled) zero-extended register.
  90
  91    ADDRESS_REG_SXTW
  92        A base register indexed by (optionally scaled) sign-extended register.
  93
  94    ADDRESS_LO_SUM
  95        A LO_SUM rtx with a base register and "LO12" symbol relocation.
  96
  97    ADDRESS_SYMBOLIC:
  98        A constant symbolic address, in pc-relative literal pool.  */
  99
 100 enum aarch64_address_type {
 101   ADDRESS_REG_IMM,
 102   ADDRESS_REG_WB,
 103   ADDRESS_REG_REG,
 104   ADDRESS_REG_UXTW,
 105   ADDRESS_REG_SXTW,
 106   ADDRESS_LO_SUM,
 107   ADDRESS_SYMBOLIC
 108 };
 109
 110 struct aarch64_address_info {
 111   enum aarch64_address_type type;
 112   rtx base;
 113   rtx offset;
 114   int shift;
 115   enum aarch64_symbol_type symbol_type;
 116 };
 117
 118 struct simd_immediate_info
 119 {
 120   rtx value;
 121   int shift;
 122   int element_width;
 123   bool mvn;
 124   bool msl;
 125 };
 126
 127 /* The current code model.  */
 128 enum aarch64_code_model aarch64_cmodel;
 129
 130 #ifdef HAVE_AS_TLS
 131 #undef TARGET_HAVE_TLS
 132 #define TARGET_HAVE_TLS 1
 133 #endif
 134
 135 static bool aarch64_composite_type_p (const_tree, machine_mode);
 136 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 137                                                      const_tree,
 138                                                      machine_mode *, int *,
 139                                                      bool *);
 140 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 141 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 142 static void aarch64_override_options_after_change (void);
 143 static bool aarch64_vector_mode_supported_p (machine_mode);
 144 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
 145                                                  const unsigned char *sel);
 146 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 147 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 148                                                          const_tree type,
 149                                                          int misalignment,
 150                                                          bool is_packed);
 151 static machine_mode
 152 aarch64_simd_container_mode (scalar_mode mode, unsigned width);
 153
 154 /* Major revision number of the ARM Architecture implemented by the target.  */
 155 unsigned aarch64_architecture_version;
 156
 157 /* The processor for which instructions should be scheduled.  */
 158 enum aarch64_processor aarch64_tune = cortexa53;
 159
 160 /* Mask to specify which instruction scheduling options should be used.  */
 161 unsigned long aarch64_tune_flags = 0;
 162
 163 /* Global flag for PC relative loads.  */
 164 bool aarch64_pcrelative_literal_loads;
 165
 166 /* Support for command line parsing of boolean flags in the tuning
 167    structures.  */
 168 struct aarch64_flag_desc
 169 {
 170   const char* name;
 171   unsigned int flag;
 172 };
 173
 174 #define AARCH64_FUSION_PAIR(name, internal_name) \
 175   { name, AARCH64_FUSE_##internal_name },
 176 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 177 {
 178   { "none", AARCH64_FUSE_NOTHING },
 179 #include "aarch64-fusion-pairs.def"
 180   { "all", AARCH64_FUSE_ALL },
 181   { NULL, AARCH64_FUSE_NOTHING }
 182 };
 183
 184 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 185   { name, AARCH64_EXTRA_TUNE_##internal_name },
 186 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 187 {
 188   { "none", AARCH64_EXTRA_TUNE_NONE },
 189 #include "aarch64-tuning-flags.def"
 190   { "all", AARCH64_EXTRA_TUNE_ALL },
 191   { NULL, AARCH64_EXTRA_TUNE_NONE }
 192 };
 193
 194 /* Tuning parameters.  */
 195
 196 static const struct cpu_addrcost_table generic_addrcost_table =
 197 {
 198     {
 199       1, /* hi  */
 200       0, /* si  */
 201       0, /* di  */
 202       1, /* ti  */
 203     },
 204   0, /* pre_modify  */
 205   0, /* post_modify  */
 206   0, /* register_offset  */
 207   0, /* register_sextend  */
 208   0, /* register_zextend  */
 209   0 /* imm_offset  */
 210 };
 211
 212 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 213 {
 214     {
 215       0, /* hi  */
 216       0, /* si  */
 217       0, /* di  */
 218       2, /* ti  */
 219     },
 220   0, /* pre_modify  */
 221   0, /* post_modify  */
 222   1, /* register_offset  */
 223   1, /* register_sextend  */
 224   2, /* register_zextend  */
 225   0, /* imm_offset  */
 226 };
 227
 228 static const struct cpu_addrcost_table xgene1_addrcost_table =
 229 {
 230     {
 231       1, /* hi  */
 232       0, /* si  */
 233       0, /* di  */
 234       1, /* ti  */
 235     },
 236   1, /* pre_modify  */
 237   0, /* post_modify  */
 238   0, /* register_offset  */
 239   1, /* register_sextend  */
 240   1, /* register_zextend  */
 241   0, /* imm_offset  */
 242 };
 243
 244 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 245 {
 246     {
 247       1, /* hi  */
 248       1, /* si  */
 249       1, /* di  */
 250       2, /* ti  */
 251     },
 252   0, /* pre_modify  */
 253   0, /* post_modify  */
 254   2, /* register_offset  */
 255   3, /* register_sextend  */
 256   3, /* register_zextend  */
 257   0, /* imm_offset  */
 258 };
 259
 260 static const struct cpu_regmove_cost generic_regmove_cost =
 261 {
 262   1, /* GP2GP  */
 263   /* Avoid the use of slow int<->fp moves for spilling by setting
 264      their cost higher than memmov_cost.  */
 265   5, /* GP2FP  */
 266   5, /* FP2GP  */
 267   2 /* FP2FP  */
 268 };
 269
 270 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 271 {
 272   1, /* GP2GP  */
 273   /* Avoid the use of slow int<->fp moves for spilling by setting
 274      their cost higher than memmov_cost.  */
 275   5, /* GP2FP  */
 276   5, /* FP2GP  */
 277   2 /* FP2FP  */
 278 };
 279
 280 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 281 {
 282   1, /* GP2GP  */
 283   /* Avoid the use of slow int<->fp moves for spilling by setting
 284      their cost higher than memmov_cost.  */
 285   5, /* GP2FP  */
 286   5, /* FP2GP  */
 287   2 /* FP2FP  */
 288 };
 289
 290 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 291 {
 292   1, /* GP2GP  */
 293   /* Avoid the use of slow int<->fp moves for spilling by setting
 294      their cost higher than memmov_cost (actual, 4 and 9).  */
 295   9, /* GP2FP  */
 296   9, /* FP2GP  */
 297   1 /* FP2FP  */
 298 };
 299
 300 static const struct cpu_regmove_cost thunderx_regmove_cost =
 301 {
 302   2, /* GP2GP  */
 303   2, /* GP2FP  */
 304   6, /* FP2GP  */
 305   4 /* FP2FP  */
 306 };
 307
 308 static const struct cpu_regmove_cost xgene1_regmove_cost =
 309 {
 310   1, /* GP2GP  */
 311   /* Avoid the use of slow int<->fp moves for spilling by setting
 312      their cost higher than memmov_cost.  */
 313   8, /* GP2FP  */
 314   8, /* FP2GP  */
 315   2 /* FP2FP  */
 316 };
 317
 318 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 319 {
 320   2, /* GP2GP  */
 321   /* Avoid the use of int<->fp moves for spilling.  */
 322   6, /* GP2FP  */
 323   6, /* FP2GP  */
 324   4 /* FP2FP  */
 325 };
 326
 327 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 328 {
 329   1, /* GP2GP  */
 330   /* Avoid the use of int<->fp moves for spilling.  */
 331   8, /* GP2FP  */
 332   8, /* FP2GP  */
 333   4  /* FP2FP  */
 334 };
 335
 336 /* Generic costs for vector insn classes.  */
 337 static const struct cpu_vector_cost generic_vector_cost =
 338 {
 339   1, /* scalar_int_stmt_cost  */
 340   1, /* scalar_fp_stmt_cost  */
 341   1, /* scalar_load_cost  */
 342   1, /* scalar_store_cost  */
 343   1, /* vec_int_stmt_cost  */
 344   1, /* vec_fp_stmt_cost  */
 345   2, /* vec_permute_cost  */
 346   1, /* vec_to_scalar_cost  */
 347   1, /* scalar_to_vec_cost  */
 348   1, /* vec_align_load_cost  */
 349   1, /* vec_unalign_load_cost  */
 350   1, /* vec_unalign_store_cost  */
 351   1, /* vec_store_cost  */
 352   3, /* cond_taken_branch_cost  */
 353   1 /* cond_not_taken_branch_cost  */
 354 };
 355
 356 /* ThunderX costs for vector insn classes.  */
 357 static const struct cpu_vector_cost thunderx_vector_cost =
 358 {
 359   1, /* scalar_int_stmt_cost  */
 360   1, /* scalar_fp_stmt_cost  */
 361   3, /* scalar_load_cost  */
 362   1, /* scalar_store_cost  */
 363   4, /* vec_int_stmt_cost  */
 364   1, /* vec_fp_stmt_cost  */
 365   4, /* vec_permute_cost  */
 366   2, /* vec_to_scalar_cost  */
 367   2, /* scalar_to_vec_cost  */
 368   3, /* vec_align_load_cost  */
 369   5, /* vec_unalign_load_cost  */
 370   5, /* vec_unalign_store_cost  */
 371   1, /* vec_store_cost  */
 372   3, /* cond_taken_branch_cost  */
 373   3 /* cond_not_taken_branch_cost  */
 374 };
 375
 376 /* Generic costs for vector insn classes.  */
 377 static const struct cpu_vector_cost cortexa57_vector_cost =
 378 {
 379   1, /* scalar_int_stmt_cost  */
 380   1, /* scalar_fp_stmt_cost  */
 381   4, /* scalar_load_cost  */
 382   1, /* scalar_store_cost  */
 383   2, /* vec_int_stmt_cost  */
 384   2, /* vec_fp_stmt_cost  */
 385   3, /* vec_permute_cost  */
 386   8, /* vec_to_scalar_cost  */
 387   8, /* scalar_to_vec_cost  */
 388   4, /* vec_align_load_cost  */
 389   4, /* vec_unalign_load_cost  */
 390   1, /* vec_unalign_store_cost  */
 391   1, /* vec_store_cost  */
 392   1, /* cond_taken_branch_cost  */
 393   1 /* cond_not_taken_branch_cost  */
 394 };
 395
 396 static const struct cpu_vector_cost exynosm1_vector_cost =
 397 {
 398   1, /* scalar_int_stmt_cost  */
 399   1, /* scalar_fp_stmt_cost  */
 400   5, /* scalar_load_cost  */
 401   1, /* scalar_store_cost  */
 402   3, /* vec_int_stmt_cost  */
 403   3, /* vec_fp_stmt_cost  */
 404   3, /* vec_permute_cost  */
 405   3, /* vec_to_scalar_cost  */
 406   3, /* scalar_to_vec_cost  */
 407   5, /* vec_align_load_cost  */
 408   5, /* vec_unalign_load_cost  */
 409   1, /* vec_unalign_store_cost  */
 410   1, /* vec_store_cost  */
 411   1, /* cond_taken_branch_cost  */
 412   1 /* cond_not_taken_branch_cost  */
 413 };
 414
 415 /* Generic costs for vector insn classes.  */
 416 static const struct cpu_vector_cost xgene1_vector_cost =
 417 {
 418   1, /* scalar_int_stmt_cost  */
 419   1, /* scalar_fp_stmt_cost  */
 420   5, /* scalar_load_cost  */
 421   1, /* scalar_store_cost  */
 422   2, /* vec_int_stmt_cost  */
 423   2, /* vec_fp_stmt_cost  */
 424   2, /* vec_permute_cost  */
 425   4, /* vec_to_scalar_cost  */
 426   4, /* scalar_to_vec_cost  */
 427   10, /* vec_align_load_cost  */
 428   10, /* vec_unalign_load_cost  */
 429   2, /* vec_unalign_store_cost  */
 430   2, /* vec_store_cost  */
 431   2, /* cond_taken_branch_cost  */
 432   1 /* cond_not_taken_branch_cost  */
 433 };
 434
 435 /* Costs for vector insn classes for Vulcan.  */
 436 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 437 {
 438   1, /* scalar_int_stmt_cost  */
 439   6, /* scalar_fp_stmt_cost  */
 440   4, /* scalar_load_cost  */
 441   1, /* scalar_store_cost  */
 442   5, /* vec_int_stmt_cost  */
 443   6, /* vec_fp_stmt_cost  */
 444   3, /* vec_permute_cost  */
 445   6, /* vec_to_scalar_cost  */
 446   5, /* scalar_to_vec_cost  */
 447   8, /* vec_align_load_cost  */
 448   8, /* vec_unalign_load_cost  */
 449   4, /* vec_unalign_store_cost  */
 450   4, /* vec_store_cost  */
 451   2, /* cond_taken_branch_cost  */
 452   1  /* cond_not_taken_branch_cost  */
 453 };
 454
 455 /* Generic costs for branch instructions.  */
 456 static const struct cpu_branch_cost generic_branch_cost =
 457 {
 458   1,  /* Predictable.  */
 459   3   /* Unpredictable.  */
 460 };
 461
 462 /* Generic approximation modes.  */
 463 static const cpu_approx_modes generic_approx_modes =
 464 {
 465   AARCH64_APPROX_NONE,  /* division  */
 466   AARCH64_APPROX_NONE,  /* sqrt  */
 467   AARCH64_APPROX_NONE   /* recip_sqrt  */
 468 };
 469
 470 /* Approximation modes for Exynos M1.  */
 471 static const cpu_approx_modes exynosm1_approx_modes =
 472 {
 473   AARCH64_APPROX_NONE,  /* division  */
 474   AARCH64_APPROX_ALL,   /* sqrt  */
 475   AARCH64_APPROX_ALL    /* recip_sqrt  */
 476 };
 477
 478 /* Approximation modes for X-Gene 1.  */
 479 static const cpu_approx_modes xgene1_approx_modes =
 480 {
 481   AARCH64_APPROX_NONE,  /* division  */
 482   AARCH64_APPROX_NONE,  /* sqrt  */
 483   AARCH64_APPROX_ALL    /* recip_sqrt  */
 484 };
 485
 486 /* Generic prefetch settings (which disable prefetch).  */
 487 static const cpu_prefetch_tune generic_prefetch_tune =
 488 {
 489   0,                    /* num_slots  */
 490   -1,                   /* l1_cache_size  */
 491   -1,                   /* l1_cache_line_size  */
 492   -1,                   /* l2_cache_size  */
 493   -1                    /* default_opt_level  */
 494 };
 495
 496 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 497 {
 498   0,                    /* num_slots  */
 499   -1,                   /* l1_cache_size  */
 500   64,                   /* l1_cache_line_size  */
 501   -1,                   /* l2_cache_size  */
 502   -1                    /* default_opt_level  */
 503 };
 504
 505 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 506 {
 507   4,                    /* num_slots  */
 508   32,                   /* l1_cache_size  */
 509   64,                   /* l1_cache_line_size  */
 510   1024,                 /* l2_cache_size  */
 511   3                     /* default_opt_level  */
 512 };
 513
 514 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 515 {
 516   8,                    /* num_slots  */
 517   32,                   /* l1_cache_size  */
 518   128,                  /* l1_cache_line_size  */
 519   16*1024,              /* l2_cache_size  */
 520   3                     /* default_opt_level  */
 521 };
 522
 523 static const cpu_prefetch_tune thunderx_prefetch_tune =
 524 {
 525   8,                    /* num_slots  */
 526   32,                   /* l1_cache_size  */
 527   128,                  /* l1_cache_line_size  */
 528   -1,                   /* l2_cache_size  */
 529   -1                    /* default_opt_level  */
 530 };
 531
 532 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 533 {
 534   8,                    /* num_slots  */
 535   32,                   /* l1_cache_size  */
 536   64,                   /* l1_cache_line_size  */
 537   256,                  /* l2_cache_size  */
 538   -1                    /* default_opt_level  */
 539 };
 540
 541 static const struct tune_params generic_tunings =
 542 {
 543   &cortexa57_extra_costs,
 544   &generic_addrcost_table,
 545   &generic_regmove_cost,
 546   &generic_vector_cost,
 547   &generic_branch_cost,
 548   &generic_approx_modes,
 549   4, /* memmov_cost  */
 550   2, /* issue_rate  */
 551   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 552   8,    /* function_align.  */
 553   4,    /* jump_align.  */
 554   8,    /* loop_align.  */
 555   2,    /* int_reassoc_width.  */
 556   4,    /* fp_reassoc_width.  */
 557   1,    /* vec_reassoc_width.  */
 558   2,    /* min_div_recip_mul_sf.  */
 559   2,    /* min_div_recip_mul_df.  */
 560   0,    /* max_case_values.  */
 561   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 562   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 563   &generic_prefetch_tune
 564 };
 565
 566 static const struct tune_params cortexa35_tunings =
 567 {
 568   &cortexa53_extra_costs,
 569   &generic_addrcost_table,
 570   &cortexa53_regmove_cost,
 571   &generic_vector_cost,
 572   &generic_branch_cost,
 573   &generic_approx_modes,
 574   4, /* memmov_cost  */
 575   1, /* issue_rate  */
 576   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 577    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 578   16,   /* function_align.  */
 579   4,    /* jump_align.  */
 580   8,    /* loop_align.  */
 581   2,    /* int_reassoc_width.  */
 582   4,    /* fp_reassoc_width.  */
 583   1,    /* vec_reassoc_width.  */
 584   2,    /* min_div_recip_mul_sf.  */
 585   2,    /* min_div_recip_mul_df.  */
 586   0,    /* max_case_values.  */
 587   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 588   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 589   &generic_prefetch_tune
 590 };
 591
 592 static const struct tune_params cortexa53_tunings =
 593 {
 594   &cortexa53_extra_costs,
 595   &generic_addrcost_table,
 596   &cortexa53_regmove_cost,
 597   &generic_vector_cost,
 598   &generic_branch_cost,
 599   &generic_approx_modes,
 600   4, /* memmov_cost  */
 601   2, /* issue_rate  */
 602   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 603    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 604   16,   /* function_align.  */
 605   4,    /* jump_align.  */
 606   8,    /* loop_align.  */
 607   2,    /* int_reassoc_width.  */
 608   4,    /* fp_reassoc_width.  */
 609   1,    /* vec_reassoc_width.  */
 610   2,    /* min_div_recip_mul_sf.  */
 611   2,    /* min_div_recip_mul_df.  */
 612   0,    /* max_case_values.  */
 613   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 614   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 615   &generic_prefetch_tune
 616 };
 617
 618 static const struct tune_params cortexa57_tunings =
 619 {
 620   &cortexa57_extra_costs,
 621   &generic_addrcost_table,
 622   &cortexa57_regmove_cost,
 623   &cortexa57_vector_cost,
 624   &generic_branch_cost,
 625   &generic_approx_modes,
 626   4, /* memmov_cost  */
 627   3, /* issue_rate  */
 628   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 629    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 630   16,   /* function_align.  */
 631   4,    /* jump_align.  */
 632   8,    /* loop_align.  */
 633   2,    /* int_reassoc_width.  */
 634   4,    /* fp_reassoc_width.  */
 635   1,    /* vec_reassoc_width.  */
 636   2,    /* min_div_recip_mul_sf.  */
 637   2,    /* min_div_recip_mul_df.  */
 638   0,    /* max_case_values.  */
 639   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 640   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 641   &generic_prefetch_tune
 642 };
 643
 644 static const struct tune_params cortexa72_tunings =
 645 {
 646   &cortexa57_extra_costs,
 647   &generic_addrcost_table,
 648   &cortexa57_regmove_cost,
 649   &cortexa57_vector_cost,
 650   &generic_branch_cost,
 651   &generic_approx_modes,
 652   4, /* memmov_cost  */
 653   3, /* issue_rate  */
 654   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 655    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 656   16,   /* function_align.  */
 657   4,    /* jump_align.  */
 658   8,    /* loop_align.  */
 659   2,    /* int_reassoc_width.  */
 660   4,    /* fp_reassoc_width.  */
 661   1,    /* vec_reassoc_width.  */
 662   2,    /* min_div_recip_mul_sf.  */
 663   2,    /* min_div_recip_mul_df.  */
 664   0,    /* max_case_values.  */
 665   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 666   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 667   &generic_prefetch_tune
 668 };
 669
 670 static const struct tune_params cortexa73_tunings =
 671 {
 672   &cortexa57_extra_costs,
 673   &generic_addrcost_table,
 674   &cortexa57_regmove_cost,
 675   &cortexa57_vector_cost,
 676   &generic_branch_cost,
 677   &generic_approx_modes,
 678   4, /* memmov_cost.  */
 679   2, /* issue_rate.  */
 680   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 681    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 682   16,   /* function_align.  */
 683   4,    /* jump_align.  */
 684   8,    /* loop_align.  */
 685   2,    /* int_reassoc_width.  */
 686   4,    /* fp_reassoc_width.  */
 687   1,    /* vec_reassoc_width.  */
 688   2,    /* min_div_recip_mul_sf.  */
 689   2,    /* min_div_recip_mul_df.  */
 690   0,    /* max_case_values.  */
 691   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 692   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 693   &generic_prefetch_tune
 694 };
 695
 696
 697
 698 static const struct tune_params exynosm1_tunings =
 699 {
 700   &exynosm1_extra_costs,
 701   &exynosm1_addrcost_table,
 702   &exynosm1_regmove_cost,
 703   &exynosm1_vector_cost,
 704   &generic_branch_cost,
 705   &exynosm1_approx_modes,
 706   4,    /* memmov_cost  */
 707   3,    /* issue_rate  */
 708   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 709   4,    /* function_align.  */
 710   4,    /* jump_align.  */
 711   4,    /* loop_align.  */
 712   2,    /* int_reassoc_width.  */
 713   4,    /* fp_reassoc_width.  */
 714   1,    /* vec_reassoc_width.  */
 715   2,    /* min_div_recip_mul_sf.  */
 716   2,    /* min_div_recip_mul_df.  */
 717   48,   /* max_case_values.  */
 718   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 719   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 720   &exynosm1_prefetch_tune
 721 };
 722
 723 static const struct tune_params thunderxt88_tunings =
 724 {
 725   &thunderx_extra_costs,
 726   &generic_addrcost_table,
 727   &thunderx_regmove_cost,
 728   &thunderx_vector_cost,
 729   &generic_branch_cost,
 730   &generic_approx_modes,
 731   6, /* memmov_cost  */
 732   2, /* issue_rate  */
 733   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 734   8,    /* function_align.  */
 735   8,    /* jump_align.  */
 736   8,    /* loop_align.  */
 737   2,    /* int_reassoc_width.  */
 738   4,    /* fp_reassoc_width.  */
 739   1,    /* vec_reassoc_width.  */
 740   2,    /* min_div_recip_mul_sf.  */
 741   2,    /* min_div_recip_mul_df.  */
 742   0,    /* max_case_values.  */
 743   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 744   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 745   &thunderxt88_prefetch_tune
 746 };
 747
 748 static const struct tune_params thunderx_tunings =
 749 {
 750   &thunderx_extra_costs,
 751   &generic_addrcost_table,
 752   &thunderx_regmove_cost,
 753   &thunderx_vector_cost,
 754   &generic_branch_cost,
 755   &generic_approx_modes,
 756   6, /* memmov_cost  */
 757   2, /* issue_rate  */
 758   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 759   8,    /* function_align.  */
 760   8,    /* jump_align.  */
 761   8,    /* loop_align.  */
 762   2,    /* int_reassoc_width.  */
 763   4,    /* fp_reassoc_width.  */
 764   1,    /* vec_reassoc_width.  */
 765   2,    /* min_div_recip_mul_sf.  */
 766   2,    /* min_div_recip_mul_df.  */
 767   0,    /* max_case_values.  */
 768   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 769   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 770    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 771   &thunderx_prefetch_tune
 772 };
 773
 774 static const struct tune_params xgene1_tunings =
 775 {
 776   &xgene1_extra_costs,
 777   &xgene1_addrcost_table,
 778   &xgene1_regmove_cost,
 779   &xgene1_vector_cost,
 780   &generic_branch_cost,
 781   &xgene1_approx_modes,
 782   6, /* memmov_cost  */
 783   4, /* issue_rate  */
 784   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 785   16,   /* function_align.  */
 786   8,    /* jump_align.  */
 787   16,   /* loop_align.  */
 788   2,    /* int_reassoc_width.  */
 789   4,    /* fp_reassoc_width.  */
 790   1,    /* vec_reassoc_width.  */
 791   2,    /* min_div_recip_mul_sf.  */
 792   2,    /* min_div_recip_mul_df.  */
 793   0,    /* max_case_values.  */
 794   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 795   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 796   &generic_prefetch_tune
 797 };
 798
 799 static const struct tune_params qdf24xx_tunings =
 800 {
 801   &qdf24xx_extra_costs,
 802   &generic_addrcost_table,
 803   &qdf24xx_regmove_cost,
 804   &generic_vector_cost,
 805   &generic_branch_cost,
 806   &generic_approx_modes,
 807   4, /* memmov_cost  */
 808   4, /* issue_rate  */
 809   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 810    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 811   16,   /* function_align.  */
 812   8,    /* jump_align.  */
 813   16,   /* loop_align.  */
 814   2,    /* int_reassoc_width.  */
 815   4,    /* fp_reassoc_width.  */
 816   1,    /* vec_reassoc_width.  */
 817   2,    /* min_div_recip_mul_sf.  */
 818   2,    /* min_div_recip_mul_df.  */
 819   0,    /* max_case_values.  */
 820   tune_params::AUTOPREFETCHER_STRONG,   /* autoprefetcher_model.  */
 821   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
 822   &qdf24xx_prefetch_tune
 823 };
 824
 825 static const struct tune_params thunderx2t99_tunings =
 826 {
 827   &thunderx2t99_extra_costs,
 828   &thunderx2t99_addrcost_table,
 829   &thunderx2t99_regmove_cost,
 830   &thunderx2t99_vector_cost,
 831   &generic_branch_cost,
 832   &generic_approx_modes,
 833   4, /* memmov_cost.  */
 834   4, /* issue_rate.  */
 835   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
 836    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 837   16,   /* function_align.  */
 838   8,    /* jump_align.  */
 839   16,   /* loop_align.  */
 840   3,    /* int_reassoc_width.  */
 841   2,    /* fp_reassoc_width.  */
 842   2,    /* vec_reassoc_width.  */
 843   2,    /* min_div_recip_mul_sf.  */
 844   2,    /* min_div_recip_mul_df.  */
 845   0,    /* max_case_values.  */
 846   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 847   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 848   &thunderx2t99_prefetch_tune
 849 };
 850
 851 /* Support for fine-grained override of the tuning structures.  */
 852 struct aarch64_tuning_override_function
 853 {
 854   const char* name;
 855   void (*parse_override)(const char*, struct tune_params*);
 856 };
 857
 858 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 859 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 860
 861 static const struct aarch64_tuning_override_function
 862 aarch64_tuning_override_functions[] =
 863 {
 864   { "fuse", aarch64_parse_fuse_string },
 865   { "tune", aarch64_parse_tune_string },
 866   { NULL, NULL }
 867 };
 868
 869 /* A processor implementing AArch64.  */
 870 struct processor
 871 {
 872   const char *const name;
 873   enum aarch64_processor ident;
 874   enum aarch64_processor sched_core;
 875   enum aarch64_arch arch;
 876   unsigned architecture_version;
 877   const unsigned long flags;
 878   const struct tune_params *const tune;
 879 };
 880
 881 /* Architectures implementing AArch64.  */
 882 static const struct processor all_architectures[] =
 883 {
 884 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
 885   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
 886 #include "aarch64-arches.def"
 887   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 888 };
 889
 890 /* Processor cores implementing AArch64.  */
 891 static const struct processor all_cores[] =
 892 {
 893 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
 894   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
 895   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
 896   FLAGS, &COSTS##_tunings},
 897 #include "aarch64-cores.def"
 898   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
 899     AARCH64_FL_FOR_ARCH8, &generic_tunings},
 900   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 901 };
 902
 903
 904 /* Target specification.  These are populated by the -march, -mtune, -mcpu
 905    handling code or by target attributes.  */
 906 static const struct processor *selected_arch;
 907 static const struct processor *selected_cpu;
 908 static const struct processor *selected_tune;
 909
 910 /* The current tuning set.  */
 911 struct tune_params aarch64_tune_params = generic_tunings;
 912
 913 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 914
 915 /* An ISA extension in the co-processor and main instruction set space.  */
 916 struct aarch64_option_extension
 917 {
 918   const char *const name;
 919   const unsigned long flags_on;
 920   const unsigned long flags_off;
 921 };
 922
 923 typedef enum aarch64_cond_code
 924 {
 925   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 926   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 927   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 928 }
 929 aarch64_cc;
 930
 931 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 932
 933 /* The condition codes of the processor, and the inverse function.  */
 934 static const char * const aarch64_condition_codes[] =
 935 {
 936   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 937   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 938 };
 939
 940 /* Generate code to enable conditional branches in functions over 1 MiB.  */
 941 const char *
 942 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
 943                         const char * branch_format)
 944 {
 945     rtx_code_label * tmp_label = gen_label_rtx ();
 946     char label_buf[256];
 947     char buffer[128];
 948     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
 949                                  CODE_LABEL_NUMBER (tmp_label));
 950     const char *label_ptr = targetm.strip_name_encoding (label_buf);
 951     rtx dest_label = operands[pos_label];
 952     operands[pos_label] = tmp_label;
 953
 954     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
 955     output_asm_insn (buffer, operands);
 956
 957     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
 958     operands[pos_label] = dest_label;
 959     output_asm_insn (buffer, operands);
 960     return "";
 961 }
 962
 963 void
 964 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
 965 {
 966   const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
 967   if (TARGET_GENERAL_REGS_ONLY)
 968     error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
 969   else
 970     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
 971 }
 972
 973 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
 974    The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
 975    the same cost even if ALL_REGS has a much larger cost.  ALL_REGS is also
 976    used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
 977    cost (in this case the best class is the lowest cost one).  Using ALL_REGS
 978    irrespectively of its cost results in bad allocations with many redundant
 979    int<->FP moves which are expensive on various cores.
 980    To avoid this we don't allow ALL_REGS as the allocno class, but force a
 981    decision between FP_REGS and GENERAL_REGS.  We use the allocno class if it
 982    isn't ALL_REGS.  Similarly, use the best class if it isn't ALL_REGS.
 983    Otherwise set the allocno class depending on the mode.
 984    The result of this is that it is no longer inefficient to have a higher
 985    memory move cost than the register move cost.
 986 */
 987
 988 static reg_class_t
 989 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
 990                                          reg_class_t best_class)
 991 {
 992   machine_mode mode;
 993
 994   if (allocno_class != ALL_REGS)
 995     return allocno_class;
 996
 997   if (best_class != ALL_REGS)
 998     return best_class;
 999
1000   mode = PSEUDO_REGNO_MODE (regno);
1001   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1002 }
1003
1004 static unsigned int
1005 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1006 {
1007   if (GET_MODE_UNIT_SIZE (mode) == 4)
1008     return aarch64_tune_params.min_div_recip_mul_sf;
1009   return aarch64_tune_params.min_div_recip_mul_df;
1010 }
1011
1012 static int
1013 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
1014                              machine_mode mode)
1015 {
1016   if (VECTOR_MODE_P (mode))
1017     return aarch64_tune_params.vec_reassoc_width;
1018   if (INTEGRAL_MODE_P (mode))
1019     return aarch64_tune_params.int_reassoc_width;
1020   if (FLOAT_MODE_P (mode))
1021     return aarch64_tune_params.fp_reassoc_width;
1022   return 1;
1023 }
1024
1025 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1026 unsigned
1027 aarch64_dbx_register_number (unsigned regno)
1028 {
1029    if (GP_REGNUM_P (regno))
1030      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1031    else if (regno == SP_REGNUM)
1032      return AARCH64_DWARF_SP;
1033    else if (FP_REGNUM_P (regno))
1034      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1035
1036    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1037       equivalent DWARF register.  */
1038    return DWARF_FRAME_REGISTERS;
1039 }
1040
1041 /* Return TRUE if MODE is any of the large INT modes.  */
1042 static bool
1043 aarch64_vect_struct_mode_p (machine_mode mode)
1044 {
1045   return mode == OImode || mode == CImode || mode == XImode;
1046 }
1047
1048 /* Return TRUE if MODE is any of the vector modes.  */
1049 static bool
1050 aarch64_vector_mode_p (machine_mode mode)
1051 {
1052   return aarch64_vector_mode_supported_p (mode)
1053          || aarch64_vect_struct_mode_p (mode);
1054 }
1055
1056 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1057 static bool
1058 aarch64_array_mode_supported_p (machine_mode mode,
1059                                 unsigned HOST_WIDE_INT nelems)
1060 {
1061   if (TARGET_SIMD
1062       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1063           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1064       && (nelems >= 2 && nelems <= 4))
1065     return true;
1066
1067   return false;
1068 }
1069
1070 /* Implement HARD_REGNO_NREGS.  */
1071
1072 int
1073 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1074 {
1075   switch (aarch64_regno_regclass (regno))
1076     {
1077     case FP_REGS:
1078     case FP_LO_REGS:
1079       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1080     default:
1081       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1082     }
1083   gcc_unreachable ();
1084 }
1085
1086 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1087
1088 static bool
1089 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1090 {
1091   if (GET_MODE_CLASS (mode) == MODE_CC)
1092     return regno == CC_REGNUM;
1093
1094   if (regno == SP_REGNUM)
1095     /* The purpose of comparing with ptr_mode is to support the
1096        global register variable associated with the stack pointer
1097        register via the syntax of asm ("wsp") in ILP32.  */
1098     return mode == Pmode || mode == ptr_mode;
1099
1100   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1101     return mode == Pmode;
1102
1103   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1104     return true;
1105
1106   if (FP_REGNUM_P (regno))
1107     {
1108       if (aarch64_vect_struct_mode_p (mode))
1109         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1110       else
1111         return true;
1112     }
1113
1114   return false;
1115 }
1116
1117 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1118    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1119    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1120
1121 static bool
1122 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1123 {
1124   return FP_REGNUM_P (regno) && GET_MODE_SIZE (mode) > 8;
1125 }
1126
1127 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1128 machine_mode
1129 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1130                                      machine_mode mode)
1131 {
1132   /* Handle modes that fit within single registers.  */
1133   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1134     {
1135       if (GET_MODE_SIZE (mode) >= 4)
1136         return mode;
1137       else
1138         return SImode;
1139     }
1140   /* Fall back to generic for multi-reg and very large modes.  */
1141   else
1142     return choose_hard_reg_mode (regno, nregs, false);
1143 }
1144
1145 /* Return true if calls to DECL should be treated as
1146    long-calls (ie called via a register).  */
1147 static bool
1148 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1149 {
1150   return false;
1151 }
1152
1153 /* Return true if calls to symbol-ref SYM should be treated as
1154    long-calls (ie called via a register).  */
1155 bool
1156 aarch64_is_long_call_p (rtx sym)
1157 {
1158   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1159 }
1160
1161 /* Return true if calls to symbol-ref SYM should not go through
1162    plt stubs.  */
1163
1164 bool
1165 aarch64_is_noplt_call_p (rtx sym)
1166 {
1167   const_tree decl = SYMBOL_REF_DECL (sym);
1168
1169   if (flag_pic
1170       && decl
1171       && (!flag_plt
1172           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1173       && !targetm.binds_local_p (decl))
1174     return true;
1175
1176   return false;
1177 }
1178
1179 /* Return true if the offsets to a zero/sign-extract operation
1180    represent an expression that matches an extend operation.  The
1181    operands represent the paramters from
1182
1183    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1184 bool
1185 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1186                                 rtx extract_imm)
1187 {
1188   HOST_WIDE_INT mult_val, extract_val;
1189
1190   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1191     return false;
1192
1193   mult_val = INTVAL (mult_imm);
1194   extract_val = INTVAL (extract_imm);
1195
1196   if (extract_val > 8
1197       && extract_val < GET_MODE_BITSIZE (mode)
1198       && exact_log2 (extract_val & ~7) > 0
1199       && (extract_val & 7) <= 4
1200       && mult_val == (1 << (extract_val & 7)))
1201     return true;
1202
1203   return false;
1204 }
1205
1206 /* Emit an insn that's a simple single-set.  Both the operands must be
1207    known to be valid.  */
1208 inline static rtx_insn *
1209 emit_set_insn (rtx x, rtx y)
1210 {
1211   return emit_insn (gen_rtx_SET (x, y));
1212 }
1213
1214 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1215    return the rtx for register 0 in the proper mode.  */
1216 rtx
1217 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1218 {
1219   machine_mode mode = SELECT_CC_MODE (code, x, y);
1220   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1221
1222   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1223   return cc_reg;
1224 }
1225
1226 /* Build the SYMBOL_REF for __tls_get_addr.  */
1227
1228 static GTY(()) rtx tls_get_addr_libfunc;
1229
1230 rtx
1231 aarch64_tls_get_addr (void)
1232 {
1233   if (!tls_get_addr_libfunc)
1234     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1235   return tls_get_addr_libfunc;
1236 }
1237
1238 /* Return the TLS model to use for ADDR.  */
1239
1240 static enum tls_model
1241 tls_symbolic_operand_type (rtx addr)
1242 {
1243   enum tls_model tls_kind = TLS_MODEL_NONE;
1244   rtx sym, addend;
1245
1246   if (GET_CODE (addr) == CONST)
1247     {
1248       split_const (addr, &sym, &addend);
1249       if (GET_CODE (sym) == SYMBOL_REF)
1250         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1251     }
1252   else if (GET_CODE (addr) == SYMBOL_REF)
1253     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1254
1255   return tls_kind;
1256 }
1257
1258 /* We'll allow lo_sum's in addresses in our legitimate addresses
1259    so that combine would take care of combining addresses where
1260    necessary, but for generation purposes, we'll generate the address
1261    as :
1262    RTL                               Absolute
1263    tmp = hi (symbol_ref);            adrp  x1, foo
1264    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1265                                      nop
1266
1267    PIC                               TLS
1268    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1269    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1270                                      bl   __tls_get_addr
1271                                      nop
1272
1273    Load TLS symbol, depending on TLS mechanism and TLS access model.
1274
1275    Global Dynamic - Traditional TLS:
1276    adrp tmp, :tlsgd:imm
1277    add  dest, tmp, #:tlsgd_lo12:imm
1278    bl   __tls_get_addr
1279
1280    Global Dynamic - TLS Descriptors:
1281    adrp dest, :tlsdesc:imm
1282    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1283    add  dest, dest, #:tlsdesc_lo12:imm
1284    blr  tmp
1285    mrs  tp, tpidr_el0
1286    add  dest, dest, tp
1287
1288    Initial Exec:
1289    mrs  tp, tpidr_el0
1290    adrp tmp, :gottprel:imm
1291    ldr  dest, [tmp, #:gottprel_lo12:imm]
1292    add  dest, dest, tp
1293
1294    Local Exec:
1295    mrs  tp, tpidr_el0
1296    add  t0, tp, #:tprel_hi12:imm, lsl #12
1297    add  t0, t0, #:tprel_lo12_nc:imm
1298 */
1299
1300 static void
1301 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1302                                    enum aarch64_symbol_type type)
1303 {
1304   switch (type)
1305     {
1306     case SYMBOL_SMALL_ABSOLUTE:
1307       {
1308         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1309         rtx tmp_reg = dest;
1310         machine_mode mode = GET_MODE (dest);
1311
1312         gcc_assert (mode == Pmode || mode == ptr_mode);
1313
1314         if (can_create_pseudo_p ())
1315           tmp_reg = gen_reg_rtx (mode);
1316
1317         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1318         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1319         return;
1320       }
1321
1322     case SYMBOL_TINY_ABSOLUTE:
1323       emit_insn (gen_rtx_SET (dest, imm));
1324       return;
1325
1326     case SYMBOL_SMALL_GOT_28K:
1327       {
1328         machine_mode mode = GET_MODE (dest);
1329         rtx gp_rtx = pic_offset_table_rtx;
1330         rtx insn;
1331         rtx mem;
1332
1333         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1334            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1335            decide rtx costs, in which case pic_offset_table_rtx is not
1336            initialized.  For that case no need to generate the first adrp
1337            instruction as the final cost for global variable access is
1338            one instruction.  */
1339         if (gp_rtx != NULL)
1340           {
1341             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1342                using the page base as GOT base, the first page may be wasted,
1343                in the worst scenario, there is only 28K space for GOT).
1344
1345                The generate instruction sequence for accessing global variable
1346                is:
1347
1348                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1349
1350                Only one instruction needed. But we must initialize
1351                pic_offset_table_rtx properly.  We generate initialize insn for
1352                every global access, and allow CSE to remove all redundant.
1353
1354                The final instruction sequences will look like the following
1355                for multiply global variables access.
1356
1357                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1358
1359                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1360                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1361                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1362                  ...  */
1363
1364             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1365             crtl->uses_pic_offset_table = 1;
1366             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1367
1368             if (mode != GET_MODE (gp_rtx))
1369              gp_rtx = gen_lowpart (mode, gp_rtx);
1370
1371           }
1372
1373         if (mode == ptr_mode)
1374           {
1375             if (mode == DImode)
1376               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1377             else
1378               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1379
1380             mem = XVECEXP (SET_SRC (insn), 0, 0);
1381           }
1382         else
1383           {
1384             gcc_assert (mode == Pmode);
1385
1386             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1387             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1388           }
1389
1390         /* The operand is expected to be MEM.  Whenever the related insn
1391            pattern changed, above code which calculate mem should be
1392            updated.  */
1393         gcc_assert (GET_CODE (mem) == MEM);
1394         MEM_READONLY_P (mem) = 1;
1395         MEM_NOTRAP_P (mem) = 1;
1396         emit_insn (insn);
1397         return;
1398       }
1399
1400     case SYMBOL_SMALL_GOT_4G:
1401       {
1402         /* In ILP32, the mode of dest can be either SImode or DImode,
1403            while the got entry is always of SImode size.  The mode of
1404            dest depends on how dest is used: if dest is assigned to a
1405            pointer (e.g. in the memory), it has SImode; it may have
1406            DImode if dest is dereferenced to access the memeory.
1407            This is why we have to handle three different ldr_got_small
1408            patterns here (two patterns for ILP32).  */
1409
1410         rtx insn;
1411         rtx mem;
1412         rtx tmp_reg = dest;
1413         machine_mode mode = GET_MODE (dest);
1414
1415         if (can_create_pseudo_p ())
1416           tmp_reg = gen_reg_rtx (mode);
1417
1418         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1419         if (mode == ptr_mode)
1420           {
1421             if (mode == DImode)
1422               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1423             else
1424               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1425
1426             mem = XVECEXP (SET_SRC (insn), 0, 0);
1427           }
1428         else
1429           {
1430             gcc_assert (mode == Pmode);
1431
1432             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1433             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1434           }
1435
1436         gcc_assert (GET_CODE (mem) == MEM);
1437         MEM_READONLY_P (mem) = 1;
1438         MEM_NOTRAP_P (mem) = 1;
1439         emit_insn (insn);
1440         return;
1441       }
1442
1443     case SYMBOL_SMALL_TLSGD:
1444       {
1445         rtx_insn *insns;
1446         machine_mode mode = GET_MODE (dest);
1447         rtx result = gen_rtx_REG (mode, R0_REGNUM);
1448
1449         start_sequence ();
1450         if (TARGET_ILP32)
1451           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1452         else
1453           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1454         insns = get_insns ();
1455         end_sequence ();
1456
1457         RTL_CONST_CALL_P (insns) = 1;
1458         emit_libcall_block (insns, dest, result, imm);
1459         return;
1460       }
1461
1462     case SYMBOL_SMALL_TLSDESC:
1463       {
1464         machine_mode mode = GET_MODE (dest);
1465         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1466         rtx tp;
1467
1468         gcc_assert (mode == Pmode || mode == ptr_mode);
1469
1470         /* In ILP32, the got entry is always of SImode size.  Unlike
1471            small GOT, the dest is fixed at reg 0.  */
1472         if (TARGET_ILP32)
1473           emit_insn (gen_tlsdesc_small_si (imm));
1474         else
1475           emit_insn (gen_tlsdesc_small_di (imm));
1476         tp = aarch64_load_tp (NULL);
1477
1478         if (mode != Pmode)
1479           tp = gen_lowpart (mode, tp);
1480
1481         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1482         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1483         return;
1484       }
1485
1486     case SYMBOL_SMALL_TLSIE:
1487       {
1488         /* In ILP32, the mode of dest can be either SImode or DImode,
1489            while the got entry is always of SImode size.  The mode of
1490            dest depends on how dest is used: if dest is assigned to a
1491            pointer (e.g. in the memory), it has SImode; it may have
1492            DImode if dest is dereferenced to access the memeory.
1493            This is why we have to handle three different tlsie_small
1494            patterns here (two patterns for ILP32).  */
1495         machine_mode mode = GET_MODE (dest);
1496         rtx tmp_reg = gen_reg_rtx (mode);
1497         rtx tp = aarch64_load_tp (NULL);
1498
1499         if (mode == ptr_mode)
1500           {
1501             if (mode == DImode)
1502               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1503             else
1504               {
1505                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1506                 tp = gen_lowpart (mode, tp);
1507               }
1508           }
1509         else
1510           {
1511             gcc_assert (mode == Pmode);
1512             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1513           }
1514
1515         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1516         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1517         return;
1518       }
1519
1520     case SYMBOL_TLSLE12:
1521     case SYMBOL_TLSLE24:
1522     case SYMBOL_TLSLE32:
1523     case SYMBOL_TLSLE48:
1524       {
1525         machine_mode mode = GET_MODE (dest);
1526         rtx tp = aarch64_load_tp (NULL);
1527
1528         if (mode != Pmode)
1529           tp = gen_lowpart (mode, tp);
1530
1531         switch (type)
1532           {
1533           case SYMBOL_TLSLE12:
1534             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1535                         (dest, tp, imm));
1536             break;
1537           case SYMBOL_TLSLE24:
1538             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1539                         (dest, tp, imm));
1540           break;
1541           case SYMBOL_TLSLE32:
1542             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1543                         (dest, imm));
1544             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1545                         (dest, dest, tp));
1546           break;
1547           case SYMBOL_TLSLE48:
1548             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1549                         (dest, imm));
1550             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1551                         (dest, dest, tp));
1552             break;
1553           default:
1554             gcc_unreachable ();
1555           }
1556
1557         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1558         return;
1559       }
1560
1561     case SYMBOL_TINY_GOT:
1562       emit_insn (gen_ldr_got_tiny (dest, imm));
1563       return;
1564
1565     case SYMBOL_TINY_TLSIE:
1566       {
1567         machine_mode mode = GET_MODE (dest);
1568         rtx tp = aarch64_load_tp (NULL);
1569
1570         if (mode == ptr_mode)
1571           {
1572             if (mode == DImode)
1573               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1574             else
1575               {
1576                 tp = gen_lowpart (mode, tp);
1577                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1578               }
1579           }
1580         else
1581           {
1582             gcc_assert (mode == Pmode);
1583             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1584           }
1585
1586         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1587         return;
1588       }
1589
1590     default:
1591       gcc_unreachable ();
1592     }
1593 }
1594
1595 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1596    handle all moves if !can_create_pseudo_p ().  The distinction is
1597    important because, unlike emit_move_insn, the move expanders know
1598    how to force Pmode objects into the constant pool even when the
1599    constant pool address is not itself legitimate.  */
1600 static rtx
1601 aarch64_emit_move (rtx dest, rtx src)
1602 {
1603   return (can_create_pseudo_p ()
1604           ? emit_move_insn (dest, src)
1605           : emit_move_insn_1 (dest, src));
1606 }
1607
1608 /* Split a 128-bit move operation into two 64-bit move operations,
1609    taking care to handle partial overlap of register to register
1610    copies.  Special cases are needed when moving between GP regs and
1611    FP regs.  SRC can be a register, constant or memory; DST a register
1612    or memory.  If either operand is memory it must not have any side
1613    effects.  */
1614 void
1615 aarch64_split_128bit_move (rtx dst, rtx src)
1616 {
1617   rtx dst_lo, dst_hi;
1618   rtx src_lo, src_hi;
1619
1620   machine_mode mode = GET_MODE (dst);
1621
1622   gcc_assert (mode == TImode || mode == TFmode);
1623   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1624   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1625
1626   if (REG_P (dst) && REG_P (src))
1627     {
1628       int src_regno = REGNO (src);
1629       int dst_regno = REGNO (dst);
1630
1631       /* Handle FP <-> GP regs.  */
1632       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1633         {
1634           src_lo = gen_lowpart (word_mode, src);
1635           src_hi = gen_highpart (word_mode, src);
1636
1637           if (mode == TImode)
1638             {
1639               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1640               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1641             }
1642           else
1643             {
1644               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1645               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1646             }
1647           return;
1648         }
1649       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1650         {
1651           dst_lo = gen_lowpart (word_mode, dst);
1652           dst_hi = gen_highpart (word_mode, dst);
1653
1654           if (mode == TImode)
1655             {
1656               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1657               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1658             }
1659           else
1660             {
1661               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1662               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1663             }
1664           return;
1665         }
1666     }
1667
1668   dst_lo = gen_lowpart (word_mode, dst);
1669   dst_hi = gen_highpart (word_mode, dst);
1670   src_lo = gen_lowpart (word_mode, src);
1671   src_hi = gen_highpart_mode (word_mode, mode, src);
1672
1673   /* At most one pairing may overlap.  */
1674   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1675     {
1676       aarch64_emit_move (dst_hi, src_hi);
1677       aarch64_emit_move (dst_lo, src_lo);
1678     }
1679   else
1680     {
1681       aarch64_emit_move (dst_lo, src_lo);
1682       aarch64_emit_move (dst_hi, src_hi);
1683     }
1684 }
1685
1686 bool
1687 aarch64_split_128bit_move_p (rtx dst, rtx src)
1688 {
1689   return (! REG_P (src)
1690           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1691 }
1692
1693 /* Split a complex SIMD combine.  */
1694
1695 void
1696 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1697 {
1698   machine_mode src_mode = GET_MODE (src1);
1699   machine_mode dst_mode = GET_MODE (dst);
1700
1701   gcc_assert (VECTOR_MODE_P (dst_mode));
1702   gcc_assert (register_operand (dst, dst_mode)
1703               && register_operand (src1, src_mode)
1704               && register_operand (src2, src_mode));
1705
1706   rtx (*gen) (rtx, rtx, rtx);
1707
1708   switch (src_mode)
1709     {
1710     case E_V8QImode:
1711       gen = gen_aarch64_simd_combinev8qi;
1712       break;
1713     case E_V4HImode:
1714       gen = gen_aarch64_simd_combinev4hi;
1715       break;
1716     case E_V2SImode:
1717       gen = gen_aarch64_simd_combinev2si;
1718       break;
1719     case E_V4HFmode:
1720       gen = gen_aarch64_simd_combinev4hf;
1721       break;
1722     case E_V2SFmode:
1723       gen = gen_aarch64_simd_combinev2sf;
1724       break;
1725     case E_DImode:
1726       gen = gen_aarch64_simd_combinedi;
1727       break;
1728     case E_DFmode:
1729       gen = gen_aarch64_simd_combinedf;
1730       break;
1731     default:
1732       gcc_unreachable ();
1733     }
1734
1735   emit_insn (gen (dst, src1, src2));
1736   return;
1737 }
1738
1739 /* Split a complex SIMD move.  */
1740
1741 void
1742 aarch64_split_simd_move (rtx dst, rtx src)
1743 {
1744   machine_mode src_mode = GET_MODE (src);
1745   machine_mode dst_mode = GET_MODE (dst);
1746
1747   gcc_assert (VECTOR_MODE_P (dst_mode));
1748
1749   if (REG_P (dst) && REG_P (src))
1750     {
1751       rtx (*gen) (rtx, rtx);
1752
1753       gcc_assert (VECTOR_MODE_P (src_mode));
1754
1755       switch (src_mode)
1756         {
1757         case E_V16QImode:
1758           gen = gen_aarch64_split_simd_movv16qi;
1759           break;
1760         case E_V8HImode:
1761           gen = gen_aarch64_split_simd_movv8hi;
1762           break;
1763         case E_V4SImode:
1764           gen = gen_aarch64_split_simd_movv4si;
1765           break;
1766         case E_V2DImode:
1767           gen = gen_aarch64_split_simd_movv2di;
1768           break;
1769         case E_V8HFmode:
1770           gen = gen_aarch64_split_simd_movv8hf;
1771           break;
1772         case E_V4SFmode:
1773           gen = gen_aarch64_split_simd_movv4sf;
1774           break;
1775         case E_V2DFmode:
1776           gen = gen_aarch64_split_simd_movv2df;
1777           break;
1778         default:
1779           gcc_unreachable ();
1780         }
1781
1782       emit_insn (gen (dst, src));
1783       return;
1784     }
1785 }
1786
1787 bool
1788 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1789                               machine_mode ymode, rtx y)
1790 {
1791   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1792   gcc_assert (r != NULL);
1793   return rtx_equal_p (x, r);
1794 }
1795
1796
1797 static rtx
1798 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1799 {
1800   if (can_create_pseudo_p ())
1801     return force_reg (mode, value);
1802   else
1803     {
1804       x = aarch64_emit_move (x, value);
1805       return x;
1806     }
1807 }
1808
1809
1810 static rtx
1811 aarch64_add_offset (scalar_int_mode mode, rtx temp, rtx reg,
1812                     HOST_WIDE_INT offset)
1813 {
1814   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1815     {
1816       rtx high;
1817       /* Load the full offset into a register.  This
1818          might be improvable in the future.  */
1819       high = GEN_INT (offset);
1820       offset = 0;
1821       high = aarch64_force_temporary (mode, temp, high);
1822       reg = aarch64_force_temporary (mode, temp,
1823                                      gen_rtx_PLUS (mode, high, reg));
1824     }
1825   return plus_constant (mode, reg, offset);
1826 }
1827
1828 static int
1829 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1830                                 scalar_int_mode mode)
1831 {
1832   int i;
1833   unsigned HOST_WIDE_INT val, val2, mask;
1834   int one_match, zero_match;
1835   int num_insns;
1836
1837   val = INTVAL (imm);
1838
1839   if (aarch64_move_imm (val, mode))
1840     {
1841       if (generate)
1842         emit_insn (gen_rtx_SET (dest, imm));
1843       return 1;
1844     }
1845
1846   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
1847      (with XXXX non-zero). In that case check to see if the move can be done in
1848      a smaller mode.  */
1849   val2 = val & 0xffffffff;
1850   if (mode == DImode
1851       && aarch64_move_imm (val2, SImode)
1852       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
1853     {
1854       if (generate)
1855         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1856
1857       /* Check if we have to emit a second instruction by checking to see
1858          if any of the upper 32 bits of the original DI mode value is set.  */
1859       if (val == val2)
1860         return 1;
1861
1862       i = (val >> 48) ? 48 : 32;
1863
1864       if (generate)
1865          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1866                                     GEN_INT ((val >> i) & 0xffff)));
1867
1868       return 2;
1869     }
1870
1871   if ((val >> 32) == 0 || mode == SImode)
1872     {
1873       if (generate)
1874         {
1875           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1876           if (mode == SImode)
1877             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1878                                        GEN_INT ((val >> 16) & 0xffff)));
1879           else
1880             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1881                                        GEN_INT ((val >> 16) & 0xffff)));
1882         }
1883       return 2;
1884     }
1885
1886   /* Remaining cases are all for DImode.  */
1887
1888   mask = 0xffff;
1889   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1890     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1891   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1892     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1893
1894   if (zero_match != 2 && one_match != 2)
1895     {
1896       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1897          For a 64-bit bitmask try whether changing 16 bits to all ones or
1898          zeroes creates a valid bitmask.  To check any repeated bitmask,
1899          try using 16 bits from the other 32-bit half of val.  */
1900
1901       for (i = 0; i < 64; i += 16, mask <<= 16)
1902         {
1903           val2 = val & ~mask;
1904           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1905             break;
1906           val2 = val | mask;
1907           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1908             break;
1909           val2 = val2 & ~mask;
1910           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1911           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1912             break;
1913         }
1914       if (i != 64)
1915         {
1916           if (generate)
1917             {
1918               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1919               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1920                                          GEN_INT ((val >> i) & 0xffff)));
1921             }
1922           return 2;
1923         }
1924     }
1925
1926   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1927      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
1928      otherwise skip zero bits.  */
1929
1930   num_insns = 1;
1931   mask = 0xffff;
1932   val2 = one_match > zero_match ? ~val : val;
1933   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1934
1935   if (generate)
1936     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1937                                            ? (val | ~(mask << i))
1938                                            : (val & (mask << i)))));
1939   for (i += 16; i < 64; i += 16)
1940     {
1941       if ((val2 & (mask << i)) == 0)
1942         continue;
1943       if (generate)
1944         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1945                                    GEN_INT ((val >> i) & 0xffff)));
1946       num_insns ++;
1947     }
1948
1949   return num_insns;
1950 }
1951
1952
1953 void
1954 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1955 {
1956   machine_mode mode = GET_MODE (dest);
1957
1958   gcc_assert (mode == SImode || mode == DImode);
1959
1960   /* Check on what type of symbol it is.  */
1961   scalar_int_mode int_mode;
1962   if ((GET_CODE (imm) == SYMBOL_REF
1963        || GET_CODE (imm) == LABEL_REF
1964        || GET_CODE (imm) == CONST)
1965       && is_a <scalar_int_mode> (mode, &int_mode))
1966     {
1967       rtx mem, base, offset;
1968       enum aarch64_symbol_type sty;
1969
1970       /* If we have (const (plus symbol offset)), separate out the offset
1971          before we start classifying the symbol.  */
1972       split_const (imm, &base, &offset);
1973
1974       sty = aarch64_classify_symbol (base, offset);
1975       switch (sty)
1976         {
1977         case SYMBOL_FORCE_TO_MEM:
1978           if (offset != const0_rtx
1979               && targetm.cannot_force_const_mem (int_mode, imm))
1980             {
1981               gcc_assert (can_create_pseudo_p ());
1982               base = aarch64_force_temporary (int_mode, dest, base);
1983               base = aarch64_add_offset (int_mode, NULL, base,
1984                                          INTVAL (offset));
1985               aarch64_emit_move (dest, base);
1986               return;
1987             }
1988
1989           mem = force_const_mem (ptr_mode, imm);
1990           gcc_assert (mem);
1991
1992           /* If we aren't generating PC relative literals, then
1993              we need to expand the literal pool access carefully.
1994              This is something that needs to be done in a number
1995              of places, so could well live as a separate function.  */
1996           if (!aarch64_pcrelative_literal_loads)
1997             {
1998               gcc_assert (can_create_pseudo_p ());
1999               base = gen_reg_rtx (ptr_mode);
2000               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
2001               if (ptr_mode != Pmode)
2002                 base = convert_memory_address (Pmode, base);
2003               mem = gen_rtx_MEM (ptr_mode, base);
2004             }
2005
2006           if (int_mode != ptr_mode)
2007             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
2008
2009           emit_insn (gen_rtx_SET (dest, mem));
2010
2011           return;
2012
2013         case SYMBOL_SMALL_TLSGD:
2014         case SYMBOL_SMALL_TLSDESC:
2015         case SYMBOL_SMALL_TLSIE:
2016         case SYMBOL_SMALL_GOT_28K:
2017         case SYMBOL_SMALL_GOT_4G:
2018         case SYMBOL_TINY_GOT:
2019         case SYMBOL_TINY_TLSIE:
2020           if (offset != const0_rtx)
2021             {
2022               gcc_assert(can_create_pseudo_p ());
2023               base = aarch64_force_temporary (int_mode, dest, base);
2024               base = aarch64_add_offset (int_mode, NULL, base,
2025                                          INTVAL (offset));
2026               aarch64_emit_move (dest, base);
2027               return;
2028             }
2029           /* FALLTHRU */
2030
2031         case SYMBOL_SMALL_ABSOLUTE:
2032         case SYMBOL_TINY_ABSOLUTE:
2033         case SYMBOL_TLSLE12:
2034         case SYMBOL_TLSLE24:
2035         case SYMBOL_TLSLE32:
2036         case SYMBOL_TLSLE48:
2037           aarch64_load_symref_appropriately (dest, imm, sty);
2038           return;
2039
2040         default:
2041           gcc_unreachable ();
2042         }
2043     }
2044
2045   if (!CONST_INT_P (imm))
2046     {
2047       if (GET_CODE (imm) == HIGH)
2048         emit_insn (gen_rtx_SET (dest, imm));
2049       else
2050         {
2051           rtx mem = force_const_mem (mode, imm);
2052           gcc_assert (mem);
2053           emit_insn (gen_rtx_SET (dest, mem));
2054         }
2055
2056       return;
2057     }
2058
2059   aarch64_internal_mov_immediate (dest, imm, true,
2060                                   as_a <scalar_int_mode> (mode));
2061 }
2062
2063 /* Add DELTA to REGNUM in mode MODE.  SCRATCHREG can be used to hold a
2064    temporary value if necessary.  FRAME_RELATED_P should be true if
2065    the RTX_FRAME_RELATED flag should be set and CFA adjustments added
2066    to the generated instructions.  If SCRATCHREG is known to hold
2067    abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
2068    immediate again.
2069
2070    Since this function may be used to adjust the stack pointer, we must
2071    ensure that it cannot cause transient stack deallocation (for example
2072    by first incrementing SP and then decrementing when adjusting by a
2073    large immediate).  */
2074
2075 static void
2076 aarch64_add_constant_internal (scalar_int_mode mode, int regnum,
2077                                int scratchreg, HOST_WIDE_INT delta,
2078                                bool frame_related_p, bool emit_move_imm)
2079 {
2080   HOST_WIDE_INT mdelta = abs_hwi (delta);
2081   rtx this_rtx = gen_rtx_REG (mode, regnum);
2082   rtx_insn *insn;
2083
2084   if (!mdelta)
2085     return;
2086
2087   /* Single instruction adjustment.  */
2088   if (aarch64_uimm12_shift (mdelta))
2089     {
2090       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
2091       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2092       return;
2093     }
2094
2095   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
2096      Only do this if mdelta is not a 16-bit move as adjusting using a move
2097      is better.  */
2098   if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode))
2099     {
2100       HOST_WIDE_INT low_off = mdelta & 0xfff;
2101
2102       low_off = delta < 0 ? -low_off : low_off;
2103       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
2104       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2105       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2106       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2107       return;
2108     }
2109
2110   /* Emit a move immediate if required and an addition/subtraction.  */
2111   rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2112   if (emit_move_imm)
2113     aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode);
2114   insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx)
2115                               : gen_add2_insn (this_rtx, scratch_rtx));
2116   if (frame_related_p)
2117     {
2118       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2119       rtx adj = plus_constant (mode, this_rtx, delta);
2120       add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2121     }
2122 }
2123
2124 static inline void
2125 aarch64_add_constant (scalar_int_mode mode, int regnum, int scratchreg,
2126                       HOST_WIDE_INT delta)
2127 {
2128   aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true);
2129 }
2130
2131 static inline void
2132 aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm)
2133 {
2134   aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta,
2135                                  true, emit_move_imm);
2136 }
2137
2138 static inline void
2139 aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p)
2140 {
2141   aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta,
2142                                  frame_related_p, true);
2143 }
2144
2145 static bool
2146 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2147                                  tree exp ATTRIBUTE_UNUSED)
2148 {
2149   /* Currently, always true.  */
2150   return true;
2151 }
2152
2153 /* Implement TARGET_PASS_BY_REFERENCE.  */
2154
2155 static bool
2156 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2157                            machine_mode mode,
2158                            const_tree type,
2159                            bool named ATTRIBUTE_UNUSED)
2160 {
2161   HOST_WIDE_INT size;
2162   machine_mode dummymode;
2163   int nregs;
2164
2165   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
2166   size = (mode == BLKmode && type)
2167     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2168
2169   /* Aggregates are passed by reference based on their size.  */
2170   if (type && AGGREGATE_TYPE_P (type))
2171     {
2172       size = int_size_in_bytes (type);
2173     }
2174
2175   /* Variable sized arguments are always returned by reference.  */
2176   if (size < 0)
2177     return true;
2178
2179   /* Can this be a candidate to be passed in fp/simd register(s)?  */
2180   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2181                                                &dummymode, &nregs,
2182                                                NULL))
2183     return false;
2184
2185   /* Arguments which are variable sized or larger than 2 registers are
2186      passed by reference unless they are a homogenous floating point
2187      aggregate.  */
2188   return size > 2 * UNITS_PER_WORD;
2189 }
2190
2191 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
2192 static bool
2193 aarch64_return_in_msb (const_tree valtype)
2194 {
2195   machine_mode dummy_mode;
2196   int dummy_int;
2197
2198   /* Never happens in little-endian mode.  */
2199   if (!BYTES_BIG_ENDIAN)
2200     return false;
2201
2202   /* Only composite types smaller than or equal to 16 bytes can
2203      be potentially returned in registers.  */
2204   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2205       || int_size_in_bytes (valtype) <= 0
2206       || int_size_in_bytes (valtype) > 16)
2207     return false;
2208
2209   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2210      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2211      is always passed/returned in the least significant bits of fp/simd
2212      register(s).  */
2213   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2214                                                &dummy_mode, &dummy_int, NULL))
2215     return false;
2216
2217   return true;
2218 }
2219
2220 /* Implement TARGET_FUNCTION_VALUE.
2221    Define how to find the value returned by a function.  */
2222
2223 static rtx
2224 aarch64_function_value (const_tree type, const_tree func,
2225                         bool outgoing ATTRIBUTE_UNUSED)
2226 {
2227   machine_mode mode;
2228   int unsignedp;
2229   int count;
2230   machine_mode ag_mode;
2231
2232   mode = TYPE_MODE (type);
2233   if (INTEGRAL_TYPE_P (type))
2234     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2235
2236   if (aarch64_return_in_msb (type))
2237     {
2238       HOST_WIDE_INT size = int_size_in_bytes (type);
2239
2240       if (size % UNITS_PER_WORD != 0)
2241         {
2242           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2243           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
2244         }
2245     }
2246
2247   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2248                                                &ag_mode, &count, NULL))
2249     {
2250       if (!aarch64_composite_type_p (type, mode))
2251         {
2252           gcc_assert (count == 1 && mode == ag_mode);
2253           return gen_rtx_REG (mode, V0_REGNUM);
2254         }
2255       else
2256         {
2257           int i;
2258           rtx par;
2259
2260           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2261           for (i = 0; i < count; i++)
2262             {
2263               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2264               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2265                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2266               XVECEXP (par, 0, i) = tmp;
2267             }
2268           return par;
2269         }
2270     }
2271   else
2272     return gen_rtx_REG (mode, R0_REGNUM);
2273 }
2274
2275 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2276    Return true if REGNO is the number of a hard register in which the values
2277    of called function may come back.  */
2278
2279 static bool
2280 aarch64_function_value_regno_p (const unsigned int regno)
2281 {
2282   /* Maximum of 16 bytes can be returned in the general registers.  Examples
2283      of 16-byte return values are: 128-bit integers and 16-byte small
2284      structures (excluding homogeneous floating-point aggregates).  */
2285   if (regno == R0_REGNUM || regno == R1_REGNUM)
2286     return true;
2287
2288   /* Up to four fp/simd registers can return a function value, e.g. a
2289      homogeneous floating-point aggregate having four members.  */
2290   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2291     return TARGET_FLOAT;
2292
2293   return false;
2294 }
2295
2296 /* Implement TARGET_RETURN_IN_MEMORY.
2297
2298    If the type T of the result of a function is such that
2299      void func (T arg)
2300    would require that arg be passed as a value in a register (or set of
2301    registers) according to the parameter passing rules, then the result
2302    is returned in the same registers as would be used for such an
2303    argument.  */
2304
2305 static bool
2306 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2307 {
2308   HOST_WIDE_INT size;
2309   machine_mode ag_mode;
2310   int count;
2311
2312   if (!AGGREGATE_TYPE_P (type)
2313       && TREE_CODE (type) != COMPLEX_TYPE
2314       && TREE_CODE (type) != VECTOR_TYPE)
2315     /* Simple scalar types always returned in registers.  */
2316     return false;
2317
2318   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2319                                                type,
2320                                                &ag_mode,
2321                                                &count,
2322                                                NULL))
2323     return false;
2324
2325   /* Types larger than 2 registers returned in memory.  */
2326   size = int_size_in_bytes (type);
2327   return (size < 0 || size > 2 * UNITS_PER_WORD);
2328 }
2329
2330 static bool
2331 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2332                                const_tree type, int *nregs)
2333 {
2334   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2335   return aarch64_vfp_is_call_or_return_candidate (mode,
2336                                                   type,
2337                                                   &pcum->aapcs_vfp_rmode,
2338                                                   nregs,
2339                                                   NULL);
2340 }
2341
2342 /* Given MODE and TYPE of a function argument, return the alignment in
2343    bits.  The idea is to suppress any stronger alignment requested by
2344    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2345    This is a helper function for local use only.  */
2346
2347 static unsigned int
2348 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2349 {
2350   if (!type)
2351     return GET_MODE_ALIGNMENT (mode);
2352
2353   if (integer_zerop (TYPE_SIZE (type)))
2354     return 0;
2355
2356   gcc_assert (TYPE_MODE (type) == mode);
2357
2358   if (!AGGREGATE_TYPE_P (type))
2359     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2360
2361   if (TREE_CODE (type) == ARRAY_TYPE)
2362     return TYPE_ALIGN (TREE_TYPE (type));
2363
2364   unsigned int alignment = 0;
2365   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2366     if (TREE_CODE (field) == FIELD_DECL)
2367       alignment = std::max (alignment, DECL_ALIGN (field));
2368
2369   return alignment;
2370 }
2371
2372 /* Layout a function argument according to the AAPCS64 rules.  The rule
2373    numbers refer to the rule numbers in the AAPCS64.  */
2374
2375 static void
2376 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2377                     const_tree type,
2378                     bool named ATTRIBUTE_UNUSED)
2379 {
2380   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2381   int ncrn, nvrn, nregs;
2382   bool allocate_ncrn, allocate_nvrn;
2383   HOST_WIDE_INT size;
2384
2385   /* We need to do this once per argument.  */
2386   if (pcum->aapcs_arg_processed)
2387     return;
2388
2389   pcum->aapcs_arg_processed = true;
2390
2391   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
2392   size
2393     = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2394                 UNITS_PER_WORD);
2395
2396   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2397   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2398                                                  mode,
2399                                                  type,
2400                                                  &nregs);
2401
2402   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2403      The following code thus handles passing by SIMD/FP registers first.  */
2404
2405   nvrn = pcum->aapcs_nvrn;
2406
2407   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2408      and homogenous short-vector aggregates (HVA).  */
2409   if (allocate_nvrn)
2410     {
2411       if (!TARGET_FLOAT)
2412         aarch64_err_no_fpadvsimd (mode, "argument");
2413
2414       if (nvrn + nregs <= NUM_FP_ARG_REGS)
2415         {
2416           pcum->aapcs_nextnvrn = nvrn + nregs;
2417           if (!aarch64_composite_type_p (type, mode))
2418             {
2419               gcc_assert (nregs == 1);
2420               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2421             }
2422           else
2423             {
2424               rtx par;
2425               int i;
2426               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2427               for (i = 0; i < nregs; i++)
2428                 {
2429                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2430                                          V0_REGNUM + nvrn + i);
2431                   tmp = gen_rtx_EXPR_LIST
2432                     (VOIDmode, tmp,
2433                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2434                   XVECEXP (par, 0, i) = tmp;
2435                 }
2436               pcum->aapcs_reg = par;
2437             }
2438           return;
2439         }
2440       else
2441         {
2442           /* C.3 NSRN is set to 8.  */
2443           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2444           goto on_stack;
2445         }
2446     }
2447
2448   ncrn = pcum->aapcs_ncrn;
2449   nregs = size / UNITS_PER_WORD;
2450
2451   /* C6 - C9.  though the sign and zero extension semantics are
2452      handled elsewhere.  This is the case where the argument fits
2453      entirely general registers.  */
2454   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2455     {
2456
2457       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2458
2459       /* C.8 if the argument has an alignment of 16 then the NGRN is
2460          rounded up to the next even number.  */
2461       if (nregs == 2
2462           && ncrn % 2
2463           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
2464              comparison is there because for > 16 * BITS_PER_UNIT
2465              alignment nregs should be > 2 and therefore it should be
2466              passed by reference rather than value.  */
2467           && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2468         {
2469           ++ncrn;
2470           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2471         }
2472
2473       /* NREGS can be 0 when e.g. an empty structure is to be passed.
2474          A reg is still generated for it, but the caller should be smart
2475          enough not to use it.  */
2476       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2477         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2478       else
2479         {
2480           rtx par;
2481           int i;
2482
2483           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2484           for (i = 0; i < nregs; i++)
2485             {
2486               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2487               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2488                                        GEN_INT (i * UNITS_PER_WORD));
2489               XVECEXP (par, 0, i) = tmp;
2490             }
2491           pcum->aapcs_reg = par;
2492         }
2493
2494       pcum->aapcs_nextncrn = ncrn + nregs;
2495       return;
2496     }
2497
2498   /* C.11  */
2499   pcum->aapcs_nextncrn = NUM_ARG_REGS;
2500
2501   /* The argument is passed on stack; record the needed number of words for
2502      this argument and align the total size if necessary.  */
2503 on_stack:
2504   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2505
2506   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2507     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2508                                        16 / UNITS_PER_WORD);
2509   return;
2510 }
2511
2512 /* Implement TARGET_FUNCTION_ARG.  */
2513
2514 static rtx
2515 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2516                       const_tree type, bool named)
2517 {
2518   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2519   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2520
2521   if (mode == VOIDmode)
2522     return NULL_RTX;
2523
2524   aarch64_layout_arg (pcum_v, mode, type, named);
2525   return pcum->aapcs_reg;
2526 }
2527
2528 void
2529 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2530                            const_tree fntype ATTRIBUTE_UNUSED,
2531                            rtx libname ATTRIBUTE_UNUSED,
2532                            const_tree fndecl ATTRIBUTE_UNUSED,
2533                            unsigned n_named ATTRIBUTE_UNUSED)
2534 {
2535   pcum->aapcs_ncrn = 0;
2536   pcum->aapcs_nvrn = 0;
2537   pcum->aapcs_nextncrn = 0;
2538   pcum->aapcs_nextnvrn = 0;
2539   pcum->pcs_variant = ARM_PCS_AAPCS64;
2540   pcum->aapcs_reg = NULL_RTX;
2541   pcum->aapcs_arg_processed = false;
2542   pcum->aapcs_stack_words = 0;
2543   pcum->aapcs_stack_size = 0;
2544
2545   if (!TARGET_FLOAT
2546       && fndecl && TREE_PUBLIC (fndecl)
2547       && fntype && fntype != error_mark_node)
2548     {
2549       const_tree type = TREE_TYPE (fntype);
2550       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
2551       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
2552       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2553                                                    &mode, &nregs, NULL))
2554         aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2555     }
2556   return;
2557 }
2558
2559 static void
2560 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2561                               machine_mode mode,
2562                               const_tree type,
2563                               bool named)
2564 {
2565   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2566   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2567     {
2568       aarch64_layout_arg (pcum_v, mode, type, named);
2569       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2570                   != (pcum->aapcs_stack_words != 0));
2571       pcum->aapcs_arg_processed = false;
2572       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2573       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2574       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2575       pcum->aapcs_stack_words = 0;
2576       pcum->aapcs_reg = NULL_RTX;
2577     }
2578 }
2579
2580 bool
2581 aarch64_function_arg_regno_p (unsigned regno)
2582 {
2583   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2584           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2585 }
2586
2587 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
2588    PARM_BOUNDARY bits of alignment, but will be given anything up
2589    to STACK_BOUNDARY bits if the type requires it.  This makes sure
2590    that both before and after the layout of each argument, the Next
2591    Stacked Argument Address (NSAA) will have a minimum alignment of
2592    8 bytes.  */
2593
2594 static unsigned int
2595 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2596 {
2597   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2598   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
2599 }
2600
2601 /* Implement TARGET_FUNCTION_ARG_PADDING.
2602
2603    Small aggregate types are placed in the lowest memory address.
2604
2605    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
2606
2607 static pad_direction
2608 aarch64_function_arg_padding (machine_mode mode, const_tree type)
2609 {
2610   /* On little-endian targets, the least significant byte of every stack
2611      argument is passed at the lowest byte address of the stack slot.  */
2612   if (!BYTES_BIG_ENDIAN)
2613     return PAD_UPWARD;
2614
2615   /* Otherwise, integral, floating-point and pointer types are padded downward:
2616      the least significant byte of a stack argument is passed at the highest
2617      byte address of the stack slot.  */
2618   if (type
2619       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2620          || POINTER_TYPE_P (type))
2621       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2622     return PAD_DOWNWARD;
2623
2624   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
2625   return PAD_UPWARD;
2626 }
2627
2628 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2629
2630    It specifies padding for the last (may also be the only)
2631    element of a block move between registers and memory.  If
2632    assuming the block is in the memory, padding upward means that
2633    the last element is padded after its highest significant byte,
2634    while in downward padding, the last element is padded at the
2635    its least significant byte side.
2636
2637    Small aggregates and small complex types are always padded
2638    upwards.
2639
2640    We don't need to worry about homogeneous floating-point or
2641    short-vector aggregates; their move is not affected by the
2642    padding direction determined here.  Regardless of endianness,
2643    each element of such an aggregate is put in the least
2644    significant bits of a fp/simd register.
2645
2646    Return !BYTES_BIG_ENDIAN if the least significant byte of the
2647    register has useful data, and return the opposite if the most
2648    significant byte does.  */
2649
2650 bool
2651 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2652                      bool first ATTRIBUTE_UNUSED)
2653 {
2654
2655   /* Small composite types are always padded upward.  */
2656   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2657     {
2658       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2659                             : GET_MODE_SIZE (mode));
2660       if (size < 2 * UNITS_PER_WORD)
2661         return true;
2662     }
2663
2664   /* Otherwise, use the default padding.  */
2665   return !BYTES_BIG_ENDIAN;
2666 }
2667
2668 static scalar_int_mode
2669 aarch64_libgcc_cmp_return_mode (void)
2670 {
2671   return SImode;
2672 }
2673
2674 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2675
2676 /* We use the 12-bit shifted immediate arithmetic instructions so values
2677    must be multiple of (1 << 12), i.e. 4096.  */
2678 #define ARITH_FACTOR 4096
2679
2680 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2681 #error Cannot use simple address calculation for stack probing
2682 #endif
2683
2684 /* The pair of scratch registers used for stack probing.  */
2685 #define PROBE_STACK_FIRST_REG  9
2686 #define PROBE_STACK_SECOND_REG 10
2687
2688 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2689    inclusive.  These are offsets from the current stack pointer.  */
2690
2691 static void
2692 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2693 {
2694   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
2695
2696   /* See the same assertion on PROBE_INTERVAL above.  */
2697   gcc_assert ((first % ARITH_FACTOR) == 0);
2698
2699   /* See if we have a constant small number of probes to generate.  If so,
2700      that's the easy case.  */
2701   if (size <= PROBE_INTERVAL)
2702     {
2703       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2704
2705       emit_set_insn (reg1,
2706                      plus_constant (Pmode,
2707                                     stack_pointer_rtx, -(first + base)));
2708       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
2709     }
2710
2711   /* The run-time loop is made up of 8 insns in the generic case while the
2712      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
2713   else if (size <= 4 * PROBE_INTERVAL)
2714     {
2715       HOST_WIDE_INT i, rem;
2716
2717       emit_set_insn (reg1,
2718                      plus_constant (Pmode,
2719                                     stack_pointer_rtx,
2720                                     -(first + PROBE_INTERVAL)));
2721       emit_stack_probe (reg1);
2722
2723       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2724          it exceeds SIZE.  If only two probes are needed, this will not
2725          generate any code.  Then probe at FIRST + SIZE.  */
2726       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2727         {
2728           emit_set_insn (reg1,
2729                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
2730           emit_stack_probe (reg1);
2731         }
2732
2733       rem = size - (i - PROBE_INTERVAL);
2734       if (rem > 256)
2735         {
2736           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2737
2738           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
2739           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
2740         }
2741       else
2742         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
2743     }
2744
2745   /* Otherwise, do the same as above, but in a loop.  Note that we must be
2746      extra careful with variables wrapping around because we might be at
2747      the very top (or the very bottom) of the address space and we have
2748      to be able to handle this case properly; in particular, we use an
2749      equality test for the loop condition.  */
2750   else
2751     {
2752       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
2753
2754       /* Step 1: round SIZE to the previous multiple of the interval.  */
2755
2756       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2757
2758
2759       /* Step 2: compute initial and final value of the loop counter.  */
2760
2761       /* TEST_ADDR = SP + FIRST.  */
2762       emit_set_insn (reg1,
2763                      plus_constant (Pmode, stack_pointer_rtx, -first));
2764
2765       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
2766       HOST_WIDE_INT adjustment = - (first + rounded_size);
2767       if (! aarch64_uimm12_shift (adjustment))
2768         {
2769           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
2770                                           true, Pmode);
2771           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
2772         }
2773       else
2774         {
2775           emit_set_insn (reg2,
2776                          plus_constant (Pmode, stack_pointer_rtx, adjustment));
2777         }
2778
2779       /* Step 3: the loop
2780
2781          do
2782            {
2783              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2784              probe at TEST_ADDR
2785            }
2786          while (TEST_ADDR != LAST_ADDR)
2787
2788          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2789          until it is equal to ROUNDED_SIZE.  */
2790
2791       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
2792
2793
2794       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2795          that SIZE is equal to ROUNDED_SIZE.  */
2796
2797       if (size != rounded_size)
2798         {
2799           HOST_WIDE_INT rem = size - rounded_size;
2800
2801           if (rem > 256)
2802             {
2803               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2804
2805               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
2806               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
2807             }
2808           else
2809             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
2810         }
2811     }
2812
2813   /* Make sure nothing is scheduled before we are done.  */
2814   emit_insn (gen_blockage ());
2815 }
2816
2817 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
2818    absolute addresses.  */
2819
2820 const char *
2821 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2822 {
2823   static int labelno = 0;
2824   char loop_lab[32];
2825   rtx xops[2];
2826
2827   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2828
2829   /* Loop.  */
2830   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2831
2832   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
2833   xops[0] = reg1;
2834   xops[1] = GEN_INT (PROBE_INTERVAL);
2835   output_asm_insn ("sub\t%0, %0, %1", xops);
2836
2837   /* Probe at TEST_ADDR.  */
2838   output_asm_insn ("str\txzr, [%0]", xops);
2839
2840   /* Test if TEST_ADDR == LAST_ADDR.  */
2841   xops[1] = reg2;
2842   output_asm_insn ("cmp\t%0, %1", xops);
2843
2844   /* Branch.  */
2845   fputs ("\tb.ne\t", asm_out_file);
2846   assemble_name_raw (asm_out_file, loop_lab);
2847   fputc ('\n', asm_out_file);
2848
2849   return "";
2850 }
2851
2852 static bool
2853 aarch64_frame_pointer_required (void)
2854 {
2855   /* In aarch64_override_options_after_change
2856      flag_omit_leaf_frame_pointer turns off the frame pointer by
2857      default.  Turn it back on now if we've not got a leaf
2858      function.  */
2859   if (flag_omit_leaf_frame_pointer
2860       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2861     return true;
2862
2863   /* Force a frame pointer for EH returns so the return address is at FP+8.  */
2864   if (crtl->calls_eh_return)
2865     return true;
2866
2867   return false;
2868 }
2869
2870 /* Mark the registers that need to be saved by the callee and calculate
2871    the size of the callee-saved registers area and frame record (both FP
2872    and LR may be omitted).  */
2873 static void
2874 aarch64_layout_frame (void)
2875 {
2876   HOST_WIDE_INT offset = 0;
2877   int regno, last_fp_reg = INVALID_REGNUM;
2878
2879   if (reload_completed && cfun->machine->frame.laid_out)
2880     return;
2881
2882 #define SLOT_NOT_REQUIRED (-2)
2883 #define SLOT_REQUIRED     (-1)
2884
2885   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2886   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2887
2888   /* First mark all the registers that really need to be saved...  */
2889   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2890     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2891
2892   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2893     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2894
2895   /* ... that includes the eh data registers (if needed)...  */
2896   if (crtl->calls_eh_return)
2897     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2898       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2899         = SLOT_REQUIRED;
2900
2901   /* ... and any callee saved register that dataflow says is live.  */
2902   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2903     if (df_regs_ever_live_p (regno)
2904         && (regno == R30_REGNUM
2905             || !call_used_regs[regno]))
2906       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2907
2908   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2909     if (df_regs_ever_live_p (regno)
2910         && !call_used_regs[regno])
2911       {
2912         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2913         last_fp_reg = regno;
2914       }
2915
2916   if (frame_pointer_needed)
2917     {
2918       /* FP and LR are placed in the linkage record.  */
2919       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2920       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2921       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2922       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2923       offset += 2 * UNITS_PER_WORD;
2924     }
2925
2926   /* Now assign stack slots for them.  */
2927   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2928     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2929       {
2930         cfun->machine->frame.reg_offset[regno] = offset;
2931         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2932           cfun->machine->frame.wb_candidate1 = regno;
2933         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2934           cfun->machine->frame.wb_candidate2 = regno;
2935         offset += UNITS_PER_WORD;
2936       }
2937
2938   HOST_WIDE_INT max_int_offset = offset;
2939   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2940   bool has_align_gap = offset != max_int_offset;
2941
2942   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2943     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2944       {
2945         /* If there is an alignment gap between integer and fp callee-saves,
2946            allocate the last fp register to it if possible.  */
2947         if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
2948           {
2949             cfun->machine->frame.reg_offset[regno] = max_int_offset;
2950             break;
2951           }
2952
2953         cfun->machine->frame.reg_offset[regno] = offset;
2954         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2955           cfun->machine->frame.wb_candidate1 = regno;
2956         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2957                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2958           cfun->machine->frame.wb_candidate2 = regno;
2959         offset += UNITS_PER_WORD;
2960       }
2961
2962   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2963
2964   cfun->machine->frame.saved_regs_size = offset;
2965
2966   HOST_WIDE_INT varargs_and_saved_regs_size
2967     = offset + cfun->machine->frame.saved_varargs_size;
2968
2969   cfun->machine->frame.hard_fp_offset
2970     = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
2971                 STACK_BOUNDARY / BITS_PER_UNIT);
2972
2973   cfun->machine->frame.frame_size
2974     = ROUND_UP (cfun->machine->frame.hard_fp_offset
2975                 + crtl->outgoing_args_size,
2976                 STACK_BOUNDARY / BITS_PER_UNIT);
2977
2978   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
2979
2980   cfun->machine->frame.initial_adjust = 0;
2981   cfun->machine->frame.final_adjust = 0;
2982   cfun->machine->frame.callee_adjust = 0;
2983   cfun->machine->frame.callee_offset = 0;
2984
2985   HOST_WIDE_INT max_push_offset = 0;
2986   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
2987     max_push_offset = 512;
2988   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
2989     max_push_offset = 256;
2990
2991   if (cfun->machine->frame.frame_size < max_push_offset
2992       && crtl->outgoing_args_size == 0)
2993     {
2994       /* Simple, small frame with no outgoing arguments:
2995          stp reg1, reg2, [sp, -frame_size]!
2996          stp reg3, reg4, [sp, 16]  */
2997       cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
2998     }
2999   else if ((crtl->outgoing_args_size
3000             + cfun->machine->frame.saved_regs_size < 512)
3001            && !(cfun->calls_alloca
3002                 && cfun->machine->frame.hard_fp_offset < max_push_offset))
3003     {
3004       /* Frame with small outgoing arguments:
3005          sub sp, sp, frame_size
3006          stp reg1, reg2, [sp, outgoing_args_size]
3007          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
3008       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
3009       cfun->machine->frame.callee_offset
3010         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
3011     }
3012   else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
3013     {
3014       /* Frame with large outgoing arguments but a small local area:
3015          stp reg1, reg2, [sp, -hard_fp_offset]!
3016          stp reg3, reg4, [sp, 16]
3017          sub sp, sp, outgoing_args_size  */
3018       cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
3019       cfun->machine->frame.final_adjust
3020         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3021     }
3022   else if (!frame_pointer_needed
3023            && varargs_and_saved_regs_size < max_push_offset)
3024     {
3025       /* Frame with large local area and outgoing arguments (this pushes the
3026          callee-saves first, followed by the locals and outgoing area):
3027          stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
3028          stp reg3, reg4, [sp, 16]
3029          sub sp, sp, frame_size - varargs_and_saved_regs_size  */
3030       cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
3031       cfun->machine->frame.final_adjust
3032         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3033       cfun->machine->frame.hard_fp_offset = cfun->machine->frame.callee_adjust;
3034       cfun->machine->frame.locals_offset = cfun->machine->frame.hard_fp_offset;
3035     }
3036   else
3037     {
3038       /* Frame with large local area and outgoing arguments using frame pointer:
3039          sub sp, sp, hard_fp_offset
3040          stp x29, x30, [sp, 0]
3041          add x29, sp, 0
3042          stp reg3, reg4, [sp, 16]
3043          sub sp, sp, outgoing_args_size  */
3044       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
3045       cfun->machine->frame.final_adjust
3046         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
3047     }
3048
3049   cfun->machine->frame.laid_out = true;
3050 }
3051
3052 /* Return true if the register REGNO is saved on entry to
3053    the current function.  */
3054
3055 static bool
3056 aarch64_register_saved_on_entry (int regno)
3057 {
3058   return cfun->machine->frame.reg_offset[regno] >= 0;
3059 }
3060
3061 /* Return the next register up from REGNO up to LIMIT for the callee
3062    to save.  */
3063
3064 static unsigned
3065 aarch64_next_callee_save (unsigned regno, unsigned limit)
3066 {
3067   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
3068     regno ++;
3069   return regno;
3070 }
3071
3072 /* Push the register number REGNO of mode MODE to the stack with write-back
3073    adjusting the stack by ADJUSTMENT.  */
3074
3075 static void
3076 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
3077                            HOST_WIDE_INT adjustment)
3078  {
3079   rtx base_rtx = stack_pointer_rtx;
3080   rtx insn, reg, mem;
3081
3082   reg = gen_rtx_REG (mode, regno);
3083   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
3084                             plus_constant (Pmode, base_rtx, -adjustment));
3085   mem = gen_frame_mem (mode, mem);
3086
3087   insn = emit_move_insn (mem, reg);
3088   RTX_FRAME_RELATED_P (insn) = 1;
3089 }
3090
3091 /* Generate and return an instruction to store the pair of registers
3092    REG and REG2 of mode MODE to location BASE with write-back adjusting
3093    the stack location BASE by ADJUSTMENT.  */
3094
3095 static rtx
3096 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3097                           HOST_WIDE_INT adjustment)
3098 {
3099   switch (mode)
3100     {
3101     case E_DImode:
3102       return gen_storewb_pairdi_di (base, base, reg, reg2,
3103                                     GEN_INT (-adjustment),
3104                                     GEN_INT (UNITS_PER_WORD - adjustment));
3105     case E_DFmode:
3106       return gen_storewb_pairdf_di (base, base, reg, reg2,
3107                                     GEN_INT (-adjustment),
3108                                     GEN_INT (UNITS_PER_WORD - adjustment));
3109     default:
3110       gcc_unreachable ();
3111     }
3112 }
3113
3114 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3115    stack pointer by ADJUSTMENT.  */
3116
3117 static void
3118 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
3119 {
3120   rtx_insn *insn;
3121   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
3122
3123   if (regno2 == INVALID_REGNUM)
3124     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
3125
3126   rtx reg1 = gen_rtx_REG (mode, regno1);
3127   rtx reg2 = gen_rtx_REG (mode, regno2);
3128
3129   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
3130                                               reg2, adjustment));
3131   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
3132   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3133   RTX_FRAME_RELATED_P (insn) = 1;
3134 }
3135
3136 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3137    adjusting it by ADJUSTMENT afterwards.  */
3138
3139 static rtx
3140 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3141                          HOST_WIDE_INT adjustment)
3142 {
3143   switch (mode)
3144     {
3145     case E_DImode:
3146       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3147                                    GEN_INT (UNITS_PER_WORD));
3148     case E_DFmode:
3149       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3150                                    GEN_INT (UNITS_PER_WORD));
3151     default:
3152       gcc_unreachable ();
3153     }
3154 }
3155
3156 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3157    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3158    into CFI_OPS.  */
3159
3160 static void
3161 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
3162                   rtx *cfi_ops)
3163 {
3164   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
3165   rtx reg1 = gen_rtx_REG (mode, regno1);
3166
3167   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
3168
3169   if (regno2 == INVALID_REGNUM)
3170     {
3171       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
3172       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3173       emit_move_insn (reg1, gen_frame_mem (mode, mem));
3174     }
3175   else
3176     {
3177       rtx reg2 = gen_rtx_REG (mode, regno2);
3178       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3179       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3180                                           reg2, adjustment));
3181     }
3182 }
3183
3184 /* Generate and return a store pair instruction of mode MODE to store
3185    register REG1 to MEM1 and register REG2 to MEM2.  */
3186
3187 static rtx
3188 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3189                         rtx reg2)
3190 {
3191   switch (mode)
3192     {
3193     case E_DImode:
3194       return gen_store_pairdi (mem1, reg1, mem2, reg2);
3195
3196     case E_DFmode:
3197       return gen_store_pairdf (mem1, reg1, mem2, reg2);
3198
3199     default:
3200       gcc_unreachable ();
3201     }
3202 }
3203
3204 /* Generate and regurn a load pair isntruction of mode MODE to load register
3205    REG1 from MEM1 and register REG2 from MEM2.  */
3206
3207 static rtx
3208 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3209                        rtx mem2)
3210 {
3211   switch (mode)
3212     {
3213     case E_DImode:
3214       return gen_load_pairdi (reg1, mem1, reg2, mem2);
3215
3216     case E_DFmode:
3217       return gen_load_pairdf (reg1, mem1, reg2, mem2);
3218
3219     default:
3220       gcc_unreachable ();
3221     }
3222 }
3223
3224 /* Return TRUE if return address signing should be enabled for the current
3225    function, otherwise return FALSE.  */
3226
3227 bool
3228 aarch64_return_address_signing_enabled (void)
3229 {
3230   /* This function should only be called after frame laid out.   */
3231   gcc_assert (cfun->machine->frame.laid_out);
3232
3233   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
3234      if it's LR is pushed onto stack.  */
3235   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
3236           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
3237               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
3238 }
3239
3240 /* Emit code to save the callee-saved registers from register number START
3241    to LIMIT to the stack at the location starting at offset START_OFFSET,
3242    skipping any write-back candidates if SKIP_WB is true.  */
3243
3244 static void
3245 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3246                            unsigned start, unsigned limit, bool skip_wb)
3247 {
3248   rtx_insn *insn;
3249   unsigned regno;
3250   unsigned regno2;
3251
3252   for (regno = aarch64_next_callee_save (start, limit);
3253        regno <= limit;
3254        regno = aarch64_next_callee_save (regno + 1, limit))
3255     {
3256       rtx reg, mem;
3257       HOST_WIDE_INT offset;
3258
3259       if (skip_wb
3260           && (regno == cfun->machine->frame.wb_candidate1
3261               || regno == cfun->machine->frame.wb_candidate2))
3262         continue;
3263
3264       if (cfun->machine->reg_is_wrapped_separately[regno])
3265        continue;
3266
3267       reg = gen_rtx_REG (mode, regno);
3268       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3269       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
3270                                                 offset));
3271
3272       regno2 = aarch64_next_callee_save (regno + 1, limit);
3273
3274       if (regno2 <= limit
3275           && !cfun->machine->reg_is_wrapped_separately[regno2]
3276           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3277               == cfun->machine->frame.reg_offset[regno2]))
3278
3279         {
3280           rtx reg2 = gen_rtx_REG (mode, regno2);
3281           rtx mem2;
3282
3283           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3284           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
3285                                                      offset));
3286           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3287                                                     reg2));
3288
3289           /* The first part of a frame-related parallel insn is
3290              always assumed to be relevant to the frame
3291              calculations; subsequent parts, are only
3292              frame-related if explicitly marked.  */
3293           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3294           regno = regno2;
3295         }
3296       else
3297         insn = emit_move_insn (mem, reg);
3298
3299       RTX_FRAME_RELATED_P (insn) = 1;
3300     }
3301 }
3302
3303 /* Emit code to restore the callee registers of mode MODE from register
3304    number START up to and including LIMIT.  Restore from the stack offset
3305    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3306    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
3307
3308 static void
3309 aarch64_restore_callee_saves (machine_mode mode,
3310                               HOST_WIDE_INT start_offset, unsigned start,
3311                               unsigned limit, bool skip_wb, rtx *cfi_ops)
3312 {
3313   rtx base_rtx = stack_pointer_rtx;
3314   unsigned regno;
3315   unsigned regno2;
3316   HOST_WIDE_INT offset;
3317
3318   for (regno = aarch64_next_callee_save (start, limit);
3319        regno <= limit;
3320        regno = aarch64_next_callee_save (regno + 1, limit))
3321     {
3322       if (cfun->machine->reg_is_wrapped_separately[regno])
3323        continue;
3324
3325       rtx reg, mem;
3326
3327       if (skip_wb
3328           && (regno == cfun->machine->frame.wb_candidate1
3329               || regno == cfun->machine->frame.wb_candidate2))
3330         continue;
3331
3332       reg = gen_rtx_REG (mode, regno);
3333       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3334       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
3335
3336       regno2 = aarch64_next_callee_save (regno + 1, limit);
3337
3338       if (regno2 <= limit
3339           && !cfun->machine->reg_is_wrapped_separately[regno2]
3340           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3341               == cfun->machine->frame.reg_offset[regno2]))
3342         {
3343           rtx reg2 = gen_rtx_REG (mode, regno2);
3344           rtx mem2;
3345
3346           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3347           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
3348           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3349
3350           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3351           regno = regno2;
3352         }
3353       else
3354         emit_move_insn (reg, mem);
3355       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3356     }
3357 }
3358
3359 static inline bool
3360 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3361                                HOST_WIDE_INT offset)
3362 {
3363   return offset >= -256 && offset < 256;
3364 }
3365
3366 static inline bool
3367 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3368 {
3369   return (offset >= 0
3370           && offset < 4096 * GET_MODE_SIZE (mode)
3371           && offset % GET_MODE_SIZE (mode) == 0);
3372 }
3373
3374 bool
3375 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3376 {
3377   return (offset >= -64 * GET_MODE_SIZE (mode)
3378           && offset < 64 * GET_MODE_SIZE (mode)
3379           && offset % GET_MODE_SIZE (mode) == 0);
3380 }
3381
3382 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
3383
3384 static sbitmap
3385 aarch64_get_separate_components (void)
3386 {
3387   aarch64_layout_frame ();
3388
3389   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3390   bitmap_clear (components);
3391
3392   /* The registers we need saved to the frame.  */
3393   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3394     if (aarch64_register_saved_on_entry (regno))
3395       {
3396         HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3397         if (!frame_pointer_needed)
3398           offset += cfun->machine->frame.frame_size
3399                     - cfun->machine->frame.hard_fp_offset;
3400         /* Check that we can access the stack slot of the register with one
3401            direct load with no adjustments needed.  */
3402         if (offset_12bit_unsigned_scaled_p (DImode, offset))
3403           bitmap_set_bit (components, regno);
3404       }
3405
3406   /* Don't mess with the hard frame pointer.  */
3407   if (frame_pointer_needed)
3408     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
3409
3410   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3411   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3412   /* If aarch64_layout_frame has chosen registers to store/restore with
3413      writeback don't interfere with them to avoid having to output explicit
3414      stack adjustment instructions.  */
3415   if (reg2 != INVALID_REGNUM)
3416     bitmap_clear_bit (components, reg2);
3417   if (reg1 != INVALID_REGNUM)
3418     bitmap_clear_bit (components, reg1);
3419
3420   bitmap_clear_bit (components, LR_REGNUM);
3421   bitmap_clear_bit (components, SP_REGNUM);
3422
3423   return components;
3424 }
3425
3426 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
3427
3428 static sbitmap
3429 aarch64_components_for_bb (basic_block bb)
3430 {
3431   bitmap in = DF_LIVE_IN (bb);
3432   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
3433   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
3434
3435   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3436   bitmap_clear (components);
3437
3438   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
3439   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3440     if ((!call_used_regs[regno])
3441        && (bitmap_bit_p (in, regno)
3442            || bitmap_bit_p (gen, regno)
3443            || bitmap_bit_p (kill, regno)))
3444           bitmap_set_bit (components, regno);
3445
3446   return components;
3447 }
3448
3449 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
3450    Nothing to do for aarch64.  */
3451
3452 static void
3453 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
3454 {
3455 }
3456
3457 /* Return the next set bit in BMP from START onwards.  Return the total number
3458    of bits in BMP if no set bit is found at or after START.  */
3459
3460 static unsigned int
3461 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
3462 {
3463   unsigned int nbits = SBITMAP_SIZE (bmp);
3464   if (start == nbits)
3465     return start;
3466
3467   gcc_assert (start < nbits);
3468   for (unsigned int i = start; i < nbits; i++)
3469     if (bitmap_bit_p (bmp, i))
3470       return i;
3471
3472   return nbits;
3473 }
3474
3475 /* Do the work for aarch64_emit_prologue_components and
3476    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
3477    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
3478    for these components or the epilogue sequence.  That is, it determines
3479    whether we should emit stores or loads and what kind of CFA notes to attach
3480    to the insns.  Otherwise the logic for the two sequences is very
3481    similar.  */
3482
3483 static void
3484 aarch64_process_components (sbitmap components, bool prologue_p)
3485 {
3486   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
3487                              ? HARD_FRAME_POINTER_REGNUM
3488                              : STACK_POINTER_REGNUM);
3489
3490   unsigned last_regno = SBITMAP_SIZE (components);
3491   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
3492   rtx_insn *insn = NULL;
3493
3494   while (regno != last_regno)
3495     {
3496       /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
3497          so DFmode for the vector registers is enough.  */
3498       machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
3499       rtx reg = gen_rtx_REG (mode, regno);
3500       HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3501       if (!frame_pointer_needed)
3502         offset += cfun->machine->frame.frame_size
3503                   - cfun->machine->frame.hard_fp_offset;
3504       rtx addr = plus_constant (Pmode, ptr_reg, offset);
3505       rtx mem = gen_frame_mem (mode, addr);
3506
3507       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
3508       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
3509       /* No more registers to handle after REGNO.
3510          Emit a single save/restore and exit.  */
3511       if (regno2 == last_regno)
3512         {
3513           insn = emit_insn (set);
3514           RTX_FRAME_RELATED_P (insn) = 1;
3515           if (prologue_p)
3516             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3517           else
3518             add_reg_note (insn, REG_CFA_RESTORE, reg);
3519           break;
3520         }
3521
3522       HOST_WIDE_INT offset2 = cfun->machine->frame.reg_offset[regno2];
3523       /* The next register is not of the same class or its offset is not
3524          mergeable with the current one into a pair.  */
3525       if (!satisfies_constraint_Ump (mem)
3526           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
3527           || (offset2 - cfun->machine->frame.reg_offset[regno])
3528                 != GET_MODE_SIZE (mode))
3529         {
3530           insn = emit_insn (set);
3531           RTX_FRAME_RELATED_P (insn) = 1;
3532           if (prologue_p)
3533             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3534           else
3535             add_reg_note (insn, REG_CFA_RESTORE, reg);
3536
3537           regno = regno2;
3538           continue;
3539         }
3540
3541       /* REGNO2 can be saved/restored in a pair with REGNO.  */
3542       rtx reg2 = gen_rtx_REG (mode, regno2);
3543       if (!frame_pointer_needed)
3544         offset2 += cfun->machine->frame.frame_size
3545                   - cfun->machine->frame.hard_fp_offset;
3546       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
3547       rtx mem2 = gen_frame_mem (mode, addr2);
3548       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
3549                              : gen_rtx_SET (reg2, mem2);
3550
3551       if (prologue_p)
3552         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
3553       else
3554         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3555
3556       RTX_FRAME_RELATED_P (insn) = 1;
3557       if (prologue_p)
3558         {
3559           add_reg_note (insn, REG_CFA_OFFSET, set);
3560           add_reg_note (insn, REG_CFA_OFFSET, set2);
3561         }
3562       else
3563         {
3564           add_reg_note (insn, REG_CFA_RESTORE, reg);
3565           add_reg_note (insn, REG_CFA_RESTORE, reg2);
3566         }
3567
3568       regno = aarch64_get_next_set_bit (components, regno2 + 1);
3569     }
3570 }
3571
3572 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
3573
3574 static void
3575 aarch64_emit_prologue_components (sbitmap components)
3576 {
3577   aarch64_process_components (components, true);
3578 }
3579
3580 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
3581
3582 static void
3583 aarch64_emit_epilogue_components (sbitmap components)
3584 {
3585   aarch64_process_components (components, false);
3586 }
3587
3588 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
3589
3590 static void
3591 aarch64_set_handled_components (sbitmap components)
3592 {
3593   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3594     if (bitmap_bit_p (components, regno))
3595       cfun->machine->reg_is_wrapped_separately[regno] = true;
3596 }
3597
3598 /* AArch64 stack frames generated by this compiler look like:
3599
3600         +-------------------------------+
3601         |                               |
3602         |  incoming stack arguments     |
3603         |                               |
3604         +-------------------------------+
3605         |                               | <-- incoming stack pointer (aligned)
3606         |  callee-allocated save area   |
3607         |  for register varargs         |
3608         |                               |
3609         +-------------------------------+
3610         |  local variables              | <-- frame_pointer_rtx
3611         |                               |
3612         +-------------------------------+
3613         |  padding0                     | \
3614         +-------------------------------+  |
3615         |  callee-saved registers       |  | frame.saved_regs_size
3616         +-------------------------------+  |
3617         |  LR'                          |  |
3618         +-------------------------------+  |
3619         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
3620         +-------------------------------+
3621         |  dynamic allocation           |
3622         +-------------------------------+
3623         |  padding                      |
3624         +-------------------------------+
3625         |  outgoing stack arguments     | <-- arg_pointer
3626         |                               |
3627         +-------------------------------+
3628         |                               | <-- stack_pointer_rtx (aligned)
3629
3630    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3631    but leave frame_pointer_rtx and hard_frame_pointer_rtx
3632    unchanged.  */
3633
3634 /* Generate the prologue instructions for entry into a function.
3635    Establish the stack frame by decreasing the stack pointer with a
3636    properly calculated size and, if necessary, create a frame record
3637    filled with the values of LR and previous frame pointer.  The
3638    current FP is also set up if it is in use.  */
3639
3640 void
3641 aarch64_expand_prologue (void)
3642 {
3643   aarch64_layout_frame ();
3644
3645   HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3646   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3647   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3648   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3649   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3650   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3651   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3652   rtx_insn *insn;
3653
3654   /* Sign return address for functions.  */
3655   if (aarch64_return_address_signing_enabled ())
3656     {
3657       insn = emit_insn (gen_pacisp ());
3658       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3659       RTX_FRAME_RELATED_P (insn) = 1;
3660     }
3661
3662   if (flag_stack_usage_info)
3663     current_function_static_stack_size = frame_size;
3664
3665   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3666     {
3667       if (crtl->is_leaf && !cfun->calls_alloca)
3668         {
3669           if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
3670             aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
3671                                             frame_size - STACK_CHECK_PROTECT);
3672         }
3673       else if (frame_size > 0)
3674         aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
3675     }
3676
3677   aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
3678
3679   if (callee_adjust != 0)
3680     aarch64_push_regs (reg1, reg2, callee_adjust);
3681
3682   if (frame_pointer_needed)
3683     {
3684       if (callee_adjust == 0)
3685         aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3686                                    R30_REGNUM, false);
3687       insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3688                                        stack_pointer_rtx,
3689                                        GEN_INT (callee_offset)));
3690       RTX_FRAME_RELATED_P (insn) = 1;
3691       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3692     }
3693
3694   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3695                              callee_adjust != 0 || frame_pointer_needed);
3696   aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3697                              callee_adjust != 0 || frame_pointer_needed);
3698   aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
3699 }
3700
3701 /* Return TRUE if we can use a simple_return insn.
3702
3703    This function checks whether the callee saved stack is empty, which
3704    means no restore actions are need. The pro_and_epilogue will use
3705    this to check whether shrink-wrapping opt is feasible.  */
3706
3707 bool
3708 aarch64_use_return_insn_p (void)
3709 {
3710   if (!reload_completed)
3711     return false;
3712
3713   if (crtl->profile)
3714     return false;
3715
3716   aarch64_layout_frame ();
3717
3718   return cfun->machine->frame.frame_size == 0;
3719 }
3720
3721 /* Generate the epilogue instructions for returning from a function.
3722    This is almost exactly the reverse of the prolog sequence, except
3723    that we need to insert barriers to avoid scheduling loads that read
3724    from a deallocated stack, and we optimize the unwind records by
3725    emitting them all together if possible.  */
3726 void
3727 aarch64_expand_epilogue (bool for_sibcall)
3728 {
3729   aarch64_layout_frame ();
3730
3731   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3732   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3733   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3734   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3735   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3736   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3737   rtx cfi_ops = NULL;
3738   rtx_insn *insn;
3739
3740   /* We need to add memory barrier to prevent read from deallocated stack.  */
3741   bool need_barrier_p = (get_frame_size ()
3742                          + cfun->machine->frame.saved_varargs_size) != 0;
3743
3744   /* Emit a barrier to prevent loads from a deallocated stack.  */
3745   if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca
3746       || crtl->calls_eh_return)
3747     {
3748       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3749       need_barrier_p = false;
3750     }
3751
3752   /* Restore the stack pointer from the frame pointer if it may not
3753      be the same as the stack pointer.  */
3754   if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3755     {
3756       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3757                                        hard_frame_pointer_rtx,
3758                                        GEN_INT (-callee_offset)));
3759       /* If writeback is used when restoring callee-saves, the CFA
3760          is restored on the instruction doing the writeback.  */
3761       RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3762     }
3763   else
3764     aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
3765
3766   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3767                                 callee_adjust != 0, &cfi_ops);
3768   aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3769                                 callee_adjust != 0, &cfi_ops);
3770
3771   if (need_barrier_p)
3772     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3773
3774   if (callee_adjust != 0)
3775     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3776
3777   if (callee_adjust != 0 || initial_adjust > 65536)
3778     {
3779       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
3780       insn = get_last_insn ();
3781       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3782       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3783       RTX_FRAME_RELATED_P (insn) = 1;
3784       cfi_ops = NULL;
3785     }
3786
3787   aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
3788
3789   if (cfi_ops)
3790     {
3791       /* Emit delayed restores and reset the CFA to be SP.  */
3792       insn = get_last_insn ();
3793       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3794       REG_NOTES (insn) = cfi_ops;
3795       RTX_FRAME_RELATED_P (insn) = 1;
3796     }
3797
3798   /* We prefer to emit the combined return/authenticate instruction RETAA,
3799      however there are three cases in which we must instead emit an explicit
3800      authentication instruction.
3801
3802         1) Sibcalls don't return in a normal way, so if we're about to call one
3803            we must authenticate.
3804
3805         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
3806            generating code for !TARGET_ARMV8_3 we can't use it and must
3807            explicitly authenticate.
3808
3809         3) On an eh_return path we make extra stack adjustments to update the
3810            canonical frame address to be the exception handler's CFA.  We want
3811            to authenticate using the CFA of the function which calls eh_return.
3812     */
3813   if (aarch64_return_address_signing_enabled ()
3814       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
3815     {
3816       insn = emit_insn (gen_autisp ());
3817       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3818       RTX_FRAME_RELATED_P (insn) = 1;
3819     }
3820
3821   /* Stack adjustment for exception handler.  */
3822   if (crtl->calls_eh_return)
3823     {
3824       /* We need to unwind the stack by the offset computed by
3825          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
3826          to be SP; letting the CFA move during this adjustment
3827          is just as correct as retaining the CFA from the body
3828          of the function.  Therefore, do nothing special.  */
3829       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3830     }
3831
3832   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3833   if (!for_sibcall)
3834     emit_jump_insn (ret_rtx);
3835 }
3836
3837 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
3838    normally or return to a previous frame after unwinding.
3839
3840    An EH return uses a single shared return sequence.  The epilogue is
3841    exactly like a normal epilogue except that it has an extra input
3842    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
3843    that must be applied after the frame has been destroyed.  An extra label
3844    is inserted before the epilogue which initializes this register to zero,
3845    and this is the entry point for a normal return.
3846
3847    An actual EH return updates the return address, initializes the stack
3848    adjustment and jumps directly into the epilogue (bypassing the zeroing
3849    of the adjustment).  Since the return address is typically saved on the
3850    stack when a function makes a call, the saved LR must be updated outside
3851    the epilogue.
3852
3853    This poses problems as the store is generated well before the epilogue,
3854    so the offset of LR is not known yet.  Also optimizations will remove the
3855    store as it appears dead, even after the epilogue is generated (as the
3856    base or offset for loading LR is different in many cases).
3857
3858    To avoid these problems this implementation forces the frame pointer
3859    in eh_return functions so that the location of LR is fixed and known early.
3860    It also marks the store volatile, so no optimization is permitted to
3861    remove the store.  */
3862 rtx
3863 aarch64_eh_return_handler_rtx (void)
3864 {
3865   rtx tmp = gen_frame_mem (Pmode,
3866     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3867
3868   /* Mark the store volatile, so no optimization is permitted to remove it.  */
3869   MEM_VOLATILE_P (tmp) = true;
3870   return tmp;
3871 }
3872
3873 /* Output code to add DELTA to the first argument, and then jump
3874    to FUNCTION.  Used for C++ multiple inheritance.  */
3875 static void
3876 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3877                          HOST_WIDE_INT delta,
3878                          HOST_WIDE_INT vcall_offset,
3879                          tree function)
3880 {
3881   /* The this pointer is always in x0.  Note that this differs from
3882      Arm where the this pointer maybe bumped to r1 if r0 is required
3883      to return a pointer to an aggregate.  On AArch64 a result value
3884      pointer will be in x8.  */
3885   int this_regno = R0_REGNUM;
3886   rtx this_rtx, temp0, temp1, addr, funexp;
3887   rtx_insn *insn;
3888
3889   reload_completed = 1;
3890   emit_note (NOTE_INSN_PROLOGUE_END);
3891
3892   if (vcall_offset == 0)
3893     aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3894   else
3895     {
3896       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3897
3898       this_rtx = gen_rtx_REG (Pmode, this_regno);
3899       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3900       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3901
3902       addr = this_rtx;
3903       if (delta != 0)
3904         {
3905           if (delta >= -256 && delta < 256)
3906             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3907                                        plus_constant (Pmode, this_rtx, delta));
3908           else
3909             aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3910         }
3911
3912       if (Pmode == ptr_mode)
3913         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3914       else
3915         aarch64_emit_move (temp0,
3916                            gen_rtx_ZERO_EXTEND (Pmode,
3917                                                 gen_rtx_MEM (ptr_mode, addr)));
3918
3919       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3920           addr = plus_constant (Pmode, temp0, vcall_offset);
3921       else
3922         {
3923           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3924                                           Pmode);
3925           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3926         }
3927
3928       if (Pmode == ptr_mode)
3929         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3930       else
3931         aarch64_emit_move (temp1,
3932                            gen_rtx_SIGN_EXTEND (Pmode,
3933                                                 gen_rtx_MEM (ptr_mode, addr)));
3934
3935       emit_insn (gen_add2_insn (this_rtx, temp1));
3936     }
3937
3938   /* Generate a tail call to the target function.  */
3939   if (!TREE_USED (function))
3940     {
3941       assemble_external (function);
3942       TREE_USED (function) = 1;
3943     }
3944   funexp = XEXP (DECL_RTL (function), 0);
3945   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3946   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3947   SIBLING_CALL_P (insn) = 1;
3948
3949   insn = get_insns ();
3950   shorten_branches (insn);
3951   final_start_function (insn, file, 1);
3952   final (insn, file, 1);
3953   final_end_function ();
3954
3955   /* Stop pretending to be a post-reload pass.  */
3956   reload_completed = 0;
3957 }
3958
3959 static bool
3960 aarch64_tls_referenced_p (rtx x)
3961 {
3962   if (!TARGET_HAVE_TLS)
3963     return false;
3964   subrtx_iterator::array_type array;
3965   FOR_EACH_SUBRTX (iter, array, x, ALL)
3966     {
3967       const_rtx x = *iter;
3968       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3969         return true;
3970       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3971          TLS offsets, not real symbol references.  */
3972       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3973         iter.skip_subrtxes ();
3974     }
3975   return false;
3976 }
3977
3978
3979 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3980    a left shift of 0 or 12 bits.  */
3981 bool
3982 aarch64_uimm12_shift (HOST_WIDE_INT val)
3983 {
3984   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3985           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3986           );
3987 }
3988
3989
3990 /* Return true if val is an immediate that can be loaded into a
3991    register by a MOVZ instruction.  */
3992 static bool
3993 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
3994 {
3995   if (GET_MODE_SIZE (mode) > 4)
3996     {
3997       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3998           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3999         return 1;
4000     }
4001   else
4002     {
4003       /* Ignore sign extension.  */
4004       val &= (HOST_WIDE_INT) 0xffffffff;
4005     }
4006   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
4007           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
4008 }
4009
4010 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
4011
4012 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
4013   {
4014     0x0000000100000001ull,
4015     0x0001000100010001ull,
4016     0x0101010101010101ull,
4017     0x1111111111111111ull,
4018     0x5555555555555555ull,
4019   };
4020
4021
4022 /* Return true if val is a valid bitmask immediate.  */
4023
4024 bool
4025 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
4026 {
4027   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
4028   int bits;
4029
4030   /* Check for a single sequence of one bits and return quickly if so.
4031      The special cases of all ones and all zeroes returns false.  */
4032   val = (unsigned HOST_WIDE_INT) val_in;
4033   tmp = val + (val & -val);
4034
4035   if (tmp == (tmp & -tmp))
4036     return (val + 1) > 1;
4037
4038   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
4039   if (mode == SImode)
4040     val = (val << 32) | (val & 0xffffffff);
4041
4042   /* Invert if the immediate doesn't start with a zero bit - this means we
4043      only need to search for sequences of one bits.  */
4044   if (val & 1)
4045     val = ~val;
4046
4047   /* Find the first set bit and set tmp to val with the first sequence of one
4048      bits removed.  Return success if there is a single sequence of ones.  */
4049   first_one = val & -val;
4050   tmp = val & (val + first_one);
4051
4052   if (tmp == 0)
4053     return true;
4054
4055   /* Find the next set bit and compute the difference in bit position.  */
4056   next_one = tmp & -tmp;
4057   bits = clz_hwi (first_one) - clz_hwi (next_one);
4058   mask = val ^ tmp;
4059
4060   /* Check the bit position difference is a power of 2, and that the first
4061      sequence of one bits fits within 'bits' bits.  */
4062   if ((mask >> bits) != 0 || bits != (bits & -bits))
4063     return false;
4064
4065   /* Check the sequence of one bits is repeated 64/bits times.  */
4066   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
4067 }
4068
4069 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
4070    Assumed precondition: VAL_IN Is not zero.  */
4071
4072 unsigned HOST_WIDE_INT
4073 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
4074 {
4075   int lowest_bit_set = ctz_hwi (val_in);
4076   int highest_bit_set = floor_log2 (val_in);
4077   gcc_assert (val_in != 0);
4078
4079   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
4080           (HOST_WIDE_INT_1U << lowest_bit_set));
4081 }
4082
4083 /* Create constant where bits outside of lowest bit set to highest bit set
4084    are set to 1.  */
4085
4086 unsigned HOST_WIDE_INT
4087 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
4088 {
4089   return val_in | ~aarch64_and_split_imm1 (val_in);
4090 }
4091
4092 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
4093
4094 bool
4095 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
4096 {
4097   scalar_int_mode int_mode;
4098   if (!is_a <scalar_int_mode> (mode, &int_mode))
4099     return false;
4100
4101   if (aarch64_bitmask_imm (val_in, int_mode))
4102     return false;
4103
4104   if (aarch64_move_imm (val_in, int_mode))
4105     return false;
4106
4107   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
4108
4109   return aarch64_bitmask_imm (imm2, int_mode);
4110 }
4111
4112 /* Return true if val is an immediate that can be loaded into a
4113    register in a single instruction.  */
4114 bool
4115 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
4116 {
4117   scalar_int_mode int_mode;
4118   if (!is_a <scalar_int_mode> (mode, &int_mode))
4119     return false;
4120
4121   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
4122     return 1;
4123   return aarch64_bitmask_imm (val, int_mode);
4124 }
4125
4126 static bool
4127 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
4128 {
4129   rtx base, offset;
4130
4131   if (GET_CODE (x) == HIGH)
4132     return true;
4133
4134   split_const (x, &base, &offset);
4135   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
4136     {
4137       if (aarch64_classify_symbol (base, offset)
4138           != SYMBOL_FORCE_TO_MEM)
4139         return true;
4140       else
4141         /* Avoid generating a 64-bit relocation in ILP32; leave
4142            to aarch64_expand_mov_immediate to handle it properly.  */
4143         return mode != ptr_mode;
4144     }
4145
4146   return aarch64_tls_referenced_p (x);
4147 }
4148
4149 /* Implement TARGET_CASE_VALUES_THRESHOLD.
4150    The expansion for a table switch is quite expensive due to the number
4151    of instructions, the table lookup and hard to predict indirect jump.
4152    When optimizing for speed, and -O3 enabled, use the per-core tuning if
4153    set, otherwise use tables for > 16 cases as a tradeoff between size and
4154    performance.  When optimizing for size, use the default setting.  */
4155
4156 static unsigned int
4157 aarch64_case_values_threshold (void)
4158 {
4159   /* Use the specified limit for the number of cases before using jump
4160      tables at higher optimization levels.  */
4161   if (optimize > 2
4162       && selected_cpu->tune->max_case_values != 0)
4163     return selected_cpu->tune->max_case_values;
4164   else
4165     return optimize_size ? default_case_values_threshold () : 17;
4166 }
4167
4168 /* Return true if register REGNO is a valid index register.
4169    STRICT_P is true if REG_OK_STRICT is in effect.  */
4170
4171 bool
4172 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
4173 {
4174   if (!HARD_REGISTER_NUM_P (regno))
4175     {
4176       if (!strict_p)
4177         return true;
4178
4179       if (!reg_renumber)
4180         return false;
4181
4182       regno = reg_renumber[regno];
4183     }
4184   return GP_REGNUM_P (regno);
4185 }
4186
4187 /* Return true if register REGNO is a valid base register for mode MODE.
4188    STRICT_P is true if REG_OK_STRICT is in effect.  */
4189
4190 bool
4191 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
4192 {
4193   if (!HARD_REGISTER_NUM_P (regno))
4194     {
4195       if (!strict_p)
4196         return true;
4197
4198       if (!reg_renumber)
4199         return false;
4200
4201       regno = reg_renumber[regno];
4202     }
4203
4204   /* The fake registers will be eliminated to either the stack or
4205      hard frame pointer, both of which are usually valid base registers.
4206      Reload deals with the cases where the eliminated form isn't valid.  */
4207   return (GP_REGNUM_P (regno)
4208           || regno == SP_REGNUM
4209           || regno == FRAME_POINTER_REGNUM
4210           || regno == ARG_POINTER_REGNUM);
4211 }
4212
4213 /* Return true if X is a valid base register for mode MODE.
4214    STRICT_P is true if REG_OK_STRICT is in effect.  */
4215
4216 static bool
4217 aarch64_base_register_rtx_p (rtx x, bool strict_p)
4218 {
4219   if (!strict_p
4220       && GET_CODE (x) == SUBREG
4221       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
4222     x = SUBREG_REG (x);
4223
4224   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
4225 }
4226
4227 /* Return true if address offset is a valid index.  If it is, fill in INFO
4228    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
4229
4230 static bool
4231 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
4232                         machine_mode mode, bool strict_p)
4233 {
4234   enum aarch64_address_type type;
4235   rtx index;
4236   int shift;
4237
4238   /* (reg:P) */
4239   if ((REG_P (x) || GET_CODE (x) == SUBREG)
4240       && GET_MODE (x) == Pmode)
4241     {
4242       type = ADDRESS_REG_REG;
4243       index = x;
4244       shift = 0;
4245     }
4246   /* (sign_extend:DI (reg:SI)) */
4247   else if ((GET_CODE (x) == SIGN_EXTEND
4248             || GET_CODE (x) == ZERO_EXTEND)
4249            && GET_MODE (x) == DImode
4250            && GET_MODE (XEXP (x, 0)) == SImode)
4251     {
4252       type = (GET_CODE (x) == SIGN_EXTEND)
4253         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4254       index = XEXP (x, 0);
4255       shift = 0;
4256     }
4257   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
4258   else if (GET_CODE (x) == MULT
4259            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4260                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4261            && GET_MODE (XEXP (x, 0)) == DImode
4262            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4263            && CONST_INT_P (XEXP (x, 1)))
4264     {
4265       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4266         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4267       index = XEXP (XEXP (x, 0), 0);
4268       shift = exact_log2 (INTVAL (XEXP (x, 1)));
4269     }
4270   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
4271   else if (GET_CODE (x) == ASHIFT
4272            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4273                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4274            && GET_MODE (XEXP (x, 0)) == DImode
4275            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4276            && CONST_INT_P (XEXP (x, 1)))
4277     {
4278       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4279         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4280       index = XEXP (XEXP (x, 0), 0);
4281       shift = INTVAL (XEXP (x, 1));
4282     }
4283   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
4284   else if ((GET_CODE (x) == SIGN_EXTRACT
4285             || GET_CODE (x) == ZERO_EXTRACT)
4286            && GET_MODE (x) == DImode
4287            && GET_CODE (XEXP (x, 0)) == MULT
4288            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4289            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4290     {
4291       type = (GET_CODE (x) == SIGN_EXTRACT)
4292         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4293       index = XEXP (XEXP (x, 0), 0);
4294       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4295       if (INTVAL (XEXP (x, 1)) != 32 + shift
4296           || INTVAL (XEXP (x, 2)) != 0)
4297         shift = -1;
4298     }
4299   /* (and:DI (mult:DI (reg:DI) (const_int scale))
4300      (const_int 0xffffffff<<shift)) */
4301   else if (GET_CODE (x) == AND
4302            && GET_MODE (x) == DImode
4303            && GET_CODE (XEXP (x, 0)) == MULT
4304            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4305            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4306            && CONST_INT_P (XEXP (x, 1)))
4307     {
4308       type = ADDRESS_REG_UXTW;
4309       index = XEXP (XEXP (x, 0), 0);
4310       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4311       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4312         shift = -1;
4313     }
4314   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
4315   else if ((GET_CODE (x) == SIGN_EXTRACT
4316             || GET_CODE (x) == ZERO_EXTRACT)
4317            && GET_MODE (x) == DImode
4318            && GET_CODE (XEXP (x, 0)) == ASHIFT
4319            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4320            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4321     {
4322       type = (GET_CODE (x) == SIGN_EXTRACT)
4323         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4324       index = XEXP (XEXP (x, 0), 0);
4325       shift = INTVAL (XEXP (XEXP (x, 0), 1));
4326       if (INTVAL (XEXP (x, 1)) != 32 + shift
4327           || INTVAL (XEXP (x, 2)) != 0)
4328         shift = -1;
4329     }
4330   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
4331      (const_int 0xffffffff<<shift)) */
4332   else if (GET_CODE (x) == AND
4333            && GET_MODE (x) == DImode
4334            && GET_CODE (XEXP (x, 0)) == ASHIFT
4335            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4336            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4337            && CONST_INT_P (XEXP (x, 1)))
4338     {
4339       type = ADDRESS_REG_UXTW;
4340       index = XEXP (XEXP (x, 0), 0);
4341       shift = INTVAL (XEXP (XEXP (x, 0), 1));
4342       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4343         shift = -1;
4344     }
4345   /* (mult:P (reg:P) (const_int scale)) */
4346   else if (GET_CODE (x) == MULT
4347            && GET_MODE (x) == Pmode
4348            && GET_MODE (XEXP (x, 0)) == Pmode
4349            && CONST_INT_P (XEXP (x, 1)))
4350     {
4351       type = ADDRESS_REG_REG;
4352       index = XEXP (x, 0);
4353       shift = exact_log2 (INTVAL (XEXP (x, 1)));
4354     }
4355   /* (ashift:P (reg:P) (const_int shift)) */
4356   else if (GET_CODE (x) == ASHIFT
4357            && GET_MODE (x) == Pmode
4358            && GET_MODE (XEXP (x, 0)) == Pmode
4359            && CONST_INT_P (XEXP (x, 1)))
4360     {
4361       type = ADDRESS_REG_REG;
4362       index = XEXP (x, 0);
4363       shift = INTVAL (XEXP (x, 1));
4364     }
4365   else
4366     return false;
4367
4368   if (!strict_p
4369       && GET_CODE (index) == SUBREG
4370       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
4371     index = SUBREG_REG (index);
4372
4373   if ((shift == 0 ||
4374        (shift > 0 && shift <= 3
4375         && (1 << shift) == GET_MODE_SIZE (mode)))
4376       && REG_P (index)
4377       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
4378     {
4379       info->type = type;
4380       info->offset = index;
4381       info->shift = shift;
4382       return true;
4383     }
4384
4385   return false;
4386 }
4387
4388 /* Return true if MODE is one of the modes for which we
4389    support LDP/STP operations.  */
4390
4391 static bool
4392 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
4393 {
4394   return mode == SImode || mode == DImode
4395          || mode == SFmode || mode == DFmode
4396          || (aarch64_vector_mode_supported_p (mode)
4397              && GET_MODE_SIZE (mode) == 8);
4398 }
4399
4400 /* Return true if REGNO is a virtual pointer register, or an eliminable
4401    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
4402    include stack_pointer or hard_frame_pointer.  */
4403 static bool
4404 virt_or_elim_regno_p (unsigned regno)
4405 {
4406   return ((regno >= FIRST_VIRTUAL_REGISTER
4407            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
4408           || regno == FRAME_POINTER_REGNUM
4409           || regno == ARG_POINTER_REGNUM);
4410 }
4411
4412 /* Return true if X is a valid address for machine mode MODE.  If it is,
4413    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
4414    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
4415
4416 static bool
4417 aarch64_classify_address (struct aarch64_address_info *info,
4418                           rtx x, machine_mode mode,
4419                           RTX_CODE outer_code, bool strict_p)
4420 {
4421   enum rtx_code code = GET_CODE (x);
4422   rtx op0, op1;
4423
4424   /* On BE, we use load/store pair for all large int mode load/stores.
4425      TI/TFmode may also use a load/store pair.  */
4426   bool load_store_pair_p = (outer_code == PARALLEL
4427                             || mode == TImode
4428                             || mode == TFmode
4429                             || (BYTES_BIG_ENDIAN
4430                                 && aarch64_vect_struct_mode_p (mode)));
4431
4432   bool allow_reg_index_p =
4433     !load_store_pair_p
4434     && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
4435     && !aarch64_vect_struct_mode_p (mode);
4436
4437   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4438      REG addressing.  */
4439   if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
4440       && (code != POST_INC && code != REG))
4441     return false;
4442
4443   switch (code)
4444     {
4445     case REG:
4446     case SUBREG:
4447       info->type = ADDRESS_REG_IMM;
4448       info->base = x;
4449       info->offset = const0_rtx;
4450       return aarch64_base_register_rtx_p (x, strict_p);
4451
4452     case PLUS:
4453       op0 = XEXP (x, 0);
4454       op1 = XEXP (x, 1);
4455
4456       if (! strict_p
4457           && REG_P (op0)
4458           && virt_or_elim_regno_p (REGNO (op0))
4459           && CONST_INT_P (op1))
4460         {
4461           info->type = ADDRESS_REG_IMM;
4462           info->base = op0;
4463           info->offset = op1;
4464
4465           return true;
4466         }
4467
4468       if (GET_MODE_SIZE (mode) != 0
4469           && CONST_INT_P (op1)
4470           && aarch64_base_register_rtx_p (op0, strict_p))
4471         {
4472           HOST_WIDE_INT offset = INTVAL (op1);
4473
4474           info->type = ADDRESS_REG_IMM;
4475           info->base = op0;
4476           info->offset = op1;
4477
4478           /* TImode and TFmode values are allowed in both pairs of X
4479              registers and individual Q registers.  The available
4480              address modes are:
4481              X,X: 7-bit signed scaled offset
4482              Q:   9-bit signed offset
4483              We conservatively require an offset representable in either mode.
4484              When performing the check for pairs of X registers i.e.  LDP/STP
4485              pass down DImode since that is the natural size of the LDP/STP
4486              instruction memory accesses.  */
4487           if (mode == TImode || mode == TFmode)
4488             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
4489                     && (offset_9bit_signed_unscaled_p (mode, offset)
4490                         || offset_12bit_unsigned_scaled_p (mode, offset)));
4491
4492           /* A 7bit offset check because OImode will emit a ldp/stp
4493              instruction (only big endian will get here).
4494              For ldp/stp instructions, the offset is scaled for the size of a
4495              single element of the pair.  */
4496           if (mode == OImode)
4497             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4498
4499           /* Three 9/12 bit offsets checks because CImode will emit three
4500              ldr/str instructions (only big endian will get here).  */
4501           if (mode == CImode)
4502             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4503                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4504                         || offset_12bit_unsigned_scaled_p (V16QImode,
4505                                                            offset + 32)));
4506
4507           /* Two 7bit offsets checks because XImode will emit two ldp/stp
4508              instructions (only big endian will get here).  */
4509           if (mode == XImode)
4510             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4511                     && aarch64_offset_7bit_signed_scaled_p (TImode,
4512                                                             offset + 32));
4513
4514           if (load_store_pair_p)
4515             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4516                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4517           else
4518             return (offset_9bit_signed_unscaled_p (mode, offset)
4519                     || offset_12bit_unsigned_scaled_p (mode, offset));
4520         }
4521
4522       if (allow_reg_index_p)
4523         {
4524           /* Look for base + (scaled/extended) index register.  */
4525           if (aarch64_base_register_rtx_p (op0, strict_p)
4526               && aarch64_classify_index (info, op1, mode, strict_p))
4527             {
4528               info->base = op0;
4529               return true;
4530             }
4531           if (aarch64_base_register_rtx_p (op1, strict_p)
4532               && aarch64_classify_index (info, op0, mode, strict_p))
4533             {
4534               info->base = op1;
4535               return true;
4536             }
4537         }
4538
4539       return false;
4540
4541     case POST_INC:
4542     case POST_DEC:
4543     case PRE_INC:
4544     case PRE_DEC:
4545       info->type = ADDRESS_REG_WB;
4546       info->base = XEXP (x, 0);
4547       info->offset = NULL_RTX;
4548       return aarch64_base_register_rtx_p (info->base, strict_p);
4549
4550     case POST_MODIFY:
4551     case PRE_MODIFY:
4552       info->type = ADDRESS_REG_WB;
4553       info->base = XEXP (x, 0);
4554       if (GET_CODE (XEXP (x, 1)) == PLUS
4555           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4556           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4557           && aarch64_base_register_rtx_p (info->base, strict_p))
4558         {
4559           HOST_WIDE_INT offset;
4560           info->offset = XEXP (XEXP (x, 1), 1);
4561           offset = INTVAL (info->offset);
4562
4563           /* TImode and TFmode values are allowed in both pairs of X
4564              registers and individual Q registers.  The available
4565              address modes are:
4566              X,X: 7-bit signed scaled offset
4567              Q:   9-bit signed offset
4568              We conservatively require an offset representable in either mode.
4569            */
4570           if (mode == TImode || mode == TFmode)
4571             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4572                     && offset_9bit_signed_unscaled_p (mode, offset));
4573
4574           if (load_store_pair_p)
4575             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4576                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4577           else
4578             return offset_9bit_signed_unscaled_p (mode, offset);
4579         }
4580       return false;
4581
4582     case CONST:
4583     case SYMBOL_REF:
4584     case LABEL_REF:
4585       /* load literal: pc-relative constant pool entry.  Only supported
4586          for SI mode or larger.  */
4587       info->type = ADDRESS_SYMBOLIC;
4588
4589       if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4590         {
4591           rtx sym, addend;
4592
4593           split_const (x, &sym, &addend);
4594           return ((GET_CODE (sym) == LABEL_REF
4595                    || (GET_CODE (sym) == SYMBOL_REF
4596                        && CONSTANT_POOL_ADDRESS_P (sym)
4597                        && aarch64_pcrelative_literal_loads)));
4598         }
4599       return false;
4600
4601     case LO_SUM:
4602       info->type = ADDRESS_LO_SUM;
4603       info->base = XEXP (x, 0);
4604       info->offset = XEXP (x, 1);
4605       if (allow_reg_index_p
4606           && aarch64_base_register_rtx_p (info->base, strict_p))
4607         {
4608           rtx sym, offs;
4609           split_const (info->offset, &sym, &offs);
4610           if (GET_CODE (sym) == SYMBOL_REF
4611               && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4612             {
4613               /* The symbol and offset must be aligned to the access size.  */
4614               unsigned int align;
4615               unsigned int ref_size;
4616
4617               if (CONSTANT_POOL_ADDRESS_P (sym))
4618                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4619               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4620                 {
4621                   tree exp = SYMBOL_REF_DECL (sym);
4622                   align = TYPE_ALIGN (TREE_TYPE (exp));
4623                   align = CONSTANT_ALIGNMENT (exp, align);
4624                 }
4625               else if (SYMBOL_REF_DECL (sym))
4626                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4627               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4628                        && SYMBOL_REF_BLOCK (sym) != NULL)
4629                 align = SYMBOL_REF_BLOCK (sym)->alignment;
4630               else
4631                 align = BITS_PER_UNIT;
4632
4633               ref_size = GET_MODE_SIZE (mode);
4634               if (ref_size == 0)
4635                 ref_size = GET_MODE_SIZE (DImode);
4636
4637               return ((INTVAL (offs) & (ref_size - 1)) == 0
4638                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4639             }
4640         }
4641       return false;
4642
4643     default:
4644       return false;
4645     }
4646 }
4647
4648 /* Return true if the address X is valid for a PRFM instruction.
4649    STRICT_P is true if we should do strict checking with
4650    aarch64_classify_address.  */
4651
4652 bool
4653 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
4654 {
4655   struct aarch64_address_info addr;
4656
4657   /* PRFM accepts the same addresses as DImode...  */
4658   bool res = aarch64_classify_address (&addr, x, DImode, MEM, strict_p);
4659   if (!res)
4660     return false;
4661
4662   /* ... except writeback forms.  */
4663   return addr.type != ADDRESS_REG_WB;
4664 }
4665
4666 bool
4667 aarch64_symbolic_address_p (rtx x)
4668 {
4669   rtx offset;
4670
4671   split_const (x, &x, &offset);
4672   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4673 }
4674
4675 /* Classify the base of symbolic expression X.  */
4676
4677 enum aarch64_symbol_type
4678 aarch64_classify_symbolic_expression (rtx x)
4679 {
4680   rtx offset;
4681
4682   split_const (x, &x, &offset);
4683   return aarch64_classify_symbol (x, offset);
4684 }
4685
4686
4687 /* Return TRUE if X is a legitimate address for accessing memory in
4688    mode MODE.  */
4689 static bool
4690 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4691 {
4692   struct aarch64_address_info addr;
4693
4694   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4695 }
4696
4697 /* Return TRUE if X is a legitimate address for accessing memory in
4698    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
4699    pair operation.  */
4700 bool
4701 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4702                               RTX_CODE outer_code, bool strict_p)
4703 {
4704   struct aarch64_address_info addr;
4705
4706   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4707 }
4708
4709 /* Split an out-of-range address displacement into a base and offset.
4710    Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4711    to increase opportunities for sharing the base address of different sizes.
4712    For unaligned accesses and TI/TF mode use the signed 9-bit range.  */
4713 static bool
4714 aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
4715 {
4716   HOST_WIDE_INT offset = INTVAL (*disp);
4717   HOST_WIDE_INT base = offset & ~(GET_MODE_SIZE (mode) < 4 ? 0xfff : 0x3ffc);
4718
4719   if (mode == TImode || mode == TFmode
4720       || (offset & (GET_MODE_SIZE (mode) - 1)) != 0)
4721     base = (offset + 0x100) & ~0x1ff;
4722
4723   *off = GEN_INT (base);
4724   *disp = GEN_INT (offset - base);
4725   return true;
4726 }
4727
4728 /* Return the binary representation of floating point constant VALUE in INTVAL.
4729    If the value cannot be converted, return false without setting INTVAL.
4730    The conversion is done in the given MODE.  */
4731 bool
4732 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
4733 {
4734
4735   /* We make a general exception for 0.  */
4736   if (aarch64_float_const_zero_rtx_p (value))
4737     {
4738       *intval = 0;
4739       return true;
4740     }
4741
4742   machine_mode mode = GET_MODE (value);
4743   if (GET_CODE (value) != CONST_DOUBLE
4744       || !SCALAR_FLOAT_MODE_P (mode)
4745       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
4746       /* Only support up to DF mode.  */
4747       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
4748     return false;
4749
4750   unsigned HOST_WIDE_INT ival = 0;
4751
4752   long res[2];
4753   real_to_target (res,
4754                   CONST_DOUBLE_REAL_VALUE (value),
4755                   REAL_MODE_FORMAT (mode));
4756
4757   if (mode == DFmode)
4758     {
4759       int order = BYTES_BIG_ENDIAN ? 1 : 0;
4760       ival = zext_hwi (res[order], 32);
4761       ival |= (zext_hwi (res[1 - order], 32) << 32);
4762     }
4763   else
4764       ival = zext_hwi (res[0], 32);
4765
4766   *intval = ival;
4767   return true;
4768 }
4769
4770 /* Return TRUE if rtx X is an immediate constant that can be moved using a
4771    single MOV(+MOVK) followed by an FMOV.  */
4772 bool
4773 aarch64_float_const_rtx_p (rtx x)
4774 {
4775   machine_mode mode = GET_MODE (x);
4776   if (mode == VOIDmode)
4777     return false;
4778
4779   /* Determine whether it's cheaper to write float constants as
4780      mov/movk pairs over ldr/adrp pairs.  */
4781   unsigned HOST_WIDE_INT ival;
4782
4783   if (GET_CODE (x) == CONST_DOUBLE
4784       && SCALAR_FLOAT_MODE_P (mode)
4785       && aarch64_reinterpret_float_as_int (x, &ival))
4786     {
4787       scalar_int_mode imode = (mode == HFmode
4788                                ? SImode
4789                                : int_mode_for_mode (mode).require ());
4790       int num_instr = aarch64_internal_mov_immediate
4791                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
4792       return num_instr < 3;
4793     }
4794
4795   return false;
4796 }
4797
4798 /* Return TRUE if rtx X is immediate constant 0.0 */
4799 bool
4800 aarch64_float_const_zero_rtx_p (rtx x)
4801 {
4802   if (GET_MODE (x) == VOIDmode)
4803     return false;
4804
4805   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4806     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4807   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4808 }
4809
4810 /* Return TRUE if rtx X is immediate constant that fits in a single
4811    MOVI immediate operation.  */
4812 bool
4813 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
4814 {
4815   if (!TARGET_SIMD)
4816      return false;
4817
4818   machine_mode vmode;
4819   scalar_int_mode imode;
4820   unsigned HOST_WIDE_INT ival;
4821
4822   if (GET_CODE (x) == CONST_DOUBLE
4823       && SCALAR_FLOAT_MODE_P (mode))
4824     {
4825       if (!aarch64_reinterpret_float_as_int (x, &ival))
4826         return false;
4827
4828       /* We make a general exception for 0.  */
4829       if (aarch64_float_const_zero_rtx_p (x))
4830         return true;
4831
4832       imode = int_mode_for_mode (mode).require ();
4833     }
4834   else if (GET_CODE (x) == CONST_INT
4835            && is_a <scalar_int_mode> (mode, &imode))
4836     ival = INTVAL (x);
4837   else
4838     return false;
4839
4840    /* use a 64 bit mode for everything except for DI/DF mode, where we use
4841      a 128 bit vector mode.  */
4842   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
4843
4844   vmode = aarch64_simd_container_mode (imode, width);
4845   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
4846
4847   return aarch64_simd_valid_immediate (v_op, vmode, false, NULL);
4848 }
4849
4850
4851 /* Return the fixed registers used for condition codes.  */
4852
4853 static bool
4854 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4855 {
4856   *p1 = CC_REGNUM;
4857   *p2 = INVALID_REGNUM;
4858   return true;
4859 }
4860
4861 /* This function is used by the call expanders of the machine description.
4862    RESULT is the register in which the result is returned.  It's NULL for
4863    "call" and "sibcall".
4864    MEM is the location of the function call.
4865    SIBCALL indicates whether this function call is normal call or sibling call.
4866    It will generate different pattern accordingly.  */
4867
4868 void
4869 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
4870 {
4871   rtx call, callee, tmp;
4872   rtvec vec;
4873   machine_mode mode;
4874
4875   gcc_assert (MEM_P (mem));
4876   callee = XEXP (mem, 0);
4877   mode = GET_MODE (callee);
4878   gcc_assert (mode == Pmode);
4879
4880   /* Decide if we should generate indirect calls by loading the
4881      address of the callee into a register before performing
4882      the branch-and-link.  */
4883   if (SYMBOL_REF_P (callee)
4884       ? (aarch64_is_long_call_p (callee)
4885          || aarch64_is_noplt_call_p (callee))
4886       : !REG_P (callee))
4887     XEXP (mem, 0) = force_reg (mode, callee);
4888
4889   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
4890
4891   if (result != NULL_RTX)
4892     call = gen_rtx_SET (result, call);
4893
4894   if (sibcall)
4895     tmp = ret_rtx;
4896   else
4897     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
4898
4899   vec = gen_rtvec (2, call, tmp);
4900   call = gen_rtx_PARALLEL (VOIDmode, vec);
4901
4902   aarch64_emit_call_insn (call);
4903 }
4904
4905 /* Emit call insn with PAT and do aarch64-specific handling.  */
4906
4907 void
4908 aarch64_emit_call_insn (rtx pat)
4909 {
4910   rtx insn = emit_call_insn (pat);
4911
4912   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4913   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4914   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4915 }
4916
4917 machine_mode
4918 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4919 {
4920   /* All floating point compares return CCFP if it is an equality
4921      comparison, and CCFPE otherwise.  */
4922   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4923     {
4924       switch (code)
4925         {
4926         case EQ:
4927         case NE:
4928         case UNORDERED:
4929         case ORDERED:
4930         case UNLT:
4931         case UNLE:
4932         case UNGT:
4933         case UNGE:
4934         case UNEQ:
4935         case LTGT:
4936           return CCFPmode;
4937
4938         case LT:
4939         case LE:
4940         case GT:
4941         case GE:
4942           return CCFPEmode;
4943
4944         default:
4945           gcc_unreachable ();
4946         }
4947     }
4948
4949   /* Equality comparisons of short modes against zero can be performed
4950      using the TST instruction with the appropriate bitmask.  */
4951   if (y == const0_rtx && REG_P (x)
4952       && (code == EQ || code == NE)
4953       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4954     return CC_NZmode;
4955
4956   /* Similarly, comparisons of zero_extends from shorter modes can
4957      be performed using an ANDS with an immediate mask.  */
4958   if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
4959       && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4960       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
4961       && (code == EQ || code == NE))
4962     return CC_NZmode;
4963
4964   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4965       && y == const0_rtx
4966       && (code == EQ || code == NE || code == LT || code == GE)
4967       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4968           || GET_CODE (x) == NEG
4969           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4970               && CONST_INT_P (XEXP (x, 2)))))
4971     return CC_NZmode;
4972
4973   /* A compare with a shifted operand.  Because of canonicalization,
4974      the comparison will have to be swapped when we emit the assembly
4975      code.  */
4976   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4977       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
4978       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4979           || GET_CODE (x) == LSHIFTRT
4980           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4981     return CC_SWPmode;
4982
4983   /* Similarly for a negated operand, but we can only do this for
4984      equalities.  */
4985   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4986       && (REG_P (y) || GET_CODE (y) == SUBREG)
4987       && (code == EQ || code == NE)
4988       && GET_CODE (x) == NEG)
4989     return CC_Zmode;
4990
4991   /* A test for unsigned overflow.  */
4992   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
4993       && code == NE
4994       && GET_CODE (x) == PLUS
4995       && GET_CODE (y) == ZERO_EXTEND)
4996     return CC_Cmode;
4997
4998   /* For everything else, return CCmode.  */
4999   return CCmode;
5000 }
5001
5002 static int
5003 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
5004
5005 int
5006 aarch64_get_condition_code (rtx x)
5007 {
5008   machine_mode mode = GET_MODE (XEXP (x, 0));
5009   enum rtx_code comp_code = GET_CODE (x);
5010
5011   if (GET_MODE_CLASS (mode) != MODE_CC)
5012     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
5013   return aarch64_get_condition_code_1 (mode, comp_code);
5014 }
5015
5016 static int
5017 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
5018 {
5019   switch (mode)
5020     {
5021     case E_CCFPmode:
5022     case E_CCFPEmode:
5023       switch (comp_code)
5024         {
5025         case GE: return AARCH64_GE;
5026         case GT: return AARCH64_GT;
5027         case LE: return AARCH64_LS;
5028         case LT: return AARCH64_MI;
5029         case NE: return AARCH64_NE;
5030         case EQ: return AARCH64_EQ;
5031         case ORDERED: return AARCH64_VC;
5032         case UNORDERED: return AARCH64_VS;
5033         case UNLT: return AARCH64_LT;
5034         case UNLE: return AARCH64_LE;
5035         case UNGT: return AARCH64_HI;
5036         case UNGE: return AARCH64_PL;
5037         default: return -1;
5038         }
5039       break;
5040
5041     case E_CCmode:
5042       switch (comp_code)
5043         {
5044         case NE: return AARCH64_NE;
5045         case EQ: return AARCH64_EQ;
5046         case GE: return AARCH64_GE;
5047         case GT: return AARCH64_GT;
5048         case LE: return AARCH64_LE;
5049         case LT: return AARCH64_LT;
5050         case GEU: return AARCH64_CS;
5051         case GTU: return AARCH64_HI;
5052         case LEU: return AARCH64_LS;
5053         case LTU: return AARCH64_CC;
5054         default: return -1;
5055         }
5056       break;
5057
5058     case E_CC_SWPmode:
5059       switch (comp_code)
5060         {
5061         case NE: return AARCH64_NE;
5062         case EQ: return AARCH64_EQ;
5063         case GE: return AARCH64_LE;
5064         case GT: return AARCH64_LT;
5065         case LE: return AARCH64_GE;
5066         case LT: return AARCH64_GT;
5067         case GEU: return AARCH64_LS;
5068         case GTU: return AARCH64_CC;
5069         case LEU: return AARCH64_CS;
5070         case LTU: return AARCH64_HI;
5071         default: return -1;
5072         }
5073       break;
5074
5075     case E_CC_NZmode:
5076       switch (comp_code)
5077         {
5078         case NE: return AARCH64_NE;
5079         case EQ: return AARCH64_EQ;
5080         case GE: return AARCH64_PL;
5081         case LT: return AARCH64_MI;
5082         default: return -1;
5083         }
5084       break;
5085
5086     case E_CC_Zmode:
5087       switch (comp_code)
5088         {
5089         case NE: return AARCH64_NE;
5090         case EQ: return AARCH64_EQ;
5091         default: return -1;
5092         }
5093       break;
5094
5095     case E_CC_Cmode:
5096       switch (comp_code)
5097         {
5098         case NE: return AARCH64_CS;
5099         case EQ: return AARCH64_CC;
5100         default: return -1;
5101         }
5102       break;
5103
5104     default:
5105       return -1;
5106     }
5107
5108   return -1;
5109 }
5110
5111 bool
5112 aarch64_const_vec_all_same_in_range_p (rtx x,
5113                                   HOST_WIDE_INT minval,
5114                                   HOST_WIDE_INT maxval)
5115 {
5116   HOST_WIDE_INT firstval;
5117   int count, i;
5118
5119   if (GET_CODE (x) != CONST_VECTOR
5120       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
5121     return false;
5122
5123   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
5124   if (firstval < minval || firstval > maxval)
5125     return false;
5126
5127   count = CONST_VECTOR_NUNITS (x);
5128   for (i = 1; i < count; i++)
5129     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
5130       return false;
5131
5132   return true;
5133 }
5134
5135 bool
5136 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
5137 {
5138   return aarch64_const_vec_all_same_in_range_p (x, val, val);
5139 }
5140
5141
5142 /* N Z C V.  */
5143 #define AARCH64_CC_V 1
5144 #define AARCH64_CC_C (1 << 1)
5145 #define AARCH64_CC_Z (1 << 2)
5146 #define AARCH64_CC_N (1 << 3)
5147
5148 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
5149 static const int aarch64_nzcv_codes[] =
5150 {
5151   0,            /* EQ, Z == 1.  */
5152   AARCH64_CC_Z, /* NE, Z == 0.  */
5153   0,            /* CS, C == 1.  */
5154   AARCH64_CC_C, /* CC, C == 0.  */
5155   0,            /* MI, N == 1.  */
5156   AARCH64_CC_N, /* PL, N == 0.  */
5157   0,            /* VS, V == 1.  */
5158   AARCH64_CC_V, /* VC, V == 0.  */
5159   0,            /* HI, C ==1 && Z == 0.  */
5160   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
5161   AARCH64_CC_V, /* GE, N == V.  */
5162   0,            /* LT, N != V.  */
5163   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
5164   0,            /* LE, !(Z == 0 && N == V).  */
5165   0,            /* AL, Any.  */
5166   0             /* NV, Any.  */
5167 };
5168
5169 /* Print operand X to file F in a target specific manner according to CODE.
5170    The acceptable formatting commands given by CODE are:
5171      'c':               An integer or symbol address without a preceding #
5172                         sign.
5173      'e':               Print the sign/zero-extend size as a character 8->b,
5174                         16->h, 32->w.
5175      'p':               Prints N such that 2^N == X (X must be power of 2 and
5176                         const int).
5177      'P':               Print the number of non-zero bits in X (a const_int).
5178      'H':               Print the higher numbered register of a pair (TImode)
5179                         of regs.
5180      'm':               Print a condition (eq, ne, etc).
5181      'M':               Same as 'm', but invert condition.
5182      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
5183      'S/T/U/V':         Print a FP/SIMD register name for a register list.
5184                         The register printed is the FP/SIMD register name
5185                         of X + 0/1/2/3 for S/T/U/V.
5186      'R':               Print a scalar FP/SIMD register name + 1.
5187      'X':               Print bottom 16 bits of integer constant in hex.
5188      'w/x':             Print a general register name or the zero register
5189                         (32-bit or 64-bit).
5190      '0':               Print a normal operand, if it's a general register,
5191                         then we assume DImode.
5192      'k':               Print NZCV for conditional compare instructions.
5193      'A':               Output address constant representing the first
5194                         argument of X, specifying a relocation offset
5195                         if appropriate.
5196      'L':               Output constant address specified by X
5197                         with a relocation offset if appropriate.
5198      'G':               Prints address of X, specifying a PC relative
5199                         relocation mode if appropriate.  */
5200
5201 static void
5202 aarch64_print_operand (FILE *f, rtx x, int code)
5203 {
5204   switch (code)
5205     {
5206     case 'c':
5207       switch (GET_CODE (x))
5208         {
5209         case CONST_INT:
5210           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
5211           break;
5212
5213         case SYMBOL_REF:
5214           output_addr_const (f, x);
5215           break;
5216
5217         case CONST:
5218           if (GET_CODE (XEXP (x, 0)) == PLUS
5219               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
5220             {
5221               output_addr_const (f, x);
5222               break;
5223             }
5224           /* Fall through.  */
5225
5226         default:
5227           output_operand_lossage ("Unsupported operand for code '%c'", code);
5228         }
5229       break;
5230
5231     case 'e':
5232       {
5233         int n;
5234
5235         if (!CONST_INT_P (x)
5236             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
5237           {
5238             output_operand_lossage ("invalid operand for '%%%c'", code);
5239             return;
5240           }
5241
5242         switch (n)
5243           {
5244           case 3:
5245             fputc ('b', f);
5246             break;
5247           case 4:
5248             fputc ('h', f);
5249             break;
5250           case 5:
5251             fputc ('w', f);
5252             break;
5253           default:
5254             output_operand_lossage ("invalid operand for '%%%c'", code);
5255             return;
5256           }
5257       }
5258       break;
5259
5260     case 'p':
5261       {
5262         int n;
5263
5264         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
5265           {
5266             output_operand_lossage ("invalid operand for '%%%c'", code);
5267             return;
5268           }
5269
5270         asm_fprintf (f, "%d", n);
5271       }
5272       break;
5273
5274     case 'P':
5275       if (!CONST_INT_P (x))
5276         {
5277           output_operand_lossage ("invalid operand for '%%%c'", code);
5278           return;
5279         }
5280
5281       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
5282       break;
5283
5284     case 'H':
5285       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
5286         {
5287           output_operand_lossage ("invalid operand for '%%%c'", code);
5288           return;
5289         }
5290
5291       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
5292       break;
5293
5294     case 'M':
5295     case 'm':
5296       {
5297         int cond_code;
5298         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
5299         if (x == const_true_rtx)
5300           {
5301             if (code == 'M')
5302               fputs ("nv", f);
5303             return;
5304           }
5305
5306         if (!COMPARISON_P (x))
5307           {
5308             output_operand_lossage ("invalid operand for '%%%c'", code);
5309             return;
5310           }
5311
5312         cond_code = aarch64_get_condition_code (x);
5313         gcc_assert (cond_code >= 0);
5314         if (code == 'M')
5315           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
5316         fputs (aarch64_condition_codes[cond_code], f);
5317       }
5318       break;
5319
5320     case 'b':
5321     case 'h':
5322     case 's':
5323     case 'd':
5324     case 'q':
5325       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5326         {
5327           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5328           return;
5329         }
5330       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
5331       break;
5332
5333     case 'S':
5334     case 'T':
5335     case 'U':
5336     case 'V':
5337       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5338         {
5339           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5340           return;
5341         }
5342       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
5343       break;
5344
5345     case 'R':
5346       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5347         {
5348           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5349           return;
5350         }
5351       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
5352       break;
5353
5354     case 'X':
5355       if (!CONST_INT_P (x))
5356         {
5357           output_operand_lossage ("invalid operand for '%%%c'", code);
5358           return;
5359         }
5360       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
5361       break;
5362
5363     case 'w':
5364     case 'x':
5365       if (x == const0_rtx
5366           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
5367         {
5368           asm_fprintf (f, "%czr", code);
5369           break;
5370         }
5371
5372       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
5373         {
5374           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
5375           break;
5376         }
5377
5378       if (REG_P (x) && REGNO (x) == SP_REGNUM)
5379         {
5380           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
5381           break;
5382         }
5383
5384       /* Fall through */
5385
5386     case 0:
5387       if (x == NULL)
5388         {
5389           output_operand_lossage ("missing operand");
5390           return;
5391         }
5392
5393       switch (GET_CODE (x))
5394         {
5395         case REG:
5396           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
5397           break;
5398
5399         case MEM:
5400           output_address (GET_MODE (x), XEXP (x, 0));
5401           /* Check all memory references are Pmode - even with ILP32.  */
5402           gcc_assert (GET_MODE (XEXP (x, 0)) == Pmode);
5403           break;
5404
5405         case CONST:
5406         case LABEL_REF:
5407         case SYMBOL_REF:
5408           output_addr_const (asm_out_file, x);
5409           break;
5410
5411         case CONST_INT:
5412           asm_fprintf (f, "%wd", INTVAL (x));
5413           break;
5414
5415         case CONST_VECTOR:
5416           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
5417             {
5418               gcc_assert (
5419                   aarch64_const_vec_all_same_in_range_p (x,
5420                                                          HOST_WIDE_INT_MIN,
5421                                                          HOST_WIDE_INT_MAX));
5422               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
5423             }
5424           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
5425             {
5426               fputc ('0', f);
5427             }
5428           else
5429             gcc_unreachable ();
5430           break;
5431
5432         case CONST_DOUBLE:
5433           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
5434              be getting CONST_DOUBLEs holding integers.  */
5435           gcc_assert (GET_MODE (x) != VOIDmode);
5436           if (aarch64_float_const_zero_rtx_p (x))
5437             {
5438               fputc ('0', f);
5439               break;
5440             }
5441           else if (aarch64_float_const_representable_p (x))
5442             {
5443 #define buf_size 20
5444               char float_buf[buf_size] = {'\0'};
5445               real_to_decimal_for_mode (float_buf,
5446                                         CONST_DOUBLE_REAL_VALUE (x),
5447                                         buf_size, buf_size,
5448                                         1, GET_MODE (x));
5449               asm_fprintf (asm_out_file, "%s", float_buf);
5450               break;
5451 #undef buf_size
5452             }
5453           output_operand_lossage ("invalid constant");
5454           return;
5455         default:
5456           output_operand_lossage ("invalid operand");
5457           return;
5458         }
5459       break;
5460
5461     case 'A':
5462       if (GET_CODE (x) == HIGH)
5463         x = XEXP (x, 0);
5464
5465       switch (aarch64_classify_symbolic_expression (x))
5466         {
5467         case SYMBOL_SMALL_GOT_4G:
5468           asm_fprintf (asm_out_file, ":got:");
5469           break;
5470
5471         case SYMBOL_SMALL_TLSGD:
5472           asm_fprintf (asm_out_file, ":tlsgd:");
5473           break;
5474
5475         case SYMBOL_SMALL_TLSDESC:
5476           asm_fprintf (asm_out_file, ":tlsdesc:");
5477           break;
5478
5479         case SYMBOL_SMALL_TLSIE:
5480           asm_fprintf (asm_out_file, ":gottprel:");
5481           break;
5482
5483         case SYMBOL_TLSLE24:
5484           asm_fprintf (asm_out_file, ":tprel:");
5485           break;
5486
5487         case SYMBOL_TINY_GOT:
5488           gcc_unreachable ();
5489           break;
5490
5491         default:
5492           break;
5493         }
5494       output_addr_const (asm_out_file, x);
5495       break;
5496
5497     case 'L':
5498       switch (aarch64_classify_symbolic_expression (x))
5499         {
5500         case SYMBOL_SMALL_GOT_4G:
5501           asm_fprintf (asm_out_file, ":lo12:");
5502           break;
5503
5504         case SYMBOL_SMALL_TLSGD:
5505           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
5506           break;
5507
5508         case SYMBOL_SMALL_TLSDESC:
5509           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
5510           break;
5511
5512         case SYMBOL_SMALL_TLSIE:
5513           asm_fprintf (asm_out_file, ":gottprel_lo12:");
5514           break;
5515
5516         case SYMBOL_TLSLE12:
5517           asm_fprintf (asm_out_file, ":tprel_lo12:");
5518           break;
5519
5520         case SYMBOL_TLSLE24:
5521           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
5522           break;
5523
5524         case SYMBOL_TINY_GOT:
5525           asm_fprintf (asm_out_file, ":got:");
5526           break;
5527
5528         case SYMBOL_TINY_TLSIE:
5529           asm_fprintf (asm_out_file, ":gottprel:");
5530           break;
5531
5532         default:
5533           break;
5534         }
5535       output_addr_const (asm_out_file, x);
5536       break;
5537
5538     case 'G':
5539       switch (aarch64_classify_symbolic_expression (x))
5540         {
5541         case SYMBOL_TLSLE24:
5542           asm_fprintf (asm_out_file, ":tprel_hi12:");
5543           break;
5544         default:
5545           break;
5546         }
5547       output_addr_const (asm_out_file, x);
5548       break;
5549
5550     case 'k':
5551       {
5552         HOST_WIDE_INT cond_code;
5553
5554         if (!CONST_INT_P (x))
5555           {
5556             output_operand_lossage ("invalid operand for '%%%c'", code);
5557             return;
5558           }
5559
5560         cond_code = INTVAL (x);
5561         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
5562         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
5563       }
5564       break;
5565
5566     default:
5567       output_operand_lossage ("invalid operand prefix '%%%c'", code);
5568       return;
5569     }
5570 }
5571
5572 static void
5573 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
5574 {
5575   struct aarch64_address_info addr;
5576
5577   if (aarch64_classify_address (&addr, x, mode, MEM, true))
5578     switch (addr.type)
5579       {
5580       case ADDRESS_REG_IMM:
5581         if (addr.offset == const0_rtx)
5582           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
5583         else
5584           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
5585                        INTVAL (addr.offset));
5586         return;
5587
5588       case ADDRESS_REG_REG:
5589         if (addr.shift == 0)
5590           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
5591                        reg_names [REGNO (addr.offset)]);
5592         else
5593           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
5594                        reg_names [REGNO (addr.offset)], addr.shift);
5595         return;
5596
5597       case ADDRESS_REG_UXTW:
5598         if (addr.shift == 0)
5599           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
5600                        REGNO (addr.offset) - R0_REGNUM);
5601         else
5602           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
5603                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
5604         return;
5605
5606       case ADDRESS_REG_SXTW:
5607         if (addr.shift == 0)
5608           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
5609                        REGNO (addr.offset) - R0_REGNUM);
5610         else
5611           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
5612                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
5613         return;
5614
5615       case ADDRESS_REG_WB:
5616         switch (GET_CODE (x))
5617           {
5618           case PRE_INC:
5619             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
5620                          GET_MODE_SIZE (mode));
5621             return;
5622           case POST_INC:
5623             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
5624                          GET_MODE_SIZE (mode));
5625             return;
5626           case PRE_DEC:
5627             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
5628                          GET_MODE_SIZE (mode));
5629             return;
5630           case POST_DEC:
5631             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
5632                          GET_MODE_SIZE (mode));
5633             return;
5634           case PRE_MODIFY:
5635             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
5636                          INTVAL (addr.offset));
5637             return;
5638           case POST_MODIFY:
5639             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
5640                          INTVAL (addr.offset));
5641             return;
5642           default:
5643             break;
5644           }
5645         break;
5646
5647       case ADDRESS_LO_SUM:
5648         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
5649         output_addr_const (f, addr.offset);
5650         asm_fprintf (f, "]");
5651         return;
5652
5653       case ADDRESS_SYMBOLIC:
5654         break;
5655       }
5656
5657   output_addr_const (f, x);
5658 }
5659
5660 bool
5661 aarch64_label_mentioned_p (rtx x)
5662 {
5663   const char *fmt;
5664   int i;
5665
5666   if (GET_CODE (x) == LABEL_REF)
5667     return true;
5668
5669   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5670      referencing instruction, but they are constant offsets, not
5671      symbols.  */
5672   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5673     return false;
5674
5675   fmt = GET_RTX_FORMAT (GET_CODE (x));
5676   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
5677     {
5678       if (fmt[i] == 'E')
5679         {
5680           int j;
5681
5682           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
5683             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
5684               return 1;
5685         }
5686       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
5687         return 1;
5688     }
5689
5690   return 0;
5691 }
5692
5693 /* Implement REGNO_REG_CLASS.  */
5694
5695 enum reg_class
5696 aarch64_regno_regclass (unsigned regno)
5697 {
5698   if (GP_REGNUM_P (regno))
5699     return GENERAL_REGS;
5700
5701   if (regno == SP_REGNUM)
5702     return STACK_REG;
5703
5704   if (regno == FRAME_POINTER_REGNUM
5705       || regno == ARG_POINTER_REGNUM)
5706     return POINTER_REGS;
5707
5708   if (FP_REGNUM_P (regno))
5709     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
5710
5711   return NO_REGS;
5712 }
5713
5714 static rtx
5715 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
5716 {
5717   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5718      where mask is selected by alignment and size of the offset.
5719      We try to pick as large a range for the offset as possible to
5720      maximize the chance of a CSE.  However, for aligned addresses
5721      we limit the range to 4k so that structures with different sized
5722      elements are likely to use the same base.  We need to be careful
5723      not to split a CONST for some forms of address expression, otherwise
5724      it will generate sub-optimal code.  */
5725
5726   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5727     {
5728       rtx base = XEXP (x, 0);
5729       rtx offset_rtx = XEXP (x, 1);
5730       HOST_WIDE_INT offset = INTVAL (offset_rtx);
5731
5732       if (GET_CODE (base) == PLUS)
5733         {
5734           rtx op0 = XEXP (base, 0);
5735           rtx op1 = XEXP (base, 1);
5736
5737           /* Force any scaling into a temp for CSE.  */
5738           op0 = force_reg (Pmode, op0);
5739           op1 = force_reg (Pmode, op1);
5740
5741           /* Let the pointer register be in op0.  */
5742           if (REG_POINTER (op1))
5743             std::swap (op0, op1);
5744
5745           /* If the pointer is virtual or frame related, then we know that
5746              virtual register instantiation or register elimination is going
5747              to apply a second constant.  We want the two constants folded
5748              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
5749           if (virt_or_elim_regno_p (REGNO (op0)))
5750             {
5751               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5752                                    NULL_RTX, true, OPTAB_DIRECT);
5753               return gen_rtx_PLUS (Pmode, base, op1);
5754             }
5755
5756           /* Otherwise, in order to encourage CSE (and thence loop strength
5757              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
5758           base = expand_binop (Pmode, add_optab, op0, op1,
5759                                NULL_RTX, true, OPTAB_DIRECT);
5760           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5761         }
5762
5763       /* Does it look like we'll need a 16-byte load/store-pair operation?  */
5764       HOST_WIDE_INT base_offset;
5765       if (GET_MODE_SIZE (mode) > 16)
5766         base_offset = (offset + 0x400) & ~0x7f0;
5767       /* For offsets aren't a multiple of the access size, the limit is
5768          -256...255.  */
5769       else if (offset & (GET_MODE_SIZE (mode) - 1))
5770         {
5771           base_offset = (offset + 0x100) & ~0x1ff;
5772
5773           /* BLKmode typically uses LDP of X-registers.  */
5774           if (mode == BLKmode)
5775             base_offset = (offset + 512) & ~0x3ff;
5776         }
5777       /* Small negative offsets are supported.  */
5778       else if (IN_RANGE (offset, -256, 0))
5779         base_offset = 0;
5780       else if (mode == TImode || mode == TFmode)
5781         base_offset = (offset + 0x100) & ~0x1ff;
5782       /* Use 12-bit offset by access size.  */
5783       else
5784         base_offset = offset & (~0xfff * GET_MODE_SIZE (mode));
5785
5786       if (base_offset != 0)
5787         {
5788           base = plus_constant (Pmode, base, base_offset);
5789           base = force_operand (base, NULL_RTX);
5790           return plus_constant (Pmode, base, offset - base_offset);
5791         }
5792     }
5793
5794   return x;
5795 }
5796
5797 /* Return the reload icode required for a constant pool in mode.  */
5798 static enum insn_code
5799 aarch64_constant_pool_reload_icode (machine_mode mode)
5800 {
5801   switch (mode)
5802     {
5803     case E_SFmode:
5804       return CODE_FOR_aarch64_reload_movcpsfdi;
5805
5806     case E_DFmode:
5807       return CODE_FOR_aarch64_reload_movcpdfdi;
5808
5809     case E_TFmode:
5810       return CODE_FOR_aarch64_reload_movcptfdi;
5811
5812     case E_V8QImode:
5813       return CODE_FOR_aarch64_reload_movcpv8qidi;
5814
5815     case E_V16QImode:
5816       return CODE_FOR_aarch64_reload_movcpv16qidi;
5817
5818     case E_V4HImode:
5819       return CODE_FOR_aarch64_reload_movcpv4hidi;
5820
5821     case E_V8HImode:
5822       return CODE_FOR_aarch64_reload_movcpv8hidi;
5823
5824     case E_V2SImode:
5825       return CODE_FOR_aarch64_reload_movcpv2sidi;
5826
5827     case E_V4SImode:
5828       return CODE_FOR_aarch64_reload_movcpv4sidi;
5829
5830     case E_V2DImode:
5831       return CODE_FOR_aarch64_reload_movcpv2didi;
5832
5833     case E_V2DFmode:
5834       return CODE_FOR_aarch64_reload_movcpv2dfdi;
5835
5836     default:
5837       gcc_unreachable ();
5838     }
5839
5840   gcc_unreachable ();
5841 }
5842 static reg_class_t
5843 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5844                           reg_class_t rclass,
5845                           machine_mode mode,
5846                           secondary_reload_info *sri)
5847 {
5848
5849   /* If we have to disable direct literal pool loads and stores because the
5850      function is too big, then we need a scratch register.  */
5851   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5852       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5853           || targetm.vector_mode_supported_p (GET_MODE (x)))
5854       && !aarch64_pcrelative_literal_loads)
5855     {
5856       sri->icode = aarch64_constant_pool_reload_icode (mode);
5857       return NO_REGS;
5858     }
5859
5860   /* Without the TARGET_SIMD instructions we cannot move a Q register
5861      to a Q register directly.  We need a scratch.  */
5862   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5863       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5864       && reg_class_subset_p (rclass, FP_REGS))
5865     {
5866       if (mode == TFmode)
5867         sri->icode = CODE_FOR_aarch64_reload_movtf;
5868       else if (mode == TImode)
5869         sri->icode = CODE_FOR_aarch64_reload_movti;
5870       return NO_REGS;
5871     }
5872
5873   /* A TFmode or TImode memory access should be handled via an FP_REGS
5874      because AArch64 has richer addressing modes for LDR/STR instructions
5875      than LDP/STP instructions.  */
5876   if (TARGET_FLOAT && rclass == GENERAL_REGS
5877       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5878     return FP_REGS;
5879
5880   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5881       return GENERAL_REGS;
5882
5883   return NO_REGS;
5884 }
5885
5886 static bool
5887 aarch64_can_eliminate (const int from, const int to)
5888 {
5889   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5890      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
5891
5892   if (frame_pointer_needed)
5893     {
5894       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5895         return true;
5896       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5897         return false;
5898       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5899           && !cfun->calls_alloca)
5900         return true;
5901       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5902         return true;
5903
5904       return false;
5905     }
5906   else
5907     {
5908       /* If we decided that we didn't need a leaf frame pointer but then used
5909          LR in the function, then we'll want a frame pointer after all, so
5910          prevent this elimination to ensure a frame pointer is used.  */
5911       if (to == STACK_POINTER_REGNUM
5912           && flag_omit_leaf_frame_pointer
5913           && df_regs_ever_live_p (LR_REGNUM))
5914         return false;
5915     }
5916
5917   return true;
5918 }
5919
5920 HOST_WIDE_INT
5921 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5922 {
5923   aarch64_layout_frame ();
5924
5925   if (to == HARD_FRAME_POINTER_REGNUM)
5926     {
5927       if (from == ARG_POINTER_REGNUM)
5928         return cfun->machine->frame.hard_fp_offset;
5929
5930       if (from == FRAME_POINTER_REGNUM)
5931         return cfun->machine->frame.hard_fp_offset
5932                - cfun->machine->frame.locals_offset;
5933     }
5934
5935   if (to == STACK_POINTER_REGNUM)
5936     {
5937       if (from == FRAME_POINTER_REGNUM)
5938           return cfun->machine->frame.frame_size
5939                  - cfun->machine->frame.locals_offset;
5940     }
5941
5942   return cfun->machine->frame.frame_size;
5943 }
5944
5945 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
5946    previous frame.  */
5947
5948 rtx
5949 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5950 {
5951   if (count != 0)
5952     return const0_rtx;
5953   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5954 }
5955
5956
5957 static void
5958 aarch64_asm_trampoline_template (FILE *f)
5959 {
5960   if (TARGET_ILP32)
5961     {
5962       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5963       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5964     }
5965   else
5966     {
5967       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5968       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5969     }
5970   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5971   assemble_aligned_integer (4, const0_rtx);
5972   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5973   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5974 }
5975
5976 static void
5977 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5978 {
5979   rtx fnaddr, mem, a_tramp;
5980   const int tramp_code_sz = 16;
5981
5982   /* Don't need to copy the trailing D-words, we fill those in below.  */
5983   emit_block_move (m_tramp, assemble_trampoline_template (),
5984                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5985   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5986   fnaddr = XEXP (DECL_RTL (fndecl), 0);
5987   if (GET_MODE (fnaddr) != ptr_mode)
5988     fnaddr = convert_memory_address (ptr_mode, fnaddr);
5989   emit_move_insn (mem, fnaddr);
5990
5991   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5992   emit_move_insn (mem, chain_value);
5993
5994   /* XXX We should really define a "clear_cache" pattern and use
5995      gen_clear_cache().  */
5996   a_tramp = XEXP (m_tramp, 0);
5997   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5998                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
5999                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
6000                      ptr_mode);
6001 }
6002
6003 static unsigned char
6004 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
6005 {
6006   switch (regclass)
6007     {
6008     case CALLER_SAVE_REGS:
6009     case POINTER_REGS:
6010     case GENERAL_REGS:
6011     case ALL_REGS:
6012     case FP_REGS:
6013     case FP_LO_REGS:
6014       return
6015         aarch64_vector_mode_p (mode)
6016           ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
6017           : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6018     case STACK_REG:
6019       return 1;
6020
6021     case NO_REGS:
6022       return 0;
6023
6024     default:
6025       break;
6026     }
6027   gcc_unreachable ();
6028 }
6029
6030 static reg_class_t
6031 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
6032 {
6033   if (regclass == POINTER_REGS)
6034     return GENERAL_REGS;
6035
6036   if (regclass == STACK_REG)
6037     {
6038       if (REG_P(x)
6039           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
6040           return regclass;
6041
6042       return NO_REGS;
6043     }
6044
6045   /* Register eliminiation can result in a request for
6046      SP+constant->FP_REGS.  We cannot support such operations which
6047      use SP as source and an FP_REG as destination, so reject out
6048      right now.  */
6049   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
6050     {
6051       rtx lhs = XEXP (x, 0);
6052
6053       /* Look through a possible SUBREG introduced by ILP32.  */
6054       if (GET_CODE (lhs) == SUBREG)
6055         lhs = SUBREG_REG (lhs);
6056
6057       gcc_assert (REG_P (lhs));
6058       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
6059                                       POINTER_REGS));
6060       return NO_REGS;
6061     }
6062
6063   return regclass;
6064 }
6065
6066 void
6067 aarch64_asm_output_labelref (FILE* f, const char *name)
6068 {
6069   asm_fprintf (f, "%U%s", name);
6070 }
6071
6072 static void
6073 aarch64_elf_asm_constructor (rtx symbol, int priority)
6074 {
6075   if (priority == DEFAULT_INIT_PRIORITY)
6076     default_ctor_section_asm_out_constructor (symbol, priority);
6077   else
6078     {
6079       section *s;
6080       /* While priority is known to be in range [0, 65535], so 18 bytes
6081          would be enough, the compiler might not know that.  To avoid
6082          -Wformat-truncation false positive, use a larger size.  */
6083       char buf[23];
6084       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
6085       s = get_section (buf, SECTION_WRITE, NULL);
6086       switch_to_section (s);
6087       assemble_align (POINTER_SIZE);
6088       assemble_aligned_integer (POINTER_BYTES, symbol);
6089     }
6090 }
6091
6092 static void
6093 aarch64_elf_asm_destructor (rtx symbol, int priority)
6094 {
6095   if (priority == DEFAULT_INIT_PRIORITY)
6096     default_dtor_section_asm_out_destructor (symbol, priority);
6097   else
6098     {
6099       section *s;
6100       /* While priority is known to be in range [0, 65535], so 18 bytes
6101          would be enough, the compiler might not know that.  To avoid
6102          -Wformat-truncation false positive, use a larger size.  */
6103       char buf[23];
6104       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
6105       s = get_section (buf, SECTION_WRITE, NULL);
6106       switch_to_section (s);
6107       assemble_align (POINTER_SIZE);
6108       assemble_aligned_integer (POINTER_BYTES, symbol);
6109     }
6110 }
6111
6112 const char*
6113 aarch64_output_casesi (rtx *operands)
6114 {
6115   char buf[100];
6116   char label[100];
6117   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
6118   int index;
6119   static const char *const patterns[4][2] =
6120   {
6121     {
6122       "ldrb\t%w3, [%0,%w1,uxtw]",
6123       "add\t%3, %4, %w3, sxtb #2"
6124     },
6125     {
6126       "ldrh\t%w3, [%0,%w1,uxtw #1]",
6127       "add\t%3, %4, %w3, sxth #2"
6128     },
6129     {
6130       "ldr\t%w3, [%0,%w1,uxtw #2]",
6131       "add\t%3, %4, %w3, sxtw #2"
6132     },
6133     /* We assume that DImode is only generated when not optimizing and
6134        that we don't really need 64-bit address offsets.  That would
6135        imply an object file with 8GB of code in a single function!  */
6136     {
6137       "ldr\t%w3, [%0,%w1,uxtw #2]",
6138       "add\t%3, %4, %w3, sxtw #2"
6139     }
6140   };
6141
6142   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
6143
6144   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
6145   index = exact_log2 (GET_MODE_SIZE (mode));
6146
6147   gcc_assert (index >= 0 && index <= 3);
6148
6149   /* Need to implement table size reduction, by chaning the code below.  */
6150   output_asm_insn (patterns[index][0], operands);
6151   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
6152   snprintf (buf, sizeof (buf),
6153             "adr\t%%4, %s", targetm.strip_name_encoding (label));
6154   output_asm_insn (buf, operands);
6155   output_asm_insn (patterns[index][1], operands);
6156   output_asm_insn ("br\t%3", operands);
6157   assemble_label (asm_out_file, label);
6158   return "";
6159 }
6160
6161
6162 /* Return size in bits of an arithmetic operand which is shifted/scaled and
6163    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
6164    operator.  */
6165
6166 int
6167 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
6168 {
6169   if (shift >= 0 && shift <= 3)
6170     {
6171       int size;
6172       for (size = 8; size <= 32; size *= 2)
6173         {
6174           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
6175           if (mask == bits << shift)
6176             return size;
6177         }
6178     }
6179   return 0;
6180 }
6181
6182 /* Constant pools are per function only when PC relative
6183    literal loads are true or we are in the large memory
6184    model.  */
6185
6186 static inline bool
6187 aarch64_can_use_per_function_literal_pools_p (void)
6188 {
6189   return (aarch64_pcrelative_literal_loads
6190           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
6191 }
6192
6193 static bool
6194 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
6195 {
6196   /* Fixme:: In an ideal world this would work similar
6197      to the logic in aarch64_select_rtx_section but this
6198      breaks bootstrap in gcc go.  For now we workaround
6199      this by returning false here.  */
6200   return false;
6201 }
6202
6203 /* Select appropriate section for constants depending
6204    on where we place literal pools.  */
6205
6206 static section *
6207 aarch64_select_rtx_section (machine_mode mode,
6208                             rtx x,
6209                             unsigned HOST_WIDE_INT align)
6210 {
6211   if (aarch64_can_use_per_function_literal_pools_p ())
6212     return function_section (current_function_decl);
6213
6214   return default_elf_select_rtx_section (mode, x, align);
6215 }
6216
6217 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
6218 void
6219 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
6220                                   HOST_WIDE_INT offset)
6221 {
6222   /* When using per-function literal pools, we must ensure that any code
6223      section is aligned to the minimal instruction length, lest we get
6224      errors from the assembler re "unaligned instructions".  */
6225   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
6226     ASM_OUTPUT_ALIGN (f, 2);
6227 }
6228
6229 /* Costs.  */
6230
6231 /* Helper function for rtx cost calculation.  Strip a shift expression
6232    from X.  Returns the inner operand if successful, or the original
6233    expression on failure.  */
6234 static rtx
6235 aarch64_strip_shift (rtx x)
6236 {
6237   rtx op = x;
6238
6239   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
6240      we can convert both to ROR during final output.  */
6241   if ((GET_CODE (op) == ASHIFT
6242        || GET_CODE (op) == ASHIFTRT
6243        || GET_CODE (op) == LSHIFTRT
6244        || GET_CODE (op) == ROTATERT
6245        || GET_CODE (op) == ROTATE)
6246       && CONST_INT_P (XEXP (op, 1)))
6247     return XEXP (op, 0);
6248
6249   if (GET_CODE (op) == MULT
6250       && CONST_INT_P (XEXP (op, 1))
6251       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
6252     return XEXP (op, 0);
6253
6254   return x;
6255 }
6256
6257 /* Helper function for rtx cost calculation.  Strip an extend
6258    expression from X.  Returns the inner operand if successful, or the
6259    original expression on failure.  We deal with a number of possible
6260    canonicalization variations here. If STRIP_SHIFT is true, then
6261    we can strip off a shift also.  */
6262 static rtx
6263 aarch64_strip_extend (rtx x, bool strip_shift)
6264 {
6265   scalar_int_mode mode;
6266   rtx op = x;
6267
6268   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
6269     return op;
6270
6271   /* Zero and sign extraction of a widened value.  */
6272   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
6273       && XEXP (op, 2) == const0_rtx
6274       && GET_CODE (XEXP (op, 0)) == MULT
6275       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
6276                                          XEXP (op, 1)))
6277     return XEXP (XEXP (op, 0), 0);
6278
6279   /* It can also be represented (for zero-extend) as an AND with an
6280      immediate.  */
6281   if (GET_CODE (op) == AND
6282       && GET_CODE (XEXP (op, 0)) == MULT
6283       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
6284       && CONST_INT_P (XEXP (op, 1))
6285       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
6286                            INTVAL (XEXP (op, 1))) != 0)
6287     return XEXP (XEXP (op, 0), 0);
6288
6289   /* Now handle extended register, as this may also have an optional
6290      left shift by 1..4.  */
6291   if (strip_shift
6292       && GET_CODE (op) == ASHIFT
6293       && CONST_INT_P (XEXP (op, 1))
6294       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
6295     op = XEXP (op, 0);
6296
6297   if (GET_CODE (op) == ZERO_EXTEND
6298       || GET_CODE (op) == SIGN_EXTEND)
6299     op = XEXP (op, 0);
6300
6301   if (op != x)
6302     return op;
6303
6304   return x;
6305 }
6306
6307 /* Return true iff CODE is a shift supported in combination
6308    with arithmetic instructions.  */
6309
6310 static bool
6311 aarch64_shift_p (enum rtx_code code)
6312 {
6313   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
6314 }
6315
6316
6317 /* Return true iff X is a cheap shift without a sign extend. */
6318
6319 static bool
6320 aarch64_cheap_mult_shift_p (rtx x)
6321 {
6322   rtx op0, op1;
6323
6324   op0 = XEXP (x, 0);
6325   op1 = XEXP (x, 1);
6326
6327   if (!(aarch64_tune_params.extra_tuning_flags
6328                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
6329     return false;
6330
6331   if (GET_CODE (op0) == SIGN_EXTEND)
6332     return false;
6333
6334   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
6335       && UINTVAL (op1) <= 4)
6336     return true;
6337
6338   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
6339     return false;
6340
6341   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
6342
6343   if (l2 > 0 && l2 <= 4)
6344     return true;
6345
6346   return false;
6347 }
6348
6349 /* Helper function for rtx cost calculation.  Calculate the cost of
6350    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
6351    Return the calculated cost of the expression, recursing manually in to
6352    operands where needed.  */
6353
6354 static int
6355 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
6356 {
6357   rtx op0, op1;
6358   const struct cpu_cost_table *extra_cost
6359     = aarch64_tune_params.insn_extra_cost;
6360   int cost = 0;
6361   bool compound_p = (outer == PLUS || outer == MINUS);
6362   machine_mode mode = GET_MODE (x);
6363
6364   gcc_checking_assert (code == MULT);
6365
6366   op0 = XEXP (x, 0);
6367   op1 = XEXP (x, 1);
6368
6369   if (VECTOR_MODE_P (mode))
6370     mode = GET_MODE_INNER (mode);
6371
6372   /* Integer multiply/fma.  */
6373   if (GET_MODE_CLASS (mode) == MODE_INT)
6374     {
6375       /* The multiply will be canonicalized as a shift, cost it as such.  */
6376       if (aarch64_shift_p (GET_CODE (x))
6377           || (CONST_INT_P (op1)
6378               && exact_log2 (INTVAL (op1)) > 0))
6379         {
6380           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
6381                            || GET_CODE (op0) == SIGN_EXTEND;
6382           if (speed)
6383             {
6384               if (compound_p)
6385                 {
6386                   /* If the shift is considered cheap,
6387                      then don't add any cost. */
6388                   if (aarch64_cheap_mult_shift_p (x))
6389                     ;
6390                   else if (REG_P (op1))
6391                     /* ARITH + shift-by-register.  */
6392                     cost += extra_cost->alu.arith_shift_reg;
6393                   else if (is_extend)
6394                     /* ARITH + extended register.  We don't have a cost field
6395                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
6396                     cost += extra_cost->alu.extend_arith;
6397                   else
6398                     /* ARITH + shift-by-immediate.  */
6399                     cost += extra_cost->alu.arith_shift;
6400                 }
6401               else
6402                 /* LSL (immediate).  */
6403                 cost += extra_cost->alu.shift;
6404
6405             }
6406           /* Strip extends as we will have costed them in the case above.  */
6407           if (is_extend)
6408             op0 = aarch64_strip_extend (op0, true);
6409
6410           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
6411
6412           return cost;
6413         }
6414
6415       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
6416          compound and let the below cases handle it.  After all, MNEG is a
6417          special-case alias of MSUB.  */
6418       if (GET_CODE (op0) == NEG)
6419         {
6420           op0 = XEXP (op0, 0);
6421           compound_p = true;
6422         }
6423
6424       /* Integer multiplies or FMAs have zero/sign extending variants.  */
6425       if ((GET_CODE (op0) == ZERO_EXTEND
6426            && GET_CODE (op1) == ZERO_EXTEND)
6427           || (GET_CODE (op0) == SIGN_EXTEND
6428               && GET_CODE (op1) == SIGN_EXTEND))
6429         {
6430           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
6431           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
6432
6433           if (speed)
6434             {
6435               if (compound_p)
6436                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
6437                 cost += extra_cost->mult[0].extend_add;
6438               else
6439                 /* MUL/SMULL/UMULL.  */
6440                 cost += extra_cost->mult[0].extend;
6441             }
6442
6443           return cost;
6444         }
6445
6446       /* This is either an integer multiply or a MADD.  In both cases
6447          we want to recurse and cost the operands.  */
6448       cost += rtx_cost (op0, mode, MULT, 0, speed);
6449       cost += rtx_cost (op1, mode, MULT, 1, speed);
6450
6451       if (speed)
6452         {
6453           if (compound_p)
6454             /* MADD/MSUB.  */
6455             cost += extra_cost->mult[mode == DImode].add;
6456           else
6457             /* MUL.  */
6458             cost += extra_cost->mult[mode == DImode].simple;
6459         }
6460
6461       return cost;
6462     }
6463   else
6464     {
6465       if (speed)
6466         {
6467           /* Floating-point FMA/FMUL can also support negations of the
6468              operands, unless the rounding mode is upward or downward in
6469              which case FNMUL is different than FMUL with operand negation.  */
6470           bool neg0 = GET_CODE (op0) == NEG;
6471           bool neg1 = GET_CODE (op1) == NEG;
6472           if (compound_p || !flag_rounding_math || (neg0 && neg1))
6473             {
6474               if (neg0)
6475                 op0 = XEXP (op0, 0);
6476               if (neg1)
6477                 op1 = XEXP (op1, 0);
6478             }
6479
6480           if (compound_p)
6481             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
6482             cost += extra_cost->fp[mode == DFmode].fma;
6483           else
6484             /* FMUL/FNMUL.  */
6485             cost += extra_cost->fp[mode == DFmode].mult;
6486         }
6487
6488       cost += rtx_cost (op0, mode, MULT, 0, speed);
6489       cost += rtx_cost (op1, mode, MULT, 1, speed);
6490       return cost;
6491     }
6492 }
6493
6494 static int
6495 aarch64_address_cost (rtx x,
6496                       machine_mode mode,
6497                       addr_space_t as ATTRIBUTE_UNUSED,
6498                       bool speed)
6499 {
6500   enum rtx_code c = GET_CODE (x);
6501   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
6502   struct aarch64_address_info info;
6503   int cost = 0;
6504   info.shift = 0;
6505
6506   if (!aarch64_classify_address (&info, x, mode, c, false))
6507     {
6508       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
6509         {
6510           /* This is a CONST or SYMBOL ref which will be split
6511              in a different way depending on the code model in use.
6512              Cost it through the generic infrastructure.  */
6513           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
6514           /* Divide through by the cost of one instruction to
6515              bring it to the same units as the address costs.  */
6516           cost_symbol_ref /= COSTS_N_INSNS (1);
6517           /* The cost is then the cost of preparing the address,
6518              followed by an immediate (possibly 0) offset.  */
6519           return cost_symbol_ref + addr_cost->imm_offset;
6520         }
6521       else
6522         {
6523           /* This is most likely a jump table from a case
6524              statement.  */
6525           return addr_cost->register_offset;
6526         }
6527     }
6528
6529   switch (info.type)
6530     {
6531       case ADDRESS_LO_SUM:
6532       case ADDRESS_SYMBOLIC:
6533       case ADDRESS_REG_IMM:
6534         cost += addr_cost->imm_offset;
6535         break;
6536
6537       case ADDRESS_REG_WB:
6538         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
6539           cost += addr_cost->pre_modify;
6540         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
6541           cost += addr_cost->post_modify;
6542         else
6543           gcc_unreachable ();
6544
6545         break;
6546
6547       case ADDRESS_REG_REG:
6548         cost += addr_cost->register_offset;
6549         break;
6550
6551       case ADDRESS_REG_SXTW:
6552         cost += addr_cost->register_sextend;
6553         break;
6554
6555       case ADDRESS_REG_UXTW:
6556         cost += addr_cost->register_zextend;
6557         break;
6558
6559       default:
6560         gcc_unreachable ();
6561     }
6562
6563
6564   if (info.shift > 0)
6565     {
6566       /* For the sake of calculating the cost of the shifted register
6567          component, we can treat same sized modes in the same way.  */
6568       switch (GET_MODE_BITSIZE (mode))
6569         {
6570           case 16:
6571             cost += addr_cost->addr_scale_costs.hi;
6572             break;
6573
6574           case 32:
6575             cost += addr_cost->addr_scale_costs.si;
6576             break;
6577
6578           case 64:
6579             cost += addr_cost->addr_scale_costs.di;
6580             break;
6581
6582           /* We can't tell, or this is a 128-bit vector.  */
6583           default:
6584             cost += addr_cost->addr_scale_costs.ti;
6585             break;
6586         }
6587     }
6588
6589   return cost;
6590 }
6591
6592 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
6593    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
6594    to be taken.  */
6595
6596 int
6597 aarch64_branch_cost (bool speed_p, bool predictable_p)
6598 {
6599   /* When optimizing for speed, use the cost of unpredictable branches.  */
6600   const struct cpu_branch_cost *branch_costs =
6601     aarch64_tune_params.branch_costs;
6602
6603   if (!speed_p || predictable_p)
6604     return branch_costs->predictable;
6605   else
6606     return branch_costs->unpredictable;
6607 }
6608
6609 /* Return true if the RTX X in mode MODE is a zero or sign extract
6610    usable in an ADD or SUB (extended register) instruction.  */
6611 static bool
6612 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
6613 {
6614   /* Catch add with a sign extract.
6615      This is add_<optab><mode>_multp2.  */
6616   if (GET_CODE (x) == SIGN_EXTRACT
6617       || GET_CODE (x) == ZERO_EXTRACT)
6618     {
6619       rtx op0 = XEXP (x, 0);
6620       rtx op1 = XEXP (x, 1);
6621       rtx op2 = XEXP (x, 2);
6622
6623       if (GET_CODE (op0) == MULT
6624           && CONST_INT_P (op1)
6625           && op2 == const0_rtx
6626           && CONST_INT_P (XEXP (op0, 1))
6627           && aarch64_is_extend_from_extract (mode,
6628                                              XEXP (op0, 1),
6629                                              op1))
6630         {
6631           return true;
6632         }
6633     }
6634   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
6635      No shift.  */
6636   else if (GET_CODE (x) == SIGN_EXTEND
6637            || GET_CODE (x) == ZERO_EXTEND)
6638     return REG_P (XEXP (x, 0));
6639
6640   return false;
6641 }
6642
6643 static bool
6644 aarch64_frint_unspec_p (unsigned int u)
6645 {
6646   switch (u)
6647     {
6648       case UNSPEC_FRINTZ:
6649       case UNSPEC_FRINTP:
6650       case UNSPEC_FRINTM:
6651       case UNSPEC_FRINTA:
6652       case UNSPEC_FRINTN:
6653       case UNSPEC_FRINTX:
6654       case UNSPEC_FRINTI:
6655         return true;
6656
6657       default:
6658         return false;
6659     }
6660 }
6661
6662 /* Return true iff X is an rtx that will match an extr instruction
6663    i.e. as described in the *extr<mode>5_insn family of patterns.
6664    OP0 and OP1 will be set to the operands of the shifts involved
6665    on success and will be NULL_RTX otherwise.  */
6666
6667 static bool
6668 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
6669 {
6670   rtx op0, op1;
6671   scalar_int_mode mode;
6672   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
6673     return false;
6674
6675   *res_op0 = NULL_RTX;
6676   *res_op1 = NULL_RTX;
6677
6678   if (GET_CODE (x) != IOR)
6679     return false;
6680
6681   op0 = XEXP (x, 0);
6682   op1 = XEXP (x, 1);
6683
6684   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
6685       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
6686     {
6687      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
6688       if (GET_CODE (op1) == ASHIFT)
6689         std::swap (op0, op1);
6690
6691       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6692         return false;
6693
6694       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6695       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6696
6697       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6698           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6699         {
6700           *res_op0 = XEXP (op0, 0);
6701           *res_op1 = XEXP (op1, 0);
6702           return true;
6703         }
6704     }
6705
6706   return false;
6707 }
6708
6709 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6710    storing it in *COST.  Result is true if the total cost of the operation
6711    has now been calculated.  */
6712 static bool
6713 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6714 {
6715   rtx inner;
6716   rtx comparator;
6717   enum rtx_code cmpcode;
6718
6719   if (COMPARISON_P (op0))
6720     {
6721       inner = XEXP (op0, 0);
6722       comparator = XEXP (op0, 1);
6723       cmpcode = GET_CODE (op0);
6724     }
6725   else
6726     {
6727       inner = op0;
6728       comparator = const0_rtx;
6729       cmpcode = NE;
6730     }
6731
6732   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6733     {
6734       /* Conditional branch.  */
6735       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6736         return true;
6737       else
6738         {
6739           if (cmpcode == NE || cmpcode == EQ)
6740             {
6741               if (comparator == const0_rtx)
6742                 {
6743                   /* TBZ/TBNZ/CBZ/CBNZ.  */
6744                   if (GET_CODE (inner) == ZERO_EXTRACT)
6745                     /* TBZ/TBNZ.  */
6746                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6747                                        ZERO_EXTRACT, 0, speed);
6748                   else
6749                     /* CBZ/CBNZ.  */
6750                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6751
6752                 return true;
6753               }
6754             }
6755           else if (cmpcode == LT || cmpcode == GE)
6756             {
6757               /* TBZ/TBNZ.  */
6758               if (comparator == const0_rtx)
6759                 return true;
6760             }
6761         }
6762     }
6763   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6764     {
6765       /* CCMP.  */
6766       if (GET_CODE (op1) == COMPARE)
6767         {
6768           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
6769           if (XEXP (op1, 1) == const0_rtx)
6770             *cost += 1;
6771           if (speed)
6772             {
6773               machine_mode mode = GET_MODE (XEXP (op1, 0));
6774               const struct cpu_cost_table *extra_cost
6775                 = aarch64_tune_params.insn_extra_cost;
6776
6777               if (GET_MODE_CLASS (mode) == MODE_INT)
6778                 *cost += extra_cost->alu.arith;
6779               else
6780                 *cost += extra_cost->fp[mode == DFmode].compare;
6781             }
6782           return true;
6783         }
6784
6785       /* It's a conditional operation based on the status flags,
6786          so it must be some flavor of CSEL.  */
6787
6788       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
6789       if (GET_CODE (op1) == NEG
6790           || GET_CODE (op1) == NOT
6791           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6792         op1 = XEXP (op1, 0);
6793       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6794         {
6795           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
6796           op1 = XEXP (op1, 0);
6797           op2 = XEXP (op2, 0);
6798         }
6799
6800       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6801       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6802       return true;
6803     }
6804
6805   /* We don't know what this is, cost all operands.  */
6806   return false;
6807 }
6808
6809 /* Check whether X is a bitfield operation of the form shift + extend that
6810    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
6811    operand to which the bitfield operation is applied.  Otherwise return
6812    NULL_RTX.  */
6813
6814 static rtx
6815 aarch64_extend_bitfield_pattern_p (rtx x)
6816 {
6817   rtx_code outer_code = GET_CODE (x);
6818   machine_mode outer_mode = GET_MODE (x);
6819
6820   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6821       && outer_mode != SImode && outer_mode != DImode)
6822     return NULL_RTX;
6823
6824   rtx inner = XEXP (x, 0);
6825   rtx_code inner_code = GET_CODE (inner);
6826   machine_mode inner_mode = GET_MODE (inner);
6827   rtx op = NULL_RTX;
6828
6829   switch (inner_code)
6830     {
6831       case ASHIFT:
6832         if (CONST_INT_P (XEXP (inner, 1))
6833             && (inner_mode == QImode || inner_mode == HImode))
6834           op = XEXP (inner, 0);
6835         break;
6836       case LSHIFTRT:
6837         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6838             && (inner_mode == QImode || inner_mode == HImode))
6839           op = XEXP (inner, 0);
6840         break;
6841       case ASHIFTRT:
6842         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6843             && (inner_mode == QImode || inner_mode == HImode))
6844           op = XEXP (inner, 0);
6845         break;
6846       default:
6847         break;
6848     }
6849
6850   return op;
6851 }
6852
6853 /* Return true if the mask and a shift amount from an RTX of the form
6854    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6855    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
6856
6857 bool
6858 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
6859                                     rtx shft_amnt)
6860 {
6861   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6862          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6863          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6864          && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6865 }
6866
6867 /* Calculate the cost of calculating X, storing it in *COST.  Result
6868    is true if the total cost of the operation has now been calculated.  */
6869 static bool
6870 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6871                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6872 {
6873   rtx op0, op1, op2;
6874   const struct cpu_cost_table *extra_cost
6875     = aarch64_tune_params.insn_extra_cost;
6876   int code = GET_CODE (x);
6877   scalar_int_mode int_mode;
6878
6879   /* By default, assume that everything has equivalent cost to the
6880      cheapest instruction.  Any additional costs are applied as a delta
6881      above this default.  */
6882   *cost = COSTS_N_INSNS (1);
6883
6884   switch (code)
6885     {
6886     case SET:
6887       /* The cost depends entirely on the operands to SET.  */
6888       *cost = 0;
6889       op0 = SET_DEST (x);
6890       op1 = SET_SRC (x);
6891
6892       switch (GET_CODE (op0))
6893         {
6894         case MEM:
6895           if (speed)
6896             {
6897               rtx address = XEXP (op0, 0);
6898               if (VECTOR_MODE_P (mode))
6899                 *cost += extra_cost->ldst.storev;
6900               else if (GET_MODE_CLASS (mode) == MODE_INT)
6901                 *cost += extra_cost->ldst.store;
6902               else if (mode == SFmode)
6903                 *cost += extra_cost->ldst.storef;
6904               else if (mode == DFmode)
6905                 *cost += extra_cost->ldst.stored;
6906
6907               *cost +=
6908                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6909                                                      0, speed));
6910             }
6911
6912           *cost += rtx_cost (op1, mode, SET, 1, speed);
6913           return true;
6914
6915         case SUBREG:
6916           if (! REG_P (SUBREG_REG (op0)))
6917             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6918
6919           /* Fall through.  */
6920         case REG:
6921           /* The cost is one per vector-register copied.  */
6922           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6923             {
6924               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6925                               / GET_MODE_SIZE (V4SImode);
6926               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6927             }
6928           /* const0_rtx is in general free, but we will use an
6929              instruction to set a register to 0.  */
6930           else if (REG_P (op1) || op1 == const0_rtx)
6931             {
6932               /* The cost is 1 per register copied.  */
6933               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6934                               / UNITS_PER_WORD;
6935               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6936             }
6937           else
6938             /* Cost is just the cost of the RHS of the set.  */
6939             *cost += rtx_cost (op1, mode, SET, 1, speed);
6940           return true;
6941
6942         case ZERO_EXTRACT:
6943         case SIGN_EXTRACT:
6944           /* Bit-field insertion.  Strip any redundant widening of
6945              the RHS to meet the width of the target.  */
6946           if (GET_CODE (op1) == SUBREG)
6947             op1 = SUBREG_REG (op1);
6948           if ((GET_CODE (op1) == ZERO_EXTEND
6949                || GET_CODE (op1) == SIGN_EXTEND)
6950               && CONST_INT_P (XEXP (op0, 1))
6951               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
6952               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
6953             op1 = XEXP (op1, 0);
6954
6955           if (CONST_INT_P (op1))
6956             {
6957               /* MOV immediate is assumed to always be cheap.  */
6958               *cost = COSTS_N_INSNS (1);
6959             }
6960           else
6961             {
6962               /* BFM.  */
6963               if (speed)
6964                 *cost += extra_cost->alu.bfi;
6965               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6966             }
6967
6968           return true;
6969
6970         default:
6971           /* We can't make sense of this, assume default cost.  */
6972           *cost = COSTS_N_INSNS (1);
6973           return false;
6974         }
6975       return false;
6976
6977     case CONST_INT:
6978       /* If an instruction can incorporate a constant within the
6979          instruction, the instruction's expression avoids calling
6980          rtx_cost() on the constant.  If rtx_cost() is called on a
6981          constant, then it is usually because the constant must be
6982          moved into a register by one or more instructions.
6983
6984          The exception is constant 0, which can be expressed
6985          as XZR/WZR and is therefore free.  The exception to this is
6986          if we have (set (reg) (const0_rtx)) in which case we must cost
6987          the move.  However, we can catch that when we cost the SET, so
6988          we don't need to consider that here.  */
6989       if (x == const0_rtx)
6990         *cost = 0;
6991       else
6992         {
6993           /* To an approximation, building any other constant is
6994              proportionally expensive to the number of instructions
6995              required to build that constant.  This is true whether we
6996              are compiling for SPEED or otherwise.  */
6997           if (!is_a <scalar_int_mode> (mode, &int_mode))
6998             int_mode = word_mode;
6999           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
7000                                  (NULL_RTX, x, false, int_mode));
7001         }
7002       return true;
7003
7004     case CONST_DOUBLE:
7005
7006       /* First determine number of instructions to do the move
7007           as an integer constant.  */
7008       if (!aarch64_float_const_representable_p (x)
7009            && !aarch64_can_const_movi_rtx_p (x, mode)
7010            && aarch64_float_const_rtx_p (x))
7011         {
7012           unsigned HOST_WIDE_INT ival;
7013           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
7014           gcc_assert (succeed);
7015
7016           scalar_int_mode imode = (mode == HFmode
7017                                    ? SImode
7018                                    : int_mode_for_mode (mode).require ());
7019           int ncost = aarch64_internal_mov_immediate
7020                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7021           *cost += COSTS_N_INSNS (ncost);
7022           return true;
7023         }
7024
7025       if (speed)
7026         {
7027           /* mov[df,sf]_aarch64.  */
7028           if (aarch64_float_const_representable_p (x))
7029             /* FMOV (scalar immediate).  */
7030             *cost += extra_cost->fp[mode == DFmode].fpconst;
7031           else if (!aarch64_float_const_zero_rtx_p (x))
7032             {
7033               /* This will be a load from memory.  */
7034               if (mode == DFmode)
7035                 *cost += extra_cost->ldst.loadd;
7036               else
7037                 *cost += extra_cost->ldst.loadf;
7038             }
7039           else
7040             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
7041                or MOV v0.s[0], wzr - neither of which are modeled by the
7042                cost tables.  Just use the default cost.  */
7043             {
7044             }
7045         }
7046
7047       return true;
7048
7049     case MEM:
7050       if (speed)
7051         {
7052           /* For loads we want the base cost of a load, plus an
7053              approximation for the additional cost of the addressing
7054              mode.  */
7055           rtx address = XEXP (x, 0);
7056           if (VECTOR_MODE_P (mode))
7057             *cost += extra_cost->ldst.loadv;
7058           else if (GET_MODE_CLASS (mode) == MODE_INT)
7059             *cost += extra_cost->ldst.load;
7060           else if (mode == SFmode)
7061             *cost += extra_cost->ldst.loadf;
7062           else if (mode == DFmode)
7063             *cost += extra_cost->ldst.loadd;
7064
7065           *cost +=
7066                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7067                                                      0, speed));
7068         }
7069
7070       return true;
7071
7072     case NEG:
7073       op0 = XEXP (x, 0);
7074
7075       if (VECTOR_MODE_P (mode))
7076         {
7077           if (speed)
7078             {
7079               /* FNEG.  */
7080               *cost += extra_cost->vect.alu;
7081             }
7082           return false;
7083         }
7084
7085       if (GET_MODE_CLASS (mode) == MODE_INT)
7086         {
7087           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7088               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7089             {
7090               /* CSETM.  */
7091               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
7092               return true;
7093             }
7094
7095           /* Cost this as SUB wzr, X.  */
7096           op0 = CONST0_RTX (mode);
7097           op1 = XEXP (x, 0);
7098           goto cost_minus;
7099         }
7100
7101       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7102         {
7103           /* Support (neg(fma...)) as a single instruction only if
7104              sign of zeros is unimportant.  This matches the decision
7105              making in aarch64.md.  */
7106           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
7107             {
7108               /* FNMADD.  */
7109               *cost = rtx_cost (op0, mode, NEG, 0, speed);
7110               return true;
7111             }
7112           if (GET_CODE (op0) == MULT)
7113             {
7114               /* FNMUL.  */
7115               *cost = rtx_cost (op0, mode, NEG, 0, speed);
7116               return true;
7117             }
7118           if (speed)
7119             /* FNEG.  */
7120             *cost += extra_cost->fp[mode == DFmode].neg;
7121           return false;
7122         }
7123
7124       return false;
7125
7126     case CLRSB:
7127     case CLZ:
7128       if (speed)
7129         {
7130           if (VECTOR_MODE_P (mode))
7131             *cost += extra_cost->vect.alu;
7132           else
7133             *cost += extra_cost->alu.clz;
7134         }
7135
7136       return false;
7137
7138     case COMPARE:
7139       op0 = XEXP (x, 0);
7140       op1 = XEXP (x, 1);
7141
7142       if (op1 == const0_rtx
7143           && GET_CODE (op0) == AND)
7144         {
7145           x = op0;
7146           mode = GET_MODE (op0);
7147           goto cost_logic;
7148         }
7149
7150       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
7151         {
7152           /* TODO: A write to the CC flags possibly costs extra, this
7153              needs encoding in the cost tables.  */
7154
7155           mode = GET_MODE (op0);
7156           /* ANDS.  */
7157           if (GET_CODE (op0) == AND)
7158             {
7159               x = op0;
7160               goto cost_logic;
7161             }
7162
7163           if (GET_CODE (op0) == PLUS)
7164             {
7165               /* ADDS (and CMN alias).  */
7166               x = op0;
7167               goto cost_plus;
7168             }
7169
7170           if (GET_CODE (op0) == MINUS)
7171             {
7172               /* SUBS.  */
7173               x = op0;
7174               goto cost_minus;
7175             }
7176
7177           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
7178               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
7179               && CONST_INT_P (XEXP (op0, 2)))
7180             {
7181               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
7182                  Handle it here directly rather than going to cost_logic
7183                  since we know the immediate generated for the TST is valid
7184                  so we can avoid creating an intermediate rtx for it only
7185                  for costing purposes.  */
7186               if (speed)
7187                 *cost += extra_cost->alu.logical;
7188
7189               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
7190                                  ZERO_EXTRACT, 0, speed);
7191               return true;
7192             }
7193
7194           if (GET_CODE (op1) == NEG)
7195             {
7196               /* CMN.  */
7197               if (speed)
7198                 *cost += extra_cost->alu.arith;
7199
7200               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
7201               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
7202               return true;
7203             }
7204
7205           /* CMP.
7206
7207              Compare can freely swap the order of operands, and
7208              canonicalization puts the more complex operation first.
7209              But the integer MINUS logic expects the shift/extend
7210              operation in op1.  */
7211           if (! (REG_P (op0)
7212                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
7213           {
7214             op0 = XEXP (x, 1);
7215             op1 = XEXP (x, 0);
7216           }
7217           goto cost_minus;
7218         }
7219
7220       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
7221         {
7222           /* FCMP.  */
7223           if (speed)
7224             *cost += extra_cost->fp[mode == DFmode].compare;
7225
7226           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
7227             {
7228               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
7229               /* FCMP supports constant 0.0 for no extra cost. */
7230               return true;
7231             }
7232           return false;
7233         }
7234
7235       if (VECTOR_MODE_P (mode))
7236         {
7237           /* Vector compare.  */
7238           if (speed)
7239             *cost += extra_cost->vect.alu;
7240
7241           if (aarch64_float_const_zero_rtx_p (op1))
7242             {
7243               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
7244                  cost.  */
7245               return true;
7246             }
7247           return false;
7248         }
7249       return false;
7250
7251     case MINUS:
7252       {
7253         op0 = XEXP (x, 0);
7254         op1 = XEXP (x, 1);
7255
7256 cost_minus:
7257         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
7258
7259         /* Detect valid immediates.  */
7260         if ((GET_MODE_CLASS (mode) == MODE_INT
7261              || (GET_MODE_CLASS (mode) == MODE_CC
7262                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
7263             && CONST_INT_P (op1)
7264             && aarch64_uimm12_shift (INTVAL (op1)))
7265           {
7266             if (speed)
7267               /* SUB(S) (immediate).  */
7268               *cost += extra_cost->alu.arith;
7269             return true;
7270           }
7271
7272         /* Look for SUB (extended register).  */
7273         if (is_a <scalar_int_mode> (mode, &int_mode)
7274             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
7275           {
7276             if (speed)
7277               *cost += extra_cost->alu.extend_arith;
7278
7279             op1 = aarch64_strip_extend (op1, true);
7280             *cost += rtx_cost (op1, VOIDmode,
7281                                (enum rtx_code) GET_CODE (op1), 0, speed);
7282             return true;
7283           }
7284
7285         rtx new_op1 = aarch64_strip_extend (op1, false);
7286
7287         /* Cost this as an FMA-alike operation.  */
7288         if ((GET_CODE (new_op1) == MULT
7289              || aarch64_shift_p (GET_CODE (new_op1)))
7290             && code != COMPARE)
7291           {
7292             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
7293                                             (enum rtx_code) code,
7294                                             speed);
7295             return true;
7296           }
7297
7298         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
7299
7300         if (speed)
7301           {
7302             if (VECTOR_MODE_P (mode))
7303               {
7304                 /* Vector SUB.  */
7305                 *cost += extra_cost->vect.alu;
7306               }
7307             else if (GET_MODE_CLASS (mode) == MODE_INT)
7308               {
7309                 /* SUB(S).  */
7310                 *cost += extra_cost->alu.arith;
7311               }
7312             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7313               {
7314                 /* FSUB.  */
7315                 *cost += extra_cost->fp[mode == DFmode].addsub;
7316               }
7317           }
7318         return true;
7319       }
7320
7321     case PLUS:
7322       {
7323         rtx new_op0;
7324
7325         op0 = XEXP (x, 0);
7326         op1 = XEXP (x, 1);
7327
7328 cost_plus:
7329         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7330             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7331           {
7332             /* CSINC.  */
7333             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
7334             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7335             return true;
7336           }
7337
7338         if (GET_MODE_CLASS (mode) == MODE_INT
7339             && CONST_INT_P (op1)
7340             && aarch64_uimm12_shift (INTVAL (op1)))
7341           {
7342             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
7343
7344             if (speed)
7345               /* ADD (immediate).  */
7346               *cost += extra_cost->alu.arith;
7347             return true;
7348           }
7349
7350         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7351
7352         /* Look for ADD (extended register).  */
7353         if (is_a <scalar_int_mode> (mode, &int_mode)
7354             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
7355           {
7356             if (speed)
7357               *cost += extra_cost->alu.extend_arith;
7358
7359             op0 = aarch64_strip_extend (op0, true);
7360             *cost += rtx_cost (op0, VOIDmode,
7361                                (enum rtx_code) GET_CODE (op0), 0, speed);
7362             return true;
7363           }
7364
7365         /* Strip any extend, leave shifts behind as we will
7366            cost them through mult_cost.  */
7367         new_op0 = aarch64_strip_extend (op0, false);
7368
7369         if (GET_CODE (new_op0) == MULT
7370             || aarch64_shift_p (GET_CODE (new_op0)))
7371           {
7372             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
7373                                             speed);
7374             return true;
7375           }
7376
7377         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
7378
7379         if (speed)
7380           {
7381             if (VECTOR_MODE_P (mode))
7382               {
7383                 /* Vector ADD.  */
7384                 *cost += extra_cost->vect.alu;
7385               }
7386             else if (GET_MODE_CLASS (mode) == MODE_INT)
7387               {
7388                 /* ADD.  */
7389                 *cost += extra_cost->alu.arith;
7390               }
7391             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7392               {
7393                 /* FADD.  */
7394                 *cost += extra_cost->fp[mode == DFmode].addsub;
7395               }
7396           }
7397         return true;
7398       }
7399
7400     case BSWAP:
7401       *cost = COSTS_N_INSNS (1);
7402
7403       if (speed)
7404         {
7405           if (VECTOR_MODE_P (mode))
7406             *cost += extra_cost->vect.alu;
7407           else
7408             *cost += extra_cost->alu.rev;
7409         }
7410       return false;
7411
7412     case IOR:
7413       if (aarch_rev16_p (x))
7414         {
7415           *cost = COSTS_N_INSNS (1);
7416
7417           if (speed)
7418             {
7419               if (VECTOR_MODE_P (mode))
7420                 *cost += extra_cost->vect.alu;
7421               else
7422                 *cost += extra_cost->alu.rev;
7423             }
7424           return true;
7425         }
7426
7427       if (aarch64_extr_rtx_p (x, &op0, &op1))
7428         {
7429           *cost += rtx_cost (op0, mode, IOR, 0, speed);
7430           *cost += rtx_cost (op1, mode, IOR, 1, speed);
7431           if (speed)
7432             *cost += extra_cost->alu.shift;
7433
7434           return true;
7435         }
7436     /* Fall through.  */
7437     case XOR:
7438     case AND:
7439     cost_logic:
7440       op0 = XEXP (x, 0);
7441       op1 = XEXP (x, 1);
7442
7443       if (VECTOR_MODE_P (mode))
7444         {
7445           if (speed)
7446             *cost += extra_cost->vect.alu;
7447           return true;
7448         }
7449
7450       if (code == AND
7451           && GET_CODE (op0) == MULT
7452           && CONST_INT_P (XEXP (op0, 1))
7453           && CONST_INT_P (op1)
7454           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
7455                                INTVAL (op1)) != 0)
7456         {
7457           /* This is a UBFM/SBFM.  */
7458           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
7459           if (speed)
7460             *cost += extra_cost->alu.bfx;
7461           return true;
7462         }
7463
7464       if (is_int_mode (mode, &int_mode))
7465         {
7466           if (CONST_INT_P (op1))
7467             {
7468               /* We have a mask + shift version of a UBFIZ
7469                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
7470               if (GET_CODE (op0) == ASHIFT
7471                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
7472                                                          XEXP (op0, 1)))
7473                 {
7474                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
7475                                      (enum rtx_code) code, 0, speed);
7476                   if (speed)
7477                     *cost += extra_cost->alu.bfx;
7478
7479                   return true;
7480                 }
7481               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
7482                 {
7483                 /* We possibly get the immediate for free, this is not
7484                    modelled.  */
7485                   *cost += rtx_cost (op0, int_mode,
7486                                      (enum rtx_code) code, 0, speed);
7487                   if (speed)
7488                     *cost += extra_cost->alu.logical;
7489
7490                   return true;
7491                 }
7492             }
7493           else
7494             {
7495               rtx new_op0 = op0;
7496
7497               /* Handle ORN, EON, or BIC.  */
7498               if (GET_CODE (op0) == NOT)
7499                 op0 = XEXP (op0, 0);
7500
7501               new_op0 = aarch64_strip_shift (op0);
7502
7503               /* If we had a shift on op0 then this is a logical-shift-
7504                  by-register/immediate operation.  Otherwise, this is just
7505                  a logical operation.  */
7506               if (speed)
7507                 {
7508                   if (new_op0 != op0)
7509                     {
7510                       /* Shift by immediate.  */
7511                       if (CONST_INT_P (XEXP (op0, 1)))
7512                         *cost += extra_cost->alu.log_shift;
7513                       else
7514                         *cost += extra_cost->alu.log_shift_reg;
7515                     }
7516                   else
7517                     *cost += extra_cost->alu.logical;
7518                 }
7519
7520               /* In both cases we want to cost both operands.  */
7521               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
7522                                  0, speed);
7523               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
7524                                  1, speed);
7525
7526               return true;
7527             }
7528         }
7529       return false;
7530
7531     case NOT:
7532       x = XEXP (x, 0);
7533       op0 = aarch64_strip_shift (x);
7534
7535       if (VECTOR_MODE_P (mode))
7536         {
7537           /* Vector NOT.  */
7538           *cost += extra_cost->vect.alu;
7539           return false;
7540         }
7541
7542       /* MVN-shifted-reg.  */
7543       if (op0 != x)
7544         {
7545           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7546
7547           if (speed)
7548             *cost += extra_cost->alu.log_shift;
7549
7550           return true;
7551         }
7552       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
7553          Handle the second form here taking care that 'a' in the above can
7554          be a shift.  */
7555       else if (GET_CODE (op0) == XOR)
7556         {
7557           rtx newop0 = XEXP (op0, 0);
7558           rtx newop1 = XEXP (op0, 1);
7559           rtx op0_stripped = aarch64_strip_shift (newop0);
7560
7561           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
7562           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
7563
7564           if (speed)
7565             {
7566               if (op0_stripped != newop0)
7567                 *cost += extra_cost->alu.log_shift;
7568               else
7569                 *cost += extra_cost->alu.logical;
7570             }
7571
7572           return true;
7573         }
7574       /* MVN.  */
7575       if (speed)
7576         *cost += extra_cost->alu.logical;
7577
7578       return false;
7579
7580     case ZERO_EXTEND:
7581
7582       op0 = XEXP (x, 0);
7583       /* If a value is written in SI mode, then zero extended to DI
7584          mode, the operation will in general be free as a write to
7585          a 'w' register implicitly zeroes the upper bits of an 'x'
7586          register.  However, if this is
7587
7588            (set (reg) (zero_extend (reg)))
7589
7590          we must cost the explicit register move.  */
7591       if (mode == DImode
7592           && GET_MODE (op0) == SImode
7593           && outer == SET)
7594         {
7595           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
7596
7597         /* If OP_COST is non-zero, then the cost of the zero extend
7598            is effectively the cost of the inner operation.  Otherwise
7599            we have a MOV instruction and we take the cost from the MOV
7600            itself.  This is true independently of whether we are
7601            optimizing for space or time.  */
7602           if (op_cost)
7603             *cost = op_cost;
7604
7605           return true;
7606         }
7607       else if (MEM_P (op0))
7608         {
7609           /* All loads can zero extend to any size for free.  */
7610           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
7611           return true;
7612         }
7613
7614       op0 = aarch64_extend_bitfield_pattern_p (x);
7615       if (op0)
7616         {
7617           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
7618           if (speed)
7619             *cost += extra_cost->alu.bfx;
7620           return true;
7621         }
7622
7623       if (speed)
7624         {
7625           if (VECTOR_MODE_P (mode))
7626             {
7627               /* UMOV.  */
7628               *cost += extra_cost->vect.alu;
7629             }
7630           else
7631             {
7632               /* We generate an AND instead of UXTB/UXTH.  */
7633               *cost += extra_cost->alu.logical;
7634             }
7635         }
7636       return false;
7637
7638     case SIGN_EXTEND:
7639       if (MEM_P (XEXP (x, 0)))
7640         {
7641           /* LDRSH.  */
7642           if (speed)
7643             {
7644               rtx address = XEXP (XEXP (x, 0), 0);
7645               *cost += extra_cost->ldst.load_sign_extend;
7646
7647               *cost +=
7648                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7649                                                      0, speed));
7650             }
7651           return true;
7652         }
7653
7654       op0 = aarch64_extend_bitfield_pattern_p (x);
7655       if (op0)
7656         {
7657           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
7658           if (speed)
7659             *cost += extra_cost->alu.bfx;
7660           return true;
7661         }
7662
7663       if (speed)
7664         {
7665           if (VECTOR_MODE_P (mode))
7666             *cost += extra_cost->vect.alu;
7667           else
7668             *cost += extra_cost->alu.extend;
7669         }
7670       return false;
7671
7672     case ASHIFT:
7673       op0 = XEXP (x, 0);
7674       op1 = XEXP (x, 1);
7675
7676       if (CONST_INT_P (op1))
7677         {
7678           if (speed)
7679             {
7680               if (VECTOR_MODE_P (mode))
7681                 {
7682                   /* Vector shift (immediate).  */
7683                   *cost += extra_cost->vect.alu;
7684                 }
7685               else
7686                 {
7687                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
7688                      aliases.  */
7689                   *cost += extra_cost->alu.shift;
7690                 }
7691             }
7692
7693           /* We can incorporate zero/sign extend for free.  */
7694           if (GET_CODE (op0) == ZERO_EXTEND
7695               || GET_CODE (op0) == SIGN_EXTEND)
7696             op0 = XEXP (op0, 0);
7697
7698           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
7699           return true;
7700         }
7701       else
7702         {
7703           if (VECTOR_MODE_P (mode))
7704             {
7705               if (speed)
7706                 /* Vector shift (register).  */
7707                 *cost += extra_cost->vect.alu;
7708             }
7709           else
7710             {
7711               if (speed)
7712                 /* LSLV.  */
7713                 *cost += extra_cost->alu.shift_reg;
7714
7715               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7716                   && CONST_INT_P (XEXP (op1, 1))
7717                   && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7718                 {
7719                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7720                   /* We already demanded XEXP (op1, 0) to be REG_P, so
7721                      don't recurse into it.  */
7722                   return true;
7723                 }
7724             }
7725           return false;  /* All arguments need to be in registers.  */
7726         }
7727
7728     case ROTATE:
7729     case ROTATERT:
7730     case LSHIFTRT:
7731     case ASHIFTRT:
7732       op0 = XEXP (x, 0);
7733       op1 = XEXP (x, 1);
7734
7735       if (CONST_INT_P (op1))
7736         {
7737           /* ASR (immediate) and friends.  */
7738           if (speed)
7739             {
7740               if (VECTOR_MODE_P (mode))
7741                 *cost += extra_cost->vect.alu;
7742               else
7743                 *cost += extra_cost->alu.shift;
7744             }
7745
7746           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7747           return true;
7748         }
7749       else
7750         {
7751           if (VECTOR_MODE_P (mode))
7752             {
7753               if (speed)
7754                 /* Vector shift (register).  */
7755                 *cost += extra_cost->vect.alu;
7756             }
7757           else
7758             {
7759               if (speed)
7760                 /* ASR (register) and friends.  */
7761                 *cost += extra_cost->alu.shift_reg;
7762
7763               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7764                   && CONST_INT_P (XEXP (op1, 1))
7765                   && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7766                 {
7767                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7768                   /* We already demanded XEXP (op1, 0) to be REG_P, so
7769                      don't recurse into it.  */
7770                   return true;
7771                 }
7772             }
7773           return false;  /* All arguments need to be in registers.  */
7774         }
7775
7776     case SYMBOL_REF:
7777
7778       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7779           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7780         {
7781           /* LDR.  */
7782           if (speed)
7783             *cost += extra_cost->ldst.load;
7784         }
7785       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7786                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7787         {
7788           /* ADRP, followed by ADD.  */
7789           *cost += COSTS_N_INSNS (1);
7790           if (speed)
7791             *cost += 2 * extra_cost->alu.arith;
7792         }
7793       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7794                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7795         {
7796           /* ADR.  */
7797           if (speed)
7798             *cost += extra_cost->alu.arith;
7799         }
7800
7801       if (flag_pic)
7802         {
7803           /* One extra load instruction, after accessing the GOT.  */
7804           *cost += COSTS_N_INSNS (1);
7805           if (speed)
7806             *cost += extra_cost->ldst.load;
7807         }
7808       return true;
7809
7810     case HIGH:
7811     case LO_SUM:
7812       /* ADRP/ADD (immediate).  */
7813       if (speed)
7814         *cost += extra_cost->alu.arith;
7815       return true;
7816
7817     case ZERO_EXTRACT:
7818     case SIGN_EXTRACT:
7819       /* UBFX/SBFX.  */
7820       if (speed)
7821         {
7822           if (VECTOR_MODE_P (mode))
7823             *cost += extra_cost->vect.alu;
7824           else
7825             *cost += extra_cost->alu.bfx;
7826         }
7827
7828       /* We can trust that the immediates used will be correct (there
7829          are no by-register forms), so we need only cost op0.  */
7830       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7831       return true;
7832
7833     case MULT:
7834       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7835       /* aarch64_rtx_mult_cost always handles recursion to its
7836          operands.  */
7837       return true;
7838
7839     case MOD:
7840     /* We can expand signed mod by power of 2 using a NEGS, two parallel
7841        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
7842        an unconditional negate.  This case should only ever be reached through
7843        the set_smod_pow2_cheap check in expmed.c.  */
7844       if (CONST_INT_P (XEXP (x, 1))
7845           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7846           && (mode == SImode || mode == DImode))
7847         {
7848           /* We expand to 4 instructions.  Reset the baseline.  */
7849           *cost = COSTS_N_INSNS (4);
7850
7851           if (speed)
7852             *cost += 2 * extra_cost->alu.logical
7853                      + 2 * extra_cost->alu.arith;
7854
7855           return true;
7856         }
7857
7858     /* Fall-through.  */
7859     case UMOD:
7860       if (speed)
7861         {
7862           /* Slighly prefer UMOD over SMOD.  */
7863           if (VECTOR_MODE_P (mode))
7864             *cost += extra_cost->vect.alu;
7865           else if (GET_MODE_CLASS (mode) == MODE_INT)
7866             *cost += (extra_cost->mult[mode == DImode].add
7867                       + extra_cost->mult[mode == DImode].idiv
7868                       + (code == MOD ? 1 : 0));
7869         }
7870       return false;  /* All arguments need to be in registers.  */
7871
7872     case DIV:
7873     case UDIV:
7874     case SQRT:
7875       if (speed)
7876         {
7877           if (VECTOR_MODE_P (mode))
7878             *cost += extra_cost->vect.alu;
7879           else if (GET_MODE_CLASS (mode) == MODE_INT)
7880             /* There is no integer SQRT, so only DIV and UDIV can get
7881                here.  */
7882             *cost += (extra_cost->mult[mode == DImode].idiv
7883                      /* Slighly prefer UDIV over SDIV.  */
7884                      + (code == DIV ? 1 : 0));
7885           else
7886             *cost += extra_cost->fp[mode == DFmode].div;
7887         }
7888       return false;  /* All arguments need to be in registers.  */
7889
7890     case IF_THEN_ELSE:
7891       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7892                                          XEXP (x, 2), cost, speed);
7893
7894     case EQ:
7895     case NE:
7896     case GT:
7897     case GTU:
7898     case LT:
7899     case LTU:
7900     case GE:
7901     case GEU:
7902     case LE:
7903     case LEU:
7904
7905       return false; /* All arguments must be in registers.  */
7906
7907     case FMA:
7908       op0 = XEXP (x, 0);
7909       op1 = XEXP (x, 1);
7910       op2 = XEXP (x, 2);
7911
7912       if (speed)
7913         {
7914           if (VECTOR_MODE_P (mode))
7915             *cost += extra_cost->vect.alu;
7916           else
7917             *cost += extra_cost->fp[mode == DFmode].fma;
7918         }
7919
7920       /* FMSUB, FNMADD, and FNMSUB are free.  */
7921       if (GET_CODE (op0) == NEG)
7922         op0 = XEXP (op0, 0);
7923
7924       if (GET_CODE (op2) == NEG)
7925         op2 = XEXP (op2, 0);
7926
7927       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7928          and the by-element operand as operand 0.  */
7929       if (GET_CODE (op1) == NEG)
7930         op1 = XEXP (op1, 0);
7931
7932       /* Catch vector-by-element operations.  The by-element operand can
7933          either be (vec_duplicate (vec_select (x))) or just
7934          (vec_select (x)), depending on whether we are multiplying by
7935          a vector or a scalar.
7936
7937          Canonicalization is not very good in these cases, FMA4 will put the
7938          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
7939       if (GET_CODE (op0) == VEC_DUPLICATE)
7940         op0 = XEXP (op0, 0);
7941       else if (GET_CODE (op1) == VEC_DUPLICATE)
7942         op1 = XEXP (op1, 0);
7943
7944       if (GET_CODE (op0) == VEC_SELECT)
7945         op0 = XEXP (op0, 0);
7946       else if (GET_CODE (op1) == VEC_SELECT)
7947         op1 = XEXP (op1, 0);
7948
7949       /* If the remaining parameters are not registers,
7950          get the cost to put them into registers.  */
7951       *cost += rtx_cost (op0, mode, FMA, 0, speed);
7952       *cost += rtx_cost (op1, mode, FMA, 1, speed);
7953       *cost += rtx_cost (op2, mode, FMA, 2, speed);
7954       return true;
7955
7956     case FLOAT:
7957     case UNSIGNED_FLOAT:
7958       if (speed)
7959         *cost += extra_cost->fp[mode == DFmode].fromint;
7960       return false;
7961
7962     case FLOAT_EXTEND:
7963       if (speed)
7964         {
7965           if (VECTOR_MODE_P (mode))
7966             {
7967               /*Vector truncate.  */
7968               *cost += extra_cost->vect.alu;
7969             }
7970           else
7971             *cost += extra_cost->fp[mode == DFmode].widen;
7972         }
7973       return false;
7974
7975     case FLOAT_TRUNCATE:
7976       if (speed)
7977         {
7978           if (VECTOR_MODE_P (mode))
7979             {
7980               /*Vector conversion.  */
7981               *cost += extra_cost->vect.alu;
7982             }
7983           else
7984             *cost += extra_cost->fp[mode == DFmode].narrow;
7985         }
7986       return false;
7987
7988     case FIX:
7989     case UNSIGNED_FIX:
7990       x = XEXP (x, 0);
7991       /* Strip the rounding part.  They will all be implemented
7992          by the fcvt* family of instructions anyway.  */
7993       if (GET_CODE (x) == UNSPEC)
7994         {
7995           unsigned int uns_code = XINT (x, 1);
7996
7997           if (uns_code == UNSPEC_FRINTA
7998               || uns_code == UNSPEC_FRINTM
7999               || uns_code == UNSPEC_FRINTN
8000               || uns_code == UNSPEC_FRINTP
8001               || uns_code == UNSPEC_FRINTZ)
8002             x = XVECEXP (x, 0, 0);
8003         }
8004
8005       if (speed)
8006         {
8007           if (VECTOR_MODE_P (mode))
8008             *cost += extra_cost->vect.alu;
8009           else
8010             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
8011         }
8012
8013       /* We can combine fmul by a power of 2 followed by a fcvt into a single
8014          fixed-point fcvt.  */
8015       if (GET_CODE (x) == MULT
8016           && ((VECTOR_MODE_P (mode)
8017                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
8018               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
8019         {
8020           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
8021                              0, speed);
8022           return true;
8023         }
8024
8025       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
8026       return true;
8027
8028     case ABS:
8029       if (VECTOR_MODE_P (mode))
8030         {
8031           /* ABS (vector).  */
8032           if (speed)
8033             *cost += extra_cost->vect.alu;
8034         }
8035       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8036         {
8037           op0 = XEXP (x, 0);
8038
8039           /* FABD, which is analogous to FADD.  */
8040           if (GET_CODE (op0) == MINUS)
8041             {
8042               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
8043               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
8044               if (speed)
8045                 *cost += extra_cost->fp[mode == DFmode].addsub;
8046
8047               return true;
8048             }
8049           /* Simple FABS is analogous to FNEG.  */
8050           if (speed)
8051             *cost += extra_cost->fp[mode == DFmode].neg;
8052         }
8053       else
8054         {
8055           /* Integer ABS will either be split to
8056              two arithmetic instructions, or will be an ABS
8057              (scalar), which we don't model.  */
8058           *cost = COSTS_N_INSNS (2);
8059           if (speed)
8060             *cost += 2 * extra_cost->alu.arith;
8061         }
8062       return false;
8063
8064     case SMAX:
8065     case SMIN:
8066       if (speed)
8067         {
8068           if (VECTOR_MODE_P (mode))
8069             *cost += extra_cost->vect.alu;
8070           else
8071             {
8072               /* FMAXNM/FMINNM/FMAX/FMIN.
8073                  TODO: This may not be accurate for all implementations, but
8074                  we do not model this in the cost tables.  */
8075               *cost += extra_cost->fp[mode == DFmode].addsub;
8076             }
8077         }
8078       return false;
8079
8080     case UNSPEC:
8081       /* The floating point round to integer frint* instructions.  */
8082       if (aarch64_frint_unspec_p (XINT (x, 1)))
8083         {
8084           if (speed)
8085             *cost += extra_cost->fp[mode == DFmode].roundint;
8086
8087           return false;
8088         }
8089
8090       if (XINT (x, 1) == UNSPEC_RBIT)
8091         {
8092           if (speed)
8093             *cost += extra_cost->alu.rev;
8094
8095           return false;
8096         }
8097       break;
8098
8099     case TRUNCATE:
8100
8101       /* Decompose <su>muldi3_highpart.  */
8102       if (/* (truncate:DI  */
8103           mode == DImode
8104           /*   (lshiftrt:TI  */
8105           && GET_MODE (XEXP (x, 0)) == TImode
8106           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
8107           /*      (mult:TI  */
8108           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
8109           /*        (ANY_EXTEND:TI (reg:DI))
8110                     (ANY_EXTEND:TI (reg:DI)))  */
8111           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
8112                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
8113               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
8114                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
8115           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
8116           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
8117           /*     (const_int 64)  */
8118           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8119           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
8120         {
8121           /* UMULH/SMULH.  */
8122           if (speed)
8123             *cost += extra_cost->mult[mode == DImode].extend;
8124           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
8125                              mode, MULT, 0, speed);
8126           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
8127                              mode, MULT, 1, speed);
8128           return true;
8129         }
8130
8131       /* Fall through.  */
8132     default:
8133       break;
8134     }
8135
8136   if (dump_file
8137       && flag_aarch64_verbose_cost)
8138     fprintf (dump_file,
8139       "\nFailed to cost RTX.  Assuming default cost.\n");
8140
8141   return true;
8142 }
8143
8144 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
8145    calculated for X.  This cost is stored in *COST.  Returns true
8146    if the total cost of X was calculated.  */
8147 static bool
8148 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
8149                    int param, int *cost, bool speed)
8150 {
8151   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
8152
8153   if (dump_file
8154       && flag_aarch64_verbose_cost)
8155     {
8156       print_rtl_single (dump_file, x);
8157       fprintf (dump_file, "\n%s cost: %d (%s)\n",
8158                speed ? "Hot" : "Cold",
8159                *cost, result ? "final" : "partial");
8160     }
8161
8162   return result;
8163 }
8164
8165 static int
8166 aarch64_register_move_cost (machine_mode mode,
8167                             reg_class_t from_i, reg_class_t to_i)
8168 {
8169   enum reg_class from = (enum reg_class) from_i;
8170   enum reg_class to = (enum reg_class) to_i;
8171   const struct cpu_regmove_cost *regmove_cost
8172     = aarch64_tune_params.regmove_cost;
8173
8174   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
8175   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
8176     to = GENERAL_REGS;
8177
8178   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
8179     from = GENERAL_REGS;
8180
8181   /* Moving between GPR and stack cost is the same as GP2GP.  */
8182   if ((from == GENERAL_REGS && to == STACK_REG)
8183       || (to == GENERAL_REGS && from == STACK_REG))
8184     return regmove_cost->GP2GP;
8185
8186   /* To/From the stack register, we move via the gprs.  */
8187   if (to == STACK_REG || from == STACK_REG)
8188     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
8189             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
8190
8191   if (GET_MODE_SIZE (mode) == 16)
8192     {
8193       /* 128-bit operations on general registers require 2 instructions.  */
8194       if (from == GENERAL_REGS && to == GENERAL_REGS)
8195         return regmove_cost->GP2GP * 2;
8196       else if (from == GENERAL_REGS)
8197         return regmove_cost->GP2FP * 2;
8198       else if (to == GENERAL_REGS)
8199         return regmove_cost->FP2GP * 2;
8200
8201       /* When AdvSIMD instructions are disabled it is not possible to move
8202          a 128-bit value directly between Q registers.  This is handled in
8203          secondary reload.  A general register is used as a scratch to move
8204          the upper DI value and the lower DI value is moved directly,
8205          hence the cost is the sum of three moves. */
8206       if (! TARGET_SIMD)
8207         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
8208
8209       return regmove_cost->FP2FP;
8210     }
8211
8212   if (from == GENERAL_REGS && to == GENERAL_REGS)
8213     return regmove_cost->GP2GP;
8214   else if (from == GENERAL_REGS)
8215     return regmove_cost->GP2FP;
8216   else if (to == GENERAL_REGS)
8217     return regmove_cost->FP2GP;
8218
8219   return regmove_cost->FP2FP;
8220 }
8221
8222 static int
8223 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
8224                           reg_class_t rclass ATTRIBUTE_UNUSED,
8225                           bool in ATTRIBUTE_UNUSED)
8226 {
8227   return aarch64_tune_params.memmov_cost;
8228 }
8229
8230 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
8231    to optimize 1.0/sqrt.  */
8232
8233 static bool
8234 use_rsqrt_p (machine_mode mode)
8235 {
8236   return (!flag_trapping_math
8237           && flag_unsafe_math_optimizations
8238           && ((aarch64_tune_params.approx_modes->recip_sqrt
8239                & AARCH64_APPROX_MODE (mode))
8240               || flag_mrecip_low_precision_sqrt));
8241 }
8242
8243 /* Function to decide when to use the approximate reciprocal square root
8244    builtin.  */
8245
8246 static tree
8247 aarch64_builtin_reciprocal (tree fndecl)
8248 {
8249   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
8250
8251   if (!use_rsqrt_p (mode))
8252     return NULL_TREE;
8253   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
8254 }
8255
8256 typedef rtx (*rsqrte_type) (rtx, rtx);
8257
8258 /* Select reciprocal square root initial estimate insn depending on machine
8259    mode.  */
8260
8261 static rsqrte_type
8262 get_rsqrte_type (machine_mode mode)
8263 {
8264   switch (mode)
8265   {
8266     case E_DFmode:   return gen_aarch64_rsqrtedf;
8267     case E_SFmode:   return gen_aarch64_rsqrtesf;
8268     case E_V2DFmode: return gen_aarch64_rsqrtev2df;
8269     case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
8270     case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
8271     default: gcc_unreachable ();
8272   }
8273 }
8274
8275 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
8276
8277 /* Select reciprocal square root series step insn depending on machine mode.  */
8278
8279 static rsqrts_type
8280 get_rsqrts_type (machine_mode mode)
8281 {
8282   switch (mode)
8283   {
8284     case E_DFmode:   return gen_aarch64_rsqrtsdf;
8285     case E_SFmode:   return gen_aarch64_rsqrtssf;
8286     case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
8287     case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
8288     case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
8289     default: gcc_unreachable ();
8290   }
8291 }
8292
8293 /* Emit instruction sequence to compute either the approximate square root
8294    or its approximate reciprocal, depending on the flag RECP, and return
8295    whether the sequence was emitted or not.  */
8296
8297 bool
8298 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
8299 {
8300   machine_mode mode = GET_MODE (dst);
8301
8302   if (GET_MODE_INNER (mode) == HFmode)
8303     {
8304       gcc_assert (!recp);
8305       return false;
8306     }
8307
8308   if (!recp)
8309     {
8310       if (!(flag_mlow_precision_sqrt
8311             || (aarch64_tune_params.approx_modes->sqrt
8312                 & AARCH64_APPROX_MODE (mode))))
8313         return false;
8314
8315       if (flag_finite_math_only
8316           || flag_trapping_math
8317           || !flag_unsafe_math_optimizations
8318           || optimize_function_for_size_p (cfun))
8319         return false;
8320     }
8321   else
8322     /* Caller assumes we cannot fail.  */
8323     gcc_assert (use_rsqrt_p (mode));
8324
8325   machine_mode mmsk = mode_for_int_vector (mode).require ();
8326   rtx xmsk = gen_reg_rtx (mmsk);
8327   if (!recp)
8328     /* When calculating the approximate square root, compare the
8329        argument with 0.0 and create a mask.  */
8330     emit_insn (gen_rtx_SET (xmsk,
8331                             gen_rtx_NEG (mmsk,
8332                                          gen_rtx_EQ (mmsk, src,
8333                                                      CONST0_RTX (mode)))));
8334
8335   /* Estimate the approximate reciprocal square root.  */
8336   rtx xdst = gen_reg_rtx (mode);
8337   emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
8338
8339   /* Iterate over the series twice for SF and thrice for DF.  */
8340   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8341
8342   /* Optionally iterate over the series once less for faster performance
8343      while sacrificing the accuracy.  */
8344   if ((recp && flag_mrecip_low_precision_sqrt)
8345       || (!recp && flag_mlow_precision_sqrt))
8346     iterations--;
8347
8348   /* Iterate over the series to calculate the approximate reciprocal square
8349      root.  */
8350   rtx x1 = gen_reg_rtx (mode);
8351   while (iterations--)
8352     {
8353       rtx x2 = gen_reg_rtx (mode);
8354       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
8355
8356       emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
8357
8358       if (iterations > 0)
8359         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
8360     }
8361
8362   if (!recp)
8363     {
8364       /* Qualify the approximate reciprocal square root when the argument is
8365          0.0 by squashing the intermediary result to 0.0.  */
8366       rtx xtmp = gen_reg_rtx (mmsk);
8367       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
8368                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
8369       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
8370
8371       /* Calculate the approximate square root.  */
8372       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
8373     }
8374
8375   /* Finalize the approximation.  */
8376   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
8377
8378   return true;
8379 }
8380
8381 typedef rtx (*recpe_type) (rtx, rtx);
8382
8383 /* Select reciprocal initial estimate insn depending on machine mode.  */
8384
8385 static recpe_type
8386 get_recpe_type (machine_mode mode)
8387 {
8388   switch (mode)
8389   {
8390     case E_SFmode:   return (gen_aarch64_frecpesf);
8391     case E_V2SFmode: return (gen_aarch64_frecpev2sf);
8392     case E_V4SFmode: return (gen_aarch64_frecpev4sf);
8393     case E_DFmode:   return (gen_aarch64_frecpedf);
8394     case E_V2DFmode: return (gen_aarch64_frecpev2df);
8395     default:         gcc_unreachable ();
8396   }
8397 }
8398
8399 typedef rtx (*recps_type) (rtx, rtx, rtx);
8400
8401 /* Select reciprocal series step insn depending on machine mode.  */
8402
8403 static recps_type
8404 get_recps_type (machine_mode mode)
8405 {
8406   switch (mode)
8407   {
8408     case E_SFmode:   return (gen_aarch64_frecpssf);
8409     case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
8410     case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
8411     case E_DFmode:   return (gen_aarch64_frecpsdf);
8412     case E_V2DFmode: return (gen_aarch64_frecpsv2df);
8413     default:         gcc_unreachable ();
8414   }
8415 }
8416
8417 /* Emit the instruction sequence to compute the approximation for the division
8418    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
8419
8420 bool
8421 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
8422 {
8423   machine_mode mode = GET_MODE (quo);
8424
8425   if (GET_MODE_INNER (mode) == HFmode)
8426     return false;
8427
8428   bool use_approx_division_p = (flag_mlow_precision_div
8429                                 || (aarch64_tune_params.approx_modes->division
8430                                     & AARCH64_APPROX_MODE (mode)));
8431
8432   if (!flag_finite_math_only
8433       || flag_trapping_math
8434       || !flag_unsafe_math_optimizations
8435       || optimize_function_for_size_p (cfun)
8436       || !use_approx_division_p)
8437     return false;
8438
8439   /* Estimate the approximate reciprocal.  */
8440   rtx xrcp = gen_reg_rtx (mode);
8441   emit_insn ((*get_recpe_type (mode)) (xrcp, den));
8442
8443   /* Iterate over the series twice for SF and thrice for DF.  */
8444   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8445
8446   /* Optionally iterate over the series once less for faster performance,
8447      while sacrificing the accuracy.  */
8448   if (flag_mlow_precision_div)
8449     iterations--;
8450
8451   /* Iterate over the series to calculate the approximate reciprocal.  */
8452   rtx xtmp = gen_reg_rtx (mode);
8453   while (iterations--)
8454     {
8455       emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
8456
8457       if (iterations > 0)
8458         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
8459     }
8460
8461   if (num != CONST1_RTX (mode))
8462     {
8463       /* As the approximate reciprocal of DEN is already calculated, only
8464          calculate the approximate division when NUM is not 1.0.  */
8465       rtx xnum = force_reg (mode, num);
8466       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
8467     }
8468
8469   /* Finalize the approximation.  */
8470   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
8471   return true;
8472 }
8473
8474 /* Return the number of instructions that can be issued per cycle.  */
8475 static int
8476 aarch64_sched_issue_rate (void)
8477 {
8478   return aarch64_tune_params.issue_rate;
8479 }
8480
8481 static int
8482 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
8483 {
8484   int issue_rate = aarch64_sched_issue_rate ();
8485
8486   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
8487 }
8488
8489
8490 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
8491    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
8492    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
8493
8494 static int
8495 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
8496                                                     int ready_index)
8497 {
8498   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
8499 }
8500
8501
8502 /* Vectorizer cost model target hooks.  */
8503
8504 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
8505 static int
8506 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
8507                                     tree vectype,
8508                                     int misalign ATTRIBUTE_UNUSED)
8509 {
8510   unsigned elements;
8511   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
8512   bool fp = false;
8513
8514   if (vectype != NULL)
8515     fp = FLOAT_TYPE_P (vectype);
8516
8517   switch (type_of_cost)
8518     {
8519       case scalar_stmt:
8520         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
8521
8522       case scalar_load:
8523         return costs->scalar_load_cost;
8524
8525       case scalar_store:
8526         return costs->scalar_store_cost;
8527
8528       case vector_stmt:
8529         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8530
8531       case vector_load:
8532         return costs->vec_align_load_cost;
8533
8534       case vector_store:
8535         return costs->vec_store_cost;
8536
8537       case vec_to_scalar:
8538         return costs->vec_to_scalar_cost;
8539
8540       case scalar_to_vec:
8541         return costs->scalar_to_vec_cost;
8542
8543       case unaligned_load:
8544         return costs->vec_unalign_load_cost;
8545
8546       case unaligned_store:
8547         return costs->vec_unalign_store_cost;
8548
8549       case cond_branch_taken:
8550         return costs->cond_taken_branch_cost;
8551
8552       case cond_branch_not_taken:
8553         return costs->cond_not_taken_branch_cost;
8554
8555       case vec_perm:
8556         return costs->vec_permute_cost;
8557
8558       case vec_promote_demote:
8559         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8560
8561       case vec_construct:
8562         elements = TYPE_VECTOR_SUBPARTS (vectype);
8563         return elements / 2 + 1;
8564
8565       default:
8566         gcc_unreachable ();
8567     }
8568 }
8569
8570 /* Implement targetm.vectorize.add_stmt_cost.  */
8571 static unsigned
8572 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
8573                        struct _stmt_vec_info *stmt_info, int misalign,
8574                        enum vect_cost_model_location where)
8575 {
8576   unsigned *cost = (unsigned *) data;
8577   unsigned retval = 0;
8578
8579   if (flag_vect_cost_model)
8580     {
8581       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
8582       int stmt_cost =
8583             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
8584
8585       /* Statements in an inner loop relative to the loop being
8586          vectorized are weighted more heavily.  The value here is
8587          arbitrary and could potentially be improved with analysis.  */
8588       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
8589         count *= 50; /*  FIXME  */
8590
8591       retval = (unsigned) (count * stmt_cost);
8592       cost[where] += retval;
8593     }
8594
8595   return retval;
8596 }
8597
8598 static void initialize_aarch64_code_model (struct gcc_options *);
8599
8600 /* Parse the TO_PARSE string and put the architecture struct that it
8601    selects into RES and the architectural features into ISA_FLAGS.
8602    Return an aarch64_parse_opt_result describing the parse result.
8603    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
8604
8605 static enum aarch64_parse_opt_result
8606 aarch64_parse_arch (const char *to_parse, const struct processor **res,
8607                     unsigned long *isa_flags)
8608 {
8609   char *ext;
8610   const struct processor *arch;
8611   char *str = (char *) alloca (strlen (to_parse) + 1);
8612   size_t len;
8613
8614   strcpy (str, to_parse);
8615
8616   ext = strchr (str, '+');
8617
8618   if (ext != NULL)
8619     len = ext - str;
8620   else
8621     len = strlen (str);
8622
8623   if (len == 0)
8624     return AARCH64_PARSE_MISSING_ARG;
8625
8626
8627   /* Loop through the list of supported ARCHes to find a match.  */
8628   for (arch = all_architectures; arch->name != NULL; arch++)
8629     {
8630       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
8631         {
8632           unsigned long isa_temp = arch->flags;
8633
8634           if (ext != NULL)
8635             {
8636               /* TO_PARSE string contains at least one extension.  */
8637               enum aarch64_parse_opt_result ext_res
8638                 = aarch64_parse_extension (ext, &isa_temp);
8639
8640               if (ext_res != AARCH64_PARSE_OK)
8641                 return ext_res;
8642             }
8643           /* Extension parsing was successful.  Confirm the result
8644              arch and ISA flags.  */
8645           *res = arch;
8646           *isa_flags = isa_temp;
8647           return AARCH64_PARSE_OK;
8648         }
8649     }
8650
8651   /* ARCH name not found in list.  */
8652   return AARCH64_PARSE_INVALID_ARG;
8653 }
8654
8655 /* Parse the TO_PARSE string and put the result tuning in RES and the
8656    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
8657    describing the parse result.  If there is an error parsing, RES and
8658    ISA_FLAGS are left unchanged.  */
8659
8660 static enum aarch64_parse_opt_result
8661 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
8662                    unsigned long *isa_flags)
8663 {
8664   char *ext;
8665   const struct processor *cpu;
8666   char *str = (char *) alloca (strlen (to_parse) + 1);
8667   size_t len;
8668
8669   strcpy (str, to_parse);
8670
8671   ext = strchr (str, '+');
8672
8673   if (ext != NULL)
8674     len = ext - str;
8675   else
8676     len = strlen (str);
8677
8678   if (len == 0)
8679     return AARCH64_PARSE_MISSING_ARG;
8680
8681
8682   /* Loop through the list of supported CPUs to find a match.  */
8683   for (cpu = all_cores; cpu->name != NULL; cpu++)
8684     {
8685       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
8686         {
8687           unsigned long isa_temp = cpu->flags;
8688
8689
8690           if (ext != NULL)
8691             {
8692               /* TO_PARSE string contains at least one extension.  */
8693               enum aarch64_parse_opt_result ext_res
8694                 = aarch64_parse_extension (ext, &isa_temp);
8695
8696               if (ext_res != AARCH64_PARSE_OK)
8697                 return ext_res;
8698             }
8699           /* Extension parsing was successfull.  Confirm the result
8700              cpu and ISA flags.  */
8701           *res = cpu;
8702           *isa_flags = isa_temp;
8703           return AARCH64_PARSE_OK;
8704         }
8705     }
8706
8707   /* CPU name not found in list.  */
8708   return AARCH64_PARSE_INVALID_ARG;
8709 }
8710
8711 /* Parse the TO_PARSE string and put the cpu it selects into RES.
8712    Return an aarch64_parse_opt_result describing the parse result.
8713    If the parsing fails the RES does not change.  */
8714
8715 static enum aarch64_parse_opt_result
8716 aarch64_parse_tune (const char *to_parse, const struct processor **res)
8717 {
8718   const struct processor *cpu;
8719   char *str = (char *) alloca (strlen (to_parse) + 1);
8720
8721   strcpy (str, to_parse);
8722
8723   /* Loop through the list of supported CPUs to find a match.  */
8724   for (cpu = all_cores; cpu->name != NULL; cpu++)
8725     {
8726       if (strcmp (cpu->name, str) == 0)
8727         {
8728           *res = cpu;
8729           return AARCH64_PARSE_OK;
8730         }
8731     }
8732
8733   /* CPU name not found in list.  */
8734   return AARCH64_PARSE_INVALID_ARG;
8735 }
8736
8737 /* Parse TOKEN, which has length LENGTH to see if it is an option
8738    described in FLAG.  If it is, return the index bit for that fusion type.
8739    If not, error (printing OPTION_NAME) and return zero.  */
8740
8741 static unsigned int
8742 aarch64_parse_one_option_token (const char *token,
8743                                 size_t length,
8744                                 const struct aarch64_flag_desc *flag,
8745                                 const char *option_name)
8746 {
8747   for (; flag->name != NULL; flag++)
8748     {
8749       if (length == strlen (flag->name)
8750           && !strncmp (flag->name, token, length))
8751         return flag->flag;
8752     }
8753
8754   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
8755   return 0;
8756 }
8757
8758 /* Parse OPTION which is a comma-separated list of flags to enable.
8759    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8760    default state we inherit from the CPU tuning structures.  OPTION_NAME
8761    gives the top-level option we are parsing in the -moverride string,
8762    for use in error messages.  */
8763
8764 static unsigned int
8765 aarch64_parse_boolean_options (const char *option,
8766                                const struct aarch64_flag_desc *flags,
8767                                unsigned int initial_state,
8768                                const char *option_name)
8769 {
8770   const char separator = '.';
8771   const char* specs = option;
8772   const char* ntoken = option;
8773   unsigned int found_flags = initial_state;
8774
8775   while ((ntoken = strchr (specs, separator)))
8776     {
8777       size_t token_length = ntoken - specs;
8778       unsigned token_ops = aarch64_parse_one_option_token (specs,
8779                                                            token_length,
8780                                                            flags,
8781                                                            option_name);
8782       /* If we find "none" (or, for simplicity's sake, an error) anywhere
8783          in the token stream, reset the supported operations.  So:
8784
8785            adrp+add.cmp+branch.none.adrp+add
8786
8787            would have the result of turning on only adrp+add fusion.  */
8788       if (!token_ops)
8789         found_flags = 0;
8790
8791       found_flags |= token_ops;
8792       specs = ++ntoken;
8793     }
8794
8795   /* We ended with a comma, print something.  */
8796   if (!(*specs))
8797     {
8798       error ("%s string ill-formed\n", option_name);
8799       return 0;
8800     }
8801
8802   /* We still have one more token to parse.  */
8803   size_t token_length = strlen (specs);
8804   unsigned token_ops = aarch64_parse_one_option_token (specs,
8805                                                        token_length,
8806                                                        flags,
8807                                                        option_name);
8808    if (!token_ops)
8809      found_flags = 0;
8810
8811   found_flags |= token_ops;
8812   return found_flags;
8813 }
8814
8815 /* Support for overriding instruction fusion.  */
8816
8817 static void
8818 aarch64_parse_fuse_string (const char *fuse_string,
8819                             struct tune_params *tune)
8820 {
8821   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8822                                                      aarch64_fusible_pairs,
8823                                                      tune->fusible_ops,
8824                                                      "fuse=");
8825 }
8826
8827 /* Support for overriding other tuning flags.  */
8828
8829 static void
8830 aarch64_parse_tune_string (const char *tune_string,
8831                             struct tune_params *tune)
8832 {
8833   tune->extra_tuning_flags
8834     = aarch64_parse_boolean_options (tune_string,
8835                                      aarch64_tuning_flags,
8836                                      tune->extra_tuning_flags,
8837                                      "tune=");
8838 }
8839
8840 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8841    we understand.  If it is, extract the option string and handoff to
8842    the appropriate function.  */
8843
8844 void
8845 aarch64_parse_one_override_token (const char* token,
8846                                   size_t length,
8847                                   struct tune_params *tune)
8848 {
8849   const struct aarch64_tuning_override_function *fn
8850     = aarch64_tuning_override_functions;
8851
8852   const char *option_part = strchr (token, '=');
8853   if (!option_part)
8854     {
8855       error ("tuning string missing in option (%s)", token);
8856       return;
8857     }
8858
8859   /* Get the length of the option name.  */
8860   length = option_part - token;
8861   /* Skip the '=' to get to the option string.  */
8862   option_part++;
8863
8864   for (; fn->name != NULL; fn++)
8865     {
8866       if (!strncmp (fn->name, token, length))
8867         {
8868           fn->parse_override (option_part, tune);
8869           return;
8870         }
8871     }
8872
8873   error ("unknown tuning option (%s)",token);
8874   return;
8875 }
8876
8877 /* A checking mechanism for the implementation of the tls size.  */
8878
8879 static void
8880 initialize_aarch64_tls_size (struct gcc_options *opts)
8881 {
8882   if (aarch64_tls_size == 0)
8883     aarch64_tls_size = 24;
8884
8885   switch (opts->x_aarch64_cmodel_var)
8886     {
8887     case AARCH64_CMODEL_TINY:
8888       /* Both the default and maximum TLS size allowed under tiny is 1M which
8889          needs two instructions to address, so we clamp the size to 24.  */
8890       if (aarch64_tls_size > 24)
8891         aarch64_tls_size = 24;
8892       break;
8893     case AARCH64_CMODEL_SMALL:
8894       /* The maximum TLS size allowed under small is 4G.  */
8895       if (aarch64_tls_size > 32)
8896         aarch64_tls_size = 32;
8897       break;
8898     case AARCH64_CMODEL_LARGE:
8899       /* The maximum TLS size allowed under large is 16E.
8900          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
8901       if (aarch64_tls_size > 48)
8902         aarch64_tls_size = 48;
8903       break;
8904     default:
8905       gcc_unreachable ();
8906     }
8907
8908   return;
8909 }
8910
8911 /* Parse STRING looking for options in the format:
8912      string     :: option:string
8913      option     :: name=substring
8914      name       :: {a-z}
8915      substring  :: defined by option.  */
8916
8917 static void
8918 aarch64_parse_override_string (const char* input_string,
8919                                struct tune_params* tune)
8920 {
8921   const char separator = ':';
8922   size_t string_length = strlen (input_string) + 1;
8923   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8924   char *string = string_root;
8925   strncpy (string, input_string, string_length);
8926   string[string_length - 1] = '\0';
8927
8928   char* ntoken = string;
8929
8930   while ((ntoken = strchr (string, separator)))
8931     {
8932       size_t token_length = ntoken - string;
8933       /* Make this substring look like a string.  */
8934       *ntoken = '\0';
8935       aarch64_parse_one_override_token (string, token_length, tune);
8936       string = ++ntoken;
8937     }
8938
8939   /* One last option to parse.  */
8940   aarch64_parse_one_override_token (string, strlen (string), tune);
8941   free (string_root);
8942 }
8943
8944
8945 static void
8946 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8947 {
8948   /* The logic here is that if we are disabling all frame pointer generation
8949      then we do not need to disable leaf frame pointer generation as a
8950      separate operation.  But if we are *only* disabling leaf frame pointer
8951      generation then we set flag_omit_frame_pointer to true, but in
8952      aarch64_frame_pointer_required we return false only for leaf functions.
8953
8954      PR 70044: We have to be careful about being called multiple times for the
8955      same function.  Once we have decided to set flag_omit_frame_pointer just
8956      so that we can omit leaf frame pointers, we must then not interpret a
8957      second call as meaning that all frame pointer generation should be
8958      omitted.  We do this by setting flag_omit_frame_pointer to a special,
8959      non-zero value.  */
8960   if (opts->x_flag_omit_frame_pointer == 2)
8961     opts->x_flag_omit_frame_pointer = 0;
8962
8963   if (opts->x_flag_omit_frame_pointer)
8964     opts->x_flag_omit_leaf_frame_pointer = false;
8965   else if (opts->x_flag_omit_leaf_frame_pointer)
8966     opts->x_flag_omit_frame_pointer = 2;
8967
8968   /* If not optimizing for size, set the default
8969      alignment to what the target wants.  */
8970   if (!opts->x_optimize_size)
8971     {
8972       if (opts->x_align_loops <= 0)
8973         opts->x_align_loops = aarch64_tune_params.loop_align;
8974       if (opts->x_align_jumps <= 0)
8975         opts->x_align_jumps = aarch64_tune_params.jump_align;
8976       if (opts->x_align_functions <= 0)
8977         opts->x_align_functions = aarch64_tune_params.function_align;
8978     }
8979
8980   /* We default to no pc-relative literal loads.  */
8981
8982   aarch64_pcrelative_literal_loads = false;
8983
8984   /* If -mpc-relative-literal-loads is set on the command line, this
8985      implies that the user asked for PC relative literal loads.  */
8986   if (opts->x_pcrelative_literal_loads == 1)
8987     aarch64_pcrelative_literal_loads = true;
8988
8989   /* This is PR70113. When building the Linux kernel with
8990      CONFIG_ARM64_ERRATUM_843419, support for relocations
8991      R_AARCH64_ADR_PREL_PG_HI21 and R_AARCH64_ADR_PREL_PG_HI21_NC is
8992      removed from the kernel to avoid loading objects with possibly
8993      offending sequences.  Without -mpc-relative-literal-loads we would
8994      generate such relocations, preventing the kernel build from
8995      succeeding.  */
8996   if (opts->x_pcrelative_literal_loads == 2
8997       && TARGET_FIX_ERR_A53_843419)
8998     aarch64_pcrelative_literal_loads = true;
8999
9000   /* In the tiny memory model it makes no sense to disallow PC relative
9001      literal pool loads.  */
9002   if (aarch64_cmodel == AARCH64_CMODEL_TINY
9003       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
9004     aarch64_pcrelative_literal_loads = true;
9005
9006   /* When enabling the lower precision Newton series for the square root, also
9007      enable it for the reciprocal square root, since the latter is an
9008      intermediary step for the former.  */
9009   if (flag_mlow_precision_sqrt)
9010     flag_mrecip_low_precision_sqrt = true;
9011 }
9012
9013 /* 'Unpack' up the internal tuning structs and update the options
9014     in OPTS.  The caller must have set up selected_tune and selected_arch
9015     as all the other target-specific codegen decisions are
9016     derived from them.  */
9017
9018 void
9019 aarch64_override_options_internal (struct gcc_options *opts)
9020 {
9021   aarch64_tune_flags = selected_tune->flags;
9022   aarch64_tune = selected_tune->sched_core;
9023   /* Make a copy of the tuning parameters attached to the core, which
9024      we may later overwrite.  */
9025   aarch64_tune_params = *(selected_tune->tune);
9026   aarch64_architecture_version = selected_arch->architecture_version;
9027
9028   if (opts->x_aarch64_override_tune_string)
9029     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
9030                                   &aarch64_tune_params);
9031
9032   /* This target defaults to strict volatile bitfields.  */
9033   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
9034     opts->x_flag_strict_volatile_bitfields = 1;
9035
9036   initialize_aarch64_code_model (opts);
9037   initialize_aarch64_tls_size (opts);
9038
9039   int queue_depth = 0;
9040   switch (aarch64_tune_params.autoprefetcher_model)
9041     {
9042       case tune_params::AUTOPREFETCHER_OFF:
9043         queue_depth = -1;
9044         break;
9045       case tune_params::AUTOPREFETCHER_WEAK:
9046         queue_depth = 0;
9047         break;
9048       case tune_params::AUTOPREFETCHER_STRONG:
9049         queue_depth = max_insn_queue_index + 1;
9050         break;
9051       default:
9052         gcc_unreachable ();
9053     }
9054
9055   /* We don't mind passing in global_options_set here as we don't use
9056      the *options_set structs anyway.  */
9057   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
9058                          queue_depth,
9059                          opts->x_param_values,
9060                          global_options_set.x_param_values);
9061
9062   /* Set up parameters to be used in prefetching algorithm.  Do not
9063      override the defaults unless we are tuning for a core we have
9064      researched values for.  */
9065   if (aarch64_tune_params.prefetch->num_slots > 0)
9066     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
9067                            aarch64_tune_params.prefetch->num_slots,
9068                            opts->x_param_values,
9069                            global_options_set.x_param_values);
9070   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
9071     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
9072                            aarch64_tune_params.prefetch->l1_cache_size,
9073                            opts->x_param_values,
9074                            global_options_set.x_param_values);
9075   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
9076     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
9077                            aarch64_tune_params.prefetch->l1_cache_line_size,
9078                            opts->x_param_values,
9079                            global_options_set.x_param_values);
9080   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
9081     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
9082                            aarch64_tune_params.prefetch->l2_cache_size,
9083                            opts->x_param_values,
9084                            global_options_set.x_param_values);
9085
9086   /* Enable sw prefetching at specified optimization level for
9087      CPUS that have prefetch.  Lower optimization level threshold by 1
9088      when profiling is enabled.  */
9089   if (opts->x_flag_prefetch_loop_arrays < 0
9090       && !opts->x_optimize_size
9091       && aarch64_tune_params.prefetch->default_opt_level >= 0
9092       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
9093     opts->x_flag_prefetch_loop_arrays = 1;
9094
9095   aarch64_override_options_after_change_1 (opts);
9096 }
9097
9098 /* Print a hint with a suggestion for a core or architecture name that
9099    most closely resembles what the user passed in STR.  ARCH is true if
9100    the user is asking for an architecture name.  ARCH is false if the user
9101    is asking for a core name.  */
9102
9103 static void
9104 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
9105 {
9106   auto_vec<const char *> candidates;
9107   const struct processor *entry = arch ? all_architectures : all_cores;
9108   for (; entry->name != NULL; entry++)
9109     candidates.safe_push (entry->name);
9110   char *s;
9111   const char *hint = candidates_list_and_hint (str, s, candidates);
9112   if (hint)
9113     inform (input_location, "valid arguments are: %s;"
9114                              " did you mean %qs?", s, hint);
9115   XDELETEVEC (s);
9116 }
9117
9118 /* Print a hint with a suggestion for a core name that most closely resembles
9119    what the user passed in STR.  */
9120
9121 inline static void
9122 aarch64_print_hint_for_core (const char *str)
9123 {
9124   aarch64_print_hint_for_core_or_arch (str, false);
9125 }
9126
9127 /* Print a hint with a suggestion for an architecture name that most closely
9128    resembles what the user passed in STR.  */
9129
9130 inline static void
9131 aarch64_print_hint_for_arch (const char *str)
9132 {
9133   aarch64_print_hint_for_core_or_arch (str, true);
9134 }
9135
9136 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
9137    specified in STR and throw errors if appropriate.  Put the results if
9138    they are valid in RES and ISA_FLAGS.  Return whether the option is
9139    valid.  */
9140
9141 static bool
9142 aarch64_validate_mcpu (const char *str, const struct processor **res,
9143                        unsigned long *isa_flags)
9144 {
9145   enum aarch64_parse_opt_result parse_res
9146     = aarch64_parse_cpu (str, res, isa_flags);
9147
9148   if (parse_res == AARCH64_PARSE_OK)
9149     return true;
9150
9151   switch (parse_res)
9152     {
9153       case AARCH64_PARSE_MISSING_ARG:
9154         error ("missing cpu name in %<-mcpu=%s%>", str);
9155         break;
9156       case AARCH64_PARSE_INVALID_ARG:
9157         error ("unknown value %qs for -mcpu", str);
9158         aarch64_print_hint_for_core (str);
9159         break;
9160       case AARCH64_PARSE_INVALID_FEATURE:
9161         error ("invalid feature modifier in %<-mcpu=%s%>", str);
9162         break;
9163       default:
9164         gcc_unreachable ();
9165     }
9166
9167   return false;
9168 }
9169
9170 /* Validate a command-line -march option.  Parse the arch and extensions
9171    (if any) specified in STR and throw errors if appropriate.  Put the
9172    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
9173    option is valid.  */
9174
9175 static bool
9176 aarch64_validate_march (const char *str, const struct processor **res,
9177                          unsigned long *isa_flags)
9178 {
9179   enum aarch64_parse_opt_result parse_res
9180     = aarch64_parse_arch (str, res, isa_flags);
9181
9182   if (parse_res == AARCH64_PARSE_OK)
9183     return true;
9184
9185   switch (parse_res)
9186     {
9187       case AARCH64_PARSE_MISSING_ARG:
9188         error ("missing arch name in %<-march=%s%>", str);
9189         break;
9190       case AARCH64_PARSE_INVALID_ARG:
9191         error ("unknown value %qs for -march", str);
9192         aarch64_print_hint_for_arch (str);
9193         break;
9194       case AARCH64_PARSE_INVALID_FEATURE:
9195         error ("invalid feature modifier in %<-march=%s%>", str);
9196         break;
9197       default:
9198         gcc_unreachable ();
9199     }
9200
9201   return false;
9202 }
9203
9204 /* Validate a command-line -mtune option.  Parse the cpu
9205    specified in STR and throw errors if appropriate.  Put the
9206    result, if it is valid, in RES.  Return whether the option is
9207    valid.  */
9208
9209 static bool
9210 aarch64_validate_mtune (const char *str, const struct processor **res)
9211 {
9212   enum aarch64_parse_opt_result parse_res
9213     = aarch64_parse_tune (str, res);
9214
9215   if (parse_res == AARCH64_PARSE_OK)
9216     return true;
9217
9218   switch (parse_res)
9219     {
9220       case AARCH64_PARSE_MISSING_ARG:
9221         error ("missing cpu name in %<-mtune=%s%>", str);
9222         break;
9223       case AARCH64_PARSE_INVALID_ARG:
9224         error ("unknown value %qs for -mtune", str);
9225         aarch64_print_hint_for_core (str);
9226         break;
9227       default:
9228         gcc_unreachable ();
9229     }
9230   return false;
9231 }
9232
9233 /* Return the CPU corresponding to the enum CPU.
9234    If it doesn't specify a cpu, return the default.  */
9235
9236 static const struct processor *
9237 aarch64_get_tune_cpu (enum aarch64_processor cpu)
9238 {
9239   if (cpu != aarch64_none)
9240     return &all_cores[cpu];
9241
9242   /* The & 0x3f is to extract the bottom 6 bits that encode the
9243      default cpu as selected by the --with-cpu GCC configure option
9244      in config.gcc.
9245      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
9246      flags mechanism should be reworked to make it more sane.  */
9247   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9248 }
9249
9250 /* Return the architecture corresponding to the enum ARCH.
9251    If it doesn't specify a valid architecture, return the default.  */
9252
9253 static const struct processor *
9254 aarch64_get_arch (enum aarch64_arch arch)
9255 {
9256   if (arch != aarch64_no_arch)
9257     return &all_architectures[arch];
9258
9259   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9260
9261   return &all_architectures[cpu->arch];
9262 }
9263
9264 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
9265    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
9266    tuning structs.  In particular it must set selected_tune and
9267    aarch64_isa_flags that define the available ISA features and tuning
9268    decisions.  It must also set selected_arch as this will be used to
9269    output the .arch asm tags for each function.  */
9270
9271 static void
9272 aarch64_override_options (void)
9273 {
9274   unsigned long cpu_isa = 0;
9275   unsigned long arch_isa = 0;
9276   aarch64_isa_flags = 0;
9277
9278   bool valid_cpu = true;
9279   bool valid_tune = true;
9280   bool valid_arch = true;
9281
9282   selected_cpu = NULL;
9283   selected_arch = NULL;
9284   selected_tune = NULL;
9285
9286   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
9287      If either of -march or -mtune is given, they override their
9288      respective component of -mcpu.  */
9289   if (aarch64_cpu_string)
9290     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
9291                                         &cpu_isa);
9292
9293   if (aarch64_arch_string)
9294     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
9295                                           &arch_isa);
9296
9297   if (aarch64_tune_string)
9298     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
9299
9300   /* If the user did not specify a processor, choose the default
9301      one for them.  This will be the CPU set during configuration using
9302      --with-cpu, otherwise it is "generic".  */
9303   if (!selected_cpu)
9304     {
9305       if (selected_arch)
9306         {
9307           selected_cpu = &all_cores[selected_arch->ident];
9308           aarch64_isa_flags = arch_isa;
9309           explicit_arch = selected_arch->arch;
9310         }
9311       else
9312         {
9313           /* Get default configure-time CPU.  */
9314           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
9315           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
9316         }
9317
9318       if (selected_tune)
9319         explicit_tune_core = selected_tune->ident;
9320     }
9321   /* If both -mcpu and -march are specified check that they are architecturally
9322      compatible, warn if they're not and prefer the -march ISA flags.  */
9323   else if (selected_arch)
9324     {
9325       if (selected_arch->arch != selected_cpu->arch)
9326         {
9327           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
9328                        all_architectures[selected_cpu->arch].name,
9329                        selected_arch->name);
9330         }
9331       aarch64_isa_flags = arch_isa;
9332       explicit_arch = selected_arch->arch;
9333       explicit_tune_core = selected_tune ? selected_tune->ident
9334                                           : selected_cpu->ident;
9335     }
9336   else
9337     {
9338       /* -mcpu but no -march.  */
9339       aarch64_isa_flags = cpu_isa;
9340       explicit_tune_core = selected_tune ? selected_tune->ident
9341                                           : selected_cpu->ident;
9342       gcc_assert (selected_cpu);
9343       selected_arch = &all_architectures[selected_cpu->arch];
9344       explicit_arch = selected_arch->arch;
9345     }
9346
9347   /* Set the arch as well as we will need it when outputing
9348      the .arch directive in assembly.  */
9349   if (!selected_arch)
9350     {
9351       gcc_assert (selected_cpu);
9352       selected_arch = &all_architectures[selected_cpu->arch];
9353     }
9354
9355   if (!selected_tune)
9356     selected_tune = selected_cpu;
9357
9358 #ifndef HAVE_AS_MABI_OPTION
9359   /* The compiler may have been configured with 2.23.* binutils, which does
9360      not have support for ILP32.  */
9361   if (TARGET_ILP32)
9362     error ("Assembler does not support -mabi=ilp32");
9363 #endif
9364
9365   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
9366     sorry ("Return address signing is only supported for -mabi=lp64");
9367
9368   /* Make sure we properly set up the explicit options.  */
9369   if ((aarch64_cpu_string && valid_cpu)
9370        || (aarch64_tune_string && valid_tune))
9371     gcc_assert (explicit_tune_core != aarch64_none);
9372
9373   if ((aarch64_cpu_string && valid_cpu)
9374        || (aarch64_arch_string && valid_arch))
9375     gcc_assert (explicit_arch != aarch64_no_arch);
9376
9377   aarch64_override_options_internal (&global_options);
9378
9379   /* Save these options as the default ones in case we push and pop them later
9380      while processing functions with potential target attributes.  */
9381   target_option_default_node = target_option_current_node
9382       = build_target_option_node (&global_options);
9383 }
9384
9385 /* Implement targetm.override_options_after_change.  */
9386
9387 static void
9388 aarch64_override_options_after_change (void)
9389 {
9390   aarch64_override_options_after_change_1 (&global_options);
9391 }
9392
9393 static struct machine_function *
9394 aarch64_init_machine_status (void)
9395 {
9396   struct machine_function *machine;
9397   machine = ggc_cleared_alloc<machine_function> ();
9398   return machine;
9399 }
9400
9401 void
9402 aarch64_init_expanders (void)
9403 {
9404   init_machine_status = aarch64_init_machine_status;
9405 }
9406
9407 /* A checking mechanism for the implementation of the various code models.  */
9408 static void
9409 initialize_aarch64_code_model (struct gcc_options *opts)
9410 {
9411    if (opts->x_flag_pic)
9412      {
9413        switch (opts->x_aarch64_cmodel_var)
9414          {
9415          case AARCH64_CMODEL_TINY:
9416            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
9417            break;
9418          case AARCH64_CMODEL_SMALL:
9419 #ifdef HAVE_AS_SMALL_PIC_RELOCS
9420            aarch64_cmodel = (flag_pic == 2
9421                              ? AARCH64_CMODEL_SMALL_PIC
9422                              : AARCH64_CMODEL_SMALL_SPIC);
9423 #else
9424            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
9425 #endif
9426            break;
9427          case AARCH64_CMODEL_LARGE:
9428            sorry ("code model %qs with -f%s", "large",
9429                   opts->x_flag_pic > 1 ? "PIC" : "pic");
9430            break;
9431          default:
9432            gcc_unreachable ();
9433          }
9434      }
9435    else
9436      aarch64_cmodel = opts->x_aarch64_cmodel_var;
9437 }
9438
9439 /* Implement TARGET_OPTION_SAVE.  */
9440
9441 static void
9442 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
9443 {
9444   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
9445 }
9446
9447 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
9448    using the information saved in PTR.  */
9449
9450 static void
9451 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
9452 {
9453   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
9454   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9455   opts->x_explicit_arch = ptr->x_explicit_arch;
9456   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
9457   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
9458
9459   aarch64_override_options_internal (opts);
9460 }
9461
9462 /* Implement TARGET_OPTION_PRINT.  */
9463
9464 static void
9465 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
9466 {
9467   const struct processor *cpu
9468     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9469   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
9470   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
9471   std::string extension
9472     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
9473
9474   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
9475   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
9476            arch->name, extension.c_str ());
9477 }
9478
9479 static GTY(()) tree aarch64_previous_fndecl;
9480
9481 void
9482 aarch64_reset_previous_fndecl (void)
9483 {
9484   aarch64_previous_fndecl = NULL;
9485 }
9486
9487 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
9488    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
9489    make sure optab availability predicates are recomputed when necessary.  */
9490
9491 void
9492 aarch64_save_restore_target_globals (tree new_tree)
9493 {
9494   if (TREE_TARGET_GLOBALS (new_tree))
9495     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
9496   else if (new_tree == target_option_default_node)
9497     restore_target_globals (&default_target_globals);
9498   else
9499     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
9500 }
9501
9502 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
9503    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
9504    of the function, if such exists.  This function may be called multiple
9505    times on a single function so use aarch64_previous_fndecl to avoid
9506    setting up identical state.  */
9507
9508 static void
9509 aarch64_set_current_function (tree fndecl)
9510 {
9511   if (!fndecl || fndecl == aarch64_previous_fndecl)
9512     return;
9513
9514   tree old_tree = (aarch64_previous_fndecl
9515                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
9516                    : NULL_TREE);
9517
9518   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9519
9520   /* If current function has no attributes but the previous one did,
9521      use the default node.  */
9522   if (!new_tree && old_tree)
9523     new_tree = target_option_default_node;
9524
9525   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
9526      the default have been handled by aarch64_save_restore_target_globals from
9527      aarch64_pragma_target_parse.  */
9528   if (old_tree == new_tree)
9529     return;
9530
9531   aarch64_previous_fndecl = fndecl;
9532
9533   /* First set the target options.  */
9534   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
9535
9536   aarch64_save_restore_target_globals (new_tree);
9537 }
9538
9539 /* Enum describing the various ways we can handle attributes.
9540    In many cases we can reuse the generic option handling machinery.  */
9541
9542 enum aarch64_attr_opt_type
9543 {
9544   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
9545   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
9546   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
9547   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
9548 };
9549
9550 /* All the information needed to handle a target attribute.
9551    NAME is the name of the attribute.
9552    ATTR_TYPE specifies the type of behavior of the attribute as described
9553    in the definition of enum aarch64_attr_opt_type.
9554    ALLOW_NEG is true if the attribute supports a "no-" form.
9555    HANDLER is the function that takes the attribute string and whether
9556    it is a pragma or attribute and handles the option.  It is needed only
9557    when the ATTR_TYPE is aarch64_attr_custom.
9558    OPT_NUM is the enum specifying the option that the attribute modifies.
9559    This is needed for attributes that mirror the behavior of a command-line
9560    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
9561    aarch64_attr_enum.  */
9562
9563 struct aarch64_attribute_info
9564 {
9565   const char *name;
9566   enum aarch64_attr_opt_type attr_type;
9567   bool allow_neg;
9568   bool (*handler) (const char *, const char *);
9569   enum opt_code opt_num;
9570 };
9571
9572 /* Handle the ARCH_STR argument to the arch= target attribute.
9573    PRAGMA_OR_ATTR is used in potential error messages.  */
9574
9575 static bool
9576 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
9577 {
9578   const struct processor *tmp_arch = NULL;
9579   enum aarch64_parse_opt_result parse_res
9580     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
9581
9582   if (parse_res == AARCH64_PARSE_OK)
9583     {
9584       gcc_assert (tmp_arch);
9585       selected_arch = tmp_arch;
9586       explicit_arch = selected_arch->arch;
9587       return true;
9588     }
9589
9590   switch (parse_res)
9591     {
9592       case AARCH64_PARSE_MISSING_ARG:
9593         error ("missing architecture name in 'arch' target %s", pragma_or_attr);
9594         break;
9595       case AARCH64_PARSE_INVALID_ARG:
9596         error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
9597         aarch64_print_hint_for_arch (str);
9598         break;
9599       case AARCH64_PARSE_INVALID_FEATURE:
9600         error ("invalid feature modifier %qs for 'arch' target %s",
9601                str, pragma_or_attr);
9602         break;
9603       default:
9604         gcc_unreachable ();
9605     }
9606
9607   return false;
9608 }
9609
9610 /* Handle the argument CPU_STR to the cpu= target attribute.
9611    PRAGMA_OR_ATTR is used in potential error messages.  */
9612
9613 static bool
9614 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
9615 {
9616   const struct processor *tmp_cpu = NULL;
9617   enum aarch64_parse_opt_result parse_res
9618     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
9619
9620   if (parse_res == AARCH64_PARSE_OK)
9621     {
9622       gcc_assert (tmp_cpu);
9623       selected_tune = tmp_cpu;
9624       explicit_tune_core = selected_tune->ident;
9625
9626       selected_arch = &all_architectures[tmp_cpu->arch];
9627       explicit_arch = selected_arch->arch;
9628       return true;
9629     }
9630
9631   switch (parse_res)
9632     {
9633       case AARCH64_PARSE_MISSING_ARG:
9634         error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
9635         break;
9636       case AARCH64_PARSE_INVALID_ARG:
9637         error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
9638         aarch64_print_hint_for_core (str);
9639         break;
9640       case AARCH64_PARSE_INVALID_FEATURE:
9641         error ("invalid feature modifier %qs for 'cpu' target %s",
9642                str, pragma_or_attr);
9643         break;
9644       default:
9645         gcc_unreachable ();
9646     }
9647
9648   return false;
9649 }
9650
9651 /* Handle the argument STR to the tune= target attribute.
9652    PRAGMA_OR_ATTR is used in potential error messages.  */
9653
9654 static bool
9655 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
9656 {
9657   const struct processor *tmp_tune = NULL;
9658   enum aarch64_parse_opt_result parse_res
9659     = aarch64_parse_tune (str, &tmp_tune);
9660
9661   if (parse_res == AARCH64_PARSE_OK)
9662     {
9663       gcc_assert (tmp_tune);
9664       selected_tune = tmp_tune;
9665       explicit_tune_core = selected_tune->ident;
9666       return true;
9667     }
9668
9669   switch (parse_res)
9670     {
9671       case AARCH64_PARSE_INVALID_ARG:
9672         error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
9673         aarch64_print_hint_for_core (str);
9674         break;
9675       default:
9676         gcc_unreachable ();
9677     }
9678
9679   return false;
9680 }
9681
9682 /* Parse an architecture extensions target attribute string specified in STR.
9683    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
9684    if successful.  Update aarch64_isa_flags to reflect the ISA features
9685    modified.
9686    PRAGMA_OR_ATTR is used in potential error messages.  */
9687
9688 static bool
9689 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
9690 {
9691   enum aarch64_parse_opt_result parse_res;
9692   unsigned long isa_flags = aarch64_isa_flags;
9693
9694   /* We allow "+nothing" in the beginning to clear out all architectural
9695      features if the user wants to handpick specific features.  */
9696   if (strncmp ("+nothing", str, 8) == 0)
9697     {
9698       isa_flags = 0;
9699       str += 8;
9700     }
9701
9702   parse_res = aarch64_parse_extension (str, &isa_flags);
9703
9704   if (parse_res == AARCH64_PARSE_OK)
9705     {
9706       aarch64_isa_flags = isa_flags;
9707       return true;
9708     }
9709
9710   switch (parse_res)
9711     {
9712       case AARCH64_PARSE_MISSING_ARG:
9713         error ("missing feature modifier in target %s %qs",
9714                pragma_or_attr, str);
9715         break;
9716
9717       case AARCH64_PARSE_INVALID_FEATURE:
9718         error ("invalid feature modifier in target %s %qs",
9719                pragma_or_attr, str);
9720         break;
9721
9722       default:
9723         gcc_unreachable ();
9724     }
9725
9726  return false;
9727 }
9728
9729 /* The target attributes that we support.  On top of these we also support just
9730    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
9731    handled explicitly in aarch64_process_one_target_attr.  */
9732
9733 static const struct aarch64_attribute_info aarch64_attributes[] =
9734 {
9735   { "general-regs-only", aarch64_attr_mask, false, NULL,
9736      OPT_mgeneral_regs_only },
9737   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
9738      OPT_mfix_cortex_a53_835769 },
9739   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
9740      OPT_mfix_cortex_a53_843419 },
9741   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
9742   { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
9743   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
9744      OPT_momit_leaf_frame_pointer },
9745   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
9746   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
9747      OPT_march_ },
9748   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
9749   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
9750      OPT_mtune_ },
9751   { "sign-return-address", aarch64_attr_enum, false, NULL,
9752      OPT_msign_return_address_ },
9753   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
9754 };
9755
9756 /* Parse ARG_STR which contains the definition of one target attribute.
9757    Show appropriate errors if any or return true if the attribute is valid.
9758    PRAGMA_OR_ATTR holds the string to use in error messages about whether
9759    we're processing a target attribute or pragma.  */
9760
9761 static bool
9762 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
9763 {
9764   bool invert = false;
9765
9766   size_t len = strlen (arg_str);
9767
9768   if (len == 0)
9769     {
9770       error ("malformed target %s", pragma_or_attr);
9771       return false;
9772     }
9773
9774   char *str_to_check = (char *) alloca (len + 1);
9775   strcpy (str_to_check, arg_str);
9776
9777   /* Skip leading whitespace.  */
9778   while (*str_to_check == ' ' || *str_to_check == '\t')
9779     str_to_check++;
9780
9781   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
9782      It is easier to detect and handle it explicitly here rather than going
9783      through the machinery for the rest of the target attributes in this
9784      function.  */
9785   if (*str_to_check == '+')
9786     return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
9787
9788   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
9789     {
9790       invert = true;
9791       str_to_check += 3;
9792     }
9793   char *arg = strchr (str_to_check, '=');
9794
9795   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9796      and point ARG to "foo".  */
9797   if (arg)
9798     {
9799       *arg = '\0';
9800       arg++;
9801     }
9802   const struct aarch64_attribute_info *p_attr;
9803   bool found = false;
9804   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
9805     {
9806       /* If the names don't match up, or the user has given an argument
9807          to an attribute that doesn't accept one, or didn't give an argument
9808          to an attribute that expects one, fail to match.  */
9809       if (strcmp (str_to_check, p_attr->name) != 0)
9810         continue;
9811
9812       found = true;
9813       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
9814                               || p_attr->attr_type == aarch64_attr_enum;
9815
9816       if (attr_need_arg_p ^ (arg != NULL))
9817         {
9818           error ("target %s %qs does not accept an argument",
9819                   pragma_or_attr, str_to_check);
9820           return false;
9821         }
9822
9823       /* If the name matches but the attribute does not allow "no-" versions
9824          then we can't match.  */
9825       if (invert && !p_attr->allow_neg)
9826         {
9827           error ("target %s %qs does not allow a negated form",
9828                   pragma_or_attr, str_to_check);
9829           return false;
9830         }
9831
9832       switch (p_attr->attr_type)
9833         {
9834         /* Has a custom handler registered.
9835            For example, cpu=, arch=, tune=.  */
9836           case aarch64_attr_custom:
9837             gcc_assert (p_attr->handler);
9838             if (!p_attr->handler (arg, pragma_or_attr))
9839               return false;
9840             break;
9841
9842           /* Either set or unset a boolean option.  */
9843           case aarch64_attr_bool:
9844             {
9845               struct cl_decoded_option decoded;
9846
9847               generate_option (p_attr->opt_num, NULL, !invert,
9848                                CL_TARGET, &decoded);
9849               aarch64_handle_option (&global_options, &global_options_set,
9850                                       &decoded, input_location);
9851               break;
9852             }
9853           /* Set or unset a bit in the target_flags.  aarch64_handle_option
9854              should know what mask to apply given the option number.  */
9855           case aarch64_attr_mask:
9856             {
9857               struct cl_decoded_option decoded;
9858               /* We only need to specify the option number.
9859                  aarch64_handle_option will know which mask to apply.  */
9860               decoded.opt_index = p_attr->opt_num;
9861               decoded.value = !invert;
9862               aarch64_handle_option (&global_options, &global_options_set,
9863                                       &decoded, input_location);
9864               break;
9865             }
9866           /* Use the option setting machinery to set an option to an enum.  */
9867           case aarch64_attr_enum:
9868             {
9869               gcc_assert (arg);
9870               bool valid;
9871               int value;
9872               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
9873                                               &value, CL_TARGET);
9874               if (valid)
9875                 {
9876                   set_option (&global_options, NULL, p_attr->opt_num, value,
9877                               NULL, DK_UNSPECIFIED, input_location,
9878                               global_dc);
9879                 }
9880               else
9881                 {
9882                   error ("target %s %s=%s is not valid",
9883                          pragma_or_attr, str_to_check, arg);
9884                 }
9885               break;
9886             }
9887           default:
9888             gcc_unreachable ();
9889         }
9890     }
9891
9892   /* If we reached here we either have found an attribute and validated
9893      it or didn't match any.  If we matched an attribute but its arguments
9894      were malformed we will have returned false already.  */
9895   return found;
9896 }
9897
9898 /* Count how many times the character C appears in
9899    NULL-terminated string STR.  */
9900
9901 static unsigned int
9902 num_occurences_in_str (char c, char *str)
9903 {
9904   unsigned int res = 0;
9905   while (*str != '\0')
9906     {
9907       if (*str == c)
9908         res++;
9909
9910       str++;
9911     }
9912
9913   return res;
9914 }
9915
9916 /* Parse the tree in ARGS that contains the target attribute information
9917    and update the global target options space.  PRAGMA_OR_ATTR is a string
9918    to be used in error messages, specifying whether this is processing
9919    a target attribute or a target pragma.  */
9920
9921 bool
9922 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9923 {
9924   if (TREE_CODE (args) == TREE_LIST)
9925     {
9926       do
9927         {
9928           tree head = TREE_VALUE (args);
9929           if (head)
9930             {
9931               if (!aarch64_process_target_attr (head, pragma_or_attr))
9932                 return false;
9933             }
9934           args = TREE_CHAIN (args);
9935         } while (args);
9936
9937       return true;
9938     }
9939
9940   if (TREE_CODE (args) != STRING_CST)
9941     {
9942       error ("attribute %<target%> argument not a string");
9943       return false;
9944     }
9945
9946   size_t len = strlen (TREE_STRING_POINTER (args));
9947   char *str_to_check = (char *) alloca (len + 1);
9948   strcpy (str_to_check, TREE_STRING_POINTER (args));
9949
9950   if (len == 0)
9951     {
9952       error ("malformed target %s value", pragma_or_attr);
9953       return false;
9954     }
9955
9956   /* Used to catch empty spaces between commas i.e.
9957      attribute ((target ("attr1,,attr2"))).  */
9958   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9959
9960   /* Handle multiple target attributes separated by ','.  */
9961   char *token = strtok (str_to_check, ",");
9962
9963   unsigned int num_attrs = 0;
9964   while (token)
9965     {
9966       num_attrs++;
9967       if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9968         {
9969           error ("target %s %qs is invalid", pragma_or_attr, token);
9970           return false;
9971         }
9972
9973       token = strtok (NULL, ",");
9974     }
9975
9976   if (num_attrs != num_commas + 1)
9977     {
9978       error ("malformed target %s list %qs",
9979               pragma_or_attr, TREE_STRING_POINTER (args));
9980       return false;
9981     }
9982
9983   return true;
9984 }
9985
9986 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
9987    process attribute ((target ("..."))).  */
9988
9989 static bool
9990 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9991 {
9992   struct cl_target_option cur_target;
9993   bool ret;
9994   tree old_optimize;
9995   tree new_target, new_optimize;
9996   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9997
9998   /* If what we're processing is the current pragma string then the
9999      target option node is already stored in target_option_current_node
10000      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
10001      having to re-parse the string.  This is especially useful to keep
10002      arm_neon.h compile times down since that header contains a lot
10003      of intrinsics enclosed in pragmas.  */
10004   if (!existing_target && args == current_target_pragma)
10005     {
10006       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
10007       return true;
10008     }
10009   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
10010
10011   old_optimize = build_optimization_node (&global_options);
10012   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
10013
10014   /* If the function changed the optimization levels as well as setting
10015      target options, start with the optimizations specified.  */
10016   if (func_optimize && func_optimize != old_optimize)
10017     cl_optimization_restore (&global_options,
10018                              TREE_OPTIMIZATION (func_optimize));
10019
10020   /* Save the current target options to restore at the end.  */
10021   cl_target_option_save (&cur_target, &global_options);
10022
10023   /* If fndecl already has some target attributes applied to it, unpack
10024      them so that we add this attribute on top of them, rather than
10025      overwriting them.  */
10026   if (existing_target)
10027     {
10028       struct cl_target_option *existing_options
10029         = TREE_TARGET_OPTION (existing_target);
10030
10031       if (existing_options)
10032         cl_target_option_restore (&global_options, existing_options);
10033     }
10034   else
10035     cl_target_option_restore (&global_options,
10036                         TREE_TARGET_OPTION (target_option_current_node));
10037
10038
10039   ret = aarch64_process_target_attr (args, "attribute");
10040
10041   /* Set up any additional state.  */
10042   if (ret)
10043     {
10044       aarch64_override_options_internal (&global_options);
10045       /* Initialize SIMD builtins if we haven't already.
10046          Set current_target_pragma to NULL for the duration so that
10047          the builtin initialization code doesn't try to tag the functions
10048          being built with the attributes specified by any current pragma, thus
10049          going into an infinite recursion.  */
10050       if (TARGET_SIMD)
10051         {
10052           tree saved_current_target_pragma = current_target_pragma;
10053           current_target_pragma = NULL;
10054           aarch64_init_simd_builtins ();
10055           current_target_pragma = saved_current_target_pragma;
10056         }
10057       new_target = build_target_option_node (&global_options);
10058     }
10059   else
10060     new_target = NULL;
10061
10062   new_optimize = build_optimization_node (&global_options);
10063
10064   if (fndecl && ret)
10065     {
10066       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
10067
10068       if (old_optimize != new_optimize)
10069         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
10070     }
10071
10072   cl_target_option_restore (&global_options, &cur_target);
10073
10074   if (old_optimize != new_optimize)
10075     cl_optimization_restore (&global_options,
10076                              TREE_OPTIMIZATION (old_optimize));
10077   return ret;
10078 }
10079
10080 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
10081    tri-bool options (yes, no, don't care) and the default value is
10082    DEF, determine whether to reject inlining.  */
10083
10084 static bool
10085 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
10086                                      int dont_care, int def)
10087 {
10088   /* If the callee doesn't care, always allow inlining.  */
10089   if (callee == dont_care)
10090     return true;
10091
10092   /* If the caller doesn't care, always allow inlining.  */
10093   if (caller == dont_care)
10094     return true;
10095
10096   /* Otherwise, allow inlining if either the callee and caller values
10097      agree, or if the callee is using the default value.  */
10098   return (callee == caller || callee == def);
10099 }
10100
10101 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
10102    to inline CALLEE into CALLER based on target-specific info.
10103    Make sure that the caller and callee have compatible architectural
10104    features.  Then go through the other possible target attributes
10105    and see if they can block inlining.  Try not to reject always_inline
10106    callees unless they are incompatible architecturally.  */
10107
10108 static bool
10109 aarch64_can_inline_p (tree caller, tree callee)
10110 {
10111   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
10112   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
10113
10114   /* If callee has no option attributes, then it is ok to inline.  */
10115   if (!callee_tree)
10116     return true;
10117
10118   struct cl_target_option *caller_opts
10119         = TREE_TARGET_OPTION (caller_tree ? caller_tree
10120                                            : target_option_default_node);
10121
10122   struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
10123
10124
10125   /* Callee's ISA flags should be a subset of the caller's.  */
10126   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
10127        != callee_opts->x_aarch64_isa_flags)
10128     return false;
10129
10130   /* Allow non-strict aligned functions inlining into strict
10131      aligned ones.  */
10132   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
10133        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
10134       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
10135            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
10136     return false;
10137
10138   bool always_inline = lookup_attribute ("always_inline",
10139                                           DECL_ATTRIBUTES (callee));
10140
10141   /* If the architectural features match up and the callee is always_inline
10142      then the other attributes don't matter.  */
10143   if (always_inline)
10144     return true;
10145
10146   if (caller_opts->x_aarch64_cmodel_var
10147       != callee_opts->x_aarch64_cmodel_var)
10148     return false;
10149
10150   if (caller_opts->x_aarch64_tls_dialect
10151       != callee_opts->x_aarch64_tls_dialect)
10152     return false;
10153
10154   /* Honour explicit requests to workaround errata.  */
10155   if (!aarch64_tribools_ok_for_inlining_p (
10156           caller_opts->x_aarch64_fix_a53_err835769,
10157           callee_opts->x_aarch64_fix_a53_err835769,
10158           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
10159     return false;
10160
10161   if (!aarch64_tribools_ok_for_inlining_p (
10162           caller_opts->x_aarch64_fix_a53_err843419,
10163           callee_opts->x_aarch64_fix_a53_err843419,
10164           2, TARGET_FIX_ERR_A53_843419))
10165     return false;
10166
10167   /* If the user explicitly specified -momit-leaf-frame-pointer for the
10168      caller and calle and they don't match up, reject inlining.  */
10169   if (!aarch64_tribools_ok_for_inlining_p (
10170           caller_opts->x_flag_omit_leaf_frame_pointer,
10171           callee_opts->x_flag_omit_leaf_frame_pointer,
10172           2, 1))
10173     return false;
10174
10175   /* If the callee has specific tuning overrides, respect them.  */
10176   if (callee_opts->x_aarch64_override_tune_string != NULL
10177       && caller_opts->x_aarch64_override_tune_string == NULL)
10178     return false;
10179
10180   /* If the user specified tuning override strings for the
10181      caller and callee and they don't match up, reject inlining.
10182      We just do a string compare here, we don't analyze the meaning
10183      of the string, as it would be too costly for little gain.  */
10184   if (callee_opts->x_aarch64_override_tune_string
10185       && caller_opts->x_aarch64_override_tune_string
10186       && (strcmp (callee_opts->x_aarch64_override_tune_string,
10187                   caller_opts->x_aarch64_override_tune_string) != 0))
10188     return false;
10189
10190   return true;
10191 }
10192
10193 /* Return true if SYMBOL_REF X binds locally.  */
10194
10195 static bool
10196 aarch64_symbol_binds_local_p (const_rtx x)
10197 {
10198   return (SYMBOL_REF_DECL (x)
10199           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
10200           : SYMBOL_REF_LOCAL_P (x));
10201 }
10202
10203 /* Return true if SYMBOL_REF X is thread local */
10204 static bool
10205 aarch64_tls_symbol_p (rtx x)
10206 {
10207   if (! TARGET_HAVE_TLS)
10208     return false;
10209
10210   if (GET_CODE (x) != SYMBOL_REF)
10211     return false;
10212
10213   return SYMBOL_REF_TLS_MODEL (x) != 0;
10214 }
10215
10216 /* Classify a TLS symbol into one of the TLS kinds.  */
10217 enum aarch64_symbol_type
10218 aarch64_classify_tls_symbol (rtx x)
10219 {
10220   enum tls_model tls_kind = tls_symbolic_operand_type (x);
10221
10222   switch (tls_kind)
10223     {
10224     case TLS_MODEL_GLOBAL_DYNAMIC:
10225     case TLS_MODEL_LOCAL_DYNAMIC:
10226       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
10227
10228     case TLS_MODEL_INITIAL_EXEC:
10229       switch (aarch64_cmodel)
10230         {
10231         case AARCH64_CMODEL_TINY:
10232         case AARCH64_CMODEL_TINY_PIC:
10233           return SYMBOL_TINY_TLSIE;
10234         default:
10235           return SYMBOL_SMALL_TLSIE;
10236         }
10237
10238     case TLS_MODEL_LOCAL_EXEC:
10239       if (aarch64_tls_size == 12)
10240         return SYMBOL_TLSLE12;
10241       else if (aarch64_tls_size == 24)
10242         return SYMBOL_TLSLE24;
10243       else if (aarch64_tls_size == 32)
10244         return SYMBOL_TLSLE32;
10245       else if (aarch64_tls_size == 48)
10246         return SYMBOL_TLSLE48;
10247       else
10248         gcc_unreachable ();
10249
10250     case TLS_MODEL_EMULATED:
10251     case TLS_MODEL_NONE:
10252       return SYMBOL_FORCE_TO_MEM;
10253
10254     default:
10255       gcc_unreachable ();
10256     }
10257 }
10258
10259 /* Return the method that should be used to access SYMBOL_REF or
10260    LABEL_REF X.  */
10261
10262 enum aarch64_symbol_type
10263 aarch64_classify_symbol (rtx x, rtx offset)
10264 {
10265   if (GET_CODE (x) == LABEL_REF)
10266     {
10267       switch (aarch64_cmodel)
10268         {
10269         case AARCH64_CMODEL_LARGE:
10270           return SYMBOL_FORCE_TO_MEM;
10271
10272         case AARCH64_CMODEL_TINY_PIC:
10273         case AARCH64_CMODEL_TINY:
10274           return SYMBOL_TINY_ABSOLUTE;
10275
10276         case AARCH64_CMODEL_SMALL_SPIC:
10277         case AARCH64_CMODEL_SMALL_PIC:
10278         case AARCH64_CMODEL_SMALL:
10279           return SYMBOL_SMALL_ABSOLUTE;
10280
10281         default:
10282           gcc_unreachable ();
10283         }
10284     }
10285
10286   if (GET_CODE (x) == SYMBOL_REF)
10287     {
10288       if (aarch64_tls_symbol_p (x))
10289         return aarch64_classify_tls_symbol (x);
10290
10291       switch (aarch64_cmodel)
10292         {
10293         case AARCH64_CMODEL_TINY:
10294           /* When we retrieve symbol + offset address, we have to make sure
10295              the offset does not cause overflow of the final address.  But
10296              we have no way of knowing the address of symbol at compile time
10297              so we can't accurately say if the distance between the PC and
10298              symbol + offset is outside the addressible range of +/-1M in the
10299              TINY code model.  So we rely on images not being greater than
10300              1M and cap the offset at 1M and anything beyond 1M will have to
10301              be loaded using an alternative mechanism.  Furthermore if the
10302              symbol is a weak reference to something that isn't known to
10303              resolve to a symbol in this module, then force to memory.  */
10304           if ((SYMBOL_REF_WEAK (x)
10305                && !aarch64_symbol_binds_local_p (x))
10306               || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
10307             return SYMBOL_FORCE_TO_MEM;
10308           return SYMBOL_TINY_ABSOLUTE;
10309
10310         case AARCH64_CMODEL_SMALL:
10311           /* Same reasoning as the tiny code model, but the offset cap here is
10312              4G.  */
10313           if ((SYMBOL_REF_WEAK (x)
10314                && !aarch64_symbol_binds_local_p (x))
10315               || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
10316                             HOST_WIDE_INT_C (4294967264)))
10317             return SYMBOL_FORCE_TO_MEM;
10318           return SYMBOL_SMALL_ABSOLUTE;
10319
10320         case AARCH64_CMODEL_TINY_PIC:
10321           if (!aarch64_symbol_binds_local_p (x))
10322             return SYMBOL_TINY_GOT;
10323           return SYMBOL_TINY_ABSOLUTE;
10324
10325         case AARCH64_CMODEL_SMALL_SPIC:
10326         case AARCH64_CMODEL_SMALL_PIC:
10327           if (!aarch64_symbol_binds_local_p (x))
10328             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
10329                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
10330           return SYMBOL_SMALL_ABSOLUTE;
10331
10332         case AARCH64_CMODEL_LARGE:
10333           /* This is alright even in PIC code as the constant
10334              pool reference is always PC relative and within
10335              the same translation unit.  */
10336           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
10337             return SYMBOL_SMALL_ABSOLUTE;
10338           else
10339             return SYMBOL_FORCE_TO_MEM;
10340
10341         default:
10342           gcc_unreachable ();
10343         }
10344     }
10345
10346   /* By default push everything into the constant pool.  */
10347   return SYMBOL_FORCE_TO_MEM;
10348 }
10349
10350 bool
10351 aarch64_constant_address_p (rtx x)
10352 {
10353   return (CONSTANT_P (x) && memory_address_p (DImode, x));
10354 }
10355
10356 bool
10357 aarch64_legitimate_pic_operand_p (rtx x)
10358 {
10359   if (GET_CODE (x) == SYMBOL_REF
10360       || (GET_CODE (x) == CONST
10361           && GET_CODE (XEXP (x, 0)) == PLUS
10362           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
10363      return false;
10364
10365   return true;
10366 }
10367
10368 /* Return true if X holds either a quarter-precision or
10369      floating-point +0.0 constant.  */
10370 static bool
10371 aarch64_valid_floating_const (rtx x)
10372 {
10373   if (!CONST_DOUBLE_P (x))
10374     return false;
10375
10376   /* This call determines which constants can be used in mov<mode>
10377      as integer moves instead of constant loads.  */
10378   if (aarch64_float_const_rtx_p (x))
10379     return true;
10380
10381   return aarch64_float_const_representable_p (x);
10382 }
10383
10384 static bool
10385 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
10386 {
10387   /* Do not allow vector struct mode constants.  We could support
10388      0 and -1 easily, but they need support in aarch64-simd.md.  */
10389   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
10390     return false;
10391
10392   /* For these cases we never want to use a literal load.
10393      As such we have to prevent the compiler from forcing these
10394      to memory.  */
10395   if ((GET_CODE (x) == CONST_VECTOR
10396        && aarch64_simd_valid_immediate (x, mode, false, NULL))
10397       || CONST_INT_P (x)
10398       || aarch64_valid_floating_const (x)
10399       || aarch64_can_const_movi_rtx_p (x, mode)
10400       || aarch64_float_const_rtx_p (x))
10401         return !targetm.cannot_force_const_mem (mode, x);
10402
10403   if (GET_CODE (x) == HIGH
10404       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
10405     return true;
10406
10407   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
10408      so spilling them is better than rematerialization.  */
10409   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
10410     return true;
10411
10412   return aarch64_constant_address_p (x);
10413 }
10414
10415 rtx
10416 aarch64_load_tp (rtx target)
10417 {
10418   if (!target
10419       || GET_MODE (target) != Pmode
10420       || !register_operand (target, Pmode))
10421     target = gen_reg_rtx (Pmode);
10422
10423   /* Can return in any reg.  */
10424   emit_insn (gen_aarch64_load_tp_hard (target));
10425   return target;
10426 }
10427
10428 /* On AAPCS systems, this is the "struct __va_list".  */
10429 static GTY(()) tree va_list_type;
10430
10431 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
10432    Return the type to use as __builtin_va_list.
10433
10434    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
10435
10436    struct __va_list
10437    {
10438      void *__stack;
10439      void *__gr_top;
10440      void *__vr_top;
10441      int   __gr_offs;
10442      int   __vr_offs;
10443    };  */
10444
10445 static tree
10446 aarch64_build_builtin_va_list (void)
10447 {
10448   tree va_list_name;
10449   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10450
10451   /* Create the type.  */
10452   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
10453   /* Give it the required name.  */
10454   va_list_name = build_decl (BUILTINS_LOCATION,
10455                              TYPE_DECL,
10456                              get_identifier ("__va_list"),
10457                              va_list_type);
10458   DECL_ARTIFICIAL (va_list_name) = 1;
10459   TYPE_NAME (va_list_type) = va_list_name;
10460   TYPE_STUB_DECL (va_list_type) = va_list_name;
10461
10462   /* Create the fields.  */
10463   f_stack = build_decl (BUILTINS_LOCATION,
10464                         FIELD_DECL, get_identifier ("__stack"),
10465                         ptr_type_node);
10466   f_grtop = build_decl (BUILTINS_LOCATION,
10467                         FIELD_DECL, get_identifier ("__gr_top"),
10468                         ptr_type_node);
10469   f_vrtop = build_decl (BUILTINS_LOCATION,
10470                         FIELD_DECL, get_identifier ("__vr_top"),
10471                         ptr_type_node);
10472   f_groff = build_decl (BUILTINS_LOCATION,
10473                         FIELD_DECL, get_identifier ("__gr_offs"),
10474                         integer_type_node);
10475   f_vroff = build_decl (BUILTINS_LOCATION,
10476                         FIELD_DECL, get_identifier ("__vr_offs"),
10477                         integer_type_node);
10478
10479   /* Tell tree-stdarg pass about our internal offset fields.
10480      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
10481      purpose to identify whether the code is updating va_list internal
10482      offset fields through irregular way.  */
10483   va_list_gpr_counter_field = f_groff;
10484   va_list_fpr_counter_field = f_vroff;
10485
10486   DECL_ARTIFICIAL (f_stack) = 1;
10487   DECL_ARTIFICIAL (f_grtop) = 1;
10488   DECL_ARTIFICIAL (f_vrtop) = 1;
10489   DECL_ARTIFICIAL (f_groff) = 1;
10490   DECL_ARTIFICIAL (f_vroff) = 1;
10491
10492   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
10493   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
10494   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
10495   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
10496   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
10497
10498   TYPE_FIELDS (va_list_type) = f_stack;
10499   DECL_CHAIN (f_stack) = f_grtop;
10500   DECL_CHAIN (f_grtop) = f_vrtop;
10501   DECL_CHAIN (f_vrtop) = f_groff;
10502   DECL_CHAIN (f_groff) = f_vroff;
10503
10504   /* Compute its layout.  */
10505   layout_type (va_list_type);
10506
10507   return va_list_type;
10508 }
10509
10510 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
10511 static void
10512 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
10513 {
10514   const CUMULATIVE_ARGS *cum;
10515   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10516   tree stack, grtop, vrtop, groff, vroff;
10517   tree t;
10518   int gr_save_area_size = cfun->va_list_gpr_size;
10519   int vr_save_area_size = cfun->va_list_fpr_size;
10520   int vr_offset;
10521
10522   cum = &crtl->args.info;
10523   if (cfun->va_list_gpr_size)
10524     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
10525                              cfun->va_list_gpr_size);
10526   if (cfun->va_list_fpr_size)
10527     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
10528                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
10529
10530   if (!TARGET_FLOAT)
10531     {
10532       gcc_assert (cum->aapcs_nvrn == 0);
10533       vr_save_area_size = 0;
10534     }
10535
10536   f_stack = TYPE_FIELDS (va_list_type_node);
10537   f_grtop = DECL_CHAIN (f_stack);
10538   f_vrtop = DECL_CHAIN (f_grtop);
10539   f_groff = DECL_CHAIN (f_vrtop);
10540   f_vroff = DECL_CHAIN (f_groff);
10541
10542   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
10543                   NULL_TREE);
10544   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
10545                   NULL_TREE);
10546   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
10547                   NULL_TREE);
10548   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
10549                   NULL_TREE);
10550   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
10551                   NULL_TREE);
10552
10553   /* Emit code to initialize STACK, which points to the next varargs stack
10554      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
10555      by named arguments.  STACK is 8-byte aligned.  */
10556   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
10557   if (cum->aapcs_stack_size > 0)
10558     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
10559   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
10560   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10561
10562   /* Emit code to initialize GRTOP, the top of the GR save area.
10563      virtual_incoming_args_rtx should have been 16 byte aligned.  */
10564   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
10565   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
10566   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10567
10568   /* Emit code to initialize VRTOP, the top of the VR save area.
10569      This address is gr_save_area_bytes below GRTOP, rounded
10570      down to the next 16-byte boundary.  */
10571   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
10572   vr_offset = ROUND_UP (gr_save_area_size,
10573                         STACK_BOUNDARY / BITS_PER_UNIT);
10574
10575   if (vr_offset)
10576     t = fold_build_pointer_plus_hwi (t, -vr_offset);
10577   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
10578   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10579
10580   /* Emit code to initialize GROFF, the offset from GRTOP of the
10581      next GPR argument.  */
10582   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
10583               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
10584   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10585
10586   /* Likewise emit code to initialize VROFF, the offset from FTOP
10587      of the next VR argument.  */
10588   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
10589               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
10590   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10591 }
10592
10593 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
10594
10595 static tree
10596 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
10597                               gimple_seq *post_p ATTRIBUTE_UNUSED)
10598 {
10599   tree addr;
10600   bool indirect_p;
10601   bool is_ha;           /* is HFA or HVA.  */
10602   bool dw_align;        /* double-word align.  */
10603   machine_mode ag_mode = VOIDmode;
10604   int nregs;
10605   machine_mode mode;
10606
10607   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10608   tree stack, f_top, f_off, off, arg, roundup, on_stack;
10609   HOST_WIDE_INT size, rsize, adjust, align;
10610   tree t, u, cond1, cond2;
10611
10612   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
10613   if (indirect_p)
10614     type = build_pointer_type (type);
10615
10616   mode = TYPE_MODE (type);
10617
10618   f_stack = TYPE_FIELDS (va_list_type_node);
10619   f_grtop = DECL_CHAIN (f_stack);
10620   f_vrtop = DECL_CHAIN (f_grtop);
10621   f_groff = DECL_CHAIN (f_vrtop);
10622   f_vroff = DECL_CHAIN (f_groff);
10623
10624   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
10625                   f_stack, NULL_TREE);
10626   size = int_size_in_bytes (type);
10627   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
10628
10629   dw_align = false;
10630   adjust = 0;
10631   if (aarch64_vfp_is_call_or_return_candidate (mode,
10632                                                type,
10633                                                &ag_mode,
10634                                                &nregs,
10635                                                &is_ha))
10636     {
10637       /* TYPE passed in fp/simd registers.  */
10638       if (!TARGET_FLOAT)
10639         aarch64_err_no_fpadvsimd (mode, "varargs");
10640
10641       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
10642                       unshare_expr (valist), f_vrtop, NULL_TREE);
10643       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
10644                       unshare_expr (valist), f_vroff, NULL_TREE);
10645
10646       rsize = nregs * UNITS_PER_VREG;
10647
10648       if (is_ha)
10649         {
10650           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
10651             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
10652         }
10653       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
10654                && size < UNITS_PER_VREG)
10655         {
10656           adjust = UNITS_PER_VREG - size;
10657         }
10658     }
10659   else
10660     {
10661       /* TYPE passed in general registers.  */
10662       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
10663                       unshare_expr (valist), f_grtop, NULL_TREE);
10664       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
10665                       unshare_expr (valist), f_groff, NULL_TREE);
10666       rsize = ROUND_UP (size, UNITS_PER_WORD);
10667       nregs = rsize / UNITS_PER_WORD;
10668
10669       if (align > 8)
10670         dw_align = true;
10671
10672       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
10673           && size < UNITS_PER_WORD)
10674         {
10675           adjust = UNITS_PER_WORD  - size;
10676         }
10677     }
10678
10679   /* Get a local temporary for the field value.  */
10680   off = get_initialized_tmp_var (f_off, pre_p, NULL);
10681
10682   /* Emit code to branch if off >= 0.  */
10683   t = build2 (GE_EXPR, boolean_type_node, off,
10684               build_int_cst (TREE_TYPE (off), 0));
10685   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
10686
10687   if (dw_align)
10688     {
10689       /* Emit: offs = (offs + 15) & -16.  */
10690       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10691                   build_int_cst (TREE_TYPE (off), 15));
10692       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
10693                   build_int_cst (TREE_TYPE (off), -16));
10694       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
10695     }
10696   else
10697     roundup = NULL;
10698
10699   /* Update ap.__[g|v]r_offs  */
10700   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10701               build_int_cst (TREE_TYPE (off), rsize));
10702   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
10703
10704   /* String up.  */
10705   if (roundup)
10706     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10707
10708   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
10709   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
10710               build_int_cst (TREE_TYPE (f_off), 0));
10711   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
10712
10713   /* String up: make sure the assignment happens before the use.  */
10714   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
10715   COND_EXPR_ELSE (cond1) = t;
10716
10717   /* Prepare the trees handling the argument that is passed on the stack;
10718      the top level node will store in ON_STACK.  */
10719   arg = get_initialized_tmp_var (stack, pre_p, NULL);
10720   if (align > 8)
10721     {
10722       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
10723       t = fold_convert (intDI_type_node, arg);
10724       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10725                   build_int_cst (TREE_TYPE (t), 15));
10726       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10727                   build_int_cst (TREE_TYPE (t), -16));
10728       t = fold_convert (TREE_TYPE (arg), t);
10729       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
10730     }
10731   else
10732     roundup = NULL;
10733   /* Advance ap.__stack  */
10734   t = fold_convert (intDI_type_node, arg);
10735   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10736               build_int_cst (TREE_TYPE (t), size + 7));
10737   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10738               build_int_cst (TREE_TYPE (t), -8));
10739   t = fold_convert (TREE_TYPE (arg), t);
10740   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
10741   /* String up roundup and advance.  */
10742   if (roundup)
10743     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10744   /* String up with arg */
10745   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
10746   /* Big-endianness related address adjustment.  */
10747   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
10748       && size < UNITS_PER_WORD)
10749   {
10750     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
10751                 size_int (UNITS_PER_WORD - size));
10752     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
10753   }
10754
10755   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
10756   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
10757
10758   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
10759   t = off;
10760   if (adjust)
10761     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
10762                 build_int_cst (TREE_TYPE (off), adjust));
10763
10764   t = fold_convert (sizetype, t);
10765   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
10766
10767   if (is_ha)
10768     {
10769       /* type ha; // treat as "struct {ftype field[n];}"
10770          ... [computing offs]
10771          for (i = 0; i <nregs; ++i, offs += 16)
10772            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
10773          return ha;  */
10774       int i;
10775       tree tmp_ha, field_t, field_ptr_t;
10776
10777       /* Declare a local variable.  */
10778       tmp_ha = create_tmp_var_raw (type, "ha");
10779       gimple_add_tmp_var (tmp_ha);
10780
10781       /* Establish the base type.  */
10782       switch (ag_mode)
10783         {
10784         case E_SFmode:
10785           field_t = float_type_node;
10786           field_ptr_t = float_ptr_type_node;
10787           break;
10788         case E_DFmode:
10789           field_t = double_type_node;
10790           field_ptr_t = double_ptr_type_node;
10791           break;
10792         case E_TFmode:
10793           field_t = long_double_type_node;
10794           field_ptr_t = long_double_ptr_type_node;
10795           break;
10796         case E_HFmode:
10797           field_t = aarch64_fp16_type_node;
10798           field_ptr_t = aarch64_fp16_ptr_type_node;
10799           break;
10800         case E_V2SImode:
10801         case E_V4SImode:
10802             {
10803               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
10804               field_t = build_vector_type_for_mode (innertype, ag_mode);
10805               field_ptr_t = build_pointer_type (field_t);
10806             }
10807           break;
10808         default:
10809           gcc_assert (0);
10810         }
10811
10812       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
10813       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
10814       addr = t;
10815       t = fold_convert (field_ptr_t, addr);
10816       t = build2 (MODIFY_EXPR, field_t,
10817                   build1 (INDIRECT_REF, field_t, tmp_ha),
10818                   build1 (INDIRECT_REF, field_t, t));
10819
10820       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
10821       for (i = 1; i < nregs; ++i)
10822         {
10823           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
10824           u = fold_convert (field_ptr_t, addr);
10825           u = build2 (MODIFY_EXPR, field_t,
10826                       build2 (MEM_REF, field_t, tmp_ha,
10827                               build_int_cst (field_ptr_t,
10828                                              (i *
10829                                               int_size_in_bytes (field_t)))),
10830                       build1 (INDIRECT_REF, field_t, u));
10831           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
10832         }
10833
10834       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
10835       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
10836     }
10837
10838   COND_EXPR_ELSE (cond2) = t;
10839   addr = fold_convert (build_pointer_type (type), cond1);
10840   addr = build_va_arg_indirect_ref (addr);
10841
10842   if (indirect_p)
10843     addr = build_va_arg_indirect_ref (addr);
10844
10845   return addr;
10846 }
10847
10848 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
10849
10850 static void
10851 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10852                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
10853                                 int no_rtl)
10854 {
10855   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10856   CUMULATIVE_ARGS local_cum;
10857   int gr_saved = cfun->va_list_gpr_size;
10858   int vr_saved = cfun->va_list_fpr_size;
10859
10860   /* The caller has advanced CUM up to, but not beyond, the last named
10861      argument.  Advance a local copy of CUM past the last "real" named
10862      argument, to find out how many registers are left over.  */
10863   local_cum = *cum;
10864   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
10865
10866   /* Found out how many registers we need to save.
10867      Honor tree-stdvar analysis results.  */
10868   if (cfun->va_list_gpr_size)
10869     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
10870                     cfun->va_list_gpr_size / UNITS_PER_WORD);
10871   if (cfun->va_list_fpr_size)
10872     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
10873                     cfun->va_list_fpr_size / UNITS_PER_VREG);
10874
10875   if (!TARGET_FLOAT)
10876     {
10877       gcc_assert (local_cum.aapcs_nvrn == 0);
10878       vr_saved = 0;
10879     }
10880
10881   if (!no_rtl)
10882     {
10883       if (gr_saved > 0)
10884         {
10885           rtx ptr, mem;
10886
10887           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
10888           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
10889                                - gr_saved * UNITS_PER_WORD);
10890           mem = gen_frame_mem (BLKmode, ptr);
10891           set_mem_alias_set (mem, get_varargs_alias_set ());
10892
10893           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
10894                                mem, gr_saved);
10895         }
10896       if (vr_saved > 0)
10897         {
10898           /* We can't use move_block_from_reg, because it will use
10899              the wrong mode, storing D regs only.  */
10900           machine_mode mode = TImode;
10901           int off, i, vr_start;
10902
10903           /* Set OFF to the offset from virtual_incoming_args_rtx of
10904              the first vector register.  The VR save area lies below
10905              the GR one, and is aligned to 16 bytes.  */
10906           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10907                            STACK_BOUNDARY / BITS_PER_UNIT);
10908           off -= vr_saved * UNITS_PER_VREG;
10909
10910           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10911           for (i = 0; i < vr_saved; ++i)
10912             {
10913               rtx ptr, mem;
10914
10915               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10916               mem = gen_frame_mem (mode, ptr);
10917               set_mem_alias_set (mem, get_varargs_alias_set ());
10918               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10919               off += UNITS_PER_VREG;
10920             }
10921         }
10922     }
10923
10924   /* We don't save the size into *PRETEND_SIZE because we want to avoid
10925      any complication of having crtl->args.pretend_args_size changed.  */
10926   cfun->machine->frame.saved_varargs_size
10927     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10928                  STACK_BOUNDARY / BITS_PER_UNIT)
10929        + vr_saved * UNITS_PER_VREG);
10930 }
10931
10932 static void
10933 aarch64_conditional_register_usage (void)
10934 {
10935   int i;
10936   if (!TARGET_FLOAT)
10937     {
10938       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10939         {
10940           fixed_regs[i] = 1;
10941           call_used_regs[i] = 1;
10942         }
10943     }
10944 }
10945
10946 /* Walk down the type tree of TYPE counting consecutive base elements.
10947    If *MODEP is VOIDmode, then set it to the first valid floating point
10948    type.  If a non-floating point type is found, or if a floating point
10949    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10950    otherwise return the count in the sub-tree.  */
10951 static int
10952 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10953 {
10954   machine_mode mode;
10955   HOST_WIDE_INT size;
10956
10957   switch (TREE_CODE (type))
10958     {
10959     case REAL_TYPE:
10960       mode = TYPE_MODE (type);
10961       if (mode != DFmode && mode != SFmode
10962           && mode != TFmode && mode != HFmode)
10963         return -1;
10964
10965       if (*modep == VOIDmode)
10966         *modep = mode;
10967
10968       if (*modep == mode)
10969         return 1;
10970
10971       break;
10972
10973     case COMPLEX_TYPE:
10974       mode = TYPE_MODE (TREE_TYPE (type));
10975       if (mode != DFmode && mode != SFmode
10976           && mode != TFmode && mode != HFmode)
10977         return -1;
10978
10979       if (*modep == VOIDmode)
10980         *modep = mode;
10981
10982       if (*modep == mode)
10983         return 2;
10984
10985       break;
10986
10987     case VECTOR_TYPE:
10988       /* Use V2SImode and V4SImode as representatives of all 64-bit
10989          and 128-bit vector types.  */
10990       size = int_size_in_bytes (type);
10991       switch (size)
10992         {
10993         case 8:
10994           mode = V2SImode;
10995           break;
10996         case 16:
10997           mode = V4SImode;
10998           break;
10999         default:
11000           return -1;
11001         }
11002
11003       if (*modep == VOIDmode)
11004         *modep = mode;
11005
11006       /* Vector modes are considered to be opaque: two vectors are
11007          equivalent for the purposes of being homogeneous aggregates
11008          if they are the same size.  */
11009       if (*modep == mode)
11010         return 1;
11011
11012       break;
11013
11014     case ARRAY_TYPE:
11015       {
11016         int count;
11017         tree index = TYPE_DOMAIN (type);
11018
11019         /* Can't handle incomplete types nor sizes that are not
11020            fixed.  */
11021         if (!COMPLETE_TYPE_P (type)
11022             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11023           return -1;
11024
11025         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
11026         if (count == -1
11027             || !index
11028             || !TYPE_MAX_VALUE (index)
11029             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
11030             || !TYPE_MIN_VALUE (index)
11031             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
11032             || count < 0)
11033           return -1;
11034
11035         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
11036                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
11037
11038         /* There must be no padding.  */
11039         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11040           return -1;
11041
11042         return count;
11043       }
11044
11045     case RECORD_TYPE:
11046       {
11047         int count = 0;
11048         int sub_count;
11049         tree field;
11050
11051         /* Can't handle incomplete types nor sizes that are not
11052            fixed.  */
11053         if (!COMPLETE_TYPE_P (type)
11054             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11055           return -1;
11056
11057         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
11058           {
11059             if (TREE_CODE (field) != FIELD_DECL)
11060               continue;
11061
11062             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
11063             if (sub_count < 0)
11064               return -1;
11065             count += sub_count;
11066           }
11067
11068         /* There must be no padding.  */
11069         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11070           return -1;
11071
11072         return count;
11073       }
11074
11075     case UNION_TYPE:
11076     case QUAL_UNION_TYPE:
11077       {
11078         /* These aren't very interesting except in a degenerate case.  */
11079         int count = 0;
11080         int sub_count;
11081         tree field;
11082
11083         /* Can't handle incomplete types nor sizes that are not
11084            fixed.  */
11085         if (!COMPLETE_TYPE_P (type)
11086             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11087           return -1;
11088
11089         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
11090           {
11091             if (TREE_CODE (field) != FIELD_DECL)
11092               continue;
11093
11094             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
11095             if (sub_count < 0)
11096               return -1;
11097             count = count > sub_count ? count : sub_count;
11098           }
11099
11100         /* There must be no padding.  */
11101         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11102           return -1;
11103
11104         return count;
11105       }
11106
11107     default:
11108       break;
11109     }
11110
11111   return -1;
11112 }
11113
11114 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
11115    type as described in AAPCS64 \S 4.1.2.
11116
11117    See the comment above aarch64_composite_type_p for the notes on MODE.  */
11118
11119 static bool
11120 aarch64_short_vector_p (const_tree type,
11121                         machine_mode mode)
11122 {
11123   HOST_WIDE_INT size = -1;
11124
11125   if (type && TREE_CODE (type) == VECTOR_TYPE)
11126     size = int_size_in_bytes (type);
11127   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
11128             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11129     size = GET_MODE_SIZE (mode);
11130
11131   return (size == 8 || size == 16);
11132 }
11133
11134 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
11135    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
11136    array types.  The C99 floating-point complex types are also considered
11137    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
11138    types, which are GCC extensions and out of the scope of AAPCS64, are
11139    treated as composite types here as well.
11140
11141    Note that MODE itself is not sufficient in determining whether a type
11142    is such a composite type or not.  This is because
11143    stor-layout.c:compute_record_mode may have already changed the MODE
11144    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
11145    structure with only one field may have its MODE set to the mode of the
11146    field.  Also an integer mode whose size matches the size of the
11147    RECORD_TYPE type may be used to substitute the original mode
11148    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
11149    solely relied on.  */
11150
11151 static bool
11152 aarch64_composite_type_p (const_tree type,
11153                           machine_mode mode)
11154 {
11155   if (aarch64_short_vector_p (type, mode))
11156     return false;
11157
11158   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
11159     return true;
11160
11161   if (mode == BLKmode
11162       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
11163       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
11164     return true;
11165
11166   return false;
11167 }
11168
11169 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
11170    shall be passed or returned in simd/fp register(s) (providing these
11171    parameter passing registers are available).
11172
11173    Upon successful return, *COUNT returns the number of needed registers,
11174    *BASE_MODE returns the mode of the individual register and when IS_HAF
11175    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
11176    floating-point aggregate or a homogeneous short-vector aggregate.  */
11177
11178 static bool
11179 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
11180                                          const_tree type,
11181                                          machine_mode *base_mode,
11182                                          int *count,
11183                                          bool *is_ha)
11184 {
11185   machine_mode new_mode = VOIDmode;
11186   bool composite_p = aarch64_composite_type_p (type, mode);
11187
11188   if (is_ha != NULL) *is_ha = false;
11189
11190   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
11191       || aarch64_short_vector_p (type, mode))
11192     {
11193       *count = 1;
11194       new_mode = mode;
11195     }
11196   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
11197     {
11198       if (is_ha != NULL) *is_ha = true;
11199       *count = 2;
11200       new_mode = GET_MODE_INNER (mode);
11201     }
11202   else if (type && composite_p)
11203     {
11204       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
11205
11206       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
11207         {
11208           if (is_ha != NULL) *is_ha = true;
11209           *count = ag_count;
11210         }
11211       else
11212         return false;
11213     }
11214   else
11215     return false;
11216
11217   *base_mode = new_mode;
11218   return true;
11219 }
11220
11221 /* Implement TARGET_STRUCT_VALUE_RTX.  */
11222
11223 static rtx
11224 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
11225                           int incoming ATTRIBUTE_UNUSED)
11226 {
11227   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
11228 }
11229
11230 /* Implements target hook vector_mode_supported_p.  */
11231 static bool
11232 aarch64_vector_mode_supported_p (machine_mode mode)
11233 {
11234   if (TARGET_SIMD
11235       && (mode == V4SImode  || mode == V8HImode
11236           || mode == V16QImode || mode == V2DImode
11237           || mode == V2SImode  || mode == V4HImode
11238           || mode == V8QImode || mode == V2SFmode
11239           || mode == V4SFmode || mode == V2DFmode
11240           || mode == V4HFmode || mode == V8HFmode
11241           || mode == V1DFmode))
11242     return true;
11243
11244   return false;
11245 }
11246
11247 /* Return appropriate SIMD container
11248    for MODE within a vector of WIDTH bits.  */
11249 static machine_mode
11250 aarch64_simd_container_mode (scalar_mode mode, unsigned width)
11251 {
11252   gcc_assert (width == 64 || width == 128);
11253   if (TARGET_SIMD)
11254     {
11255       if (width == 128)
11256         switch (mode)
11257           {
11258           case E_DFmode:
11259             return V2DFmode;
11260           case E_SFmode:
11261             return V4SFmode;
11262           case E_HFmode:
11263             return V8HFmode;
11264           case E_SImode:
11265             return V4SImode;
11266           case E_HImode:
11267             return V8HImode;
11268           case E_QImode:
11269             return V16QImode;
11270           case E_DImode:
11271             return V2DImode;
11272           default:
11273             break;
11274           }
11275       else
11276         switch (mode)
11277           {
11278           case E_SFmode:
11279             return V2SFmode;
11280           case E_HFmode:
11281             return V4HFmode;
11282           case E_SImode:
11283             return V2SImode;
11284           case E_HImode:
11285             return V4HImode;
11286           case E_QImode:
11287             return V8QImode;
11288           default:
11289             break;
11290           }
11291     }
11292   return word_mode;
11293 }
11294
11295 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
11296 static machine_mode
11297 aarch64_preferred_simd_mode (scalar_mode mode)
11298 {
11299   return aarch64_simd_container_mode (mode, 128);
11300 }
11301
11302 /* Return the bitmask of possible vector sizes for the vectorizer
11303    to iterate over.  */
11304 static unsigned int
11305 aarch64_autovectorize_vector_sizes (void)
11306 {
11307   return (16 | 8);
11308 }
11309
11310 /* Implement TARGET_MANGLE_TYPE.  */
11311
11312 static const char *
11313 aarch64_mangle_type (const_tree type)
11314 {
11315   /* The AArch64 ABI documents say that "__va_list" has to be
11316      managled as if it is in the "std" namespace.  */
11317   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
11318     return "St9__va_list";
11319
11320   /* Half-precision float.  */
11321   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
11322     return "Dh";
11323
11324   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
11325      builtin types.  */
11326   if (TYPE_NAME (type) != NULL)
11327     return aarch64_mangle_builtin_type (type);
11328
11329   /* Use the default mangling.  */
11330   return NULL;
11331 }
11332
11333 /* Find the first rtx_insn before insn that will generate an assembly
11334    instruction.  */
11335
11336 static rtx_insn *
11337 aarch64_prev_real_insn (rtx_insn *insn)
11338 {
11339   if (!insn)
11340     return NULL;
11341
11342   do
11343     {
11344       insn = prev_real_insn (insn);
11345     }
11346   while (insn && recog_memoized (insn) < 0);
11347
11348   return insn;
11349 }
11350
11351 static bool
11352 is_madd_op (enum attr_type t1)
11353 {
11354   unsigned int i;
11355   /* A number of these may be AArch32 only.  */
11356   enum attr_type mlatypes[] = {
11357     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
11358     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
11359     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
11360   };
11361
11362   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
11363     {
11364       if (t1 == mlatypes[i])
11365         return true;
11366     }
11367
11368   return false;
11369 }
11370
11371 /* Check if there is a register dependency between a load and the insn
11372    for which we hold recog_data.  */
11373
11374 static bool
11375 dep_between_memop_and_curr (rtx memop)
11376 {
11377   rtx load_reg;
11378   int opno;
11379
11380   gcc_assert (GET_CODE (memop) == SET);
11381
11382   if (!REG_P (SET_DEST (memop)))
11383     return false;
11384
11385   load_reg = SET_DEST (memop);
11386   for (opno = 1; opno < recog_data.n_operands; opno++)
11387     {
11388       rtx operand = recog_data.operand[opno];
11389       if (REG_P (operand)
11390           && reg_overlap_mentioned_p (load_reg, operand))
11391         return true;
11392
11393     }
11394   return false;
11395 }
11396
11397
11398 /* When working around the Cortex-A53 erratum 835769,
11399    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
11400    instruction and has a preceding memory instruction such that a NOP
11401    should be inserted between them.  */
11402
11403 bool
11404 aarch64_madd_needs_nop (rtx_insn* insn)
11405 {
11406   enum attr_type attr_type;
11407   rtx_insn *prev;
11408   rtx body;
11409
11410   if (!TARGET_FIX_ERR_A53_835769)
11411     return false;
11412
11413   if (!INSN_P (insn) || recog_memoized (insn) < 0)
11414     return false;
11415
11416   attr_type = get_attr_type (insn);
11417   if (!is_madd_op (attr_type))
11418     return false;
11419
11420   prev = aarch64_prev_real_insn (insn);
11421   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
11422      Restore recog state to INSN to avoid state corruption.  */
11423   extract_constrain_insn_cached (insn);
11424
11425   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
11426     return false;
11427
11428   body = single_set (prev);
11429
11430   /* If the previous insn is a memory op and there is no dependency between
11431      it and the DImode madd, emit a NOP between them.  If body is NULL then we
11432      have a complex memory operation, probably a load/store pair.
11433      Be conservative for now and emit a NOP.  */
11434   if (GET_MODE (recog_data.operand[0]) == DImode
11435       && (!body || !dep_between_memop_and_curr (body)))
11436     return true;
11437
11438   return false;
11439
11440 }
11441
11442
11443 /* Implement FINAL_PRESCAN_INSN.  */
11444
11445 void
11446 aarch64_final_prescan_insn (rtx_insn *insn)
11447 {
11448   if (aarch64_madd_needs_nop (insn))
11449     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
11450 }
11451
11452
11453 /* Return the equivalent letter for size.  */
11454 static char
11455 sizetochar (int size)
11456 {
11457   switch (size)
11458     {
11459     case 64: return 'd';
11460     case 32: return 's';
11461     case 16: return 'h';
11462     case 8 : return 'b';
11463     default: gcc_unreachable ();
11464     }
11465 }
11466
11467 /* Return true iff x is a uniform vector of floating-point
11468    constants, and the constant can be represented in
11469    quarter-precision form.  Note, as aarch64_float_const_representable
11470    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
11471 static bool
11472 aarch64_vect_float_const_representable_p (rtx x)
11473 {
11474   rtx elt;
11475   return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
11476           && const_vec_duplicate_p (x, &elt)
11477           && aarch64_float_const_representable_p (elt));
11478 }
11479
11480 /* Return true for valid and false for invalid.  */
11481 bool
11482 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
11483                               struct simd_immediate_info *info)
11484 {
11485 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
11486   matches = 1;                                          \
11487   for (i = 0; i < idx; i += (STRIDE))                   \
11488     if (!(TEST))                                        \
11489       matches = 0;                                      \
11490   if (matches)                                          \
11491     {                                                   \
11492       immtype = (CLASS);                                \
11493       elsize = (ELSIZE);                                \
11494       eshift = (SHIFT);                                 \
11495       emvn = (NEG);                                     \
11496       break;                                            \
11497     }
11498
11499   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
11500   unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
11501   unsigned char bytes[16];
11502   int immtype = -1, matches;
11503   unsigned int invmask = inverse ? 0xff : 0;
11504   int eshift, emvn;
11505
11506   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11507     {
11508       if (! (aarch64_simd_imm_zero_p (op, mode)
11509              || aarch64_vect_float_const_representable_p (op)))
11510         return false;
11511
11512       if (info)
11513         {
11514           rtx elt = CONST_VECTOR_ELT (op, 0);
11515           scalar_float_mode elt_mode
11516             = as_a <scalar_float_mode> (GET_MODE (elt));
11517
11518           info->value = elt;
11519           info->element_width = GET_MODE_BITSIZE (elt_mode);
11520           info->mvn = false;
11521           info->shift = 0;
11522         }
11523
11524       return true;
11525     }
11526
11527   /* Splat vector constant out into a byte vector.  */
11528   for (i = 0; i < n_elts; i++)
11529     {
11530       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
11531          it must be laid out in the vector register in reverse order.  */
11532       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
11533       unsigned HOST_WIDE_INT elpart;
11534
11535       gcc_assert (CONST_INT_P (el));
11536       elpart = INTVAL (el);
11537
11538       for (unsigned int byte = 0; byte < innersize; byte++)
11539         {
11540           bytes[idx++] = (elpart & 0xff) ^ invmask;
11541           elpart >>= BITS_PER_UNIT;
11542         }
11543
11544     }
11545
11546   /* Sanity check.  */
11547   gcc_assert (idx == GET_MODE_SIZE (mode));
11548
11549   do
11550     {
11551       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
11552              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
11553
11554       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11555              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11556
11557       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
11558              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11559
11560       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
11561              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
11562
11563       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
11564
11565       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
11566
11567       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
11568              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
11569
11570       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11571              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11572
11573       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
11574              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11575
11576       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
11577              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
11578
11579       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
11580
11581       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
11582
11583       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11584              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11585
11586       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11587              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11588
11589       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
11590              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11591
11592       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
11593              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11594
11595       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
11596
11597       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
11598              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
11599     }
11600   while (0);
11601
11602   if (immtype == -1)
11603     return false;
11604
11605   if (info)
11606     {
11607       info->element_width = elsize;
11608       info->mvn = emvn != 0;
11609       info->shift = eshift;
11610
11611       unsigned HOST_WIDE_INT imm = 0;
11612
11613       if (immtype >= 12 && immtype <= 15)
11614         info->msl = true;
11615
11616       /* Un-invert bytes of recognized vector, if necessary.  */
11617       if (invmask != 0)
11618         for (i = 0; i < idx; i++)
11619           bytes[i] ^= invmask;
11620
11621       if (immtype == 17)
11622         {
11623           /* FIXME: Broken on 32-bit H_W_I hosts.  */
11624           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
11625
11626           for (i = 0; i < 8; i++)
11627             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
11628               << (i * BITS_PER_UNIT);
11629
11630
11631           info->value = GEN_INT (imm);
11632         }
11633       else
11634         {
11635           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
11636             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
11637
11638           /* Construct 'abcdefgh' because the assembler cannot handle
11639              generic constants.  */
11640           if (info->mvn)
11641             imm = ~imm;
11642           imm = (imm >> info->shift) & 0xff;
11643           info->value = GEN_INT (imm);
11644         }
11645     }
11646
11647   return true;
11648 #undef CHECK
11649 }
11650
11651 /* Check of immediate shift constants are within range.  */
11652 bool
11653 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
11654 {
11655   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
11656   if (left)
11657     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
11658   else
11659     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
11660 }
11661
11662 /* Return true if X is a uniform vector where all elements
11663    are either the floating-point constant 0.0 or the
11664    integer constant 0.  */
11665 bool
11666 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
11667 {
11668   return x == CONST0_RTX (mode);
11669 }
11670
11671
11672 /* Return the bitmask CONST_INT to select the bits required by a zero extract
11673    operation of width WIDTH at bit position POS.  */
11674
11675 rtx
11676 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
11677 {
11678   gcc_assert (CONST_INT_P (width));
11679   gcc_assert (CONST_INT_P (pos));
11680
11681   unsigned HOST_WIDE_INT mask
11682     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
11683   return GEN_INT (mask << UINTVAL (pos));
11684 }
11685
11686 bool
11687 aarch64_mov_operand_p (rtx x, machine_mode mode)
11688 {
11689   if (GET_CODE (x) == HIGH
11690       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
11691     return true;
11692
11693   if (CONST_INT_P (x))
11694     return true;
11695
11696   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
11697     return true;
11698
11699   return aarch64_classify_symbolic_expression (x)
11700     == SYMBOL_TINY_ABSOLUTE;
11701 }
11702
11703 /* Return a const_int vector of VAL.  */
11704 rtx
11705 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
11706 {
11707   int nunits = GET_MODE_NUNITS (mode);
11708   rtvec v = rtvec_alloc (nunits);
11709   int i;
11710
11711   rtx cache = GEN_INT (val);
11712
11713   for (i=0; i < nunits; i++)
11714     RTVEC_ELT (v, i) = cache;
11715
11716   return gen_rtx_CONST_VECTOR (mode, v);
11717 }
11718
11719 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
11720
11721 bool
11722 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
11723 {
11724   machine_mode vmode;
11725
11726   vmode = aarch64_preferred_simd_mode (mode);
11727   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
11728   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
11729 }
11730
11731 /* Construct and return a PARALLEL RTX vector with elements numbering the
11732    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
11733    the vector - from the perspective of the architecture.  This does not
11734    line up with GCC's perspective on lane numbers, so we end up with
11735    different masks depending on our target endian-ness.  The diagram
11736    below may help.  We must draw the distinction when building masks
11737    which select one half of the vector.  An instruction selecting
11738    architectural low-lanes for a big-endian target, must be described using
11739    a mask selecting GCC high-lanes.
11740
11741                  Big-Endian             Little-Endian
11742
11743 GCC             0   1   2   3           3   2   1   0
11744               | x | x | x | x |       | x | x | x | x |
11745 Architecture    3   2   1   0           3   2   1   0
11746
11747 Low Mask:         { 2, 3 }                { 0, 1 }
11748 High Mask:        { 0, 1 }                { 2, 3 }
11749 */
11750
11751 rtx
11752 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
11753 {
11754   int nunits = GET_MODE_NUNITS (mode);
11755   rtvec v = rtvec_alloc (nunits / 2);
11756   int high_base = nunits / 2;
11757   int low_base = 0;
11758   int base;
11759   rtx t1;
11760   int i;
11761
11762   if (BYTES_BIG_ENDIAN)
11763     base = high ? low_base : high_base;
11764   else
11765     base = high ? high_base : low_base;
11766
11767   for (i = 0; i < nunits / 2; i++)
11768     RTVEC_ELT (v, i) = GEN_INT (base + i);
11769
11770   t1 = gen_rtx_PARALLEL (mode, v);
11771   return t1;
11772 }
11773
11774 /* Check OP for validity as a PARALLEL RTX vector with elements
11775    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
11776    from the perspective of the architecture.  See the diagram above
11777    aarch64_simd_vect_par_cnst_half for more details.  */
11778
11779 bool
11780 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
11781                                        bool high)
11782 {
11783   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
11784   HOST_WIDE_INT count_op = XVECLEN (op, 0);
11785   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
11786   int i = 0;
11787
11788   if (!VECTOR_MODE_P (mode))
11789     return false;
11790
11791   if (count_op != count_ideal)
11792     return false;
11793
11794   for (i = 0; i < count_ideal; i++)
11795     {
11796       rtx elt_op = XVECEXP (op, 0, i);
11797       rtx elt_ideal = XVECEXP (ideal, 0, i);
11798
11799       if (!CONST_INT_P (elt_op)
11800           || INTVAL (elt_ideal) != INTVAL (elt_op))
11801         return false;
11802     }
11803   return true;
11804 }
11805
11806 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
11807    HIGH (exclusive).  */
11808 void
11809 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
11810                           const_tree exp)
11811 {
11812   HOST_WIDE_INT lane;
11813   gcc_assert (CONST_INT_P (operand));
11814   lane = INTVAL (operand);
11815
11816   if (lane < low || lane >= high)
11817   {
11818     if (exp)
11819       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
11820     else
11821       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
11822   }
11823 }
11824
11825 /* Return TRUE if OP is a valid vector addressing mode.  */
11826 bool
11827 aarch64_simd_mem_operand_p (rtx op)
11828 {
11829   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
11830                         || REG_P (XEXP (op, 0)));
11831 }
11832
11833 /* Emit a register copy from operand to operand, taking care not to
11834    early-clobber source registers in the process.
11835
11836    COUNT is the number of components into which the copy needs to be
11837    decomposed.  */
11838 void
11839 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
11840                                 unsigned int count)
11841 {
11842   unsigned int i;
11843   int rdest = REGNO (operands[0]);
11844   int rsrc = REGNO (operands[1]);
11845
11846   if (!reg_overlap_mentioned_p (operands[0], operands[1])
11847       || rdest < rsrc)
11848     for (i = 0; i < count; i++)
11849       emit_move_insn (gen_rtx_REG (mode, rdest + i),
11850                       gen_rtx_REG (mode, rsrc + i));
11851   else
11852     for (i = 0; i < count; i++)
11853       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
11854                       gen_rtx_REG (mode, rsrc + count - i - 1));
11855 }
11856
11857 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11858    one of VSTRUCT modes: OI, CI, or XI.  */
11859 int
11860 aarch64_simd_attr_length_rglist (machine_mode mode)
11861 {
11862   return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
11863 }
11864
11865 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
11866    alignment of a vector to 128 bits.  */
11867 static HOST_WIDE_INT
11868 aarch64_simd_vector_alignment (const_tree type)
11869 {
11870   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
11871   return MIN (align, 128);
11872 }
11873
11874 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
11875 static bool
11876 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
11877 {
11878   if (is_packed)
11879     return false;
11880
11881   /* We guarantee alignment for vectors up to 128-bits.  */
11882   if (tree_int_cst_compare (TYPE_SIZE (type),
11883                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11884     return false;
11885
11886   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
11887   return true;
11888 }
11889
11890 /* Return true if the vector misalignment factor is supported by the
11891    target.  */
11892 static bool
11893 aarch64_builtin_support_vector_misalignment (machine_mode mode,
11894                                              const_tree type, int misalignment,
11895                                              bool is_packed)
11896 {
11897   if (TARGET_SIMD && STRICT_ALIGNMENT)
11898     {
11899       /* Return if movmisalign pattern is not supported for this mode.  */
11900       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
11901         return false;
11902
11903       if (misalignment == -1)
11904         {
11905           /* Misalignment factor is unknown at compile time but we know
11906              it's word aligned.  */
11907           if (aarch64_simd_vector_alignment_reachable (type, is_packed))
11908             {
11909               int element_size = TREE_INT_CST_LOW (TYPE_SIZE (type));
11910
11911               if (element_size != 64)
11912                 return true;
11913             }
11914           return false;
11915         }
11916     }
11917   return default_builtin_support_vector_misalignment (mode, type, misalignment,
11918                                                       is_packed);
11919 }
11920
11921 /* If VALS is a vector constant that can be loaded into a register
11922    using DUP, generate instructions to do so and return an RTX to
11923    assign to the register.  Otherwise return NULL_RTX.  */
11924 static rtx
11925 aarch64_simd_dup_constant (rtx vals)
11926 {
11927   machine_mode mode = GET_MODE (vals);
11928   machine_mode inner_mode = GET_MODE_INNER (mode);
11929   rtx x;
11930
11931   if (!const_vec_duplicate_p (vals, &x))
11932     return NULL_RTX;
11933
11934   /* We can load this constant by using DUP and a constant in a
11935      single ARM register.  This will be cheaper than a vector
11936      load.  */
11937   x = copy_to_mode_reg (inner_mode, x);
11938   return gen_rtx_VEC_DUPLICATE (mode, x);
11939 }
11940
11941
11942 /* Generate code to load VALS, which is a PARALLEL containing only
11943    constants (for vec_init) or CONST_VECTOR, efficiently into a
11944    register.  Returns an RTX to copy into the register, or NULL_RTX
11945    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
11946 static rtx
11947 aarch64_simd_make_constant (rtx vals)
11948 {
11949   machine_mode mode = GET_MODE (vals);
11950   rtx const_dup;
11951   rtx const_vec = NULL_RTX;
11952   int n_elts = GET_MODE_NUNITS (mode);
11953   int n_const = 0;
11954   int i;
11955
11956   if (GET_CODE (vals) == CONST_VECTOR)
11957     const_vec = vals;
11958   else if (GET_CODE (vals) == PARALLEL)
11959     {
11960       /* A CONST_VECTOR must contain only CONST_INTs and
11961          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11962          Only store valid constants in a CONST_VECTOR.  */
11963       for (i = 0; i < n_elts; ++i)
11964         {
11965           rtx x = XVECEXP (vals, 0, i);
11966           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11967             n_const++;
11968         }
11969       if (n_const == n_elts)
11970         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11971     }
11972   else
11973     gcc_unreachable ();
11974
11975   if (const_vec != NULL_RTX
11976       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11977     /* Load using MOVI/MVNI.  */
11978     return const_vec;
11979   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11980     /* Loaded using DUP.  */
11981     return const_dup;
11982   else if (const_vec != NULL_RTX)
11983     /* Load from constant pool. We can not take advantage of single-cycle
11984        LD1 because we need a PC-relative addressing mode.  */
11985     return const_vec;
11986   else
11987     /* A PARALLEL containing something not valid inside CONST_VECTOR.
11988        We can not construct an initializer.  */
11989     return NULL_RTX;
11990 }
11991
11992 /* Expand a vector initialisation sequence, such that TARGET is
11993    initialised to contain VALS.  */
11994
11995 void
11996 aarch64_expand_vector_init (rtx target, rtx vals)
11997 {
11998   machine_mode mode = GET_MODE (target);
11999   scalar_mode inner_mode = GET_MODE_INNER (mode);
12000   /* The number of vector elements.  */
12001   int n_elts = GET_MODE_NUNITS (mode);
12002   /* The number of vector elements which are not constant.  */
12003   int n_var = 0;
12004   rtx any_const = NULL_RTX;
12005   /* The first element of vals.  */
12006   rtx v0 = XVECEXP (vals, 0, 0);
12007   bool all_same = true;
12008
12009   /* Count the number of variable elements to initialise.  */
12010   for (int i = 0; i < n_elts; ++i)
12011     {
12012       rtx x = XVECEXP (vals, 0, i);
12013       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
12014         ++n_var;
12015       else
12016         any_const = x;
12017
12018       all_same &= rtx_equal_p (x, v0);
12019     }
12020
12021   /* No variable elements, hand off to aarch64_simd_make_constant which knows
12022      how best to handle this.  */
12023   if (n_var == 0)
12024     {
12025       rtx constant = aarch64_simd_make_constant (vals);
12026       if (constant != NULL_RTX)
12027         {
12028           emit_move_insn (target, constant);
12029           return;
12030         }
12031     }
12032
12033   /* Splat a single non-constant element if we can.  */
12034   if (all_same)
12035     {
12036       rtx x = copy_to_mode_reg (inner_mode, v0);
12037       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
12038       return;
12039     }
12040
12041   enum insn_code icode = optab_handler (vec_set_optab, mode);
12042   gcc_assert (icode != CODE_FOR_nothing);
12043
12044   /* If there are only variable elements, try to optimize
12045      the insertion using dup for the most common element
12046      followed by insertions.  */
12047
12048   /* The algorithm will fill matches[*][0] with the earliest matching element,
12049      and matches[X][1] with the count of duplicate elements (if X is the
12050      earliest element which has duplicates).  */
12051
12052   if (n_var == n_elts && n_elts <= 16)
12053     {
12054       int matches[16][2] = {0};
12055       for (int i = 0; i < n_elts; i++)
12056         {
12057           for (int j = 0; j <= i; j++)
12058             {
12059               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
12060                 {
12061                   matches[i][0] = j;
12062                   matches[j][1]++;
12063                   break;
12064                 }
12065             }
12066         }
12067       int maxelement = 0;
12068       int maxv = 0;
12069       for (int i = 0; i < n_elts; i++)
12070         if (matches[i][1] > maxv)
12071           {
12072             maxelement = i;
12073             maxv = matches[i][1];
12074           }
12075
12076       /* Create a duplicate of the most common element.  */
12077       rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
12078       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
12079
12080       /* Insert the rest.  */
12081       for (int i = 0; i < n_elts; i++)
12082         {
12083           rtx x = XVECEXP (vals, 0, i);
12084           if (matches[i][0] == maxelement)
12085             continue;
12086           x = copy_to_mode_reg (inner_mode, x);
12087           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
12088         }
12089       return;
12090     }
12091
12092   /* Initialise a vector which is part-variable.  We want to first try
12093      to build those lanes which are constant in the most efficient way we
12094      can.  */
12095   if (n_var != n_elts)
12096     {
12097       rtx copy = copy_rtx (vals);
12098
12099       /* Load constant part of vector.  We really don't care what goes into the
12100          parts we will overwrite, but we're more likely to be able to load the
12101          constant efficiently if it has fewer, larger, repeating parts
12102          (see aarch64_simd_valid_immediate).  */
12103       for (int i = 0; i < n_elts; i++)
12104         {
12105           rtx x = XVECEXP (vals, 0, i);
12106           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
12107             continue;
12108           rtx subst = any_const;
12109           for (int bit = n_elts / 2; bit > 0; bit /= 2)
12110             {
12111               /* Look in the copied vector, as more elements are const.  */
12112               rtx test = XVECEXP (copy, 0, i ^ bit);
12113               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
12114                 {
12115                   subst = test;
12116                   break;
12117                 }
12118             }
12119           XVECEXP (copy, 0, i) = subst;
12120         }
12121       aarch64_expand_vector_init (target, copy);
12122     }
12123
12124   /* Insert the variable lanes directly.  */
12125   for (int i = 0; i < n_elts; i++)
12126     {
12127       rtx x = XVECEXP (vals, 0, i);
12128       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
12129         continue;
12130       x = copy_to_mode_reg (inner_mode, x);
12131       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
12132     }
12133 }
12134
12135 static unsigned HOST_WIDE_INT
12136 aarch64_shift_truncation_mask (machine_mode mode)
12137 {
12138   return
12139     (!SHIFT_COUNT_TRUNCATED
12140      || aarch64_vector_mode_supported_p (mode)
12141      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
12142 }
12143
12144 /* Select a format to encode pointers in exception handling data.  */
12145 int
12146 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
12147 {
12148    int type;
12149    switch (aarch64_cmodel)
12150      {
12151      case AARCH64_CMODEL_TINY:
12152      case AARCH64_CMODEL_TINY_PIC:
12153      case AARCH64_CMODEL_SMALL:
12154      case AARCH64_CMODEL_SMALL_PIC:
12155      case AARCH64_CMODEL_SMALL_SPIC:
12156        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
12157           for everything.  */
12158        type = DW_EH_PE_sdata4;
12159        break;
12160      default:
12161        /* No assumptions here.  8-byte relocs required.  */
12162        type = DW_EH_PE_sdata8;
12163        break;
12164      }
12165    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
12166 }
12167
12168 /* The last .arch and .tune assembly strings that we printed.  */
12169 static std::string aarch64_last_printed_arch_string;
12170 static std::string aarch64_last_printed_tune_string;
12171
12172 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
12173    by the function fndecl.  */
12174
12175 void
12176 aarch64_declare_function_name (FILE *stream, const char* name,
12177                                 tree fndecl)
12178 {
12179   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12180
12181   struct cl_target_option *targ_options;
12182   if (target_parts)
12183     targ_options = TREE_TARGET_OPTION (target_parts);
12184   else
12185     targ_options = TREE_TARGET_OPTION (target_option_current_node);
12186   gcc_assert (targ_options);
12187
12188   const struct processor *this_arch
12189     = aarch64_get_arch (targ_options->x_explicit_arch);
12190
12191   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
12192   std::string extension
12193     = aarch64_get_extension_string_for_isa_flags (isa_flags,
12194                                                   this_arch->flags);
12195   /* Only update the assembler .arch string if it is distinct from the last
12196      such string we printed.  */
12197   std::string to_print = this_arch->name + extension;
12198   if (to_print != aarch64_last_printed_arch_string)
12199     {
12200       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
12201       aarch64_last_printed_arch_string = to_print;
12202     }
12203
12204   /* Print the cpu name we're tuning for in the comments, might be
12205      useful to readers of the generated asm.  Do it only when it changes
12206      from function to function and verbose assembly is requested.  */
12207   const struct processor *this_tune
12208     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
12209
12210   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
12211     {
12212       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
12213                    this_tune->name);
12214       aarch64_last_printed_tune_string = this_tune->name;
12215     }
12216
12217   /* Don't forget the type directive for ELF.  */
12218   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
12219   ASM_OUTPUT_LABEL (stream, name);
12220 }
12221
12222 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
12223
12224 static void
12225 aarch64_start_file (void)
12226 {
12227   struct cl_target_option *default_options
12228     = TREE_TARGET_OPTION (target_option_default_node);
12229
12230   const struct processor *default_arch
12231     = aarch64_get_arch (default_options->x_explicit_arch);
12232   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
12233   std::string extension
12234     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
12235                                                   default_arch->flags);
12236
12237    aarch64_last_printed_arch_string = default_arch->name + extension;
12238    aarch64_last_printed_tune_string = "";
12239    asm_fprintf (asm_out_file, "\t.arch %s\n",
12240                 aarch64_last_printed_arch_string.c_str ());
12241
12242    default_file_start ();
12243 }
12244
12245 /* Emit load exclusive.  */
12246
12247 static void
12248 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
12249                              rtx mem, rtx model_rtx)
12250 {
12251   rtx (*gen) (rtx, rtx, rtx);
12252
12253   switch (mode)
12254     {
12255     case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
12256     case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
12257     case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
12258     case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
12259     default:
12260       gcc_unreachable ();
12261     }
12262
12263   emit_insn (gen (rval, mem, model_rtx));
12264 }
12265
12266 /* Emit store exclusive.  */
12267
12268 static void
12269 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
12270                               rtx rval, rtx mem, rtx model_rtx)
12271 {
12272   rtx (*gen) (rtx, rtx, rtx, rtx);
12273
12274   switch (mode)
12275     {
12276     case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
12277     case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
12278     case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
12279     case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
12280     default:
12281       gcc_unreachable ();
12282     }
12283
12284   emit_insn (gen (bval, rval, mem, model_rtx));
12285 }
12286
12287 /* Mark the previous jump instruction as unlikely.  */
12288
12289 static void
12290 aarch64_emit_unlikely_jump (rtx insn)
12291 {
12292   rtx_insn *jump = emit_jump_insn (insn);
12293   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
12294 }
12295
12296 /* Expand a compare and swap pattern.  */
12297
12298 void
12299 aarch64_expand_compare_and_swap (rtx operands[])
12300 {
12301   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
12302   machine_mode mode, cmp_mode;
12303   typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
12304   int idx;
12305   gen_cas_fn gen;
12306   const gen_cas_fn split_cas[] =
12307   {
12308     gen_aarch64_compare_and_swapqi,
12309     gen_aarch64_compare_and_swaphi,
12310     gen_aarch64_compare_and_swapsi,
12311     gen_aarch64_compare_and_swapdi
12312   };
12313   const gen_cas_fn atomic_cas[] =
12314   {
12315     gen_aarch64_compare_and_swapqi_lse,
12316     gen_aarch64_compare_and_swaphi_lse,
12317     gen_aarch64_compare_and_swapsi_lse,
12318     gen_aarch64_compare_and_swapdi_lse
12319   };
12320
12321   bval = operands[0];
12322   rval = operands[1];
12323   mem = operands[2];
12324   oldval = operands[3];
12325   newval = operands[4];
12326   is_weak = operands[5];
12327   mod_s = operands[6];
12328   mod_f = operands[7];
12329   mode = GET_MODE (mem);
12330   cmp_mode = mode;
12331
12332   /* Normally the succ memory model must be stronger than fail, but in the
12333      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
12334      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
12335
12336   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
12337       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
12338     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
12339
12340   switch (mode)
12341     {
12342     case E_QImode:
12343     case E_HImode:
12344       /* For short modes, we're going to perform the comparison in SImode,
12345          so do the zero-extension now.  */
12346       cmp_mode = SImode;
12347       rval = gen_reg_rtx (SImode);
12348       oldval = convert_modes (SImode, mode, oldval, true);
12349       /* Fall through.  */
12350
12351     case E_SImode:
12352     case E_DImode:
12353       /* Force the value into a register if needed.  */
12354       if (!aarch64_plus_operand (oldval, mode))
12355         oldval = force_reg (cmp_mode, oldval);
12356       break;
12357
12358     default:
12359       gcc_unreachable ();
12360     }
12361
12362   switch (mode)
12363     {
12364     case E_QImode: idx = 0; break;
12365     case E_HImode: idx = 1; break;
12366     case E_SImode: idx = 2; break;
12367     case E_DImode: idx = 3; break;
12368     default:
12369       gcc_unreachable ();
12370     }
12371   if (TARGET_LSE)
12372     gen = atomic_cas[idx];
12373   else
12374     gen = split_cas[idx];
12375
12376   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
12377
12378   if (mode == QImode || mode == HImode)
12379     emit_move_insn (operands[1], gen_lowpart (mode, rval));
12380
12381   x = gen_rtx_REG (CCmode, CC_REGNUM);
12382   x = gen_rtx_EQ (SImode, x, const0_rtx);
12383   emit_insn (gen_rtx_SET (bval, x));
12384 }
12385
12386 /* Test whether the target supports using a atomic load-operate instruction.
12387    CODE is the operation and AFTER is TRUE if the data in memory after the
12388    operation should be returned and FALSE if the data before the operation
12389    should be returned.  Returns FALSE if the operation isn't supported by the
12390    architecture.  */
12391
12392 bool
12393 aarch64_atomic_ldop_supported_p (enum rtx_code code)
12394 {
12395   if (!TARGET_LSE)
12396     return false;
12397
12398   switch (code)
12399     {
12400     case SET:
12401     case AND:
12402     case IOR:
12403     case XOR:
12404     case MINUS:
12405     case PLUS:
12406       return true;
12407     default:
12408       return false;
12409     }
12410 }
12411
12412 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
12413    sequence implementing an atomic operation.  */
12414
12415 static void
12416 aarch64_emit_post_barrier (enum memmodel model)
12417 {
12418   const enum memmodel base_model = memmodel_base (model);
12419
12420   if (is_mm_sync (model)
12421       && (base_model == MEMMODEL_ACQUIRE
12422           || base_model == MEMMODEL_ACQ_REL
12423           || base_model == MEMMODEL_SEQ_CST))
12424     {
12425       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
12426     }
12427 }
12428
12429 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
12430    for the data in memory.  EXPECTED is the value expected to be in memory.
12431    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
12432    is the memory ordering to use.  */
12433
12434 void
12435 aarch64_gen_atomic_cas (rtx rval, rtx mem,
12436                         rtx expected, rtx desired,
12437                         rtx model)
12438 {
12439   rtx (*gen) (rtx, rtx, rtx, rtx);
12440   machine_mode mode;
12441
12442   mode = GET_MODE (mem);
12443
12444   switch (mode)
12445     {
12446     case E_QImode: gen = gen_aarch64_atomic_casqi; break;
12447     case E_HImode: gen = gen_aarch64_atomic_cashi; break;
12448     case E_SImode: gen = gen_aarch64_atomic_cassi; break;
12449     case E_DImode: gen = gen_aarch64_atomic_casdi; break;
12450     default:
12451       gcc_unreachable ();
12452     }
12453
12454   /* Move the expected value into the CAS destination register.  */
12455   emit_insn (gen_rtx_SET (rval, expected));
12456
12457   /* Emit the CAS.  */
12458   emit_insn (gen (rval, mem, desired, model));
12459
12460   /* Compare the expected value with the value loaded by the CAS, to establish
12461      whether the swap was made.  */
12462   aarch64_gen_compare_reg (EQ, rval, expected);
12463 }
12464
12465 /* Split a compare and swap pattern.  */
12466
12467 void
12468 aarch64_split_compare_and_swap (rtx operands[])
12469 {
12470   rtx rval, mem, oldval, newval, scratch;
12471   machine_mode mode;
12472   bool is_weak;
12473   rtx_code_label *label1, *label2;
12474   rtx x, cond;
12475   enum memmodel model;
12476   rtx model_rtx;
12477
12478   rval = operands[0];
12479   mem = operands[1];
12480   oldval = operands[2];
12481   newval = operands[3];
12482   is_weak = (operands[4] != const0_rtx);
12483   model_rtx = operands[5];
12484   scratch = operands[7];
12485   mode = GET_MODE (mem);
12486   model = memmodel_from_int (INTVAL (model_rtx));
12487
12488   /* When OLDVAL is zero and we want the strong version we can emit a tighter
12489     loop:
12490     .label1:
12491         LD[A]XR rval, [mem]
12492         CBNZ    rval, .label2
12493         ST[L]XR scratch, newval, [mem]
12494         CBNZ    scratch, .label1
12495     .label2:
12496         CMP     rval, 0.  */
12497   bool strong_zero_p = !is_weak && oldval == const0_rtx;
12498
12499   label1 = NULL;
12500   if (!is_weak)
12501     {
12502       label1 = gen_label_rtx ();
12503       emit_label (label1);
12504     }
12505   label2 = gen_label_rtx ();
12506
12507   /* The initial load can be relaxed for a __sync operation since a final
12508      barrier will be emitted to stop code hoisting.  */
12509   if (is_mm_sync (model))
12510     aarch64_emit_load_exclusive (mode, rval, mem,
12511                                  GEN_INT (MEMMODEL_RELAXED));
12512   else
12513     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
12514
12515   if (strong_zero_p)
12516     {
12517       x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
12518       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12519                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12520       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12521     }
12522   else
12523     {
12524       cond = aarch64_gen_compare_reg (NE, rval, oldval);
12525       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12526       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12527                                  gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12528       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12529     }
12530
12531   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
12532
12533   if (!is_weak)
12534     {
12535       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
12536       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12537                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
12538       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12539     }
12540   else
12541     {
12542       cond = gen_rtx_REG (CCmode, CC_REGNUM);
12543       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
12544       emit_insn (gen_rtx_SET (cond, x));
12545     }
12546
12547   emit_label (label2);
12548   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
12549      to set the condition flags.  If this is not used it will be removed by
12550      later passes.  */
12551   if (strong_zero_p)
12552     {
12553       cond = gen_rtx_REG (CCmode, CC_REGNUM);
12554       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
12555       emit_insn (gen_rtx_SET (cond, x));
12556     }
12557   /* Emit any final barrier needed for a __sync operation.  */
12558   if (is_mm_sync (model))
12559     aarch64_emit_post_barrier (model);
12560 }
12561
12562 /* Emit a BIC instruction.  */
12563
12564 static void
12565 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
12566 {
12567   rtx shift_rtx = GEN_INT (shift);
12568   rtx (*gen) (rtx, rtx, rtx, rtx);
12569
12570   switch (mode)
12571     {
12572     case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
12573     case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
12574     default:
12575       gcc_unreachable ();
12576     }
12577
12578   emit_insn (gen (dst, s2, shift_rtx, s1));
12579 }
12580
12581 /* Emit an atomic swap.  */
12582
12583 static void
12584 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
12585                           rtx mem, rtx model)
12586 {
12587   rtx (*gen) (rtx, rtx, rtx, rtx);
12588
12589   switch (mode)
12590     {
12591     case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
12592     case E_HImode: gen = gen_aarch64_atomic_swphi; break;
12593     case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
12594     case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
12595     default:
12596       gcc_unreachable ();
12597     }
12598
12599   emit_insn (gen (dst, mem, value, model));
12600 }
12601
12602 /* Operations supported by aarch64_emit_atomic_load_op.  */
12603
12604 enum aarch64_atomic_load_op_code
12605 {
12606   AARCH64_LDOP_PLUS,    /* A + B  */
12607   AARCH64_LDOP_XOR,     /* A ^ B  */
12608   AARCH64_LDOP_OR,      /* A | B  */
12609   AARCH64_LDOP_BIC      /* A & ~B  */
12610 };
12611
12612 /* Emit an atomic load-operate.  */
12613
12614 static void
12615 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
12616                              machine_mode mode, rtx dst, rtx src,
12617                              rtx mem, rtx model)
12618 {
12619   typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
12620   const aarch64_atomic_load_op_fn plus[] =
12621   {
12622     gen_aarch64_atomic_loadaddqi,
12623     gen_aarch64_atomic_loadaddhi,
12624     gen_aarch64_atomic_loadaddsi,
12625     gen_aarch64_atomic_loadadddi
12626   };
12627   const aarch64_atomic_load_op_fn eor[] =
12628   {
12629     gen_aarch64_atomic_loadeorqi,
12630     gen_aarch64_atomic_loadeorhi,
12631     gen_aarch64_atomic_loadeorsi,
12632     gen_aarch64_atomic_loadeordi
12633   };
12634   const aarch64_atomic_load_op_fn ior[] =
12635   {
12636     gen_aarch64_atomic_loadsetqi,
12637     gen_aarch64_atomic_loadsethi,
12638     gen_aarch64_atomic_loadsetsi,
12639     gen_aarch64_atomic_loadsetdi
12640   };
12641   const aarch64_atomic_load_op_fn bic[] =
12642   {
12643     gen_aarch64_atomic_loadclrqi,
12644     gen_aarch64_atomic_loadclrhi,
12645     gen_aarch64_atomic_loadclrsi,
12646     gen_aarch64_atomic_loadclrdi
12647   };
12648   aarch64_atomic_load_op_fn gen;
12649   int idx = 0;
12650
12651   switch (mode)
12652     {
12653     case E_QImode: idx = 0; break;
12654     case E_HImode: idx = 1; break;
12655     case E_SImode: idx = 2; break;
12656     case E_DImode: idx = 3; break;
12657     default:
12658       gcc_unreachable ();
12659     }
12660
12661   switch (code)
12662     {
12663     case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
12664     case AARCH64_LDOP_XOR: gen = eor[idx]; break;
12665     case AARCH64_LDOP_OR: gen = ior[idx]; break;
12666     case AARCH64_LDOP_BIC: gen = bic[idx]; break;
12667     default:
12668       gcc_unreachable ();
12669     }
12670
12671   emit_insn (gen (dst, mem, src, model));
12672 }
12673
12674 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
12675    location to store the data read from memory.  OUT_RESULT is the location to
12676    store the result of the operation.  MEM is the memory location to read and
12677    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
12678    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
12679    be NULL.  */
12680
12681 void
12682 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
12683                          rtx mem, rtx value, rtx model_rtx)
12684 {
12685   machine_mode mode = GET_MODE (mem);
12686   machine_mode wmode = (mode == DImode ? DImode : SImode);
12687   const bool short_mode = (mode < SImode);
12688   aarch64_atomic_load_op_code ldop_code;
12689   rtx src;
12690   rtx x;
12691
12692   if (out_data)
12693     out_data = gen_lowpart (mode, out_data);
12694
12695   if (out_result)
12696     out_result = gen_lowpart (mode, out_result);
12697
12698   /* Make sure the value is in a register, putting it into a destination
12699      register if it needs to be manipulated.  */
12700   if (!register_operand (value, mode)
12701       || code == AND || code == MINUS)
12702     {
12703       src = out_result ? out_result : out_data;
12704       emit_move_insn (src, gen_lowpart (mode, value));
12705     }
12706   else
12707     src = value;
12708   gcc_assert (register_operand (src, mode));
12709
12710   /* Preprocess the data for the operation as necessary.  If the operation is
12711      a SET then emit a swap instruction and finish.  */
12712   switch (code)
12713     {
12714     case SET:
12715       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
12716       return;
12717
12718     case MINUS:
12719       /* Negate the value and treat it as a PLUS.  */
12720       {
12721         rtx neg_src;
12722
12723         /* Resize the value if necessary.  */
12724         if (short_mode)
12725           src = gen_lowpart (wmode, src);
12726
12727         neg_src = gen_rtx_NEG (wmode, src);
12728         emit_insn (gen_rtx_SET (src, neg_src));
12729
12730         if (short_mode)
12731           src = gen_lowpart (mode, src);
12732       }
12733       /* Fall-through.  */
12734     case PLUS:
12735       ldop_code = AARCH64_LDOP_PLUS;
12736       break;
12737
12738     case IOR:
12739       ldop_code = AARCH64_LDOP_OR;
12740       break;
12741
12742     case XOR:
12743       ldop_code = AARCH64_LDOP_XOR;
12744       break;
12745
12746     case AND:
12747       {
12748         rtx not_src;
12749
12750         /* Resize the value if necessary.  */
12751         if (short_mode)
12752           src = gen_lowpart (wmode, src);
12753
12754         not_src = gen_rtx_NOT (wmode, src);
12755         emit_insn (gen_rtx_SET (src, not_src));
12756
12757         if (short_mode)
12758           src = gen_lowpart (mode, src);
12759       }
12760       ldop_code = AARCH64_LDOP_BIC;
12761       break;
12762
12763     default:
12764       /* The operation can't be done with atomic instructions.  */
12765       gcc_unreachable ();
12766     }
12767
12768   aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
12769
12770   /* If necessary, calculate the data in memory after the update by redoing the
12771      operation from values in registers.  */
12772   if (!out_result)
12773     return;
12774
12775   if (short_mode)
12776     {
12777       src = gen_lowpart (wmode, src);
12778       out_data = gen_lowpart (wmode, out_data);
12779       out_result = gen_lowpart (wmode, out_result);
12780     }
12781
12782   x = NULL_RTX;
12783
12784   switch (code)
12785     {
12786     case MINUS:
12787     case PLUS:
12788       x = gen_rtx_PLUS (wmode, out_data, src);
12789       break;
12790     case IOR:
12791       x = gen_rtx_IOR (wmode, out_data, src);
12792       break;
12793     case XOR:
12794       x = gen_rtx_XOR (wmode, out_data, src);
12795       break;
12796     case AND:
12797       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
12798       return;
12799     default:
12800       gcc_unreachable ();
12801     }
12802
12803   emit_set_insn (out_result, x);
12804
12805   return;
12806 }
12807
12808 /* Split an atomic operation.  */
12809
12810 void
12811 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
12812                          rtx value, rtx model_rtx, rtx cond)
12813 {
12814   machine_mode mode = GET_MODE (mem);
12815   machine_mode wmode = (mode == DImode ? DImode : SImode);
12816   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
12817   const bool is_sync = is_mm_sync (model);
12818   rtx_code_label *label;
12819   rtx x;
12820
12821   /* Split the atomic operation into a sequence.  */
12822   label = gen_label_rtx ();
12823   emit_label (label);
12824
12825   if (new_out)
12826     new_out = gen_lowpart (wmode, new_out);
12827   if (old_out)
12828     old_out = gen_lowpart (wmode, old_out);
12829   else
12830     old_out = new_out;
12831   value = simplify_gen_subreg (wmode, value, mode, 0);
12832
12833   /* The initial load can be relaxed for a __sync operation since a final
12834      barrier will be emitted to stop code hoisting.  */
12835  if (is_sync)
12836     aarch64_emit_load_exclusive (mode, old_out, mem,
12837                                  GEN_INT (MEMMODEL_RELAXED));
12838   else
12839     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
12840
12841   switch (code)
12842     {
12843     case SET:
12844       new_out = value;
12845       break;
12846
12847     case NOT:
12848       x = gen_rtx_AND (wmode, old_out, value);
12849       emit_insn (gen_rtx_SET (new_out, x));
12850       x = gen_rtx_NOT (wmode, new_out);
12851       emit_insn (gen_rtx_SET (new_out, x));
12852       break;
12853
12854     case MINUS:
12855       if (CONST_INT_P (value))
12856         {
12857           value = GEN_INT (-INTVAL (value));
12858           code = PLUS;
12859         }
12860       /* Fall through.  */
12861
12862     default:
12863       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
12864       emit_insn (gen_rtx_SET (new_out, x));
12865       break;
12866     }
12867
12868   aarch64_emit_store_exclusive (mode, cond, mem,
12869                                 gen_lowpart (mode, new_out), model_rtx);
12870
12871   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12872   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12873                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
12874   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12875
12876   /* Emit any final barrier needed for a __sync operation.  */
12877   if (is_sync)
12878     aarch64_emit_post_barrier (model);
12879 }
12880
12881 static void
12882 aarch64_init_libfuncs (void)
12883 {
12884    /* Half-precision float operations.  The compiler handles all operations
12885      with NULL libfuncs by converting to SFmode.  */
12886
12887   /* Conversions.  */
12888   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
12889   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
12890
12891   /* Arithmetic.  */
12892   set_optab_libfunc (add_optab, HFmode, NULL);
12893   set_optab_libfunc (sdiv_optab, HFmode, NULL);
12894   set_optab_libfunc (smul_optab, HFmode, NULL);
12895   set_optab_libfunc (neg_optab, HFmode, NULL);
12896   set_optab_libfunc (sub_optab, HFmode, NULL);
12897
12898   /* Comparisons.  */
12899   set_optab_libfunc (eq_optab, HFmode, NULL);
12900   set_optab_libfunc (ne_optab, HFmode, NULL);
12901   set_optab_libfunc (lt_optab, HFmode, NULL);
12902   set_optab_libfunc (le_optab, HFmode, NULL);
12903   set_optab_libfunc (ge_optab, HFmode, NULL);
12904   set_optab_libfunc (gt_optab, HFmode, NULL);
12905   set_optab_libfunc (unord_optab, HFmode, NULL);
12906 }
12907
12908 /* Target hook for c_mode_for_suffix.  */
12909 static machine_mode
12910 aarch64_c_mode_for_suffix (char suffix)
12911 {
12912   if (suffix == 'q')
12913     return TFmode;
12914
12915   return VOIDmode;
12916 }
12917
12918 /* We can only represent floating point constants which will fit in
12919    "quarter-precision" values.  These values are characterised by
12920    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
12921    by:
12922
12923    (-1)^s * (n/16) * 2^r
12924
12925    Where:
12926      's' is the sign bit.
12927      'n' is an integer in the range 16 <= n <= 31.
12928      'r' is an integer in the range -3 <= r <= 4.  */
12929
12930 /* Return true iff X can be represented by a quarter-precision
12931    floating point immediate operand X.  Note, we cannot represent 0.0.  */
12932 bool
12933 aarch64_float_const_representable_p (rtx x)
12934 {
12935   /* This represents our current view of how many bits
12936      make up the mantissa.  */
12937   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
12938   int exponent;
12939   unsigned HOST_WIDE_INT mantissa, mask;
12940   REAL_VALUE_TYPE r, m;
12941   bool fail;
12942
12943   if (!CONST_DOUBLE_P (x))
12944     return false;
12945
12946   /* We don't support HFmode constants yet.  */
12947   if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
12948     return false;
12949
12950   r = *CONST_DOUBLE_REAL_VALUE (x);
12951
12952   /* We cannot represent infinities, NaNs or +/-zero.  We won't
12953      know if we have +zero until we analyse the mantissa, but we
12954      can reject the other invalid values.  */
12955   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
12956       || REAL_VALUE_MINUS_ZERO (r))
12957     return false;
12958
12959   /* Extract exponent.  */
12960   r = real_value_abs (&r);
12961   exponent = REAL_EXP (&r);
12962
12963   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12964      highest (sign) bit, with a fixed binary point at bit point_pos.
12965      m1 holds the low part of the mantissa, m2 the high part.
12966      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12967      bits for the mantissa, this can fail (low bits will be lost).  */
12968   real_ldexp (&m, &r, point_pos - exponent);
12969   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
12970
12971   /* If the low part of the mantissa has bits set we cannot represent
12972      the value.  */
12973   if (w.ulow () != 0)
12974     return false;
12975   /* We have rejected the lower HOST_WIDE_INT, so update our
12976      understanding of how many bits lie in the mantissa and
12977      look only at the high HOST_WIDE_INT.  */
12978   mantissa = w.elt (1);
12979   point_pos -= HOST_BITS_PER_WIDE_INT;
12980
12981   /* We can only represent values with a mantissa of the form 1.xxxx.  */
12982   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
12983   if ((mantissa & mask) != 0)
12984     return false;
12985
12986   /* Having filtered unrepresentable values, we may now remove all
12987      but the highest 5 bits.  */
12988   mantissa >>= point_pos - 5;
12989
12990   /* We cannot represent the value 0.0, so reject it.  This is handled
12991      elsewhere.  */
12992   if (mantissa == 0)
12993     return false;
12994
12995   /* Then, as bit 4 is always set, we can mask it off, leaving
12996      the mantissa in the range [0, 15].  */
12997   mantissa &= ~(1 << 4);
12998   gcc_assert (mantissa <= 15);
12999
13000   /* GCC internally does not use IEEE754-like encoding (where normalized
13001      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
13002      Our mantissa values are shifted 4 places to the left relative to
13003      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
13004      by 5 places to correct for GCC's representation.  */
13005   exponent = 5 - exponent;
13006
13007   return (exponent >= 0 && exponent <= 7);
13008 }
13009
13010 char*
13011 aarch64_output_simd_mov_immediate (rtx const_vector,
13012                                    machine_mode mode,
13013                                    unsigned width)
13014 {
13015   bool is_valid;
13016   static char templ[40];
13017   const char *mnemonic;
13018   const char *shift_op;
13019   unsigned int lane_count = 0;
13020   char element_char;
13021
13022   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
13023
13024   /* This will return true to show const_vector is legal for use as either
13025      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
13026      also update INFO to show how the immediate should be generated.  */
13027   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
13028   gcc_assert (is_valid);
13029
13030   element_char = sizetochar (info.element_width);
13031   lane_count = width / info.element_width;
13032
13033   mode = GET_MODE_INNER (mode);
13034   if (GET_MODE_CLASS (mode) == MODE_FLOAT)
13035     {
13036       gcc_assert (info.shift == 0 && ! info.mvn);
13037       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
13038          move immediate path.  */
13039       if (aarch64_float_const_zero_rtx_p (info.value))
13040         info.value = GEN_INT (0);
13041       else
13042         {
13043           const unsigned int buf_size = 20;
13044           char float_buf[buf_size] = {'\0'};
13045           real_to_decimal_for_mode (float_buf,
13046                                     CONST_DOUBLE_REAL_VALUE (info.value),
13047                                     buf_size, buf_size, 1, mode);
13048
13049           if (lane_count == 1)
13050             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
13051           else
13052             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
13053                       lane_count, element_char, float_buf);
13054           return templ;
13055         }
13056     }
13057
13058   mnemonic = info.mvn ? "mvni" : "movi";
13059   shift_op = info.msl ? "msl" : "lsl";
13060
13061   gcc_assert (CONST_INT_P (info.value));
13062   if (lane_count == 1)
13063     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
13064               mnemonic, UINTVAL (info.value));
13065   else if (info.shift)
13066     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
13067               ", %s %d", mnemonic, lane_count, element_char,
13068               UINTVAL (info.value), shift_op, info.shift);
13069   else
13070     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
13071               mnemonic, lane_count, element_char, UINTVAL (info.value));
13072   return templ;
13073 }
13074
13075 char*
13076 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
13077 {
13078
13079   /* If a floating point number was passed and we desire to use it in an
13080      integer mode do the conversion to integer.  */
13081   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
13082     {
13083       unsigned HOST_WIDE_INT ival;
13084       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
13085           gcc_unreachable ();
13086       immediate = gen_int_mode (ival, mode);
13087     }
13088
13089   machine_mode vmode;
13090   /* use a 64 bit mode for everything except for DI/DF mode, where we use
13091      a 128 bit vector mode.  */
13092   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
13093
13094   vmode = aarch64_simd_container_mode (mode, width);
13095   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
13096   return aarch64_output_simd_mov_immediate (v_op, vmode, width);
13097 }
13098
13099 /* Split operands into moves from op[1] + op[2] into op[0].  */
13100
13101 void
13102 aarch64_split_combinev16qi (rtx operands[3])
13103 {
13104   unsigned int dest = REGNO (operands[0]);
13105   unsigned int src1 = REGNO (operands[1]);
13106   unsigned int src2 = REGNO (operands[2]);
13107   machine_mode halfmode = GET_MODE (operands[1]);
13108   unsigned int halfregs = REG_NREGS (operands[1]);
13109   rtx destlo, desthi;
13110
13111   gcc_assert (halfmode == V16QImode);
13112
13113   if (src1 == dest && src2 == dest + halfregs)
13114     {
13115       /* No-op move.  Can't split to nothing; emit something.  */
13116       emit_note (NOTE_INSN_DELETED);
13117       return;
13118     }
13119
13120   /* Preserve register attributes for variable tracking.  */
13121   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
13122   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
13123                                GET_MODE_SIZE (halfmode));
13124
13125   /* Special case of reversed high/low parts.  */
13126   if (reg_overlap_mentioned_p (operands[2], destlo)
13127       && reg_overlap_mentioned_p (operands[1], desthi))
13128     {
13129       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
13130       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
13131       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
13132     }
13133   else if (!reg_overlap_mentioned_p (operands[2], destlo))
13134     {
13135       /* Try to avoid unnecessary moves if part of the result
13136          is in the right place already.  */
13137       if (src1 != dest)
13138         emit_move_insn (destlo, operands[1]);
13139       if (src2 != dest + halfregs)
13140         emit_move_insn (desthi, operands[2]);
13141     }
13142   else
13143     {
13144       if (src2 != dest + halfregs)
13145         emit_move_insn (desthi, operands[2]);
13146       if (src1 != dest)
13147         emit_move_insn (destlo, operands[1]);
13148     }
13149 }
13150
13151 /* vec_perm support.  */
13152
13153 #define MAX_VECT_LEN 16
13154
13155 struct expand_vec_perm_d
13156 {
13157   rtx target, op0, op1;
13158   unsigned char perm[MAX_VECT_LEN];
13159   machine_mode vmode;
13160   unsigned char nelt;
13161   bool one_vector_p;
13162   bool testing_p;
13163 };
13164
13165 /* Generate a variable permutation.  */
13166
13167 static void
13168 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
13169 {
13170   machine_mode vmode = GET_MODE (target);
13171   bool one_vector_p = rtx_equal_p (op0, op1);
13172
13173   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
13174   gcc_checking_assert (GET_MODE (op0) == vmode);
13175   gcc_checking_assert (GET_MODE (op1) == vmode);
13176   gcc_checking_assert (GET_MODE (sel) == vmode);
13177   gcc_checking_assert (TARGET_SIMD);
13178
13179   if (one_vector_p)
13180     {
13181       if (vmode == V8QImode)
13182         {
13183           /* Expand the argument to a V16QI mode by duplicating it.  */
13184           rtx pair = gen_reg_rtx (V16QImode);
13185           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
13186           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13187         }
13188       else
13189         {
13190           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
13191         }
13192     }
13193   else
13194     {
13195       rtx pair;
13196
13197       if (vmode == V8QImode)
13198         {
13199           pair = gen_reg_rtx (V16QImode);
13200           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
13201           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13202         }
13203       else
13204         {
13205           pair = gen_reg_rtx (OImode);
13206           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
13207           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
13208         }
13209     }
13210 }
13211
13212 void
13213 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
13214 {
13215   machine_mode vmode = GET_MODE (target);
13216   unsigned int nelt = GET_MODE_NUNITS (vmode);
13217   bool one_vector_p = rtx_equal_p (op0, op1);
13218   rtx mask;
13219
13220   /* The TBL instruction does not use a modulo index, so we must take care
13221      of that ourselves.  */
13222   mask = aarch64_simd_gen_const_vector_dup (vmode,
13223       one_vector_p ? nelt - 1 : 2 * nelt - 1);
13224   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
13225
13226   /* For big-endian, we also need to reverse the index within the vector
13227      (but not which vector).  */
13228   if (BYTES_BIG_ENDIAN)
13229     {
13230       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
13231       if (!one_vector_p)
13232         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
13233       sel = expand_simple_binop (vmode, XOR, sel, mask,
13234                                  NULL, 0, OPTAB_LIB_WIDEN);
13235     }
13236   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
13237 }
13238
13239 /* Recognize patterns suitable for the TRN instructions.  */
13240 static bool
13241 aarch64_evpc_trn (struct expand_vec_perm_d *d)
13242 {
13243   unsigned int i, odd, mask, nelt = d->nelt;
13244   rtx out, in0, in1, x;
13245   rtx (*gen) (rtx, rtx, rtx);
13246   machine_mode vmode = d->vmode;
13247
13248   if (GET_MODE_UNIT_SIZE (vmode) > 8)
13249     return false;
13250
13251   /* Note that these are little-endian tests.
13252      We correct for big-endian later.  */
13253   if (d->perm[0] == 0)
13254     odd = 0;
13255   else if (d->perm[0] == 1)
13256     odd = 1;
13257   else
13258     return false;
13259   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13260
13261   for (i = 0; i < nelt; i += 2)
13262     {
13263       if (d->perm[i] != i + odd)
13264         return false;
13265       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
13266         return false;
13267     }
13268
13269   /* Success!  */
13270   if (d->testing_p)
13271     return true;
13272
13273   in0 = d->op0;
13274   in1 = d->op1;
13275   if (BYTES_BIG_ENDIAN)
13276     {
13277       x = in0, in0 = in1, in1 = x;
13278       odd = !odd;
13279     }
13280   out = d->target;
13281
13282   if (odd)
13283     {
13284       switch (vmode)
13285         {
13286         case E_V16QImode: gen = gen_aarch64_trn2v16qi; break;
13287         case E_V8QImode: gen = gen_aarch64_trn2v8qi; break;
13288         case E_V8HImode: gen = gen_aarch64_trn2v8hi; break;
13289         case E_V4HImode: gen = gen_aarch64_trn2v4hi; break;
13290         case E_V4SImode: gen = gen_aarch64_trn2v4si; break;
13291         case E_V2SImode: gen = gen_aarch64_trn2v2si; break;
13292         case E_V2DImode: gen = gen_aarch64_trn2v2di; break;
13293         case E_V4HFmode: gen = gen_aarch64_trn2v4hf; break;
13294         case E_V8HFmode: gen = gen_aarch64_trn2v8hf; break;
13295         case E_V4SFmode: gen = gen_aarch64_trn2v4sf; break;
13296         case E_V2SFmode: gen = gen_aarch64_trn2v2sf; break;
13297         case E_V2DFmode: gen = gen_aarch64_trn2v2df; break;
13298         default:
13299           return false;
13300         }
13301     }
13302   else
13303     {
13304       switch (vmode)
13305         {
13306         case E_V16QImode: gen = gen_aarch64_trn1v16qi; break;
13307         case E_V8QImode: gen = gen_aarch64_trn1v8qi; break;
13308         case E_V8HImode: gen = gen_aarch64_trn1v8hi; break;
13309         case E_V4HImode: gen = gen_aarch64_trn1v4hi; break;
13310         case E_V4SImode: gen = gen_aarch64_trn1v4si; break;
13311         case E_V2SImode: gen = gen_aarch64_trn1v2si; break;
13312         case E_V2DImode: gen = gen_aarch64_trn1v2di; break;
13313         case E_V4HFmode: gen = gen_aarch64_trn1v4hf; break;
13314         case E_V8HFmode: gen = gen_aarch64_trn1v8hf; break;
13315         case E_V4SFmode: gen = gen_aarch64_trn1v4sf; break;
13316         case E_V2SFmode: gen = gen_aarch64_trn1v2sf; break;
13317         case E_V2DFmode: gen = gen_aarch64_trn1v2df; break;
13318         default:
13319           return false;
13320         }
13321     }
13322
13323   emit_insn (gen (out, in0, in1));
13324   return true;
13325 }
13326
13327 /* Recognize patterns suitable for the UZP instructions.  */
13328 static bool
13329 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
13330 {
13331   unsigned int i, odd, mask, nelt = d->nelt;
13332   rtx out, in0, in1, x;
13333   rtx (*gen) (rtx, rtx, rtx);
13334   machine_mode vmode = d->vmode;
13335
13336   if (GET_MODE_UNIT_SIZE (vmode) > 8)
13337     return false;
13338
13339   /* Note that these are little-endian tests.
13340      We correct for big-endian later.  */
13341   if (d->perm[0] == 0)
13342     odd = 0;
13343   else if (d->perm[0] == 1)
13344     odd = 1;
13345   else
13346     return false;
13347   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13348
13349   for (i = 0; i < nelt; i++)
13350     {
13351       unsigned elt = (i * 2 + odd) & mask;
13352       if (d->perm[i] != elt)
13353         return false;
13354     }
13355
13356   /* Success!  */
13357   if (d->testing_p)
13358     return true;
13359
13360   in0 = d->op0;
13361   in1 = d->op1;
13362   if (BYTES_BIG_ENDIAN)
13363     {
13364       x = in0, in0 = in1, in1 = x;
13365       odd = !odd;
13366     }
13367   out = d->target;
13368
13369   if (odd)
13370     {
13371       switch (vmode)
13372         {
13373         case E_V16QImode: gen = gen_aarch64_uzp2v16qi; break;
13374         case E_V8QImode: gen = gen_aarch64_uzp2v8qi; break;
13375         case E_V8HImode: gen = gen_aarch64_uzp2v8hi; break;
13376         case E_V4HImode: gen = gen_aarch64_uzp2v4hi; break;
13377         case E_V4SImode: gen = gen_aarch64_uzp2v4si; break;
13378         case E_V2SImode: gen = gen_aarch64_uzp2v2si; break;
13379         case E_V2DImode: gen = gen_aarch64_uzp2v2di; break;
13380         case E_V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
13381         case E_V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
13382         case E_V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
13383         case E_V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
13384         case E_V2DFmode: gen = gen_aarch64_uzp2v2df; break;
13385         default:
13386           return false;
13387         }
13388     }
13389   else
13390     {
13391       switch (vmode)
13392         {
13393         case E_V16QImode: gen = gen_aarch64_uzp1v16qi; break;
13394         case E_V8QImode: gen = gen_aarch64_uzp1v8qi; break;
13395         case E_V8HImode: gen = gen_aarch64_uzp1v8hi; break;
13396         case E_V4HImode: gen = gen_aarch64_uzp1v4hi; break;
13397         case E_V4SImode: gen = gen_aarch64_uzp1v4si; break;
13398         case E_V2SImode: gen = gen_aarch64_uzp1v2si; break;
13399         case E_V2DImode: gen = gen_aarch64_uzp1v2di; break;
13400         case E_V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
13401         case E_V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
13402         case E_V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
13403         case E_V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
13404         case E_V2DFmode: gen = gen_aarch64_uzp1v2df; break;
13405         default:
13406           return false;
13407         }
13408     }
13409
13410   emit_insn (gen (out, in0, in1));
13411   return true;
13412 }
13413
13414 /* Recognize patterns suitable for the ZIP instructions.  */
13415 static bool
13416 aarch64_evpc_zip (struct expand_vec_perm_d *d)
13417 {
13418   unsigned int i, high, mask, nelt = d->nelt;
13419   rtx out, in0, in1, x;
13420   rtx (*gen) (rtx, rtx, rtx);
13421   machine_mode vmode = d->vmode;
13422
13423   if (GET_MODE_UNIT_SIZE (vmode) > 8)
13424     return false;
13425
13426   /* Note that these are little-endian tests.
13427      We correct for big-endian later.  */
13428   high = nelt / 2;
13429   if (d->perm[0] == high)
13430     /* Do Nothing.  */
13431     ;
13432   else if (d->perm[0] == 0)
13433     high = 0;
13434   else
13435     return false;
13436   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13437
13438   for (i = 0; i < nelt / 2; i++)
13439     {
13440       unsigned elt = (i + high) & mask;
13441       if (d->perm[i * 2] != elt)
13442         return false;
13443       elt = (elt + nelt) & mask;
13444       if (d->perm[i * 2 + 1] != elt)
13445         return false;
13446     }
13447
13448   /* Success!  */
13449   if (d->testing_p)
13450     return true;
13451
13452   in0 = d->op0;
13453   in1 = d->op1;
13454   if (BYTES_BIG_ENDIAN)
13455     {
13456       x = in0, in0 = in1, in1 = x;
13457       high = !high;
13458     }
13459   out = d->target;
13460
13461   if (high)
13462     {
13463       switch (vmode)
13464         {
13465         case E_V16QImode: gen = gen_aarch64_zip2v16qi; break;
13466         case E_V8QImode: gen = gen_aarch64_zip2v8qi; break;
13467         case E_V8HImode: gen = gen_aarch64_zip2v8hi; break;
13468         case E_V4HImode: gen = gen_aarch64_zip2v4hi; break;
13469         case E_V4SImode: gen = gen_aarch64_zip2v4si; break;
13470         case E_V2SImode: gen = gen_aarch64_zip2v2si; break;
13471         case E_V2DImode: gen = gen_aarch64_zip2v2di; break;
13472         case E_V4HFmode: gen = gen_aarch64_zip2v4hf; break;
13473         case E_V8HFmode: gen = gen_aarch64_zip2v8hf; break;
13474         case E_V4SFmode: gen = gen_aarch64_zip2v4sf; break;
13475         case E_V2SFmode: gen = gen_aarch64_zip2v2sf; break;
13476         case E_V2DFmode: gen = gen_aarch64_zip2v2df; break;
13477         default:
13478           return false;
13479         }
13480     }
13481   else
13482     {
13483       switch (vmode)
13484         {
13485         case E_V16QImode: gen = gen_aarch64_zip1v16qi; break;
13486         case E_V8QImode: gen = gen_aarch64_zip1v8qi; break;
13487         case E_V8HImode: gen = gen_aarch64_zip1v8hi; break;
13488         case E_V4HImode: gen = gen_aarch64_zip1v4hi; break;
13489         case E_V4SImode: gen = gen_aarch64_zip1v4si; break;
13490         case E_V2SImode: gen = gen_aarch64_zip1v2si; break;
13491         case E_V2DImode: gen = gen_aarch64_zip1v2di; break;
13492         case E_V4HFmode: gen = gen_aarch64_zip1v4hf; break;
13493         case E_V8HFmode: gen = gen_aarch64_zip1v8hf; break;
13494         case E_V4SFmode: gen = gen_aarch64_zip1v4sf; break;
13495         case E_V2SFmode: gen = gen_aarch64_zip1v2sf; break;
13496         case E_V2DFmode: gen = gen_aarch64_zip1v2df; break;
13497         default:
13498           return false;
13499         }
13500     }
13501
13502   emit_insn (gen (out, in0, in1));
13503   return true;
13504 }
13505
13506 /* Recognize patterns for the EXT insn.  */
13507
13508 static bool
13509 aarch64_evpc_ext (struct expand_vec_perm_d *d)
13510 {
13511   unsigned int i, nelt = d->nelt;
13512   rtx (*gen) (rtx, rtx, rtx, rtx);
13513   rtx offset;
13514
13515   unsigned int location = d->perm[0]; /* Always < nelt.  */
13516
13517   /* Check if the extracted indices are increasing by one.  */
13518   for (i = 1; i < nelt; i++)
13519     {
13520       unsigned int required = location + i;
13521       if (d->one_vector_p)
13522         {
13523           /* We'll pass the same vector in twice, so allow indices to wrap.  */
13524           required &= (nelt - 1);
13525         }
13526       if (d->perm[i] != required)
13527         return false;
13528     }
13529
13530   switch (d->vmode)
13531     {
13532     case E_V16QImode: gen = gen_aarch64_extv16qi; break;
13533     case E_V8QImode: gen = gen_aarch64_extv8qi; break;
13534     case E_V4HImode: gen = gen_aarch64_extv4hi; break;
13535     case E_V8HImode: gen = gen_aarch64_extv8hi; break;
13536     case E_V2SImode: gen = gen_aarch64_extv2si; break;
13537     case E_V4SImode: gen = gen_aarch64_extv4si; break;
13538     case E_V4HFmode: gen = gen_aarch64_extv4hf; break;
13539     case E_V8HFmode: gen = gen_aarch64_extv8hf; break;
13540     case E_V2SFmode: gen = gen_aarch64_extv2sf; break;
13541     case E_V4SFmode: gen = gen_aarch64_extv4sf; break;
13542     case E_V2DImode: gen = gen_aarch64_extv2di; break;
13543     case E_V2DFmode: gen = gen_aarch64_extv2df; break;
13544     default:
13545       return false;
13546     }
13547
13548   /* Success! */
13549   if (d->testing_p)
13550     return true;
13551
13552   /* The case where (location == 0) is a no-op for both big- and little-endian,
13553      and is removed by the mid-end at optimization levels -O1 and higher.  */
13554
13555   if (BYTES_BIG_ENDIAN && (location != 0))
13556     {
13557       /* After setup, we want the high elements of the first vector (stored
13558          at the LSB end of the register), and the low elements of the second
13559          vector (stored at the MSB end of the register). So swap.  */
13560       std::swap (d->op0, d->op1);
13561       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
13562       location = nelt - location;
13563     }
13564
13565   offset = GEN_INT (location);
13566   emit_insn (gen (d->target, d->op0, d->op1, offset));
13567   return true;
13568 }
13569
13570 /* Recognize patterns for the REV insns.  */
13571
13572 static bool
13573 aarch64_evpc_rev (struct expand_vec_perm_d *d)
13574 {
13575   unsigned int i, j, diff, nelt = d->nelt;
13576   rtx (*gen) (rtx, rtx);
13577
13578   if (!d->one_vector_p)
13579     return false;
13580
13581   diff = d->perm[0];
13582   switch (diff)
13583     {
13584     case 7:
13585       switch (d->vmode)
13586         {
13587         case E_V16QImode: gen = gen_aarch64_rev64v16qi; break;
13588         case E_V8QImode: gen = gen_aarch64_rev64v8qi;  break;
13589         default:
13590           return false;
13591         }
13592       break;
13593     case 3:
13594       switch (d->vmode)
13595         {
13596         case E_V16QImode: gen = gen_aarch64_rev32v16qi; break;
13597         case E_V8QImode: gen = gen_aarch64_rev32v8qi;  break;
13598         case E_V8HImode: gen = gen_aarch64_rev64v8hi;  break;
13599         case E_V4HImode: gen = gen_aarch64_rev64v4hi;  break;
13600         default:
13601           return false;
13602         }
13603       break;
13604     case 1:
13605       switch (d->vmode)
13606         {
13607         case E_V16QImode: gen = gen_aarch64_rev16v16qi; break;
13608         case E_V8QImode: gen = gen_aarch64_rev16v8qi;  break;
13609         case E_V8HImode: gen = gen_aarch64_rev32v8hi;  break;
13610         case E_V4HImode: gen = gen_aarch64_rev32v4hi;  break;
13611         case E_V4SImode: gen = gen_aarch64_rev64v4si;  break;
13612         case E_V2SImode: gen = gen_aarch64_rev64v2si;  break;
13613         case E_V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
13614         case E_V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
13615         case E_V8HFmode: gen = gen_aarch64_rev64v8hf;  break;
13616         case E_V4HFmode: gen = gen_aarch64_rev64v4hf;  break;
13617         default:
13618           return false;
13619         }
13620       break;
13621     default:
13622       return false;
13623     }
13624
13625   for (i = 0; i < nelt ; i += diff + 1)
13626     for (j = 0; j <= diff; j += 1)
13627       {
13628         /* This is guaranteed to be true as the value of diff
13629            is 7, 3, 1 and we should have enough elements in the
13630            queue to generate this.  Getting a vector mask with a
13631            value of diff other than these values implies that
13632            something is wrong by the time we get here.  */
13633         gcc_assert (i + j < nelt);
13634         if (d->perm[i + j] != i + diff - j)
13635           return false;
13636       }
13637
13638   /* Success! */
13639   if (d->testing_p)
13640     return true;
13641
13642   emit_insn (gen (d->target, d->op0));
13643   return true;
13644 }
13645
13646 static bool
13647 aarch64_evpc_dup (struct expand_vec_perm_d *d)
13648 {
13649   rtx (*gen) (rtx, rtx, rtx);
13650   rtx out = d->target;
13651   rtx in0;
13652   machine_mode vmode = d->vmode;
13653   unsigned int i, elt, nelt = d->nelt;
13654   rtx lane;
13655
13656   elt = d->perm[0];
13657   for (i = 1; i < nelt; i++)
13658     {
13659       if (elt != d->perm[i])
13660         return false;
13661     }
13662
13663   /* The generic preparation in aarch64_expand_vec_perm_const_1
13664      swaps the operand order and the permute indices if it finds
13665      d->perm[0] to be in the second operand.  Thus, we can always
13666      use d->op0 and need not do any extra arithmetic to get the
13667      correct lane number.  */
13668   in0 = d->op0;
13669   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
13670
13671   switch (vmode)
13672     {
13673     case E_V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
13674     case E_V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
13675     case E_V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
13676     case E_V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
13677     case E_V4SImode: gen = gen_aarch64_dup_lanev4si; break;
13678     case E_V2SImode: gen = gen_aarch64_dup_lanev2si; break;
13679     case E_V2DImode: gen = gen_aarch64_dup_lanev2di; break;
13680     case E_V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
13681     case E_V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
13682     case E_V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
13683     case E_V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
13684     case E_V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
13685     default:
13686       return false;
13687     }
13688
13689   emit_insn (gen (out, in0, lane));
13690   return true;
13691 }
13692
13693 static bool
13694 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
13695 {
13696   rtx rperm[MAX_VECT_LEN], sel;
13697   machine_mode vmode = d->vmode;
13698   unsigned int i, nelt = d->nelt;
13699
13700   if (d->testing_p)
13701     return true;
13702
13703   /* Generic code will try constant permutation twice.  Once with the
13704      original mode and again with the elements lowered to QImode.
13705      So wait and don't do the selector expansion ourselves.  */
13706   if (vmode != V8QImode && vmode != V16QImode)
13707     return false;
13708
13709   for (i = 0; i < nelt; ++i)
13710     {
13711       int nunits = GET_MODE_NUNITS (vmode);
13712
13713       /* If big-endian and two vectors we end up with a weird mixed-endian
13714          mode on NEON.  Reverse the index within each word but not the word
13715          itself.  */
13716       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
13717                                            : d->perm[i]);
13718     }
13719   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
13720   sel = force_reg (vmode, sel);
13721
13722   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
13723   return true;
13724 }
13725
13726 static bool
13727 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
13728 {
13729   /* The pattern matching functions above are written to look for a small
13730      number to begin the sequence (0, 1, N/2).  If we begin with an index
13731      from the second operand, we can swap the operands.  */
13732   if (d->perm[0] >= d->nelt)
13733     {
13734       unsigned i, nelt = d->nelt;
13735
13736       gcc_assert (nelt == (nelt & -nelt));
13737       for (i = 0; i < nelt; ++i)
13738         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
13739
13740       std::swap (d->op0, d->op1);
13741     }
13742
13743   if (TARGET_SIMD)
13744     {
13745       if (aarch64_evpc_rev (d))
13746         return true;
13747       else if (aarch64_evpc_ext (d))
13748         return true;
13749       else if (aarch64_evpc_dup (d))
13750         return true;
13751       else if (aarch64_evpc_zip (d))
13752         return true;
13753       else if (aarch64_evpc_uzp (d))
13754         return true;
13755       else if (aarch64_evpc_trn (d))
13756         return true;
13757       return aarch64_evpc_tbl (d);
13758     }
13759   return false;
13760 }
13761
13762 /* Expand a vec_perm_const pattern.  */
13763
13764 bool
13765 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
13766 {
13767   struct expand_vec_perm_d d;
13768   int i, nelt, which;
13769
13770   d.target = target;
13771   d.op0 = op0;
13772   d.op1 = op1;
13773
13774   d.vmode = GET_MODE (target);
13775   gcc_assert (VECTOR_MODE_P (d.vmode));
13776   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13777   d.testing_p = false;
13778
13779   for (i = which = 0; i < nelt; ++i)
13780     {
13781       rtx e = XVECEXP (sel, 0, i);
13782       int ei = INTVAL (e) & (2 * nelt - 1);
13783       which |= (ei < nelt ? 1 : 2);
13784       d.perm[i] = ei;
13785     }
13786
13787   switch (which)
13788     {
13789     default:
13790       gcc_unreachable ();
13791
13792     case 3:
13793       d.one_vector_p = false;
13794       if (!rtx_equal_p (op0, op1))
13795         break;
13796
13797       /* The elements of PERM do not suggest that only the first operand
13798          is used, but both operands are identical.  Allow easier matching
13799          of the permutation by folding the permutation into the single
13800          input vector.  */
13801       /* Fall Through.  */
13802     case 2:
13803       for (i = 0; i < nelt; ++i)
13804         d.perm[i] &= nelt - 1;
13805       d.op0 = op1;
13806       d.one_vector_p = true;
13807       break;
13808
13809     case 1:
13810       d.op1 = op0;
13811       d.one_vector_p = true;
13812       break;
13813     }
13814
13815   return aarch64_expand_vec_perm_const_1 (&d);
13816 }
13817
13818 static bool
13819 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
13820                                      const unsigned char *sel)
13821 {
13822   struct expand_vec_perm_d d;
13823   unsigned int i, nelt, which;
13824   bool ret;
13825
13826   d.vmode = vmode;
13827   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13828   d.testing_p = true;
13829   memcpy (d.perm, sel, nelt);
13830
13831   /* Calculate whether all elements are in one vector.  */
13832   for (i = which = 0; i < nelt; ++i)
13833     {
13834       unsigned char e = d.perm[i];
13835       gcc_assert (e < 2 * nelt);
13836       which |= (e < nelt ? 1 : 2);
13837     }
13838
13839   /* If all elements are from the second vector, reindex as if from the
13840      first vector.  */
13841   if (which == 2)
13842     for (i = 0; i < nelt; ++i)
13843       d.perm[i] -= nelt;
13844
13845   /* Check whether the mask can be applied to a single vector.  */
13846   d.one_vector_p = (which != 3);
13847
13848   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
13849   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
13850   if (!d.one_vector_p)
13851     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
13852
13853   start_sequence ();
13854   ret = aarch64_expand_vec_perm_const_1 (&d);
13855   end_sequence ();
13856
13857   return ret;
13858 }
13859
13860 rtx
13861 aarch64_reverse_mask (machine_mode mode)
13862 {
13863   /* We have to reverse each vector because we dont have
13864      a permuted load that can reverse-load according to ABI rules.  */
13865   rtx mask;
13866   rtvec v = rtvec_alloc (16);
13867   int i, j;
13868   int nunits = GET_MODE_NUNITS (mode);
13869   int usize = GET_MODE_UNIT_SIZE (mode);
13870
13871   gcc_assert (BYTES_BIG_ENDIAN);
13872   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
13873
13874   for (i = 0; i < nunits; i++)
13875     for (j = 0; j < usize; j++)
13876       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
13877   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
13878   return force_reg (V16QImode, mask);
13879 }
13880
13881 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
13882    true.  However due to issues with register allocation it is preferable
13883    to avoid tieing integer scalar and FP scalar modes.  Executing integer
13884    operations in general registers is better than treating them as scalar
13885    vector operations.  This reduces latency and avoids redundant int<->FP
13886    moves.  So tie modes if they are either the same class, or vector modes
13887    with other vector modes, vector structs or any scalar mode.  */
13888
13889 static bool
13890 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
13891 {
13892   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
13893     return true;
13894
13895   /* We specifically want to allow elements of "structure" modes to
13896      be tieable to the structure.  This more general condition allows
13897      other rarer situations too.  */
13898   if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
13899     return true;
13900
13901   /* Also allow any scalar modes with vectors.  */
13902   if (aarch64_vector_mode_supported_p (mode1)
13903       || aarch64_vector_mode_supported_p (mode2))
13904     return true;
13905
13906   return false;
13907 }
13908
13909 /* Return a new RTX holding the result of moving POINTER forward by
13910    AMOUNT bytes.  */
13911
13912 static rtx
13913 aarch64_move_pointer (rtx pointer, int amount)
13914 {
13915   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
13916
13917   return adjust_automodify_address (pointer, GET_MODE (pointer),
13918                                     next, amount);
13919 }
13920
13921 /* Return a new RTX holding the result of moving POINTER forward by the
13922    size of the mode it points to.  */
13923
13924 static rtx
13925 aarch64_progress_pointer (rtx pointer)
13926 {
13927   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
13928
13929   return aarch64_move_pointer (pointer, amount);
13930 }
13931
13932 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13933    MODE bytes.  */
13934
13935 static void
13936 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
13937                                               machine_mode mode)
13938 {
13939   rtx reg = gen_reg_rtx (mode);
13940
13941   /* "Cast" the pointers to the correct mode.  */
13942   *src = adjust_address (*src, mode, 0);
13943   *dst = adjust_address (*dst, mode, 0);
13944   /* Emit the memcpy.  */
13945   emit_move_insn (reg, *src);
13946   emit_move_insn (*dst, reg);
13947   /* Move the pointers forward.  */
13948   *src = aarch64_progress_pointer (*src);
13949   *dst = aarch64_progress_pointer (*dst);
13950 }
13951
13952 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
13953    we succeed, otherwise return false.  */
13954
13955 bool
13956 aarch64_expand_movmem (rtx *operands)
13957 {
13958   unsigned int n;
13959   rtx dst = operands[0];
13960   rtx src = operands[1];
13961   rtx base;
13962   bool speed_p = !optimize_function_for_size_p (cfun);
13963
13964   /* When optimizing for size, give a better estimate of the length of a
13965      memcpy call, but use the default otherwise.  */
13966   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
13967
13968   /* We can't do anything smart if the amount to copy is not constant.  */
13969   if (!CONST_INT_P (operands[2]))
13970     return false;
13971
13972   n = UINTVAL (operands[2]);
13973
13974   /* Try to keep the number of instructions low.  For cases below 16 bytes we
13975      need to make at most two moves.  For cases above 16 bytes it will be one
13976      move for each 16 byte chunk, then at most two additional moves.  */
13977   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
13978     return false;
13979
13980   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
13981   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
13982
13983   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
13984   src = adjust_automodify_address (src, VOIDmode, base, 0);
13985
13986   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
13987      1-byte chunk.  */
13988   if (n < 4)
13989     {
13990       if (n >= 2)
13991         {
13992           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13993           n -= 2;
13994         }
13995
13996       if (n == 1)
13997         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13998
13999       return true;
14000     }
14001
14002   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
14003      4-byte chunk, partially overlapping with the previously copied chunk.  */
14004   if (n < 8)
14005     {
14006       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14007       n -= 4;
14008       if (n > 0)
14009         {
14010           int move = n - 4;
14011
14012           src = aarch64_move_pointer (src, move);
14013           dst = aarch64_move_pointer (dst, move);
14014           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14015         }
14016       return true;
14017     }
14018
14019   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
14020      them, then (if applicable) an 8-byte chunk.  */
14021   while (n >= 8)
14022     {
14023       if (n / 16)
14024         {
14025           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
14026           n -= 16;
14027         }
14028       else
14029         {
14030           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
14031           n -= 8;
14032         }
14033     }
14034
14035   /* Finish the final bytes of the copy.  We can always do this in one
14036      instruction.  We either copy the exact amount we need, or partially
14037      overlap with the previous chunk we copied and copy 8-bytes.  */
14038   if (n == 0)
14039     return true;
14040   else if (n == 1)
14041     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
14042   else if (n == 2)
14043     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
14044   else if (n == 4)
14045     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14046   else
14047     {
14048       if (n == 3)
14049         {
14050           src = aarch64_move_pointer (src, -1);
14051           dst = aarch64_move_pointer (dst, -1);
14052           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14053         }
14054       else
14055         {
14056           int move = n - 8;
14057
14058           src = aarch64_move_pointer (src, move);
14059           dst = aarch64_move_pointer (dst, move);
14060           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
14061         }
14062     }
14063
14064   return true;
14065 }
14066
14067 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
14068    SImode stores.  Handle the case when the constant has identical
14069    bottom and top halves.  This is beneficial when the two stores can be
14070    merged into an STP and we avoid synthesising potentially expensive
14071    immediates twice.  Return true if such a split is possible.  */
14072
14073 bool
14074 aarch64_split_dimode_const_store (rtx dst, rtx src)
14075 {
14076   rtx lo = gen_lowpart (SImode, src);
14077   rtx hi = gen_highpart_mode (SImode, DImode, src);
14078
14079   bool size_p = optimize_function_for_size_p (cfun);
14080
14081   if (!rtx_equal_p (lo, hi))
14082     return false;
14083
14084   unsigned int orig_cost
14085     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
14086   unsigned int lo_cost
14087     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
14088
14089   /* We want to transform:
14090      MOV        x1, 49370
14091      MOVK       x1, 0x140, lsl 16
14092      MOVK       x1, 0xc0da, lsl 32
14093      MOVK       x1, 0x140, lsl 48
14094      STR        x1, [x0]
14095    into:
14096      MOV        w1, 49370
14097      MOVK       w1, 0x140, lsl 16
14098      STP        w1, w1, [x0]
14099    So we want to perform this only when we save two instructions
14100    or more.  When optimizing for size, however, accept any code size
14101    savings we can.  */
14102   if (size_p && orig_cost <= lo_cost)
14103     return false;
14104
14105   if (!size_p
14106       && (orig_cost <= lo_cost + 1))
14107     return false;
14108
14109   rtx mem_lo = adjust_address (dst, SImode, 0);
14110   if (!aarch64_mem_pair_operand (mem_lo, SImode))
14111     return false;
14112
14113   rtx tmp_reg = gen_reg_rtx (SImode);
14114   aarch64_expand_mov_immediate (tmp_reg, lo);
14115   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
14116   /* Don't emit an explicit store pair as this may not be always profitable.
14117      Let the sched-fusion logic decide whether to merge them.  */
14118   emit_move_insn (mem_lo, tmp_reg);
14119   emit_move_insn (mem_hi, tmp_reg);
14120
14121   return true;
14122 }
14123
14124 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
14125
14126 static unsigned HOST_WIDE_INT
14127 aarch64_asan_shadow_offset (void)
14128 {
14129   return (HOST_WIDE_INT_1 << 36);
14130 }
14131
14132 static bool
14133 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
14134                                         unsigned int align,
14135                                         enum by_pieces_operation op,
14136                                         bool speed_p)
14137 {
14138   /* STORE_BY_PIECES can be used when copying a constant string, but
14139      in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
14140      For now we always fail this and let the move_by_pieces code copy
14141      the string from read-only memory.  */
14142   if (op == STORE_BY_PIECES)
14143     return false;
14144
14145   return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
14146 }
14147
14148 static rtx
14149 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
14150                         int code, tree treeop0, tree treeop1)
14151 {
14152   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14153   rtx op0, op1;
14154   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14155   insn_code icode;
14156   struct expand_operand ops[4];
14157
14158   start_sequence ();
14159   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14160
14161   op_mode = GET_MODE (op0);
14162   if (op_mode == VOIDmode)
14163     op_mode = GET_MODE (op1);
14164
14165   switch (op_mode)
14166     {
14167     case E_QImode:
14168     case E_HImode:
14169     case E_SImode:
14170       cmp_mode = SImode;
14171       icode = CODE_FOR_cmpsi;
14172       break;
14173
14174     case E_DImode:
14175       cmp_mode = DImode;
14176       icode = CODE_FOR_cmpdi;
14177       break;
14178
14179     case E_SFmode:
14180       cmp_mode = SFmode;
14181       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14182       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
14183       break;
14184
14185     case E_DFmode:
14186       cmp_mode = DFmode;
14187       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14188       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
14189       break;
14190
14191     default:
14192       end_sequence ();
14193       return NULL_RTX;
14194     }
14195
14196   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
14197   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
14198   if (!op0 || !op1)
14199     {
14200       end_sequence ();
14201       return NULL_RTX;
14202     }
14203   *prep_seq = get_insns ();
14204   end_sequence ();
14205
14206   create_fixed_operand (&ops[0], op0);
14207   create_fixed_operand (&ops[1], op1);
14208
14209   start_sequence ();
14210   if (!maybe_expand_insn (icode, 2, ops))
14211     {
14212       end_sequence ();
14213       return NULL_RTX;
14214     }
14215   *gen_seq = get_insns ();
14216   end_sequence ();
14217
14218   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
14219                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
14220 }
14221
14222 static rtx
14223 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
14224                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
14225 {
14226   rtx op0, op1, target;
14227   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14228   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14229   insn_code icode;
14230   struct expand_operand ops[6];
14231   int aarch64_cond;
14232
14233   push_to_sequence (*prep_seq);
14234   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14235
14236   op_mode = GET_MODE (op0);
14237   if (op_mode == VOIDmode)
14238     op_mode = GET_MODE (op1);
14239
14240   switch (op_mode)
14241     {
14242     case E_QImode:
14243     case E_HImode:
14244     case E_SImode:
14245       cmp_mode = SImode;
14246       icode = CODE_FOR_ccmpsi;
14247       break;
14248
14249     case E_DImode:
14250       cmp_mode = DImode;
14251       icode = CODE_FOR_ccmpdi;
14252       break;
14253
14254     case E_SFmode:
14255       cmp_mode = SFmode;
14256       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14257       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
14258       break;
14259
14260     case E_DFmode:
14261       cmp_mode = DFmode;
14262       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14263       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
14264       break;
14265
14266     default:
14267       end_sequence ();
14268       return NULL_RTX;
14269     }
14270
14271   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
14272   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
14273   if (!op0 || !op1)
14274     {
14275       end_sequence ();
14276       return NULL_RTX;
14277     }
14278   *prep_seq = get_insns ();
14279   end_sequence ();
14280
14281   target = gen_rtx_REG (cc_mode, CC_REGNUM);
14282   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
14283
14284   if (bit_code != AND)
14285     {
14286       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
14287                                                 GET_MODE (XEXP (prev, 0))),
14288                              VOIDmode, XEXP (prev, 0), const0_rtx);
14289       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
14290     }
14291
14292   create_fixed_operand (&ops[0], XEXP (prev, 0));
14293   create_fixed_operand (&ops[1], target);
14294   create_fixed_operand (&ops[2], op0);
14295   create_fixed_operand (&ops[3], op1);
14296   create_fixed_operand (&ops[4], prev);
14297   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
14298
14299   push_to_sequence (*gen_seq);
14300   if (!maybe_expand_insn (icode, 6, ops))
14301     {
14302       end_sequence ();
14303       return NULL_RTX;
14304     }
14305
14306   *gen_seq = get_insns ();
14307   end_sequence ();
14308
14309   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
14310 }
14311
14312 #undef TARGET_GEN_CCMP_FIRST
14313 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
14314
14315 #undef TARGET_GEN_CCMP_NEXT
14316 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
14317
14318 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
14319    instruction fusion of some sort.  */
14320
14321 static bool
14322 aarch64_macro_fusion_p (void)
14323 {
14324   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
14325 }
14326
14327
14328 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
14329    should be kept together during scheduling.  */
14330
14331 static bool
14332 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
14333 {
14334   rtx set_dest;
14335   rtx prev_set = single_set (prev);
14336   rtx curr_set = single_set (curr);
14337   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
14338   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
14339
14340   if (!aarch64_macro_fusion_p ())
14341     return false;
14342
14343   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
14344     {
14345       /* We are trying to match:
14346          prev (mov)  == (set (reg r0) (const_int imm16))
14347          curr (movk) == (set (zero_extract (reg r0)
14348                                            (const_int 16)
14349                                            (const_int 16))
14350                              (const_int imm16_1))  */
14351
14352       set_dest = SET_DEST (curr_set);
14353
14354       if (GET_CODE (set_dest) == ZERO_EXTRACT
14355           && CONST_INT_P (SET_SRC (curr_set))
14356           && CONST_INT_P (SET_SRC (prev_set))
14357           && CONST_INT_P (XEXP (set_dest, 2))
14358           && INTVAL (XEXP (set_dest, 2)) == 16
14359           && REG_P (XEXP (set_dest, 0))
14360           && REG_P (SET_DEST (prev_set))
14361           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
14362         {
14363           return true;
14364         }
14365     }
14366
14367   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
14368     {
14369
14370       /*  We're trying to match:
14371           prev (adrp) == (set (reg r1)
14372                               (high (symbol_ref ("SYM"))))
14373           curr (add) == (set (reg r0)
14374                              (lo_sum (reg r1)
14375                                      (symbol_ref ("SYM"))))
14376           Note that r0 need not necessarily be the same as r1, especially
14377           during pre-regalloc scheduling.  */
14378
14379       if (satisfies_constraint_Ush (SET_SRC (prev_set))
14380           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14381         {
14382           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
14383               && REG_P (XEXP (SET_SRC (curr_set), 0))
14384               && REGNO (XEXP (SET_SRC (curr_set), 0))
14385                  == REGNO (SET_DEST (prev_set))
14386               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
14387                               XEXP (SET_SRC (curr_set), 1)))
14388             return true;
14389         }
14390     }
14391
14392   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
14393     {
14394
14395       /* We're trying to match:
14396          prev (movk) == (set (zero_extract (reg r0)
14397                                            (const_int 16)
14398                                            (const_int 32))
14399                              (const_int imm16_1))
14400          curr (movk) == (set (zero_extract (reg r0)
14401                                            (const_int 16)
14402                                            (const_int 48))
14403                              (const_int imm16_2))  */
14404
14405       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
14406           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
14407           && REG_P (XEXP (SET_DEST (prev_set), 0))
14408           && REG_P (XEXP (SET_DEST (curr_set), 0))
14409           && REGNO (XEXP (SET_DEST (prev_set), 0))
14410              == REGNO (XEXP (SET_DEST (curr_set), 0))
14411           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
14412           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
14413           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
14414           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
14415           && CONST_INT_P (SET_SRC (prev_set))
14416           && CONST_INT_P (SET_SRC (curr_set)))
14417         return true;
14418
14419     }
14420   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
14421     {
14422       /* We're trying to match:
14423           prev (adrp) == (set (reg r0)
14424                               (high (symbol_ref ("SYM"))))
14425           curr (ldr) == (set (reg r1)
14426                              (mem (lo_sum (reg r0)
14427                                              (symbol_ref ("SYM")))))
14428                  or
14429           curr (ldr) == (set (reg r1)
14430                              (zero_extend (mem
14431                                            (lo_sum (reg r0)
14432                                                    (symbol_ref ("SYM"))))))  */
14433       if (satisfies_constraint_Ush (SET_SRC (prev_set))
14434           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14435         {
14436           rtx curr_src = SET_SRC (curr_set);
14437
14438           if (GET_CODE (curr_src) == ZERO_EXTEND)
14439             curr_src = XEXP (curr_src, 0);
14440
14441           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
14442               && REG_P (XEXP (XEXP (curr_src, 0), 0))
14443               && REGNO (XEXP (XEXP (curr_src, 0), 0))
14444                  == REGNO (SET_DEST (prev_set))
14445               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
14446                               XEXP (SET_SRC (prev_set), 0)))
14447               return true;
14448         }
14449     }
14450
14451   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
14452        && aarch_crypto_can_dual_issue (prev, curr))
14453     return true;
14454
14455   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
14456       && any_condjump_p (curr))
14457     {
14458       enum attr_type prev_type = get_attr_type (prev);
14459
14460       unsigned int condreg1, condreg2;
14461       rtx cc_reg_1;
14462       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
14463       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
14464
14465       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
14466           && prev
14467           && modified_in_p (cc_reg_1, prev))
14468         {
14469           /* FIXME: this misses some which is considered simple arthematic
14470              instructions for ThunderX.  Simple shifts are missed here.  */
14471           if (prev_type == TYPE_ALUS_SREG
14472               || prev_type == TYPE_ALUS_IMM
14473               || prev_type == TYPE_LOGICS_REG
14474               || prev_type == TYPE_LOGICS_IMM)
14475             return true;
14476         }
14477     }
14478
14479   if (prev_set
14480       && curr_set
14481       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
14482       && any_condjump_p (curr))
14483     {
14484       /* We're trying to match:
14485           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
14486           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
14487                                                          (const_int 0))
14488                                                  (label_ref ("SYM"))
14489                                                  (pc))  */
14490       if (SET_DEST (curr_set) == (pc_rtx)
14491           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
14492           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
14493           && REG_P (SET_DEST (prev_set))
14494           && REGNO (SET_DEST (prev_set))
14495              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
14496         {
14497           /* Fuse ALU operations followed by conditional branch instruction.  */
14498           switch (get_attr_type (prev))
14499             {
14500             case TYPE_ALU_IMM:
14501             case TYPE_ALU_SREG:
14502             case TYPE_ADC_REG:
14503             case TYPE_ADC_IMM:
14504             case TYPE_ADCS_REG:
14505             case TYPE_ADCS_IMM:
14506             case TYPE_LOGIC_REG:
14507             case TYPE_LOGIC_IMM:
14508             case TYPE_CSEL:
14509             case TYPE_ADR:
14510             case TYPE_MOV_IMM:
14511             case TYPE_SHIFT_REG:
14512             case TYPE_SHIFT_IMM:
14513             case TYPE_BFM:
14514             case TYPE_RBIT:
14515             case TYPE_REV:
14516             case TYPE_EXTEND:
14517               return true;
14518
14519             default:;
14520             }
14521         }
14522     }
14523
14524   return false;
14525 }
14526
14527 /* Return true iff the instruction fusion described by OP is enabled.  */
14528
14529 bool
14530 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
14531 {
14532   return (aarch64_tune_params.fusible_ops & op) != 0;
14533 }
14534
14535 /* If MEM is in the form of [base+offset], extract the two parts
14536    of address and set to BASE and OFFSET, otherwise return false
14537    after clearing BASE and OFFSET.  */
14538
14539 bool
14540 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
14541 {
14542   rtx addr;
14543
14544   gcc_assert (MEM_P (mem));
14545
14546   addr = XEXP (mem, 0);
14547
14548   if (REG_P (addr))
14549     {
14550       *base = addr;
14551       *offset = const0_rtx;
14552       return true;
14553     }
14554
14555   if (GET_CODE (addr) == PLUS
14556       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
14557     {
14558       *base = XEXP (addr, 0);
14559       *offset = XEXP (addr, 1);
14560       return true;
14561     }
14562
14563   *base = NULL_RTX;
14564   *offset = NULL_RTX;
14565
14566   return false;
14567 }
14568
14569 /* Types for scheduling fusion.  */
14570 enum sched_fusion_type
14571 {
14572   SCHED_FUSION_NONE = 0,
14573   SCHED_FUSION_LD_SIGN_EXTEND,
14574   SCHED_FUSION_LD_ZERO_EXTEND,
14575   SCHED_FUSION_LD,
14576   SCHED_FUSION_ST,
14577   SCHED_FUSION_NUM
14578 };
14579
14580 /* If INSN is a load or store of address in the form of [base+offset],
14581    extract the two parts and set to BASE and OFFSET.  Return scheduling
14582    fusion type this INSN is.  */
14583
14584 static enum sched_fusion_type
14585 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
14586 {
14587   rtx x, dest, src;
14588   enum sched_fusion_type fusion = SCHED_FUSION_LD;
14589
14590   gcc_assert (INSN_P (insn));
14591   x = PATTERN (insn);
14592   if (GET_CODE (x) != SET)
14593     return SCHED_FUSION_NONE;
14594
14595   src = SET_SRC (x);
14596   dest = SET_DEST (x);
14597
14598   machine_mode dest_mode = GET_MODE (dest);
14599
14600   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
14601     return SCHED_FUSION_NONE;
14602
14603   if (GET_CODE (src) == SIGN_EXTEND)
14604     {
14605       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
14606       src = XEXP (src, 0);
14607       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14608         return SCHED_FUSION_NONE;
14609     }
14610   else if (GET_CODE (src) == ZERO_EXTEND)
14611     {
14612       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
14613       src = XEXP (src, 0);
14614       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14615         return SCHED_FUSION_NONE;
14616     }
14617
14618   if (GET_CODE (src) == MEM && REG_P (dest))
14619     extract_base_offset_in_addr (src, base, offset);
14620   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
14621     {
14622       fusion = SCHED_FUSION_ST;
14623       extract_base_offset_in_addr (dest, base, offset);
14624     }
14625   else
14626     return SCHED_FUSION_NONE;
14627
14628   if (*base == NULL_RTX || *offset == NULL_RTX)
14629     fusion = SCHED_FUSION_NONE;
14630
14631   return fusion;
14632 }
14633
14634 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
14635
14636    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
14637    and PRI are only calculated for these instructions.  For other instruction,
14638    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
14639    type instruction fusion can be added by returning different priorities.
14640
14641    It's important that irrelevant instructions get the largest FUSION_PRI.  */
14642
14643 static void
14644 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
14645                                int *fusion_pri, int *pri)
14646 {
14647   int tmp, off_val;
14648   rtx base, offset;
14649   enum sched_fusion_type fusion;
14650
14651   gcc_assert (INSN_P (insn));
14652
14653   tmp = max_pri - 1;
14654   fusion = fusion_load_store (insn, &base, &offset);
14655   if (fusion == SCHED_FUSION_NONE)
14656     {
14657       *pri = tmp;
14658       *fusion_pri = tmp;
14659       return;
14660     }
14661
14662   /* Set FUSION_PRI according to fusion type and base register.  */
14663   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
14664
14665   /* Calculate PRI.  */
14666   tmp /= 2;
14667
14668   /* INSN with smaller offset goes first.  */
14669   off_val = (int)(INTVAL (offset));
14670   if (off_val >= 0)
14671     tmp -= (off_val & 0xfffff);
14672   else
14673     tmp += ((- off_val) & 0xfffff);
14674
14675   *pri = tmp;
14676   return;
14677 }
14678
14679 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
14680    Adjust priority of sha1h instructions so they are scheduled before
14681    other SHA1 instructions.  */
14682
14683 static int
14684 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
14685 {
14686   rtx x = PATTERN (insn);
14687
14688   if (GET_CODE (x) == SET)
14689     {
14690       x = SET_SRC (x);
14691
14692       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
14693         return priority + 10;
14694     }
14695
14696   return priority;
14697 }
14698
14699 /* Given OPERANDS of consecutive load/store, check if we can merge
14700    them into ldp/stp.  LOAD is true if they are load instructions.
14701    MODE is the mode of memory operands.  */
14702
14703 bool
14704 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
14705                                 machine_mode mode)
14706 {
14707   HOST_WIDE_INT offval_1, offval_2, msize;
14708   enum reg_class rclass_1, rclass_2;
14709   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
14710
14711   if (load)
14712     {
14713       mem_1 = operands[1];
14714       mem_2 = operands[3];
14715       reg_1 = operands[0];
14716       reg_2 = operands[2];
14717       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
14718       if (REGNO (reg_1) == REGNO (reg_2))
14719         return false;
14720     }
14721   else
14722     {
14723       mem_1 = operands[0];
14724       mem_2 = operands[2];
14725       reg_1 = operands[1];
14726       reg_2 = operands[3];
14727     }
14728
14729   /* The mems cannot be volatile.  */
14730   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
14731     return false;
14732
14733   /* If we have SImode and slow unaligned ldp,
14734      check the alignment to be at least 8 byte. */
14735   if (mode == SImode
14736       && (aarch64_tune_params.extra_tuning_flags
14737           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14738       && !optimize_size
14739       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14740     return false;
14741
14742   /* Check if the addresses are in the form of [base+offset].  */
14743   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14744   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14745     return false;
14746   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14747   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14748     return false;
14749
14750   /* Check if the bases are same.  */
14751   if (!rtx_equal_p (base_1, base_2))
14752     return false;
14753
14754   offval_1 = INTVAL (offset_1);
14755   offval_2 = INTVAL (offset_2);
14756   msize = GET_MODE_SIZE (mode);
14757   /* Check if the offsets are consecutive.  */
14758   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
14759     return false;
14760
14761   /* Check if the addresses are clobbered by load.  */
14762   if (load)
14763     {
14764       if (reg_mentioned_p (reg_1, mem_1))
14765         return false;
14766
14767       /* In increasing order, the last load can clobber the address.  */
14768       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
14769       return false;
14770     }
14771
14772   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14773     rclass_1 = FP_REGS;
14774   else
14775     rclass_1 = GENERAL_REGS;
14776
14777   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14778     rclass_2 = FP_REGS;
14779   else
14780     rclass_2 = GENERAL_REGS;
14781
14782   /* Check if the registers are of same class.  */
14783   if (rclass_1 != rclass_2)
14784     return false;
14785
14786   return true;
14787 }
14788
14789 /* Given OPERANDS of consecutive load/store, check if we can merge
14790    them into ldp/stp by adjusting the offset.  LOAD is true if they
14791    are load instructions.  MODE is the mode of memory operands.
14792
14793    Given below consecutive stores:
14794
14795      str  w1, [xb, 0x100]
14796      str  w1, [xb, 0x104]
14797      str  w1, [xb, 0x108]
14798      str  w1, [xb, 0x10c]
14799
14800    Though the offsets are out of the range supported by stp, we can
14801    still pair them after adjusting the offset, like:
14802
14803      add  scratch, xb, 0x100
14804      stp  w1, w1, [scratch]
14805      stp  w1, w1, [scratch, 0x8]
14806
14807    The peephole patterns detecting this opportunity should guarantee
14808    the scratch register is avaliable.  */
14809
14810 bool
14811 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
14812                                        scalar_mode mode)
14813 {
14814   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
14815   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
14816   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
14817   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
14818
14819   if (load)
14820     {
14821       reg_1 = operands[0];
14822       mem_1 = operands[1];
14823       reg_2 = operands[2];
14824       mem_2 = operands[3];
14825       reg_3 = operands[4];
14826       mem_3 = operands[5];
14827       reg_4 = operands[6];
14828       mem_4 = operands[7];
14829       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
14830                   && REG_P (reg_3) && REG_P (reg_4));
14831       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
14832         return false;
14833     }
14834   else
14835     {
14836       mem_1 = operands[0];
14837       reg_1 = operands[1];
14838       mem_2 = operands[2];
14839       reg_2 = operands[3];
14840       mem_3 = operands[4];
14841       reg_3 = operands[5];
14842       mem_4 = operands[6];
14843       reg_4 = operands[7];
14844     }
14845   /* Skip if memory operand is by itslef valid for ldp/stp.  */
14846   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
14847     return false;
14848
14849   /* The mems cannot be volatile.  */
14850   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
14851       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
14852     return false;
14853
14854   /* Check if the addresses are in the form of [base+offset].  */
14855   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14856   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14857     return false;
14858   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14859   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14860     return false;
14861   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
14862   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
14863     return false;
14864   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
14865   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
14866     return false;
14867
14868   /* Check if the bases are same.  */
14869   if (!rtx_equal_p (base_1, base_2)
14870       || !rtx_equal_p (base_2, base_3)
14871       || !rtx_equal_p (base_3, base_4))
14872     return false;
14873
14874   offval_1 = INTVAL (offset_1);
14875   offval_2 = INTVAL (offset_2);
14876   offval_3 = INTVAL (offset_3);
14877   offval_4 = INTVAL (offset_4);
14878   msize = GET_MODE_SIZE (mode);
14879   /* Check if the offsets are consecutive.  */
14880   if ((offval_1 != (offval_2 + msize)
14881        || offval_1 != (offval_3 + msize * 2)
14882        || offval_1 != (offval_4 + msize * 3))
14883       && (offval_4 != (offval_3 + msize)
14884           || offval_4 != (offval_2 + msize * 2)
14885           || offval_4 != (offval_1 + msize * 3)))
14886     return false;
14887
14888   /* Check if the addresses are clobbered by load.  */
14889   if (load)
14890     {
14891       if (reg_mentioned_p (reg_1, mem_1)
14892           || reg_mentioned_p (reg_2, mem_2)
14893           || reg_mentioned_p (reg_3, mem_3))
14894         return false;
14895
14896       /* In increasing order, the last load can clobber the address.  */
14897       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
14898         return false;
14899     }
14900
14901   /* If we have SImode and slow unaligned ldp,
14902      check the alignment to be at least 8 byte. */
14903   if (mode == SImode
14904       && (aarch64_tune_params.extra_tuning_flags
14905           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14906       && !optimize_size
14907       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14908     return false;
14909
14910   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14911     rclass_1 = FP_REGS;
14912   else
14913     rclass_1 = GENERAL_REGS;
14914
14915   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14916     rclass_2 = FP_REGS;
14917   else
14918     rclass_2 = GENERAL_REGS;
14919
14920   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
14921     rclass_3 = FP_REGS;
14922   else
14923     rclass_3 = GENERAL_REGS;
14924
14925   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
14926     rclass_4 = FP_REGS;
14927   else
14928     rclass_4 = GENERAL_REGS;
14929
14930   /* Check if the registers are of same class.  */
14931   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
14932     return false;
14933
14934   return true;
14935 }
14936
14937 /* Given OPERANDS of consecutive load/store, this function pairs them
14938    into ldp/stp after adjusting the offset.  It depends on the fact
14939    that addresses of load/store instructions are in increasing order.
14940    MODE is the mode of memory operands.  CODE is the rtl operator
14941    which should be applied to all memory operands, it's SIGN_EXTEND,
14942    ZERO_EXTEND or UNKNOWN.  */
14943
14944 bool
14945 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
14946                              scalar_mode mode, RTX_CODE code)
14947 {
14948   rtx base, offset, t1, t2;
14949   rtx mem_1, mem_2, mem_3, mem_4;
14950   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
14951
14952   if (load)
14953     {
14954       mem_1 = operands[1];
14955       mem_2 = operands[3];
14956       mem_3 = operands[5];
14957       mem_4 = operands[7];
14958     }
14959   else
14960     {
14961       mem_1 = operands[0];
14962       mem_2 = operands[2];
14963       mem_3 = operands[4];
14964       mem_4 = operands[6];
14965       gcc_assert (code == UNKNOWN);
14966     }
14967
14968   extract_base_offset_in_addr (mem_1, &base, &offset);
14969   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
14970
14971   /* Adjust offset thus it can fit in ldp/stp instruction.  */
14972   msize = GET_MODE_SIZE (mode);
14973   stp_off_limit = msize * 0x40;
14974   off_val = INTVAL (offset);
14975   abs_off = (off_val < 0) ? -off_val : off_val;
14976   new_off = abs_off % stp_off_limit;
14977   adj_off = abs_off - new_off;
14978
14979   /* Further adjust to make sure all offsets are OK.  */
14980   if ((new_off + msize * 2) >= stp_off_limit)
14981     {
14982       adj_off += stp_off_limit;
14983       new_off -= stp_off_limit;
14984     }
14985
14986   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
14987   if (adj_off >= 0x1000)
14988     return false;
14989
14990   if (off_val < 0)
14991     {
14992       adj_off = -adj_off;
14993       new_off = -new_off;
14994     }
14995
14996   /* Create new memory references.  */
14997   mem_1 = change_address (mem_1, VOIDmode,
14998                           plus_constant (DImode, operands[8], new_off));
14999
15000   /* Check if the adjusted address is OK for ldp/stp.  */
15001   if (!aarch64_mem_pair_operand (mem_1, mode))
15002     return false;
15003
15004   msize = GET_MODE_SIZE (mode);
15005   mem_2 = change_address (mem_2, VOIDmode,
15006                           plus_constant (DImode,
15007                                          operands[8],
15008                                          new_off + msize));
15009   mem_3 = change_address (mem_3, VOIDmode,
15010                           plus_constant (DImode,
15011                                          operands[8],
15012                                          new_off + msize * 2));
15013   mem_4 = change_address (mem_4, VOIDmode,
15014                           plus_constant (DImode,
15015                                          operands[8],
15016                                          new_off + msize * 3));
15017
15018   if (code == ZERO_EXTEND)
15019     {
15020       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
15021       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
15022       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
15023       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
15024     }
15025   else if (code == SIGN_EXTEND)
15026     {
15027       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
15028       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
15029       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
15030       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
15031     }
15032
15033   if (load)
15034     {
15035       operands[1] = mem_1;
15036       operands[3] = mem_2;
15037       operands[5] = mem_3;
15038       operands[7] = mem_4;
15039     }
15040   else
15041     {
15042       operands[0] = mem_1;
15043       operands[2] = mem_2;
15044       operands[4] = mem_3;
15045       operands[6] = mem_4;
15046     }
15047
15048   /* Emit adjusting instruction.  */
15049   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
15050   /* Emit ldp/stp instructions.  */
15051   t1 = gen_rtx_SET (operands[0], operands[1]);
15052   t2 = gen_rtx_SET (operands[2], operands[3]);
15053   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
15054   t1 = gen_rtx_SET (operands[4], operands[5]);
15055   t2 = gen_rtx_SET (operands[6], operands[7]);
15056   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
15057   return true;
15058 }
15059
15060 /* Return 1 if pseudo register should be created and used to hold
15061    GOT address for PIC code.  */
15062
15063 bool
15064 aarch64_use_pseudo_pic_reg (void)
15065 {
15066   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
15067 }
15068
15069 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
15070
15071 static int
15072 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
15073 {
15074   switch (XINT (x, 1))
15075     {
15076     case UNSPEC_GOTSMALLPIC:
15077     case UNSPEC_GOTSMALLPIC28K:
15078     case UNSPEC_GOTTINYPIC:
15079       return 0;
15080     default:
15081       break;
15082     }
15083
15084   return default_unspec_may_trap_p (x, flags);
15085 }
15086
15087
15088 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
15089    return the log2 of that value.  Otherwise return -1.  */
15090
15091 int
15092 aarch64_fpconst_pow_of_2 (rtx x)
15093 {
15094   const REAL_VALUE_TYPE *r;
15095
15096   if (!CONST_DOUBLE_P (x))
15097     return -1;
15098
15099   r = CONST_DOUBLE_REAL_VALUE (x);
15100
15101   if (REAL_VALUE_NEGATIVE (*r)
15102       || REAL_VALUE_ISNAN (*r)
15103       || REAL_VALUE_ISINF (*r)
15104       || !real_isinteger (r, DFmode))
15105     return -1;
15106
15107   return exact_log2 (real_to_integer (r));
15108 }
15109
15110 /* If X is a vector of equal CONST_DOUBLE values and that value is
15111    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
15112
15113 int
15114 aarch64_vec_fpconst_pow_of_2 (rtx x)
15115 {
15116   if (GET_CODE (x) != CONST_VECTOR)
15117     return -1;
15118
15119   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
15120     return -1;
15121
15122   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
15123   if (firstval <= 0)
15124     return -1;
15125
15126   for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
15127     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
15128       return -1;
15129
15130   return firstval;
15131 }
15132
15133 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
15134    to float.
15135
15136    __fp16 always promotes through this hook.
15137    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
15138    through the generic excess precision logic rather than here.  */
15139
15140 static tree
15141 aarch64_promoted_type (const_tree t)
15142 {
15143   if (SCALAR_FLOAT_TYPE_P (t)
15144       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
15145     return float_type_node;
15146
15147   return NULL_TREE;
15148 }
15149
15150 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
15151
15152 static bool
15153 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
15154                            optimization_type opt_type)
15155 {
15156   switch (op)
15157     {
15158     case rsqrt_optab:
15159       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
15160
15161     default:
15162       return true;
15163     }
15164 }
15165
15166 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
15167    if MODE is HFmode, and punt to the generic implementation otherwise.  */
15168
15169 static bool
15170 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
15171 {
15172   return (mode == HFmode
15173           ? true
15174           : default_libgcc_floating_mode_supported_p (mode));
15175 }
15176
15177 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
15178    if MODE is HFmode, and punt to the generic implementation otherwise.  */
15179
15180 static bool
15181 aarch64_scalar_mode_supported_p (scalar_mode mode)
15182 {
15183   return (mode == HFmode
15184           ? true
15185           : default_scalar_mode_supported_p (mode));
15186 }
15187
15188 /* Set the value of FLT_EVAL_METHOD.
15189    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
15190
15191     0: evaluate all operations and constants, whose semantic type has at
15192        most the range and precision of type float, to the range and
15193        precision of float; evaluate all other operations and constants to
15194        the range and precision of the semantic type;
15195
15196     N, where _FloatN is a supported interchange floating type
15197        evaluate all operations and constants, whose semantic type has at
15198        most the range and precision of _FloatN type, to the range and
15199        precision of the _FloatN type; evaluate all other operations and
15200        constants to the range and precision of the semantic type;
15201
15202    If we have the ARMv8.2-A extensions then we support _Float16 in native
15203    precision, so we should set this to 16.  Otherwise, we support the type,
15204    but want to evaluate expressions in float precision, so set this to
15205    0.  */
15206
15207 static enum flt_eval_method
15208 aarch64_excess_precision (enum excess_precision_type type)
15209 {
15210   switch (type)
15211     {
15212       case EXCESS_PRECISION_TYPE_FAST:
15213       case EXCESS_PRECISION_TYPE_STANDARD:
15214         /* We can calculate either in 16-bit range and precision or
15215            32-bit range and precision.  Make that decision based on whether
15216            we have native support for the ARMv8.2-A 16-bit floating-point
15217            instructions or not.  */
15218         return (TARGET_FP_F16INST
15219                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
15220                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
15221       case EXCESS_PRECISION_TYPE_IMPLICIT:
15222         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
15223       default:
15224         gcc_unreachable ();
15225     }
15226   return FLT_EVAL_METHOD_UNPREDICTABLE;
15227 }
15228
15229 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
15230    scheduled for speculative execution.  Reject the long-running division
15231    and square-root instructions.  */
15232
15233 static bool
15234 aarch64_sched_can_speculate_insn (rtx_insn *insn)
15235 {
15236   switch (get_attr_type (insn))
15237     {
15238       case TYPE_SDIV:
15239       case TYPE_UDIV:
15240       case TYPE_FDIVS:
15241       case TYPE_FDIVD:
15242       case TYPE_FSQRTS:
15243       case TYPE_FSQRTD:
15244       case TYPE_NEON_FP_SQRT_S:
15245       case TYPE_NEON_FP_SQRT_D:
15246       case TYPE_NEON_FP_SQRT_S_Q:
15247       case TYPE_NEON_FP_SQRT_D_Q:
15248       case TYPE_NEON_FP_DIV_S:
15249       case TYPE_NEON_FP_DIV_D:
15250       case TYPE_NEON_FP_DIV_S_Q:
15251       case TYPE_NEON_FP_DIV_D_Q:
15252         return false;
15253       default:
15254         return true;
15255     }
15256 }
15257
15258 /* Target-specific selftests.  */
15259
15260 #if CHECKING_P
15261
15262 namespace selftest {
15263
15264 /* Selftest for the RTL loader.
15265    Verify that the RTL loader copes with a dump from
15266    print_rtx_function.  This is essentially just a test that class
15267    function_reader can handle a real dump, but it also verifies
15268    that lookup_reg_by_dump_name correctly handles hard regs.
15269    The presence of hard reg names in the dump means that the test is
15270    target-specific, hence it is in this file.  */
15271
15272 static void
15273 aarch64_test_loading_full_dump ()
15274 {
15275   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
15276
15277   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
15278
15279   rtx_insn *insn_1 = get_insn_by_uid (1);
15280   ASSERT_EQ (NOTE, GET_CODE (insn_1));
15281
15282   rtx_insn *insn_15 = get_insn_by_uid (15);
15283   ASSERT_EQ (INSN, GET_CODE (insn_15));
15284   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
15285
15286   /* Verify crtl->return_rtx.  */
15287   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
15288   ASSERT_EQ (0, REGNO (crtl->return_rtx));
15289   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
15290 }
15291
15292 /* Run all target-specific selftests.  */
15293
15294 static void
15295 aarch64_run_selftests (void)
15296 {
15297   aarch64_test_loading_full_dump ();
15298 }
15299
15300 } // namespace selftest
15301
15302 #endif /* #if CHECKING_P */
15303
15304 #undef TARGET_ADDRESS_COST
15305 #define TARGET_ADDRESS_COST aarch64_address_cost
15306
15307 /* This hook will determines whether unnamed bitfields affect the alignment
15308    of the containing structure.  The hook returns true if the structure
15309    should inherit the alignment requirements of an unnamed bitfield's
15310    type.  */
15311 #undef TARGET_ALIGN_ANON_BITFIELD
15312 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
15313
15314 #undef TARGET_ASM_ALIGNED_DI_OP
15315 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
15316
15317 #undef TARGET_ASM_ALIGNED_HI_OP
15318 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
15319
15320 #undef TARGET_ASM_ALIGNED_SI_OP
15321 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
15322
15323 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
15324 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
15325   hook_bool_const_tree_hwi_hwi_const_tree_true
15326
15327 #undef TARGET_ASM_FILE_START
15328 #define TARGET_ASM_FILE_START aarch64_start_file
15329
15330 #undef TARGET_ASM_OUTPUT_MI_THUNK
15331 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
15332
15333 #undef TARGET_ASM_SELECT_RTX_SECTION
15334 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
15335
15336 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
15337 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
15338
15339 #undef TARGET_BUILD_BUILTIN_VA_LIST
15340 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
15341
15342 #undef TARGET_CALLEE_COPIES
15343 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
15344
15345 #undef TARGET_CAN_ELIMINATE
15346 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
15347
15348 #undef TARGET_CAN_INLINE_P
15349 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
15350
15351 #undef TARGET_CANNOT_FORCE_CONST_MEM
15352 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
15353
15354 #undef TARGET_CASE_VALUES_THRESHOLD
15355 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
15356
15357 #undef TARGET_CONDITIONAL_REGISTER_USAGE
15358 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
15359
15360 /* Only the least significant bit is used for initialization guard
15361    variables.  */
15362 #undef TARGET_CXX_GUARD_MASK_BIT
15363 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
15364
15365 #undef TARGET_C_MODE_FOR_SUFFIX
15366 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
15367
15368 #ifdef TARGET_BIG_ENDIAN_DEFAULT
15369 #undef  TARGET_DEFAULT_TARGET_FLAGS
15370 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
15371 #endif
15372
15373 #undef TARGET_CLASS_MAX_NREGS
15374 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
15375
15376 #undef TARGET_BUILTIN_DECL
15377 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
15378
15379 #undef TARGET_BUILTIN_RECIPROCAL
15380 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
15381
15382 #undef TARGET_C_EXCESS_PRECISION
15383 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
15384
15385 #undef  TARGET_EXPAND_BUILTIN
15386 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
15387
15388 #undef TARGET_EXPAND_BUILTIN_VA_START
15389 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
15390
15391 #undef TARGET_FOLD_BUILTIN
15392 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
15393
15394 #undef TARGET_FUNCTION_ARG
15395 #define TARGET_FUNCTION_ARG aarch64_function_arg
15396
15397 #undef TARGET_FUNCTION_ARG_ADVANCE
15398 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
15399
15400 #undef TARGET_FUNCTION_ARG_BOUNDARY
15401 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
15402
15403 #undef TARGET_FUNCTION_ARG_PADDING
15404 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
15405
15406 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
15407 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
15408
15409 #undef TARGET_FUNCTION_VALUE
15410 #define TARGET_FUNCTION_VALUE aarch64_function_value
15411
15412 #undef TARGET_FUNCTION_VALUE_REGNO_P
15413 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
15414
15415 #undef TARGET_FRAME_POINTER_REQUIRED
15416 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
15417
15418 #undef TARGET_GIMPLE_FOLD_BUILTIN
15419 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
15420
15421 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
15422 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
15423
15424 #undef  TARGET_INIT_BUILTINS
15425 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
15426
15427 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
15428 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
15429   aarch64_ira_change_pseudo_allocno_class
15430
15431 #undef TARGET_LEGITIMATE_ADDRESS_P
15432 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
15433
15434 #undef TARGET_LEGITIMATE_CONSTANT_P
15435 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
15436
15437 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
15438 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
15439   aarch64_legitimize_address_displacement
15440
15441 #undef TARGET_LIBGCC_CMP_RETURN_MODE
15442 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
15443
15444 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
15445 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
15446 aarch64_libgcc_floating_mode_supported_p
15447
15448 #undef TARGET_MANGLE_TYPE
15449 #define TARGET_MANGLE_TYPE aarch64_mangle_type
15450
15451 #undef TARGET_MEMORY_MOVE_COST
15452 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
15453
15454 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
15455 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
15456
15457 #undef TARGET_MUST_PASS_IN_STACK
15458 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
15459
15460 /* This target hook should return true if accesses to volatile bitfields
15461    should use the narrowest mode possible.  It should return false if these
15462    accesses should use the bitfield container type.  */
15463 #undef TARGET_NARROW_VOLATILE_BITFIELD
15464 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
15465
15466 #undef  TARGET_OPTION_OVERRIDE
15467 #define TARGET_OPTION_OVERRIDE aarch64_override_options
15468
15469 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
15470 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
15471   aarch64_override_options_after_change
15472
15473 #undef TARGET_OPTION_SAVE
15474 #define TARGET_OPTION_SAVE aarch64_option_save
15475
15476 #undef TARGET_OPTION_RESTORE
15477 #define TARGET_OPTION_RESTORE aarch64_option_restore
15478
15479 #undef TARGET_OPTION_PRINT
15480 #define TARGET_OPTION_PRINT aarch64_option_print
15481
15482 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
15483 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
15484
15485 #undef TARGET_SET_CURRENT_FUNCTION
15486 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
15487
15488 #undef TARGET_PASS_BY_REFERENCE
15489 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
15490
15491 #undef TARGET_PREFERRED_RELOAD_CLASS
15492 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
15493
15494 #undef TARGET_SCHED_REASSOCIATION_WIDTH
15495 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
15496
15497 #undef TARGET_PROMOTED_TYPE
15498 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
15499
15500 #undef TARGET_SECONDARY_RELOAD
15501 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
15502
15503 #undef TARGET_SHIFT_TRUNCATION_MASK
15504 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
15505
15506 #undef TARGET_SETUP_INCOMING_VARARGS
15507 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
15508
15509 #undef TARGET_STRUCT_VALUE_RTX
15510 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
15511
15512 #undef TARGET_REGISTER_MOVE_COST
15513 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
15514
15515 #undef TARGET_RETURN_IN_MEMORY
15516 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
15517
15518 #undef TARGET_RETURN_IN_MSB
15519 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
15520
15521 #undef TARGET_RTX_COSTS
15522 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
15523
15524 #undef TARGET_SCALAR_MODE_SUPPORTED_P
15525 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
15526
15527 #undef TARGET_SCHED_ISSUE_RATE
15528 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
15529
15530 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
15531 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
15532   aarch64_sched_first_cycle_multipass_dfa_lookahead
15533
15534 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
15535 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
15536   aarch64_first_cycle_multipass_dfa_lookahead_guard
15537
15538 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
15539 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
15540   aarch64_get_separate_components
15541
15542 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
15543 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
15544   aarch64_components_for_bb
15545
15546 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
15547 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
15548   aarch64_disqualify_components
15549
15550 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
15551 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
15552   aarch64_emit_prologue_components
15553
15554 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
15555 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
15556   aarch64_emit_epilogue_components
15557
15558 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
15559 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
15560   aarch64_set_handled_components
15561
15562 #undef TARGET_TRAMPOLINE_INIT
15563 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
15564
15565 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
15566 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
15567
15568 #undef TARGET_VECTOR_MODE_SUPPORTED_P
15569 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
15570
15571 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
15572 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
15573   aarch64_builtin_support_vector_misalignment
15574
15575 #undef TARGET_ARRAY_MODE_SUPPORTED_P
15576 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
15577
15578 #undef TARGET_VECTORIZE_ADD_STMT_COST
15579 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
15580
15581 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
15582 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
15583   aarch64_builtin_vectorization_cost
15584
15585 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
15586 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
15587
15588 #undef TARGET_VECTORIZE_BUILTINS
15589 #define TARGET_VECTORIZE_BUILTINS
15590
15591 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
15592 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
15593   aarch64_builtin_vectorized_function
15594
15595 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
15596 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
15597   aarch64_autovectorize_vector_sizes
15598
15599 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
15600 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
15601   aarch64_atomic_assign_expand_fenv
15602
15603 /* Section anchor support.  */
15604
15605 #undef TARGET_MIN_ANCHOR_OFFSET
15606 #define TARGET_MIN_ANCHOR_OFFSET -256
15607
15608 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
15609    byte offset; we can do much more for larger data types, but have no way
15610    to determine the size of the access.  We assume accesses are aligned.  */
15611 #undef TARGET_MAX_ANCHOR_OFFSET
15612 #define TARGET_MAX_ANCHOR_OFFSET 4095
15613
15614 #undef TARGET_VECTOR_ALIGNMENT
15615 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
15616
15617 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
15618 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
15619   aarch64_simd_vector_alignment_reachable
15620
15621 /* vec_perm support.  */
15622
15623 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
15624 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
15625   aarch64_vectorize_vec_perm_const_ok
15626
15627 #undef TARGET_INIT_LIBFUNCS
15628 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
15629
15630 #undef TARGET_FIXED_CONDITION_CODE_REGS
15631 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
15632
15633 #undef TARGET_FLAGS_REGNUM
15634 #define TARGET_FLAGS_REGNUM CC_REGNUM
15635
15636 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
15637 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
15638
15639 #undef TARGET_ASAN_SHADOW_OFFSET
15640 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
15641
15642 #undef TARGET_LEGITIMIZE_ADDRESS
15643 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
15644
15645 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
15646 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
15647   aarch64_use_by_pieces_infrastructure_p
15648
15649 #undef TARGET_SCHED_CAN_SPECULATE_INSN
15650 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
15651
15652 #undef TARGET_CAN_USE_DOLOOP_P
15653 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
15654
15655 #undef TARGET_SCHED_ADJUST_PRIORITY
15656 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
15657
15658 #undef TARGET_SCHED_MACRO_FUSION_P
15659 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
15660
15661 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
15662 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
15663
15664 #undef TARGET_SCHED_FUSION_PRIORITY
15665 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
15666
15667 #undef TARGET_UNSPEC_MAY_TRAP_P
15668 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
15669
15670 #undef TARGET_USE_PSEUDO_PIC_REG
15671 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
15672
15673 #undef TARGET_PRINT_OPERAND
15674 #define TARGET_PRINT_OPERAND aarch64_print_operand
15675
15676 #undef TARGET_PRINT_OPERAND_ADDRESS
15677 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
15678
15679 #undef TARGET_OPTAB_SUPPORTED_P
15680 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
15681
15682 #undef TARGET_OMIT_STRUCT_RETURN_REG
15683 #define TARGET_OMIT_STRUCT_RETURN_REG true
15684
15685 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
15686 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
15687 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
15688
15689 #undef TARGET_HARD_REGNO_MODE_OK
15690 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
15691
15692 #undef TARGET_MODES_TIEABLE_P
15693 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
15694
15695 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
15696 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
15697   aarch64_hard_regno_call_part_clobbered
15698
15699 #if CHECKING_P
15700 #undef TARGET_RUN_TARGET_SELFTESTS
15701 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
15702 #endif /* #if CHECKING_P */
15703
15704 struct gcc_target targetm = TARGET_INITIALIZER;
15705
15706 #include "gt-aarch64.h"