gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2017 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #define INCLUDE_STRING
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "memmodel.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "cfgloop.h"
  33 #include "df.h"
  34 #include "tm_p.h"
  35 #include "stringpool.h"
  36 #include "attribs.h"
  37 #include "optabs.h"
  38 #include "regs.h"
  39 #include "emit-rtl.h"
  40 #include "recog.h"
  41 #include "diagnostic.h"
  42 #include "insn-attr.h"
  43 #include "alias.h"
  44 #include "fold-const.h"
  45 #include "stor-layout.h"
  46 #include "calls.h"
  47 #include "varasm.h"
  48 #include "output.h"
  49 #include "flags.h"
  50 #include "explow.h"
  51 #include "expr.h"
  52 #include "reload.h"
  53 #include "langhooks.h"
  54 #include "opts.h"
  55 #include "params.h"
  56 #include "gimplify.h"
  57 #include "dwarf2.h"
  58 #include "gimple-iterator.h"
  59 #include "tree-vectorizer.h"
  60 #include "aarch64-cost-tables.h"
  61 #include "dumpfile.h"
  62 #include "builtins.h"
  63 #include "rtl-iter.h"
  64 #include "tm-constrs.h"
  65 #include "sched-int.h"
  66 #include "target-globals.h"
  67 #include "common/common-target.h"
  68 #include "selftest.h"
  69 #include "selftest-rtl.h"
  70
  71 /* This file should be included last.  */
  72 #include "target-def.h"
  73
  74 /* Defined for convenience.  */
  75 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  76
  77 /* Classifies an address.
  78
  79    ADDRESS_REG_IMM
  80        A simple base register plus immediate offset.
  81
  82    ADDRESS_REG_WB
  83        A base register indexed by immediate offset with writeback.
  84
  85    ADDRESS_REG_REG
  86        A base register indexed by (optionally scaled) register.
  87
  88    ADDRESS_REG_UXTW
  89        A base register indexed by (optionally scaled) zero-extended register.
  90
  91    ADDRESS_REG_SXTW
  92        A base register indexed by (optionally scaled) sign-extended register.
  93
  94    ADDRESS_LO_SUM
  95        A LO_SUM rtx with a base register and "LO12" symbol relocation.
  96
  97    ADDRESS_SYMBOLIC:
  98        A constant symbolic address, in pc-relative literal pool.  */
  99
 100 enum aarch64_address_type {
 101   ADDRESS_REG_IMM,
 102   ADDRESS_REG_WB,
 103   ADDRESS_REG_REG,
 104   ADDRESS_REG_UXTW,
 105   ADDRESS_REG_SXTW,
 106   ADDRESS_LO_SUM,
 107   ADDRESS_SYMBOLIC
 108 };
 109
 110 struct aarch64_address_info {
 111   enum aarch64_address_type type;
 112   rtx base;
 113   rtx offset;
 114   int shift;
 115   enum aarch64_symbol_type symbol_type;
 116 };
 117
 118 struct simd_immediate_info
 119 {
 120   rtx value;
 121   int shift;
 122   int element_width;
 123   bool mvn;
 124   bool msl;
 125 };
 126
 127 /* The current code model.  */
 128 enum aarch64_code_model aarch64_cmodel;
 129
 130 #ifdef HAVE_AS_TLS
 131 #undef TARGET_HAVE_TLS
 132 #define TARGET_HAVE_TLS 1
 133 #endif
 134
 135 static bool aarch64_composite_type_p (const_tree, machine_mode);
 136 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 137                                                      const_tree,
 138                                                      machine_mode *, int *,
 139                                                      bool *);
 140 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 141 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 142 static void aarch64_override_options_after_change (void);
 143 static bool aarch64_vector_mode_supported_p (machine_mode);
 144 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode,
 145                                                  vec_perm_indices);
 146 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 147 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 148                                                          const_tree type,
 149                                                          int misalignment,
 150                                                          bool is_packed);
 151 static machine_mode
 152 aarch64_simd_container_mode (scalar_mode mode, unsigned width);
 153
 154 /* Major revision number of the ARM Architecture implemented by the target.  */
 155 unsigned aarch64_architecture_version;
 156
 157 /* The processor for which instructions should be scheduled.  */
 158 enum aarch64_processor aarch64_tune = cortexa53;
 159
 160 /* Mask to specify which instruction scheduling options should be used.  */
 161 unsigned long aarch64_tune_flags = 0;
 162
 163 /* Global flag for PC relative loads.  */
 164 bool aarch64_pcrelative_literal_loads;
 165
 166 /* Support for command line parsing of boolean flags in the tuning
 167    structures.  */
 168 struct aarch64_flag_desc
 169 {
 170   const char* name;
 171   unsigned int flag;
 172 };
 173
 174 #define AARCH64_FUSION_PAIR(name, internal_name) \
 175   { name, AARCH64_FUSE_##internal_name },
 176 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 177 {
 178   { "none", AARCH64_FUSE_NOTHING },
 179 #include "aarch64-fusion-pairs.def"
 180   { "all", AARCH64_FUSE_ALL },
 181   { NULL, AARCH64_FUSE_NOTHING }
 182 };
 183
 184 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 185   { name, AARCH64_EXTRA_TUNE_##internal_name },
 186 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 187 {
 188   { "none", AARCH64_EXTRA_TUNE_NONE },
 189 #include "aarch64-tuning-flags.def"
 190   { "all", AARCH64_EXTRA_TUNE_ALL },
 191   { NULL, AARCH64_EXTRA_TUNE_NONE }
 192 };
 193
 194 /* Tuning parameters.  */
 195
 196 static const struct cpu_addrcost_table generic_addrcost_table =
 197 {
 198     {
 199       1, /* hi  */
 200       0, /* si  */
 201       0, /* di  */
 202       1, /* ti  */
 203     },
 204   0, /* pre_modify  */
 205   0, /* post_modify  */
 206   0, /* register_offset  */
 207   0, /* register_sextend  */
 208   0, /* register_zextend  */
 209   0 /* imm_offset  */
 210 };
 211
 212 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 213 {
 214     {
 215       0, /* hi  */
 216       0, /* si  */
 217       0, /* di  */
 218       2, /* ti  */
 219     },
 220   0, /* pre_modify  */
 221   0, /* post_modify  */
 222   1, /* register_offset  */
 223   1, /* register_sextend  */
 224   2, /* register_zextend  */
 225   0, /* imm_offset  */
 226 };
 227
 228 static const struct cpu_addrcost_table xgene1_addrcost_table =
 229 {
 230     {
 231       1, /* hi  */
 232       0, /* si  */
 233       0, /* di  */
 234       1, /* ti  */
 235     },
 236   1, /* pre_modify  */
 237   0, /* post_modify  */
 238   0, /* register_offset  */
 239   1, /* register_sextend  */
 240   1, /* register_zextend  */
 241   0, /* imm_offset  */
 242 };
 243
 244 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 245 {
 246     {
 247       1, /* hi  */
 248       1, /* si  */
 249       1, /* di  */
 250       2, /* ti  */
 251     },
 252   0, /* pre_modify  */
 253   0, /* post_modify  */
 254   2, /* register_offset  */
 255   3, /* register_sextend  */
 256   3, /* register_zextend  */
 257   0, /* imm_offset  */
 258 };
 259
 260 static const struct cpu_regmove_cost generic_regmove_cost =
 261 {
 262   1, /* GP2GP  */
 263   /* Avoid the use of slow int<->fp moves for spilling by setting
 264      their cost higher than memmov_cost.  */
 265   5, /* GP2FP  */
 266   5, /* FP2GP  */
 267   2 /* FP2FP  */
 268 };
 269
 270 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 271 {
 272   1, /* GP2GP  */
 273   /* Avoid the use of slow int<->fp moves for spilling by setting
 274      their cost higher than memmov_cost.  */
 275   5, /* GP2FP  */
 276   5, /* FP2GP  */
 277   2 /* FP2FP  */
 278 };
 279
 280 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 281 {
 282   1, /* GP2GP  */
 283   /* Avoid the use of slow int<->fp moves for spilling by setting
 284      their cost higher than memmov_cost.  */
 285   5, /* GP2FP  */
 286   5, /* FP2GP  */
 287   2 /* FP2FP  */
 288 };
 289
 290 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 291 {
 292   1, /* GP2GP  */
 293   /* Avoid the use of slow int<->fp moves for spilling by setting
 294      their cost higher than memmov_cost (actual, 4 and 9).  */
 295   9, /* GP2FP  */
 296   9, /* FP2GP  */
 297   1 /* FP2FP  */
 298 };
 299
 300 static const struct cpu_regmove_cost thunderx_regmove_cost =
 301 {
 302   2, /* GP2GP  */
 303   2, /* GP2FP  */
 304   6, /* FP2GP  */
 305   4 /* FP2FP  */
 306 };
 307
 308 static const struct cpu_regmove_cost xgene1_regmove_cost =
 309 {
 310   1, /* GP2GP  */
 311   /* Avoid the use of slow int<->fp moves for spilling by setting
 312      their cost higher than memmov_cost.  */
 313   8, /* GP2FP  */
 314   8, /* FP2GP  */
 315   2 /* FP2FP  */
 316 };
 317
 318 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 319 {
 320   2, /* GP2GP  */
 321   /* Avoid the use of int<->fp moves for spilling.  */
 322   6, /* GP2FP  */
 323   6, /* FP2GP  */
 324   4 /* FP2FP  */
 325 };
 326
 327 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 328 {
 329   1, /* GP2GP  */
 330   /* Avoid the use of int<->fp moves for spilling.  */
 331   8, /* GP2FP  */
 332   8, /* FP2GP  */
 333   4  /* FP2FP  */
 334 };
 335
 336 /* Generic costs for vector insn classes.  */
 337 static const struct cpu_vector_cost generic_vector_cost =
 338 {
 339   1, /* scalar_int_stmt_cost  */
 340   1, /* scalar_fp_stmt_cost  */
 341   1, /* scalar_load_cost  */
 342   1, /* scalar_store_cost  */
 343   1, /* vec_int_stmt_cost  */
 344   1, /* vec_fp_stmt_cost  */
 345   2, /* vec_permute_cost  */
 346   1, /* vec_to_scalar_cost  */
 347   1, /* scalar_to_vec_cost  */
 348   1, /* vec_align_load_cost  */
 349   1, /* vec_unalign_load_cost  */
 350   1, /* vec_unalign_store_cost  */
 351   1, /* vec_store_cost  */
 352   3, /* cond_taken_branch_cost  */
 353   1 /* cond_not_taken_branch_cost  */
 354 };
 355
 356 /* ThunderX costs for vector insn classes.  */
 357 static const struct cpu_vector_cost thunderx_vector_cost =
 358 {
 359   1, /* scalar_int_stmt_cost  */
 360   1, /* scalar_fp_stmt_cost  */
 361   3, /* scalar_load_cost  */
 362   1, /* scalar_store_cost  */
 363   4, /* vec_int_stmt_cost  */
 364   1, /* vec_fp_stmt_cost  */
 365   4, /* vec_permute_cost  */
 366   2, /* vec_to_scalar_cost  */
 367   2, /* scalar_to_vec_cost  */
 368   3, /* vec_align_load_cost  */
 369   5, /* vec_unalign_load_cost  */
 370   5, /* vec_unalign_store_cost  */
 371   1, /* vec_store_cost  */
 372   3, /* cond_taken_branch_cost  */
 373   3 /* cond_not_taken_branch_cost  */
 374 };
 375
 376 /* Generic costs for vector insn classes.  */
 377 static const struct cpu_vector_cost cortexa57_vector_cost =
 378 {
 379   1, /* scalar_int_stmt_cost  */
 380   1, /* scalar_fp_stmt_cost  */
 381   4, /* scalar_load_cost  */
 382   1, /* scalar_store_cost  */
 383   2, /* vec_int_stmt_cost  */
 384   2, /* vec_fp_stmt_cost  */
 385   3, /* vec_permute_cost  */
 386   8, /* vec_to_scalar_cost  */
 387   8, /* scalar_to_vec_cost  */
 388   4, /* vec_align_load_cost  */
 389   4, /* vec_unalign_load_cost  */
 390   1, /* vec_unalign_store_cost  */
 391   1, /* vec_store_cost  */
 392   1, /* cond_taken_branch_cost  */
 393   1 /* cond_not_taken_branch_cost  */
 394 };
 395
 396 static const struct cpu_vector_cost exynosm1_vector_cost =
 397 {
 398   1, /* scalar_int_stmt_cost  */
 399   1, /* scalar_fp_stmt_cost  */
 400   5, /* scalar_load_cost  */
 401   1, /* scalar_store_cost  */
 402   3, /* vec_int_stmt_cost  */
 403   3, /* vec_fp_stmt_cost  */
 404   3, /* vec_permute_cost  */
 405   3, /* vec_to_scalar_cost  */
 406   3, /* scalar_to_vec_cost  */
 407   5, /* vec_align_load_cost  */
 408   5, /* vec_unalign_load_cost  */
 409   1, /* vec_unalign_store_cost  */
 410   1, /* vec_store_cost  */
 411   1, /* cond_taken_branch_cost  */
 412   1 /* cond_not_taken_branch_cost  */
 413 };
 414
 415 /* Generic costs for vector insn classes.  */
 416 static const struct cpu_vector_cost xgene1_vector_cost =
 417 {
 418   1, /* scalar_int_stmt_cost  */
 419   1, /* scalar_fp_stmt_cost  */
 420   5, /* scalar_load_cost  */
 421   1, /* scalar_store_cost  */
 422   2, /* vec_int_stmt_cost  */
 423   2, /* vec_fp_stmt_cost  */
 424   2, /* vec_permute_cost  */
 425   4, /* vec_to_scalar_cost  */
 426   4, /* scalar_to_vec_cost  */
 427   10, /* vec_align_load_cost  */
 428   10, /* vec_unalign_load_cost  */
 429   2, /* vec_unalign_store_cost  */
 430   2, /* vec_store_cost  */
 431   2, /* cond_taken_branch_cost  */
 432   1 /* cond_not_taken_branch_cost  */
 433 };
 434
 435 /* Costs for vector insn classes for Vulcan.  */
 436 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 437 {
 438   1, /* scalar_int_stmt_cost  */
 439   6, /* scalar_fp_stmt_cost  */
 440   4, /* scalar_load_cost  */
 441   1, /* scalar_store_cost  */
 442   5, /* vec_int_stmt_cost  */
 443   6, /* vec_fp_stmt_cost  */
 444   3, /* vec_permute_cost  */
 445   6, /* vec_to_scalar_cost  */
 446   5, /* scalar_to_vec_cost  */
 447   8, /* vec_align_load_cost  */
 448   8, /* vec_unalign_load_cost  */
 449   4, /* vec_unalign_store_cost  */
 450   4, /* vec_store_cost  */
 451   2, /* cond_taken_branch_cost  */
 452   1  /* cond_not_taken_branch_cost  */
 453 };
 454
 455 /* Generic costs for branch instructions.  */
 456 static const struct cpu_branch_cost generic_branch_cost =
 457 {
 458   1,  /* Predictable.  */
 459   3   /* Unpredictable.  */
 460 };
 461
 462 /* Generic approximation modes.  */
 463 static const cpu_approx_modes generic_approx_modes =
 464 {
 465   AARCH64_APPROX_NONE,  /* division  */
 466   AARCH64_APPROX_NONE,  /* sqrt  */
 467   AARCH64_APPROX_NONE   /* recip_sqrt  */
 468 };
 469
 470 /* Approximation modes for Exynos M1.  */
 471 static const cpu_approx_modes exynosm1_approx_modes =
 472 {
 473   AARCH64_APPROX_NONE,  /* division  */
 474   AARCH64_APPROX_ALL,   /* sqrt  */
 475   AARCH64_APPROX_ALL    /* recip_sqrt  */
 476 };
 477
 478 /* Approximation modes for X-Gene 1.  */
 479 static const cpu_approx_modes xgene1_approx_modes =
 480 {
 481   AARCH64_APPROX_NONE,  /* division  */
 482   AARCH64_APPROX_NONE,  /* sqrt  */
 483   AARCH64_APPROX_ALL    /* recip_sqrt  */
 484 };
 485
 486 /* Generic prefetch settings (which disable prefetch).  */
 487 static const cpu_prefetch_tune generic_prefetch_tune =
 488 {
 489   0,                    /* num_slots  */
 490   -1,                   /* l1_cache_size  */
 491   -1,                   /* l1_cache_line_size  */
 492   -1,                   /* l2_cache_size  */
 493   -1                    /* default_opt_level  */
 494 };
 495
 496 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 497 {
 498   0,                    /* num_slots  */
 499   -1,                   /* l1_cache_size  */
 500   64,                   /* l1_cache_line_size  */
 501   -1,                   /* l2_cache_size  */
 502   -1                    /* default_opt_level  */
 503 };
 504
 505 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 506 {
 507   4,                    /* num_slots  */
 508   32,                   /* l1_cache_size  */
 509   64,                   /* l1_cache_line_size  */
 510   1024,                 /* l2_cache_size  */
 511   3                     /* default_opt_level  */
 512 };
 513
 514 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 515 {
 516   8,                    /* num_slots  */
 517   32,                   /* l1_cache_size  */
 518   128,                  /* l1_cache_line_size  */
 519   16*1024,              /* l2_cache_size  */
 520   3                     /* default_opt_level  */
 521 };
 522
 523 static const cpu_prefetch_tune thunderx_prefetch_tune =
 524 {
 525   8,                    /* num_slots  */
 526   32,                   /* l1_cache_size  */
 527   128,                  /* l1_cache_line_size  */
 528   -1,                   /* l2_cache_size  */
 529   -1                    /* default_opt_level  */
 530 };
 531
 532 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 533 {
 534   8,                    /* num_slots  */
 535   32,                   /* l1_cache_size  */
 536   64,                   /* l1_cache_line_size  */
 537   256,                  /* l2_cache_size  */
 538   -1                    /* default_opt_level  */
 539 };
 540
 541 static const struct tune_params generic_tunings =
 542 {
 543   &cortexa57_extra_costs,
 544   &generic_addrcost_table,
 545   &generic_regmove_cost,
 546   &generic_vector_cost,
 547   &generic_branch_cost,
 548   &generic_approx_modes,
 549   4, /* memmov_cost  */
 550   2, /* issue_rate  */
 551   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 552   8,    /* function_align.  */
 553   4,    /* jump_align.  */
 554   8,    /* loop_align.  */
 555   2,    /* int_reassoc_width.  */
 556   4,    /* fp_reassoc_width.  */
 557   1,    /* vec_reassoc_width.  */
 558   2,    /* min_div_recip_mul_sf.  */
 559   2,    /* min_div_recip_mul_df.  */
 560   0,    /* max_case_values.  */
 561   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 562   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 563   &generic_prefetch_tune
 564 };
 565
 566 static const struct tune_params cortexa35_tunings =
 567 {
 568   &cortexa53_extra_costs,
 569   &generic_addrcost_table,
 570   &cortexa53_regmove_cost,
 571   &generic_vector_cost,
 572   &generic_branch_cost,
 573   &generic_approx_modes,
 574   4, /* memmov_cost  */
 575   1, /* issue_rate  */
 576   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 577    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 578   16,   /* function_align.  */
 579   4,    /* jump_align.  */
 580   8,    /* loop_align.  */
 581   2,    /* int_reassoc_width.  */
 582   4,    /* fp_reassoc_width.  */
 583   1,    /* vec_reassoc_width.  */
 584   2,    /* min_div_recip_mul_sf.  */
 585   2,    /* min_div_recip_mul_df.  */
 586   0,    /* max_case_values.  */
 587   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 588   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 589   &generic_prefetch_tune
 590 };
 591
 592 static const struct tune_params cortexa53_tunings =
 593 {
 594   &cortexa53_extra_costs,
 595   &generic_addrcost_table,
 596   &cortexa53_regmove_cost,
 597   &generic_vector_cost,
 598   &generic_branch_cost,
 599   &generic_approx_modes,
 600   4, /* memmov_cost  */
 601   2, /* issue_rate  */
 602   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 603    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 604   16,   /* function_align.  */
 605   4,    /* jump_align.  */
 606   8,    /* loop_align.  */
 607   2,    /* int_reassoc_width.  */
 608   4,    /* fp_reassoc_width.  */
 609   1,    /* vec_reassoc_width.  */
 610   2,    /* min_div_recip_mul_sf.  */
 611   2,    /* min_div_recip_mul_df.  */
 612   0,    /* max_case_values.  */
 613   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 614   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 615   &generic_prefetch_tune
 616 };
 617
 618 static const struct tune_params cortexa57_tunings =
 619 {
 620   &cortexa57_extra_costs,
 621   &generic_addrcost_table,
 622   &cortexa57_regmove_cost,
 623   &cortexa57_vector_cost,
 624   &generic_branch_cost,
 625   &generic_approx_modes,
 626   4, /* memmov_cost  */
 627   3, /* issue_rate  */
 628   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 629    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 630   16,   /* function_align.  */
 631   4,    /* jump_align.  */
 632   8,    /* loop_align.  */
 633   2,    /* int_reassoc_width.  */
 634   4,    /* fp_reassoc_width.  */
 635   1,    /* vec_reassoc_width.  */
 636   2,    /* min_div_recip_mul_sf.  */
 637   2,    /* min_div_recip_mul_df.  */
 638   0,    /* max_case_values.  */
 639   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 640   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 641   &generic_prefetch_tune
 642 };
 643
 644 static const struct tune_params cortexa72_tunings =
 645 {
 646   &cortexa57_extra_costs,
 647   &generic_addrcost_table,
 648   &cortexa57_regmove_cost,
 649   &cortexa57_vector_cost,
 650   &generic_branch_cost,
 651   &generic_approx_modes,
 652   4, /* memmov_cost  */
 653   3, /* issue_rate  */
 654   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 655    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 656   16,   /* function_align.  */
 657   4,    /* jump_align.  */
 658   8,    /* loop_align.  */
 659   2,    /* int_reassoc_width.  */
 660   4,    /* fp_reassoc_width.  */
 661   1,    /* vec_reassoc_width.  */
 662   2,    /* min_div_recip_mul_sf.  */
 663   2,    /* min_div_recip_mul_df.  */
 664   0,    /* max_case_values.  */
 665   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 666   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 667   &generic_prefetch_tune
 668 };
 669
 670 static const struct tune_params cortexa73_tunings =
 671 {
 672   &cortexa57_extra_costs,
 673   &generic_addrcost_table,
 674   &cortexa57_regmove_cost,
 675   &cortexa57_vector_cost,
 676   &generic_branch_cost,
 677   &generic_approx_modes,
 678   4, /* memmov_cost.  */
 679   2, /* issue_rate.  */
 680   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 681    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 682   16,   /* function_align.  */
 683   4,    /* jump_align.  */
 684   8,    /* loop_align.  */
 685   2,    /* int_reassoc_width.  */
 686   4,    /* fp_reassoc_width.  */
 687   1,    /* vec_reassoc_width.  */
 688   2,    /* min_div_recip_mul_sf.  */
 689   2,    /* min_div_recip_mul_df.  */
 690   0,    /* max_case_values.  */
 691   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 692   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 693   &generic_prefetch_tune
 694 };
 695
 696
 697
 698 static const struct tune_params exynosm1_tunings =
 699 {
 700   &exynosm1_extra_costs,
 701   &exynosm1_addrcost_table,
 702   &exynosm1_regmove_cost,
 703   &exynosm1_vector_cost,
 704   &generic_branch_cost,
 705   &exynosm1_approx_modes,
 706   4,    /* memmov_cost  */
 707   3,    /* issue_rate  */
 708   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 709   4,    /* function_align.  */
 710   4,    /* jump_align.  */
 711   4,    /* loop_align.  */
 712   2,    /* int_reassoc_width.  */
 713   4,    /* fp_reassoc_width.  */
 714   1,    /* vec_reassoc_width.  */
 715   2,    /* min_div_recip_mul_sf.  */
 716   2,    /* min_div_recip_mul_df.  */
 717   48,   /* max_case_values.  */
 718   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 719   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 720   &exynosm1_prefetch_tune
 721 };
 722
 723 static const struct tune_params thunderxt88_tunings =
 724 {
 725   &thunderx_extra_costs,
 726   &generic_addrcost_table,
 727   &thunderx_regmove_cost,
 728   &thunderx_vector_cost,
 729   &generic_branch_cost,
 730   &generic_approx_modes,
 731   6, /* memmov_cost  */
 732   2, /* issue_rate  */
 733   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 734   8,    /* function_align.  */
 735   8,    /* jump_align.  */
 736   8,    /* loop_align.  */
 737   2,    /* int_reassoc_width.  */
 738   4,    /* fp_reassoc_width.  */
 739   1,    /* vec_reassoc_width.  */
 740   2,    /* min_div_recip_mul_sf.  */
 741   2,    /* min_div_recip_mul_df.  */
 742   0,    /* max_case_values.  */
 743   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 744   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 745   &thunderxt88_prefetch_tune
 746 };
 747
 748 static const struct tune_params thunderx_tunings =
 749 {
 750   &thunderx_extra_costs,
 751   &generic_addrcost_table,
 752   &thunderx_regmove_cost,
 753   &thunderx_vector_cost,
 754   &generic_branch_cost,
 755   &generic_approx_modes,
 756   6, /* memmov_cost  */
 757   2, /* issue_rate  */
 758   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 759   8,    /* function_align.  */
 760   8,    /* jump_align.  */
 761   8,    /* loop_align.  */
 762   2,    /* int_reassoc_width.  */
 763   4,    /* fp_reassoc_width.  */
 764   1,    /* vec_reassoc_width.  */
 765   2,    /* min_div_recip_mul_sf.  */
 766   2,    /* min_div_recip_mul_df.  */
 767   0,    /* max_case_values.  */
 768   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 769   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 770    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 771   &thunderx_prefetch_tune
 772 };
 773
 774 static const struct tune_params xgene1_tunings =
 775 {
 776   &xgene1_extra_costs,
 777   &xgene1_addrcost_table,
 778   &xgene1_regmove_cost,
 779   &xgene1_vector_cost,
 780   &generic_branch_cost,
 781   &xgene1_approx_modes,
 782   6, /* memmov_cost  */
 783   4, /* issue_rate  */
 784   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 785   16,   /* function_align.  */
 786   8,    /* jump_align.  */
 787   16,   /* loop_align.  */
 788   2,    /* int_reassoc_width.  */
 789   4,    /* fp_reassoc_width.  */
 790   1,    /* vec_reassoc_width.  */
 791   2,    /* min_div_recip_mul_sf.  */
 792   2,    /* min_div_recip_mul_df.  */
 793   0,    /* max_case_values.  */
 794   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 795   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 796   &generic_prefetch_tune
 797 };
 798
 799 static const struct tune_params qdf24xx_tunings =
 800 {
 801   &qdf24xx_extra_costs,
 802   &generic_addrcost_table,
 803   &qdf24xx_regmove_cost,
 804   &generic_vector_cost,
 805   &generic_branch_cost,
 806   &generic_approx_modes,
 807   4, /* memmov_cost  */
 808   4, /* issue_rate  */
 809   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 810    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 811   16,   /* function_align.  */
 812   8,    /* jump_align.  */
 813   16,   /* loop_align.  */
 814   2,    /* int_reassoc_width.  */
 815   4,    /* fp_reassoc_width.  */
 816   1,    /* vec_reassoc_width.  */
 817   2,    /* min_div_recip_mul_sf.  */
 818   2,    /* min_div_recip_mul_df.  */
 819   0,    /* max_case_values.  */
 820   tune_params::AUTOPREFETCHER_STRONG,   /* autoprefetcher_model.  */
 821   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
 822   &qdf24xx_prefetch_tune
 823 };
 824
 825 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
 826    for now.  */
 827 static const struct tune_params saphira_tunings =
 828 {
 829   &generic_extra_costs,
 830   &generic_addrcost_table,
 831   &generic_regmove_cost,
 832   &generic_vector_cost,
 833   &generic_branch_cost,
 834   &generic_approx_modes,
 835   4, /* memmov_cost  */
 836   4, /* issue_rate  */
 837   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 838    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 839   16,   /* function_align.  */
 840   8,    /* jump_align.  */
 841   16,   /* loop_align.  */
 842   2,    /* int_reassoc_width.  */
 843   4,    /* fp_reassoc_width.  */
 844   1,    /* vec_reassoc_width.  */
 845   2,    /* min_div_recip_mul_sf.  */
 846   2,    /* min_div_recip_mul_df.  */
 847   0,    /* max_case_values.  */
 848   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 849   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
 850   &generic_prefetch_tune
 851 };
 852
 853 static const struct tune_params thunderx2t99_tunings =
 854 {
 855   &thunderx2t99_extra_costs,
 856   &thunderx2t99_addrcost_table,
 857   &thunderx2t99_regmove_cost,
 858   &thunderx2t99_vector_cost,
 859   &generic_branch_cost,
 860   &generic_approx_modes,
 861   4, /* memmov_cost.  */
 862   4, /* issue_rate.  */
 863   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
 864    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 865   16,   /* function_align.  */
 866   8,    /* jump_align.  */
 867   16,   /* loop_align.  */
 868   3,    /* int_reassoc_width.  */
 869   2,    /* fp_reassoc_width.  */
 870   2,    /* vec_reassoc_width.  */
 871   2,    /* min_div_recip_mul_sf.  */
 872   2,    /* min_div_recip_mul_df.  */
 873   0,    /* max_case_values.  */
 874   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 875   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 876   &thunderx2t99_prefetch_tune
 877 };
 878
 879 /* Support for fine-grained override of the tuning structures.  */
 880 struct aarch64_tuning_override_function
 881 {
 882   const char* name;
 883   void (*parse_override)(const char*, struct tune_params*);
 884 };
 885
 886 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 887 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 888
 889 static const struct aarch64_tuning_override_function
 890 aarch64_tuning_override_functions[] =
 891 {
 892   { "fuse", aarch64_parse_fuse_string },
 893   { "tune", aarch64_parse_tune_string },
 894   { NULL, NULL }
 895 };
 896
 897 /* A processor implementing AArch64.  */
 898 struct processor
 899 {
 900   const char *const name;
 901   enum aarch64_processor ident;
 902   enum aarch64_processor sched_core;
 903   enum aarch64_arch arch;
 904   unsigned architecture_version;
 905   const unsigned long flags;
 906   const struct tune_params *const tune;
 907 };
 908
 909 /* Architectures implementing AArch64.  */
 910 static const struct processor all_architectures[] =
 911 {
 912 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
 913   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
 914 #include "aarch64-arches.def"
 915   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 916 };
 917
 918 /* Processor cores implementing AArch64.  */
 919 static const struct processor all_cores[] =
 920 {
 921 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
 922   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
 923   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
 924   FLAGS, &COSTS##_tunings},
 925 #include "aarch64-cores.def"
 926   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
 927     AARCH64_FL_FOR_ARCH8, &generic_tunings},
 928   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 929 };
 930
 931
 932 /* Target specification.  These are populated by the -march, -mtune, -mcpu
 933    handling code or by target attributes.  */
 934 static const struct processor *selected_arch;
 935 static const struct processor *selected_cpu;
 936 static const struct processor *selected_tune;
 937
 938 /* The current tuning set.  */
 939 struct tune_params aarch64_tune_params = generic_tunings;
 940
 941 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 942
 943 /* An ISA extension in the co-processor and main instruction set space.  */
 944 struct aarch64_option_extension
 945 {
 946   const char *const name;
 947   const unsigned long flags_on;
 948   const unsigned long flags_off;
 949 };
 950
 951 typedef enum aarch64_cond_code
 952 {
 953   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 954   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 955   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 956 }
 957 aarch64_cc;
 958
 959 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 960
 961 /* The condition codes of the processor, and the inverse function.  */
 962 static const char * const aarch64_condition_codes[] =
 963 {
 964   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 965   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 966 };
 967
 968 /* Generate code to enable conditional branches in functions over 1 MiB.  */
 969 const char *
 970 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
 971                         const char * branch_format)
 972 {
 973     rtx_code_label * tmp_label = gen_label_rtx ();
 974     char label_buf[256];
 975     char buffer[128];
 976     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
 977                                  CODE_LABEL_NUMBER (tmp_label));
 978     const char *label_ptr = targetm.strip_name_encoding (label_buf);
 979     rtx dest_label = operands[pos_label];
 980     operands[pos_label] = tmp_label;
 981
 982     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
 983     output_asm_insn (buffer, operands);
 984
 985     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
 986     operands[pos_label] = dest_label;
 987     output_asm_insn (buffer, operands);
 988     return "";
 989 }
 990
 991 void
 992 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
 993 {
 994   const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
 995   if (TARGET_GENERAL_REGS_ONLY)
 996     error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
 997   else
 998     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
 999 }
1000
1001 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1002    The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
1003    the same cost even if ALL_REGS has a much larger cost.  ALL_REGS is also
1004    used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
1005    cost (in this case the best class is the lowest cost one).  Using ALL_REGS
1006    irrespectively of its cost results in bad allocations with many redundant
1007    int<->FP moves which are expensive on various cores.
1008    To avoid this we don't allow ALL_REGS as the allocno class, but force a
1009    decision between FP_REGS and GENERAL_REGS.  We use the allocno class if it
1010    isn't ALL_REGS.  Similarly, use the best class if it isn't ALL_REGS.
1011    Otherwise set the allocno class depending on the mode.
1012    The result of this is that it is no longer inefficient to have a higher
1013    memory move cost than the register move cost.
1014 */
1015
1016 static reg_class_t
1017 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1018                                          reg_class_t best_class)
1019 {
1020   machine_mode mode;
1021
1022   if (allocno_class != ALL_REGS)
1023     return allocno_class;
1024
1025   if (best_class != ALL_REGS)
1026     return best_class;
1027
1028   mode = PSEUDO_REGNO_MODE (regno);
1029   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1030 }
1031
1032 static unsigned int
1033 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1034 {
1035   if (GET_MODE_UNIT_SIZE (mode) == 4)
1036     return aarch64_tune_params.min_div_recip_mul_sf;
1037   return aarch64_tune_params.min_div_recip_mul_df;
1038 }
1039
1040 static int
1041 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
1042                              machine_mode mode)
1043 {
1044   if (VECTOR_MODE_P (mode))
1045     return aarch64_tune_params.vec_reassoc_width;
1046   if (INTEGRAL_MODE_P (mode))
1047     return aarch64_tune_params.int_reassoc_width;
1048   if (FLOAT_MODE_P (mode))
1049     return aarch64_tune_params.fp_reassoc_width;
1050   return 1;
1051 }
1052
1053 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1054 unsigned
1055 aarch64_dbx_register_number (unsigned regno)
1056 {
1057    if (GP_REGNUM_P (regno))
1058      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1059    else if (regno == SP_REGNUM)
1060      return AARCH64_DWARF_SP;
1061    else if (FP_REGNUM_P (regno))
1062      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1063
1064    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1065       equivalent DWARF register.  */
1066    return DWARF_FRAME_REGISTERS;
1067 }
1068
1069 /* Return TRUE if MODE is any of the large INT modes.  */
1070 static bool
1071 aarch64_vect_struct_mode_p (machine_mode mode)
1072 {
1073   return mode == OImode || mode == CImode || mode == XImode;
1074 }
1075
1076 /* Return TRUE if MODE is any of the vector modes.  */
1077 static bool
1078 aarch64_vector_mode_p (machine_mode mode)
1079 {
1080   return aarch64_vector_mode_supported_p (mode)
1081          || aarch64_vect_struct_mode_p (mode);
1082 }
1083
1084 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1085 static bool
1086 aarch64_array_mode_supported_p (machine_mode mode,
1087                                 unsigned HOST_WIDE_INT nelems)
1088 {
1089   if (TARGET_SIMD
1090       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1091           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1092       && (nelems >= 2 && nelems <= 4))
1093     return true;
1094
1095   return false;
1096 }
1097
1098 /* Implement TARGET_HARD_REGNO_NREGS.  */
1099
1100 static unsigned int
1101 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1102 {
1103   switch (aarch64_regno_regclass (regno))
1104     {
1105     case FP_REGS:
1106     case FP_LO_REGS:
1107       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1108     default:
1109       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1110     }
1111   gcc_unreachable ();
1112 }
1113
1114 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1115
1116 static bool
1117 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1118 {
1119   if (GET_MODE_CLASS (mode) == MODE_CC)
1120     return regno == CC_REGNUM;
1121
1122   if (regno == SP_REGNUM)
1123     /* The purpose of comparing with ptr_mode is to support the
1124        global register variable associated with the stack pointer
1125        register via the syntax of asm ("wsp") in ILP32.  */
1126     return mode == Pmode || mode == ptr_mode;
1127
1128   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1129     return mode == Pmode;
1130
1131   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1132     return true;
1133
1134   if (FP_REGNUM_P (regno))
1135     {
1136       if (aarch64_vect_struct_mode_p (mode))
1137         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1138       else
1139         return true;
1140     }
1141
1142   return false;
1143 }
1144
1145 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1146    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1147    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1148
1149 static bool
1150 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1151 {
1152   return FP_REGNUM_P (regno) && GET_MODE_SIZE (mode) > 8;
1153 }
1154
1155 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1156 machine_mode
1157 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1158                                      machine_mode mode)
1159 {
1160   /* Handle modes that fit within single registers.  */
1161   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1162     {
1163       if (GET_MODE_SIZE (mode) >= 4)
1164         return mode;
1165       else
1166         return SImode;
1167     }
1168   /* Fall back to generic for multi-reg and very large modes.  */
1169   else
1170     return choose_hard_reg_mode (regno, nregs, false);
1171 }
1172
1173 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
1174    that strcpy from constants will be faster.  */
1175
1176 static HOST_WIDE_INT
1177 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1178 {
1179   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1180     return MAX (align, BITS_PER_WORD);
1181   return align;
1182 }
1183
1184 /* Return true if calls to DECL should be treated as
1185    long-calls (ie called via a register).  */
1186 static bool
1187 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1188 {
1189   return false;
1190 }
1191
1192 /* Return true if calls to symbol-ref SYM should be treated as
1193    long-calls (ie called via a register).  */
1194 bool
1195 aarch64_is_long_call_p (rtx sym)
1196 {
1197   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1198 }
1199
1200 /* Return true if calls to symbol-ref SYM should not go through
1201    plt stubs.  */
1202
1203 bool
1204 aarch64_is_noplt_call_p (rtx sym)
1205 {
1206   const_tree decl = SYMBOL_REF_DECL (sym);
1207
1208   if (flag_pic
1209       && decl
1210       && (!flag_plt
1211           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1212       && !targetm.binds_local_p (decl))
1213     return true;
1214
1215   return false;
1216 }
1217
1218 /* Return true if the offsets to a zero/sign-extract operation
1219    represent an expression that matches an extend operation.  The
1220    operands represent the paramters from
1221
1222    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1223 bool
1224 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1225                                 rtx extract_imm)
1226 {
1227   HOST_WIDE_INT mult_val, extract_val;
1228
1229   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1230     return false;
1231
1232   mult_val = INTVAL (mult_imm);
1233   extract_val = INTVAL (extract_imm);
1234
1235   if (extract_val > 8
1236       && extract_val < GET_MODE_BITSIZE (mode)
1237       && exact_log2 (extract_val & ~7) > 0
1238       && (extract_val & 7) <= 4
1239       && mult_val == (1 << (extract_val & 7)))
1240     return true;
1241
1242   return false;
1243 }
1244
1245 /* Emit an insn that's a simple single-set.  Both the operands must be
1246    known to be valid.  */
1247 inline static rtx_insn *
1248 emit_set_insn (rtx x, rtx y)
1249 {
1250   return emit_insn (gen_rtx_SET (x, y));
1251 }
1252
1253 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1254    return the rtx for register 0 in the proper mode.  */
1255 rtx
1256 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1257 {
1258   machine_mode mode = SELECT_CC_MODE (code, x, y);
1259   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1260
1261   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1262   return cc_reg;
1263 }
1264
1265 /* Build the SYMBOL_REF for __tls_get_addr.  */
1266
1267 static GTY(()) rtx tls_get_addr_libfunc;
1268
1269 rtx
1270 aarch64_tls_get_addr (void)
1271 {
1272   if (!tls_get_addr_libfunc)
1273     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1274   return tls_get_addr_libfunc;
1275 }
1276
1277 /* Return the TLS model to use for ADDR.  */
1278
1279 static enum tls_model
1280 tls_symbolic_operand_type (rtx addr)
1281 {
1282   enum tls_model tls_kind = TLS_MODEL_NONE;
1283   rtx sym, addend;
1284
1285   if (GET_CODE (addr) == CONST)
1286     {
1287       split_const (addr, &sym, &addend);
1288       if (GET_CODE (sym) == SYMBOL_REF)
1289         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1290     }
1291   else if (GET_CODE (addr) == SYMBOL_REF)
1292     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1293
1294   return tls_kind;
1295 }
1296
1297 /* We'll allow lo_sum's in addresses in our legitimate addresses
1298    so that combine would take care of combining addresses where
1299    necessary, but for generation purposes, we'll generate the address
1300    as :
1301    RTL                               Absolute
1302    tmp = hi (symbol_ref);            adrp  x1, foo
1303    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1304                                      nop
1305
1306    PIC                               TLS
1307    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1308    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1309                                      bl   __tls_get_addr
1310                                      nop
1311
1312    Load TLS symbol, depending on TLS mechanism and TLS access model.
1313
1314    Global Dynamic - Traditional TLS:
1315    adrp tmp, :tlsgd:imm
1316    add  dest, tmp, #:tlsgd_lo12:imm
1317    bl   __tls_get_addr
1318
1319    Global Dynamic - TLS Descriptors:
1320    adrp dest, :tlsdesc:imm
1321    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1322    add  dest, dest, #:tlsdesc_lo12:imm
1323    blr  tmp
1324    mrs  tp, tpidr_el0
1325    add  dest, dest, tp
1326
1327    Initial Exec:
1328    mrs  tp, tpidr_el0
1329    adrp tmp, :gottprel:imm
1330    ldr  dest, [tmp, #:gottprel_lo12:imm]
1331    add  dest, dest, tp
1332
1333    Local Exec:
1334    mrs  tp, tpidr_el0
1335    add  t0, tp, #:tprel_hi12:imm, lsl #12
1336    add  t0, t0, #:tprel_lo12_nc:imm
1337 */
1338
1339 static void
1340 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1341                                    enum aarch64_symbol_type type)
1342 {
1343   switch (type)
1344     {
1345     case SYMBOL_SMALL_ABSOLUTE:
1346       {
1347         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1348         rtx tmp_reg = dest;
1349         machine_mode mode = GET_MODE (dest);
1350
1351         gcc_assert (mode == Pmode || mode == ptr_mode);
1352
1353         if (can_create_pseudo_p ())
1354           tmp_reg = gen_reg_rtx (mode);
1355
1356         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1357         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1358         return;
1359       }
1360
1361     case SYMBOL_TINY_ABSOLUTE:
1362       emit_insn (gen_rtx_SET (dest, imm));
1363       return;
1364
1365     case SYMBOL_SMALL_GOT_28K:
1366       {
1367         machine_mode mode = GET_MODE (dest);
1368         rtx gp_rtx = pic_offset_table_rtx;
1369         rtx insn;
1370         rtx mem;
1371
1372         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1373            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1374            decide rtx costs, in which case pic_offset_table_rtx is not
1375            initialized.  For that case no need to generate the first adrp
1376            instruction as the final cost for global variable access is
1377            one instruction.  */
1378         if (gp_rtx != NULL)
1379           {
1380             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1381                using the page base as GOT base, the first page may be wasted,
1382                in the worst scenario, there is only 28K space for GOT).
1383
1384                The generate instruction sequence for accessing global variable
1385                is:
1386
1387                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1388
1389                Only one instruction needed. But we must initialize
1390                pic_offset_table_rtx properly.  We generate initialize insn for
1391                every global access, and allow CSE to remove all redundant.
1392
1393                The final instruction sequences will look like the following
1394                for multiply global variables access.
1395
1396                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1397
1398                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1399                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1400                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1401                  ...  */
1402
1403             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1404             crtl->uses_pic_offset_table = 1;
1405             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1406
1407             if (mode != GET_MODE (gp_rtx))
1408              gp_rtx = gen_lowpart (mode, gp_rtx);
1409
1410           }
1411
1412         if (mode == ptr_mode)
1413           {
1414             if (mode == DImode)
1415               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1416             else
1417               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1418
1419             mem = XVECEXP (SET_SRC (insn), 0, 0);
1420           }
1421         else
1422           {
1423             gcc_assert (mode == Pmode);
1424
1425             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1426             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1427           }
1428
1429         /* The operand is expected to be MEM.  Whenever the related insn
1430            pattern changed, above code which calculate mem should be
1431            updated.  */
1432         gcc_assert (GET_CODE (mem) == MEM);
1433         MEM_READONLY_P (mem) = 1;
1434         MEM_NOTRAP_P (mem) = 1;
1435         emit_insn (insn);
1436         return;
1437       }
1438
1439     case SYMBOL_SMALL_GOT_4G:
1440       {
1441         /* In ILP32, the mode of dest can be either SImode or DImode,
1442            while the got entry is always of SImode size.  The mode of
1443            dest depends on how dest is used: if dest is assigned to a
1444            pointer (e.g. in the memory), it has SImode; it may have
1445            DImode if dest is dereferenced to access the memeory.
1446            This is why we have to handle three different ldr_got_small
1447            patterns here (two patterns for ILP32).  */
1448
1449         rtx insn;
1450         rtx mem;
1451         rtx tmp_reg = dest;
1452         machine_mode mode = GET_MODE (dest);
1453
1454         if (can_create_pseudo_p ())
1455           tmp_reg = gen_reg_rtx (mode);
1456
1457         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1458         if (mode == ptr_mode)
1459           {
1460             if (mode == DImode)
1461               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1462             else
1463               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1464
1465             mem = XVECEXP (SET_SRC (insn), 0, 0);
1466           }
1467         else
1468           {
1469             gcc_assert (mode == Pmode);
1470
1471             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1472             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1473           }
1474
1475         gcc_assert (GET_CODE (mem) == MEM);
1476         MEM_READONLY_P (mem) = 1;
1477         MEM_NOTRAP_P (mem) = 1;
1478         emit_insn (insn);
1479         return;
1480       }
1481
1482     case SYMBOL_SMALL_TLSGD:
1483       {
1484         rtx_insn *insns;
1485         machine_mode mode = GET_MODE (dest);
1486         rtx result = gen_rtx_REG (mode, R0_REGNUM);
1487
1488         start_sequence ();
1489         if (TARGET_ILP32)
1490           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1491         else
1492           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1493         insns = get_insns ();
1494         end_sequence ();
1495
1496         RTL_CONST_CALL_P (insns) = 1;
1497         emit_libcall_block (insns, dest, result, imm);
1498         return;
1499       }
1500
1501     case SYMBOL_SMALL_TLSDESC:
1502       {
1503         machine_mode mode = GET_MODE (dest);
1504         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1505         rtx tp;
1506
1507         gcc_assert (mode == Pmode || mode == ptr_mode);
1508
1509         /* In ILP32, the got entry is always of SImode size.  Unlike
1510            small GOT, the dest is fixed at reg 0.  */
1511         if (TARGET_ILP32)
1512           emit_insn (gen_tlsdesc_small_si (imm));
1513         else
1514           emit_insn (gen_tlsdesc_small_di (imm));
1515         tp = aarch64_load_tp (NULL);
1516
1517         if (mode != Pmode)
1518           tp = gen_lowpart (mode, tp);
1519
1520         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1521         if (REG_P (dest))
1522           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1523         return;
1524       }
1525
1526     case SYMBOL_SMALL_TLSIE:
1527       {
1528         /* In ILP32, the mode of dest can be either SImode or DImode,
1529            while the got entry is always of SImode size.  The mode of
1530            dest depends on how dest is used: if dest is assigned to a
1531            pointer (e.g. in the memory), it has SImode; it may have
1532            DImode if dest is dereferenced to access the memeory.
1533            This is why we have to handle three different tlsie_small
1534            patterns here (two patterns for ILP32).  */
1535         machine_mode mode = GET_MODE (dest);
1536         rtx tmp_reg = gen_reg_rtx (mode);
1537         rtx tp = aarch64_load_tp (NULL);
1538
1539         if (mode == ptr_mode)
1540           {
1541             if (mode == DImode)
1542               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1543             else
1544               {
1545                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1546                 tp = gen_lowpart (mode, tp);
1547               }
1548           }
1549         else
1550           {
1551             gcc_assert (mode == Pmode);
1552             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1553           }
1554
1555         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1556         if (REG_P (dest))
1557           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1558         return;
1559       }
1560
1561     case SYMBOL_TLSLE12:
1562     case SYMBOL_TLSLE24:
1563     case SYMBOL_TLSLE32:
1564     case SYMBOL_TLSLE48:
1565       {
1566         machine_mode mode = GET_MODE (dest);
1567         rtx tp = aarch64_load_tp (NULL);
1568
1569         if (mode != Pmode)
1570           tp = gen_lowpart (mode, tp);
1571
1572         switch (type)
1573           {
1574           case SYMBOL_TLSLE12:
1575             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1576                         (dest, tp, imm));
1577             break;
1578           case SYMBOL_TLSLE24:
1579             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1580                         (dest, tp, imm));
1581           break;
1582           case SYMBOL_TLSLE32:
1583             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1584                         (dest, imm));
1585             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1586                         (dest, dest, tp));
1587           break;
1588           case SYMBOL_TLSLE48:
1589             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1590                         (dest, imm));
1591             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1592                         (dest, dest, tp));
1593             break;
1594           default:
1595             gcc_unreachable ();
1596           }
1597
1598         if (REG_P (dest))
1599           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1600         return;
1601       }
1602
1603     case SYMBOL_TINY_GOT:
1604       emit_insn (gen_ldr_got_tiny (dest, imm));
1605       return;
1606
1607     case SYMBOL_TINY_TLSIE:
1608       {
1609         machine_mode mode = GET_MODE (dest);
1610         rtx tp = aarch64_load_tp (NULL);
1611
1612         if (mode == ptr_mode)
1613           {
1614             if (mode == DImode)
1615               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1616             else
1617               {
1618                 tp = gen_lowpart (mode, tp);
1619                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1620               }
1621           }
1622         else
1623           {
1624             gcc_assert (mode == Pmode);
1625             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1626           }
1627
1628         if (REG_P (dest))
1629           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1630         return;
1631       }
1632
1633     default:
1634       gcc_unreachable ();
1635     }
1636 }
1637
1638 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1639    handle all moves if !can_create_pseudo_p ().  The distinction is
1640    important because, unlike emit_move_insn, the move expanders know
1641    how to force Pmode objects into the constant pool even when the
1642    constant pool address is not itself legitimate.  */
1643 static rtx
1644 aarch64_emit_move (rtx dest, rtx src)
1645 {
1646   return (can_create_pseudo_p ()
1647           ? emit_move_insn (dest, src)
1648           : emit_move_insn_1 (dest, src));
1649 }
1650
1651 /* Split a 128-bit move operation into two 64-bit move operations,
1652    taking care to handle partial overlap of register to register
1653    copies.  Special cases are needed when moving between GP regs and
1654    FP regs.  SRC can be a register, constant or memory; DST a register
1655    or memory.  If either operand is memory it must not have any side
1656    effects.  */
1657 void
1658 aarch64_split_128bit_move (rtx dst, rtx src)
1659 {
1660   rtx dst_lo, dst_hi;
1661   rtx src_lo, src_hi;
1662
1663   machine_mode mode = GET_MODE (dst);
1664
1665   gcc_assert (mode == TImode || mode == TFmode);
1666   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1667   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1668
1669   if (REG_P (dst) && REG_P (src))
1670     {
1671       int src_regno = REGNO (src);
1672       int dst_regno = REGNO (dst);
1673
1674       /* Handle FP <-> GP regs.  */
1675       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1676         {
1677           src_lo = gen_lowpart (word_mode, src);
1678           src_hi = gen_highpart (word_mode, src);
1679
1680           if (mode == TImode)
1681             {
1682               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1683               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1684             }
1685           else
1686             {
1687               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1688               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1689             }
1690           return;
1691         }
1692       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1693         {
1694           dst_lo = gen_lowpart (word_mode, dst);
1695           dst_hi = gen_highpart (word_mode, dst);
1696
1697           if (mode == TImode)
1698             {
1699               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1700               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1701             }
1702           else
1703             {
1704               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1705               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1706             }
1707           return;
1708         }
1709     }
1710
1711   dst_lo = gen_lowpart (word_mode, dst);
1712   dst_hi = gen_highpart (word_mode, dst);
1713   src_lo = gen_lowpart (word_mode, src);
1714   src_hi = gen_highpart_mode (word_mode, mode, src);
1715
1716   /* At most one pairing may overlap.  */
1717   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1718     {
1719       aarch64_emit_move (dst_hi, src_hi);
1720       aarch64_emit_move (dst_lo, src_lo);
1721     }
1722   else
1723     {
1724       aarch64_emit_move (dst_lo, src_lo);
1725       aarch64_emit_move (dst_hi, src_hi);
1726     }
1727 }
1728
1729 bool
1730 aarch64_split_128bit_move_p (rtx dst, rtx src)
1731 {
1732   return (! REG_P (src)
1733           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1734 }
1735
1736 /* Split a complex SIMD combine.  */
1737
1738 void
1739 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1740 {
1741   machine_mode src_mode = GET_MODE (src1);
1742   machine_mode dst_mode = GET_MODE (dst);
1743
1744   gcc_assert (VECTOR_MODE_P (dst_mode));
1745   gcc_assert (register_operand (dst, dst_mode)
1746               && register_operand (src1, src_mode)
1747               && register_operand (src2, src_mode));
1748
1749   rtx (*gen) (rtx, rtx, rtx);
1750
1751   switch (src_mode)
1752     {
1753     case E_V8QImode:
1754       gen = gen_aarch64_simd_combinev8qi;
1755       break;
1756     case E_V4HImode:
1757       gen = gen_aarch64_simd_combinev4hi;
1758       break;
1759     case E_V2SImode:
1760       gen = gen_aarch64_simd_combinev2si;
1761       break;
1762     case E_V4HFmode:
1763       gen = gen_aarch64_simd_combinev4hf;
1764       break;
1765     case E_V2SFmode:
1766       gen = gen_aarch64_simd_combinev2sf;
1767       break;
1768     case E_DImode:
1769       gen = gen_aarch64_simd_combinedi;
1770       break;
1771     case E_DFmode:
1772       gen = gen_aarch64_simd_combinedf;
1773       break;
1774     default:
1775       gcc_unreachable ();
1776     }
1777
1778   emit_insn (gen (dst, src1, src2));
1779   return;
1780 }
1781
1782 /* Split a complex SIMD move.  */
1783
1784 void
1785 aarch64_split_simd_move (rtx dst, rtx src)
1786 {
1787   machine_mode src_mode = GET_MODE (src);
1788   machine_mode dst_mode = GET_MODE (dst);
1789
1790   gcc_assert (VECTOR_MODE_P (dst_mode));
1791
1792   if (REG_P (dst) && REG_P (src))
1793     {
1794       rtx (*gen) (rtx, rtx);
1795
1796       gcc_assert (VECTOR_MODE_P (src_mode));
1797
1798       switch (src_mode)
1799         {
1800         case E_V16QImode:
1801           gen = gen_aarch64_split_simd_movv16qi;
1802           break;
1803         case E_V8HImode:
1804           gen = gen_aarch64_split_simd_movv8hi;
1805           break;
1806         case E_V4SImode:
1807           gen = gen_aarch64_split_simd_movv4si;
1808           break;
1809         case E_V2DImode:
1810           gen = gen_aarch64_split_simd_movv2di;
1811           break;
1812         case E_V8HFmode:
1813           gen = gen_aarch64_split_simd_movv8hf;
1814           break;
1815         case E_V4SFmode:
1816           gen = gen_aarch64_split_simd_movv4sf;
1817           break;
1818         case E_V2DFmode:
1819           gen = gen_aarch64_split_simd_movv2df;
1820           break;
1821         default:
1822           gcc_unreachable ();
1823         }
1824
1825       emit_insn (gen (dst, src));
1826       return;
1827     }
1828 }
1829
1830 bool
1831 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1832                               machine_mode ymode, rtx y)
1833 {
1834   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1835   gcc_assert (r != NULL);
1836   return rtx_equal_p (x, r);
1837 }
1838
1839
1840 static rtx
1841 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1842 {
1843   if (can_create_pseudo_p ())
1844     return force_reg (mode, value);
1845   else
1846     {
1847       x = aarch64_emit_move (x, value);
1848       return x;
1849     }
1850 }
1851
1852
1853 static rtx
1854 aarch64_add_offset (scalar_int_mode mode, rtx temp, rtx reg,
1855                     HOST_WIDE_INT offset)
1856 {
1857   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1858     {
1859       rtx high;
1860       /* Load the full offset into a register.  This
1861          might be improvable in the future.  */
1862       high = GEN_INT (offset);
1863       offset = 0;
1864       high = aarch64_force_temporary (mode, temp, high);
1865       reg = aarch64_force_temporary (mode, temp,
1866                                      gen_rtx_PLUS (mode, high, reg));
1867     }
1868   return plus_constant (mode, reg, offset);
1869 }
1870
1871 static int
1872 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1873                                 scalar_int_mode mode)
1874 {
1875   int i;
1876   unsigned HOST_WIDE_INT val, val2, mask;
1877   int one_match, zero_match;
1878   int num_insns;
1879
1880   val = INTVAL (imm);
1881
1882   if (aarch64_move_imm (val, mode))
1883     {
1884       if (generate)
1885         emit_insn (gen_rtx_SET (dest, imm));
1886       return 1;
1887     }
1888
1889   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
1890      (with XXXX non-zero). In that case check to see if the move can be done in
1891      a smaller mode.  */
1892   val2 = val & 0xffffffff;
1893   if (mode == DImode
1894       && aarch64_move_imm (val2, SImode)
1895       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
1896     {
1897       if (generate)
1898         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1899
1900       /* Check if we have to emit a second instruction by checking to see
1901          if any of the upper 32 bits of the original DI mode value is set.  */
1902       if (val == val2)
1903         return 1;
1904
1905       i = (val >> 48) ? 48 : 32;
1906
1907       if (generate)
1908          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1909                                     GEN_INT ((val >> i) & 0xffff)));
1910
1911       return 2;
1912     }
1913
1914   if ((val >> 32) == 0 || mode == SImode)
1915     {
1916       if (generate)
1917         {
1918           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1919           if (mode == SImode)
1920             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1921                                        GEN_INT ((val >> 16) & 0xffff)));
1922           else
1923             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1924                                        GEN_INT ((val >> 16) & 0xffff)));
1925         }
1926       return 2;
1927     }
1928
1929   /* Remaining cases are all for DImode.  */
1930
1931   mask = 0xffff;
1932   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1933     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1934   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1935     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1936
1937   if (zero_match != 2 && one_match != 2)
1938     {
1939       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1940          For a 64-bit bitmask try whether changing 16 bits to all ones or
1941          zeroes creates a valid bitmask.  To check any repeated bitmask,
1942          try using 16 bits from the other 32-bit half of val.  */
1943
1944       for (i = 0; i < 64; i += 16, mask <<= 16)
1945         {
1946           val2 = val & ~mask;
1947           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1948             break;
1949           val2 = val | mask;
1950           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1951             break;
1952           val2 = val2 & ~mask;
1953           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1954           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1955             break;
1956         }
1957       if (i != 64)
1958         {
1959           if (generate)
1960             {
1961               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1962               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1963                                          GEN_INT ((val >> i) & 0xffff)));
1964             }
1965           return 2;
1966         }
1967     }
1968
1969   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1970      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
1971      otherwise skip zero bits.  */
1972
1973   num_insns = 1;
1974   mask = 0xffff;
1975   val2 = one_match > zero_match ? ~val : val;
1976   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1977
1978   if (generate)
1979     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1980                                            ? (val | ~(mask << i))
1981                                            : (val & (mask << i)))));
1982   for (i += 16; i < 64; i += 16)
1983     {
1984       if ((val2 & (mask << i)) == 0)
1985         continue;
1986       if (generate)
1987         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1988                                    GEN_INT ((val >> i) & 0xffff)));
1989       num_insns ++;
1990     }
1991
1992   return num_insns;
1993 }
1994
1995 /* Add DELTA to REGNUM in mode MODE.  SCRATCHREG can be used to hold a
1996    temporary value if necessary.  FRAME_RELATED_P should be true if
1997    the RTX_FRAME_RELATED flag should be set and CFA adjustments added
1998    to the generated instructions.  If SCRATCHREG is known to hold
1999    abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
2000    immediate again.
2001
2002    Since this function may be used to adjust the stack pointer, we must
2003    ensure that it cannot cause transient stack deallocation (for example
2004    by first incrementing SP and then decrementing when adjusting by a
2005    large immediate).  */
2006
2007 static void
2008 aarch64_add_constant_internal (scalar_int_mode mode, int regnum,
2009                                int scratchreg, HOST_WIDE_INT delta,
2010                                bool frame_related_p, bool emit_move_imm)
2011 {
2012   HOST_WIDE_INT mdelta = abs_hwi (delta);
2013   rtx this_rtx = gen_rtx_REG (mode, regnum);
2014   rtx_insn *insn;
2015
2016   if (!mdelta)
2017     return;
2018
2019   /* Single instruction adjustment.  */
2020   if (aarch64_uimm12_shift (mdelta))
2021     {
2022       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
2023       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2024       return;
2025     }
2026
2027   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
2028      Only do this if mdelta is not a 16-bit move as adjusting using a move
2029      is better.  */
2030   if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode))
2031     {
2032       HOST_WIDE_INT low_off = mdelta & 0xfff;
2033
2034       low_off = delta < 0 ? -low_off : low_off;
2035       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
2036       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2037       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2038       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2039       return;
2040     }
2041
2042   /* Emit a move immediate if required and an addition/subtraction.  */
2043   rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2044   if (emit_move_imm)
2045     aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode);
2046   insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx)
2047                               : gen_add2_insn (this_rtx, scratch_rtx));
2048   if (frame_related_p)
2049     {
2050       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2051       rtx adj = plus_constant (mode, this_rtx, delta);
2052       add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2053     }
2054 }
2055
2056 static inline void
2057 aarch64_add_constant (scalar_int_mode mode, int regnum, int scratchreg,
2058                       HOST_WIDE_INT delta)
2059 {
2060   aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true);
2061 }
2062
2063 static inline void
2064 aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm)
2065 {
2066   aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta,
2067                                  true, emit_move_imm);
2068 }
2069
2070 static inline void
2071 aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p)
2072 {
2073   aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta,
2074                                  frame_related_p, true);
2075 }
2076
2077 void
2078 aarch64_expand_mov_immediate (rtx dest, rtx imm)
2079 {
2080   machine_mode mode = GET_MODE (dest);
2081
2082   gcc_assert (mode == SImode || mode == DImode);
2083
2084   /* Check on what type of symbol it is.  */
2085   scalar_int_mode int_mode;
2086   if ((GET_CODE (imm) == SYMBOL_REF
2087        || GET_CODE (imm) == LABEL_REF
2088        || GET_CODE (imm) == CONST)
2089       && is_a <scalar_int_mode> (mode, &int_mode))
2090     {
2091       rtx mem, base, offset;
2092       enum aarch64_symbol_type sty;
2093
2094       /* If we have (const (plus symbol offset)), separate out the offset
2095          before we start classifying the symbol.  */
2096       split_const (imm, &base, &offset);
2097
2098       sty = aarch64_classify_symbol (base, offset);
2099       switch (sty)
2100         {
2101         case SYMBOL_FORCE_TO_MEM:
2102           if (offset != const0_rtx
2103               && targetm.cannot_force_const_mem (int_mode, imm))
2104             {
2105               gcc_assert (can_create_pseudo_p ());
2106               base = aarch64_force_temporary (int_mode, dest, base);
2107               base = aarch64_add_offset (int_mode, NULL, base,
2108                                          INTVAL (offset));
2109               aarch64_emit_move (dest, base);
2110               return;
2111             }
2112
2113           mem = force_const_mem (ptr_mode, imm);
2114           gcc_assert (mem);
2115
2116           /* If we aren't generating PC relative literals, then
2117              we need to expand the literal pool access carefully.
2118              This is something that needs to be done in a number
2119              of places, so could well live as a separate function.  */
2120           if (!aarch64_pcrelative_literal_loads)
2121             {
2122               gcc_assert (can_create_pseudo_p ());
2123               base = gen_reg_rtx (ptr_mode);
2124               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
2125               if (ptr_mode != Pmode)
2126                 base = convert_memory_address (Pmode, base);
2127               mem = gen_rtx_MEM (ptr_mode, base);
2128             }
2129
2130           if (int_mode != ptr_mode)
2131             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
2132
2133           emit_insn (gen_rtx_SET (dest, mem));
2134
2135           return;
2136
2137         case SYMBOL_SMALL_TLSGD:
2138         case SYMBOL_SMALL_TLSDESC:
2139         case SYMBOL_SMALL_TLSIE:
2140         case SYMBOL_SMALL_GOT_28K:
2141         case SYMBOL_SMALL_GOT_4G:
2142         case SYMBOL_TINY_GOT:
2143         case SYMBOL_TINY_TLSIE:
2144           if (offset != const0_rtx)
2145             {
2146               gcc_assert(can_create_pseudo_p ());
2147               base = aarch64_force_temporary (int_mode, dest, base);
2148               base = aarch64_add_offset (int_mode, NULL, base,
2149                                          INTVAL (offset));
2150               aarch64_emit_move (dest, base);
2151               return;
2152             }
2153           /* FALLTHRU */
2154
2155         case SYMBOL_SMALL_ABSOLUTE:
2156         case SYMBOL_TINY_ABSOLUTE:
2157         case SYMBOL_TLSLE12:
2158         case SYMBOL_TLSLE24:
2159         case SYMBOL_TLSLE32:
2160         case SYMBOL_TLSLE48:
2161           aarch64_load_symref_appropriately (dest, imm, sty);
2162           return;
2163
2164         default:
2165           gcc_unreachable ();
2166         }
2167     }
2168
2169   if (!CONST_INT_P (imm))
2170     {
2171       if (GET_CODE (imm) == HIGH)
2172         emit_insn (gen_rtx_SET (dest, imm));
2173       else
2174         {
2175           rtx mem = force_const_mem (mode, imm);
2176           gcc_assert (mem);
2177           emit_insn (gen_rtx_SET (dest, mem));
2178         }
2179
2180       return;
2181     }
2182
2183   aarch64_internal_mov_immediate (dest, imm, true,
2184                                   as_a <scalar_int_mode> (mode));
2185 }
2186
2187 static bool
2188 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2189                                  tree exp ATTRIBUTE_UNUSED)
2190 {
2191   /* Currently, always true.  */
2192   return true;
2193 }
2194
2195 /* Implement TARGET_PASS_BY_REFERENCE.  */
2196
2197 static bool
2198 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2199                            machine_mode mode,
2200                            const_tree type,
2201                            bool named ATTRIBUTE_UNUSED)
2202 {
2203   HOST_WIDE_INT size;
2204   machine_mode dummymode;
2205   int nregs;
2206
2207   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
2208   size = (mode == BLKmode && type)
2209     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2210
2211   /* Aggregates are passed by reference based on their size.  */
2212   if (type && AGGREGATE_TYPE_P (type))
2213     {
2214       size = int_size_in_bytes (type);
2215     }
2216
2217   /* Variable sized arguments are always returned by reference.  */
2218   if (size < 0)
2219     return true;
2220
2221   /* Can this be a candidate to be passed in fp/simd register(s)?  */
2222   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2223                                                &dummymode, &nregs,
2224                                                NULL))
2225     return false;
2226
2227   /* Arguments which are variable sized or larger than 2 registers are
2228      passed by reference unless they are a homogenous floating point
2229      aggregate.  */
2230   return size > 2 * UNITS_PER_WORD;
2231 }
2232
2233 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
2234 static bool
2235 aarch64_return_in_msb (const_tree valtype)
2236 {
2237   machine_mode dummy_mode;
2238   int dummy_int;
2239
2240   /* Never happens in little-endian mode.  */
2241   if (!BYTES_BIG_ENDIAN)
2242     return false;
2243
2244   /* Only composite types smaller than or equal to 16 bytes can
2245      be potentially returned in registers.  */
2246   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2247       || int_size_in_bytes (valtype) <= 0
2248       || int_size_in_bytes (valtype) > 16)
2249     return false;
2250
2251   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2252      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2253      is always passed/returned in the least significant bits of fp/simd
2254      register(s).  */
2255   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2256                                                &dummy_mode, &dummy_int, NULL))
2257     return false;
2258
2259   return true;
2260 }
2261
2262 /* Implement TARGET_FUNCTION_VALUE.
2263    Define how to find the value returned by a function.  */
2264
2265 static rtx
2266 aarch64_function_value (const_tree type, const_tree func,
2267                         bool outgoing ATTRIBUTE_UNUSED)
2268 {
2269   machine_mode mode;
2270   int unsignedp;
2271   int count;
2272   machine_mode ag_mode;
2273
2274   mode = TYPE_MODE (type);
2275   if (INTEGRAL_TYPE_P (type))
2276     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2277
2278   if (aarch64_return_in_msb (type))
2279     {
2280       HOST_WIDE_INT size = int_size_in_bytes (type);
2281
2282       if (size % UNITS_PER_WORD != 0)
2283         {
2284           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2285           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
2286         }
2287     }
2288
2289   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2290                                                &ag_mode, &count, NULL))
2291     {
2292       if (!aarch64_composite_type_p (type, mode))
2293         {
2294           gcc_assert (count == 1 && mode == ag_mode);
2295           return gen_rtx_REG (mode, V0_REGNUM);
2296         }
2297       else
2298         {
2299           int i;
2300           rtx par;
2301
2302           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2303           for (i = 0; i < count; i++)
2304             {
2305               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2306               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2307                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2308               XVECEXP (par, 0, i) = tmp;
2309             }
2310           return par;
2311         }
2312     }
2313   else
2314     return gen_rtx_REG (mode, R0_REGNUM);
2315 }
2316
2317 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2318    Return true if REGNO is the number of a hard register in which the values
2319    of called function may come back.  */
2320
2321 static bool
2322 aarch64_function_value_regno_p (const unsigned int regno)
2323 {
2324   /* Maximum of 16 bytes can be returned in the general registers.  Examples
2325      of 16-byte return values are: 128-bit integers and 16-byte small
2326      structures (excluding homogeneous floating-point aggregates).  */
2327   if (regno == R0_REGNUM || regno == R1_REGNUM)
2328     return true;
2329
2330   /* Up to four fp/simd registers can return a function value, e.g. a
2331      homogeneous floating-point aggregate having four members.  */
2332   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2333     return TARGET_FLOAT;
2334
2335   return false;
2336 }
2337
2338 /* Implement TARGET_RETURN_IN_MEMORY.
2339
2340    If the type T of the result of a function is such that
2341      void func (T arg)
2342    would require that arg be passed as a value in a register (or set of
2343    registers) according to the parameter passing rules, then the result
2344    is returned in the same registers as would be used for such an
2345    argument.  */
2346
2347 static bool
2348 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2349 {
2350   HOST_WIDE_INT size;
2351   machine_mode ag_mode;
2352   int count;
2353
2354   if (!AGGREGATE_TYPE_P (type)
2355       && TREE_CODE (type) != COMPLEX_TYPE
2356       && TREE_CODE (type) != VECTOR_TYPE)
2357     /* Simple scalar types always returned in registers.  */
2358     return false;
2359
2360   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2361                                                type,
2362                                                &ag_mode,
2363                                                &count,
2364                                                NULL))
2365     return false;
2366
2367   /* Types larger than 2 registers returned in memory.  */
2368   size = int_size_in_bytes (type);
2369   return (size < 0 || size > 2 * UNITS_PER_WORD);
2370 }
2371
2372 static bool
2373 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2374                                const_tree type, int *nregs)
2375 {
2376   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2377   return aarch64_vfp_is_call_or_return_candidate (mode,
2378                                                   type,
2379                                                   &pcum->aapcs_vfp_rmode,
2380                                                   nregs,
2381                                                   NULL);
2382 }
2383
2384 /* Given MODE and TYPE of a function argument, return the alignment in
2385    bits.  The idea is to suppress any stronger alignment requested by
2386    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2387    This is a helper function for local use only.  */
2388
2389 static unsigned int
2390 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2391 {
2392   if (!type)
2393     return GET_MODE_ALIGNMENT (mode);
2394
2395   if (integer_zerop (TYPE_SIZE (type)))
2396     return 0;
2397
2398   gcc_assert (TYPE_MODE (type) == mode);
2399
2400   if (!AGGREGATE_TYPE_P (type))
2401     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2402
2403   if (TREE_CODE (type) == ARRAY_TYPE)
2404     return TYPE_ALIGN (TREE_TYPE (type));
2405
2406   unsigned int alignment = 0;
2407   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2408     if (TREE_CODE (field) == FIELD_DECL)
2409       alignment = std::max (alignment, DECL_ALIGN (field));
2410
2411   return alignment;
2412 }
2413
2414 /* Layout a function argument according to the AAPCS64 rules.  The rule
2415    numbers refer to the rule numbers in the AAPCS64.  */
2416
2417 static void
2418 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2419                     const_tree type,
2420                     bool named ATTRIBUTE_UNUSED)
2421 {
2422   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2423   int ncrn, nvrn, nregs;
2424   bool allocate_ncrn, allocate_nvrn;
2425   HOST_WIDE_INT size;
2426
2427   /* We need to do this once per argument.  */
2428   if (pcum->aapcs_arg_processed)
2429     return;
2430
2431   pcum->aapcs_arg_processed = true;
2432
2433   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
2434   size
2435     = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2436                 UNITS_PER_WORD);
2437
2438   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2439   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2440                                                  mode,
2441                                                  type,
2442                                                  &nregs);
2443
2444   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2445      The following code thus handles passing by SIMD/FP registers first.  */
2446
2447   nvrn = pcum->aapcs_nvrn;
2448
2449   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2450      and homogenous short-vector aggregates (HVA).  */
2451   if (allocate_nvrn)
2452     {
2453       if (!TARGET_FLOAT)
2454         aarch64_err_no_fpadvsimd (mode, "argument");
2455
2456       if (nvrn + nregs <= NUM_FP_ARG_REGS)
2457         {
2458           pcum->aapcs_nextnvrn = nvrn + nregs;
2459           if (!aarch64_composite_type_p (type, mode))
2460             {
2461               gcc_assert (nregs == 1);
2462               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2463             }
2464           else
2465             {
2466               rtx par;
2467               int i;
2468               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2469               for (i = 0; i < nregs; i++)
2470                 {
2471                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2472                                          V0_REGNUM + nvrn + i);
2473                   tmp = gen_rtx_EXPR_LIST
2474                     (VOIDmode, tmp,
2475                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2476                   XVECEXP (par, 0, i) = tmp;
2477                 }
2478               pcum->aapcs_reg = par;
2479             }
2480           return;
2481         }
2482       else
2483         {
2484           /* C.3 NSRN is set to 8.  */
2485           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2486           goto on_stack;
2487         }
2488     }
2489
2490   ncrn = pcum->aapcs_ncrn;
2491   nregs = size / UNITS_PER_WORD;
2492
2493   /* C6 - C9.  though the sign and zero extension semantics are
2494      handled elsewhere.  This is the case where the argument fits
2495      entirely general registers.  */
2496   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2497     {
2498
2499       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2500
2501       /* C.8 if the argument has an alignment of 16 then the NGRN is
2502          rounded up to the next even number.  */
2503       if (nregs == 2
2504           && ncrn % 2
2505           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
2506              comparison is there because for > 16 * BITS_PER_UNIT
2507              alignment nregs should be > 2 and therefore it should be
2508              passed by reference rather than value.  */
2509           && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2510         {
2511           ++ncrn;
2512           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2513         }
2514
2515       /* NREGS can be 0 when e.g. an empty structure is to be passed.
2516          A reg is still generated for it, but the caller should be smart
2517          enough not to use it.  */
2518       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2519         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2520       else
2521         {
2522           rtx par;
2523           int i;
2524
2525           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2526           for (i = 0; i < nregs; i++)
2527             {
2528               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2529               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2530                                        GEN_INT (i * UNITS_PER_WORD));
2531               XVECEXP (par, 0, i) = tmp;
2532             }
2533           pcum->aapcs_reg = par;
2534         }
2535
2536       pcum->aapcs_nextncrn = ncrn + nregs;
2537       return;
2538     }
2539
2540   /* C.11  */
2541   pcum->aapcs_nextncrn = NUM_ARG_REGS;
2542
2543   /* The argument is passed on stack; record the needed number of words for
2544      this argument and align the total size if necessary.  */
2545 on_stack:
2546   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2547
2548   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2549     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2550                                        16 / UNITS_PER_WORD);
2551   return;
2552 }
2553
2554 /* Implement TARGET_FUNCTION_ARG.  */
2555
2556 static rtx
2557 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2558                       const_tree type, bool named)
2559 {
2560   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2561   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2562
2563   if (mode == VOIDmode)
2564     return NULL_RTX;
2565
2566   aarch64_layout_arg (pcum_v, mode, type, named);
2567   return pcum->aapcs_reg;
2568 }
2569
2570 void
2571 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2572                            const_tree fntype ATTRIBUTE_UNUSED,
2573                            rtx libname ATTRIBUTE_UNUSED,
2574                            const_tree fndecl ATTRIBUTE_UNUSED,
2575                            unsigned n_named ATTRIBUTE_UNUSED)
2576 {
2577   pcum->aapcs_ncrn = 0;
2578   pcum->aapcs_nvrn = 0;
2579   pcum->aapcs_nextncrn = 0;
2580   pcum->aapcs_nextnvrn = 0;
2581   pcum->pcs_variant = ARM_PCS_AAPCS64;
2582   pcum->aapcs_reg = NULL_RTX;
2583   pcum->aapcs_arg_processed = false;
2584   pcum->aapcs_stack_words = 0;
2585   pcum->aapcs_stack_size = 0;
2586
2587   if (!TARGET_FLOAT
2588       && fndecl && TREE_PUBLIC (fndecl)
2589       && fntype && fntype != error_mark_node)
2590     {
2591       const_tree type = TREE_TYPE (fntype);
2592       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
2593       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
2594       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2595                                                    &mode, &nregs, NULL))
2596         aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2597     }
2598   return;
2599 }
2600
2601 static void
2602 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2603                               machine_mode mode,
2604                               const_tree type,
2605                               bool named)
2606 {
2607   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2608   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2609     {
2610       aarch64_layout_arg (pcum_v, mode, type, named);
2611       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2612                   != (pcum->aapcs_stack_words != 0));
2613       pcum->aapcs_arg_processed = false;
2614       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2615       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2616       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2617       pcum->aapcs_stack_words = 0;
2618       pcum->aapcs_reg = NULL_RTX;
2619     }
2620 }
2621
2622 bool
2623 aarch64_function_arg_regno_p (unsigned regno)
2624 {
2625   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2626           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2627 }
2628
2629 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
2630    PARM_BOUNDARY bits of alignment, but will be given anything up
2631    to STACK_BOUNDARY bits if the type requires it.  This makes sure
2632    that both before and after the layout of each argument, the Next
2633    Stacked Argument Address (NSAA) will have a minimum alignment of
2634    8 bytes.  */
2635
2636 static unsigned int
2637 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2638 {
2639   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2640   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
2641 }
2642
2643 /* Implement TARGET_FUNCTION_ARG_PADDING.
2644
2645    Small aggregate types are placed in the lowest memory address.
2646
2647    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
2648
2649 static pad_direction
2650 aarch64_function_arg_padding (machine_mode mode, const_tree type)
2651 {
2652   /* On little-endian targets, the least significant byte of every stack
2653      argument is passed at the lowest byte address of the stack slot.  */
2654   if (!BYTES_BIG_ENDIAN)
2655     return PAD_UPWARD;
2656
2657   /* Otherwise, integral, floating-point and pointer types are padded downward:
2658      the least significant byte of a stack argument is passed at the highest
2659      byte address of the stack slot.  */
2660   if (type
2661       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2662          || POINTER_TYPE_P (type))
2663       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2664     return PAD_DOWNWARD;
2665
2666   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
2667   return PAD_UPWARD;
2668 }
2669
2670 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2671
2672    It specifies padding for the last (may also be the only)
2673    element of a block move between registers and memory.  If
2674    assuming the block is in the memory, padding upward means that
2675    the last element is padded after its highest significant byte,
2676    while in downward padding, the last element is padded at the
2677    its least significant byte side.
2678
2679    Small aggregates and small complex types are always padded
2680    upwards.
2681
2682    We don't need to worry about homogeneous floating-point or
2683    short-vector aggregates; their move is not affected by the
2684    padding direction determined here.  Regardless of endianness,
2685    each element of such an aggregate is put in the least
2686    significant bits of a fp/simd register.
2687
2688    Return !BYTES_BIG_ENDIAN if the least significant byte of the
2689    register has useful data, and return the opposite if the most
2690    significant byte does.  */
2691
2692 bool
2693 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2694                      bool first ATTRIBUTE_UNUSED)
2695 {
2696
2697   /* Small composite types are always padded upward.  */
2698   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2699     {
2700       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2701                             : GET_MODE_SIZE (mode));
2702       if (size < 2 * UNITS_PER_WORD)
2703         return true;
2704     }
2705
2706   /* Otherwise, use the default padding.  */
2707   return !BYTES_BIG_ENDIAN;
2708 }
2709
2710 static scalar_int_mode
2711 aarch64_libgcc_cmp_return_mode (void)
2712 {
2713   return SImode;
2714 }
2715
2716 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2717
2718 /* We use the 12-bit shifted immediate arithmetic instructions so values
2719    must be multiple of (1 << 12), i.e. 4096.  */
2720 #define ARITH_FACTOR 4096
2721
2722 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2723 #error Cannot use simple address calculation for stack probing
2724 #endif
2725
2726 /* The pair of scratch registers used for stack probing.  */
2727 #define PROBE_STACK_FIRST_REG  9
2728 #define PROBE_STACK_SECOND_REG 10
2729
2730 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2731    inclusive.  These are offsets from the current stack pointer.  */
2732
2733 static void
2734 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2735 {
2736   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
2737
2738   /* See the same assertion on PROBE_INTERVAL above.  */
2739   gcc_assert ((first % ARITH_FACTOR) == 0);
2740
2741   /* See if we have a constant small number of probes to generate.  If so,
2742      that's the easy case.  */
2743   if (size <= PROBE_INTERVAL)
2744     {
2745       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2746
2747       emit_set_insn (reg1,
2748                      plus_constant (Pmode,
2749                                     stack_pointer_rtx, -(first + base)));
2750       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
2751     }
2752
2753   /* The run-time loop is made up of 8 insns in the generic case while the
2754      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
2755   else if (size <= 4 * PROBE_INTERVAL)
2756     {
2757       HOST_WIDE_INT i, rem;
2758
2759       emit_set_insn (reg1,
2760                      plus_constant (Pmode,
2761                                     stack_pointer_rtx,
2762                                     -(first + PROBE_INTERVAL)));
2763       emit_stack_probe (reg1);
2764
2765       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2766          it exceeds SIZE.  If only two probes are needed, this will not
2767          generate any code.  Then probe at FIRST + SIZE.  */
2768       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2769         {
2770           emit_set_insn (reg1,
2771                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
2772           emit_stack_probe (reg1);
2773         }
2774
2775       rem = size - (i - PROBE_INTERVAL);
2776       if (rem > 256)
2777         {
2778           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2779
2780           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
2781           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
2782         }
2783       else
2784         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
2785     }
2786
2787   /* Otherwise, do the same as above, but in a loop.  Note that we must be
2788      extra careful with variables wrapping around because we might be at
2789      the very top (or the very bottom) of the address space and we have
2790      to be able to handle this case properly; in particular, we use an
2791      equality test for the loop condition.  */
2792   else
2793     {
2794       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
2795
2796       /* Step 1: round SIZE to the previous multiple of the interval.  */
2797
2798       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2799
2800
2801       /* Step 2: compute initial and final value of the loop counter.  */
2802
2803       /* TEST_ADDR = SP + FIRST.  */
2804       emit_set_insn (reg1,
2805                      plus_constant (Pmode, stack_pointer_rtx, -first));
2806
2807       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
2808       HOST_WIDE_INT adjustment = - (first + rounded_size);
2809       if (! aarch64_uimm12_shift (adjustment))
2810         {
2811           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
2812                                           true, Pmode);
2813           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
2814         }
2815       else
2816         {
2817           emit_set_insn (reg2,
2818                          plus_constant (Pmode, stack_pointer_rtx, adjustment));
2819         }
2820
2821       /* Step 3: the loop
2822
2823          do
2824            {
2825              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2826              probe at TEST_ADDR
2827            }
2828          while (TEST_ADDR != LAST_ADDR)
2829
2830          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2831          until it is equal to ROUNDED_SIZE.  */
2832
2833       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
2834
2835
2836       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2837          that SIZE is equal to ROUNDED_SIZE.  */
2838
2839       if (size != rounded_size)
2840         {
2841           HOST_WIDE_INT rem = size - rounded_size;
2842
2843           if (rem > 256)
2844             {
2845               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2846
2847               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
2848               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
2849             }
2850           else
2851             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
2852         }
2853     }
2854
2855   /* Make sure nothing is scheduled before we are done.  */
2856   emit_insn (gen_blockage ());
2857 }
2858
2859 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
2860    absolute addresses.  */
2861
2862 const char *
2863 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2864 {
2865   static int labelno = 0;
2866   char loop_lab[32];
2867   rtx xops[2];
2868
2869   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2870
2871   /* Loop.  */
2872   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2873
2874   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
2875   xops[0] = reg1;
2876   xops[1] = GEN_INT (PROBE_INTERVAL);
2877   output_asm_insn ("sub\t%0, %0, %1", xops);
2878
2879   /* Probe at TEST_ADDR.  */
2880   output_asm_insn ("str\txzr, [%0]", xops);
2881
2882   /* Test if TEST_ADDR == LAST_ADDR.  */
2883   xops[1] = reg2;
2884   output_asm_insn ("cmp\t%0, %1", xops);
2885
2886   /* Branch.  */
2887   fputs ("\tb.ne\t", asm_out_file);
2888   assemble_name_raw (asm_out_file, loop_lab);
2889   fputc ('\n', asm_out_file);
2890
2891   return "";
2892 }
2893
2894 /* Mark the registers that need to be saved by the callee and calculate
2895    the size of the callee-saved registers area and frame record (both FP
2896    and LR may be omitted).  */
2897 static void
2898 aarch64_layout_frame (void)
2899 {
2900   HOST_WIDE_INT offset = 0;
2901   int regno, last_fp_reg = INVALID_REGNUM;
2902
2903   if (reload_completed && cfun->machine->frame.laid_out)
2904     return;
2905
2906   /* Force a frame chain for EH returns so the return address is at FP+8.  */
2907   cfun->machine->frame.emit_frame_chain
2908     = frame_pointer_needed || crtl->calls_eh_return;
2909
2910   /* Emit a frame chain if the frame pointer is enabled.
2911      If -momit-leaf-frame-pointer is used, do not use a frame chain
2912      in leaf functions which do not use LR.  */
2913   if (flag_omit_frame_pointer == 2
2914       && !(flag_omit_leaf_frame_pointer && crtl->is_leaf
2915            && !df_regs_ever_live_p (LR_REGNUM)))
2916     cfun->machine->frame.emit_frame_chain = true;
2917
2918 #define SLOT_NOT_REQUIRED (-2)
2919 #define SLOT_REQUIRED     (-1)
2920
2921   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2922   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2923
2924   /* First mark all the registers that really need to be saved...  */
2925   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2926     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2927
2928   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2929     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2930
2931   /* ... that includes the eh data registers (if needed)...  */
2932   if (crtl->calls_eh_return)
2933     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2934       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2935         = SLOT_REQUIRED;
2936
2937   /* ... and any callee saved register that dataflow says is live.  */
2938   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2939     if (df_regs_ever_live_p (regno)
2940         && (regno == R30_REGNUM
2941             || !call_used_regs[regno]))
2942       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2943
2944   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2945     if (df_regs_ever_live_p (regno)
2946         && !call_used_regs[regno])
2947       {
2948         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2949         last_fp_reg = regno;
2950       }
2951
2952   if (cfun->machine->frame.emit_frame_chain)
2953     {
2954       /* FP and LR are placed in the linkage record.  */
2955       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2956       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2957       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2958       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2959       offset = 2 * UNITS_PER_WORD;
2960     }
2961
2962   /* Now assign stack slots for them.  */
2963   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2964     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2965       {
2966         cfun->machine->frame.reg_offset[regno] = offset;
2967         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2968           cfun->machine->frame.wb_candidate1 = regno;
2969         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2970           cfun->machine->frame.wb_candidate2 = regno;
2971         offset += UNITS_PER_WORD;
2972       }
2973
2974   HOST_WIDE_INT max_int_offset = offset;
2975   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2976   bool has_align_gap = offset != max_int_offset;
2977
2978   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2979     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2980       {
2981         /* If there is an alignment gap between integer and fp callee-saves,
2982            allocate the last fp register to it if possible.  */
2983         if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
2984           {
2985             cfun->machine->frame.reg_offset[regno] = max_int_offset;
2986             break;
2987           }
2988
2989         cfun->machine->frame.reg_offset[regno] = offset;
2990         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2991           cfun->machine->frame.wb_candidate1 = regno;
2992         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2993                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2994           cfun->machine->frame.wb_candidate2 = regno;
2995         offset += UNITS_PER_WORD;
2996       }
2997
2998   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2999
3000   cfun->machine->frame.saved_regs_size = offset;
3001
3002   HOST_WIDE_INT varargs_and_saved_regs_size
3003     = offset + cfun->machine->frame.saved_varargs_size;
3004
3005   cfun->machine->frame.hard_fp_offset
3006     = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
3007                 STACK_BOUNDARY / BITS_PER_UNIT);
3008
3009   cfun->machine->frame.frame_size
3010     = ROUND_UP (cfun->machine->frame.hard_fp_offset
3011                 + crtl->outgoing_args_size,
3012                 STACK_BOUNDARY / BITS_PER_UNIT);
3013
3014   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
3015
3016   cfun->machine->frame.initial_adjust = 0;
3017   cfun->machine->frame.final_adjust = 0;
3018   cfun->machine->frame.callee_adjust = 0;
3019   cfun->machine->frame.callee_offset = 0;
3020
3021   HOST_WIDE_INT max_push_offset = 0;
3022   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
3023     max_push_offset = 512;
3024   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
3025     max_push_offset = 256;
3026
3027   if (cfun->machine->frame.frame_size < max_push_offset
3028       && crtl->outgoing_args_size == 0)
3029     {
3030       /* Simple, small frame with no outgoing arguments:
3031          stp reg1, reg2, [sp, -frame_size]!
3032          stp reg3, reg4, [sp, 16]  */
3033       cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
3034     }
3035   else if ((crtl->outgoing_args_size
3036             + cfun->machine->frame.saved_regs_size < 512)
3037            && !(cfun->calls_alloca
3038                 && cfun->machine->frame.hard_fp_offset < max_push_offset))
3039     {
3040       /* Frame with small outgoing arguments:
3041          sub sp, sp, frame_size
3042          stp reg1, reg2, [sp, outgoing_args_size]
3043          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
3044       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
3045       cfun->machine->frame.callee_offset
3046         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
3047     }
3048   else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
3049     {
3050       /* Frame with large outgoing arguments but a small local area:
3051          stp reg1, reg2, [sp, -hard_fp_offset]!
3052          stp reg3, reg4, [sp, 16]
3053          sub sp, sp, outgoing_args_size  */
3054       cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
3055       cfun->machine->frame.final_adjust
3056         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3057     }
3058   else
3059     {
3060       /* Frame with large local area and outgoing arguments using frame pointer:
3061          sub sp, sp, hard_fp_offset
3062          stp x29, x30, [sp, 0]
3063          add x29, sp, 0
3064          stp reg3, reg4, [sp, 16]
3065          sub sp, sp, outgoing_args_size  */
3066       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
3067       cfun->machine->frame.final_adjust
3068         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
3069     }
3070
3071   cfun->machine->frame.laid_out = true;
3072 }
3073
3074 /* Return true if the register REGNO is saved on entry to
3075    the current function.  */
3076
3077 static bool
3078 aarch64_register_saved_on_entry (int regno)
3079 {
3080   return cfun->machine->frame.reg_offset[regno] >= 0;
3081 }
3082
3083 /* Return the next register up from REGNO up to LIMIT for the callee
3084    to save.  */
3085
3086 static unsigned
3087 aarch64_next_callee_save (unsigned regno, unsigned limit)
3088 {
3089   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
3090     regno ++;
3091   return regno;
3092 }
3093
3094 /* Push the register number REGNO of mode MODE to the stack with write-back
3095    adjusting the stack by ADJUSTMENT.  */
3096
3097 static void
3098 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
3099                            HOST_WIDE_INT adjustment)
3100  {
3101   rtx base_rtx = stack_pointer_rtx;
3102   rtx insn, reg, mem;
3103
3104   reg = gen_rtx_REG (mode, regno);
3105   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
3106                             plus_constant (Pmode, base_rtx, -adjustment));
3107   mem = gen_frame_mem (mode, mem);
3108
3109   insn = emit_move_insn (mem, reg);
3110   RTX_FRAME_RELATED_P (insn) = 1;
3111 }
3112
3113 /* Generate and return an instruction to store the pair of registers
3114    REG and REG2 of mode MODE to location BASE with write-back adjusting
3115    the stack location BASE by ADJUSTMENT.  */
3116
3117 static rtx
3118 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3119                           HOST_WIDE_INT adjustment)
3120 {
3121   switch (mode)
3122     {
3123     case E_DImode:
3124       return gen_storewb_pairdi_di (base, base, reg, reg2,
3125                                     GEN_INT (-adjustment),
3126                                     GEN_INT (UNITS_PER_WORD - adjustment));
3127     case E_DFmode:
3128       return gen_storewb_pairdf_di (base, base, reg, reg2,
3129                                     GEN_INT (-adjustment),
3130                                     GEN_INT (UNITS_PER_WORD - adjustment));
3131     default:
3132       gcc_unreachable ();
3133     }
3134 }
3135
3136 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3137    stack pointer by ADJUSTMENT.  */
3138
3139 static void
3140 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
3141 {
3142   rtx_insn *insn;
3143   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
3144
3145   if (regno2 == INVALID_REGNUM)
3146     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
3147
3148   rtx reg1 = gen_rtx_REG (mode, regno1);
3149   rtx reg2 = gen_rtx_REG (mode, regno2);
3150
3151   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
3152                                               reg2, adjustment));
3153   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
3154   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3155   RTX_FRAME_RELATED_P (insn) = 1;
3156 }
3157
3158 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3159    adjusting it by ADJUSTMENT afterwards.  */
3160
3161 static rtx
3162 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3163                          HOST_WIDE_INT adjustment)
3164 {
3165   switch (mode)
3166     {
3167     case E_DImode:
3168       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3169                                    GEN_INT (UNITS_PER_WORD));
3170     case E_DFmode:
3171       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3172                                    GEN_INT (UNITS_PER_WORD));
3173     default:
3174       gcc_unreachable ();
3175     }
3176 }
3177
3178 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3179    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3180    into CFI_OPS.  */
3181
3182 static void
3183 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
3184                   rtx *cfi_ops)
3185 {
3186   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
3187   rtx reg1 = gen_rtx_REG (mode, regno1);
3188
3189   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
3190
3191   if (regno2 == INVALID_REGNUM)
3192     {
3193       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
3194       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3195       emit_move_insn (reg1, gen_frame_mem (mode, mem));
3196     }
3197   else
3198     {
3199       rtx reg2 = gen_rtx_REG (mode, regno2);
3200       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3201       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3202                                           reg2, adjustment));
3203     }
3204 }
3205
3206 /* Generate and return a store pair instruction of mode MODE to store
3207    register REG1 to MEM1 and register REG2 to MEM2.  */
3208
3209 static rtx
3210 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3211                         rtx reg2)
3212 {
3213   switch (mode)
3214     {
3215     case E_DImode:
3216       return gen_store_pairdi (mem1, reg1, mem2, reg2);
3217
3218     case E_DFmode:
3219       return gen_store_pairdf (mem1, reg1, mem2, reg2);
3220
3221     default:
3222       gcc_unreachable ();
3223     }
3224 }
3225
3226 /* Generate and regurn a load pair isntruction of mode MODE to load register
3227    REG1 from MEM1 and register REG2 from MEM2.  */
3228
3229 static rtx
3230 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3231                        rtx mem2)
3232 {
3233   switch (mode)
3234     {
3235     case E_DImode:
3236       return gen_load_pairdi (reg1, mem1, reg2, mem2);
3237
3238     case E_DFmode:
3239       return gen_load_pairdf (reg1, mem1, reg2, mem2);
3240
3241     default:
3242       gcc_unreachable ();
3243     }
3244 }
3245
3246 /* Return TRUE if return address signing should be enabled for the current
3247    function, otherwise return FALSE.  */
3248
3249 bool
3250 aarch64_return_address_signing_enabled (void)
3251 {
3252   /* This function should only be called after frame laid out.   */
3253   gcc_assert (cfun->machine->frame.laid_out);
3254
3255   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
3256      if it's LR is pushed onto stack.  */
3257   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
3258           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
3259               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
3260 }
3261
3262 /* Emit code to save the callee-saved registers from register number START
3263    to LIMIT to the stack at the location starting at offset START_OFFSET,
3264    skipping any write-back candidates if SKIP_WB is true.  */
3265
3266 static void
3267 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3268                            unsigned start, unsigned limit, bool skip_wb)
3269 {
3270   rtx_insn *insn;
3271   unsigned regno;
3272   unsigned regno2;
3273
3274   for (regno = aarch64_next_callee_save (start, limit);
3275        regno <= limit;
3276        regno = aarch64_next_callee_save (regno + 1, limit))
3277     {
3278       rtx reg, mem;
3279       HOST_WIDE_INT offset;
3280
3281       if (skip_wb
3282           && (regno == cfun->machine->frame.wb_candidate1
3283               || regno == cfun->machine->frame.wb_candidate2))
3284         continue;
3285
3286       if (cfun->machine->reg_is_wrapped_separately[regno])
3287        continue;
3288
3289       reg = gen_rtx_REG (mode, regno);
3290       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3291       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
3292                                                 offset));
3293
3294       regno2 = aarch64_next_callee_save (regno + 1, limit);
3295
3296       if (regno2 <= limit
3297           && !cfun->machine->reg_is_wrapped_separately[regno2]
3298           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3299               == cfun->machine->frame.reg_offset[regno2]))
3300
3301         {
3302           rtx reg2 = gen_rtx_REG (mode, regno2);
3303           rtx mem2;
3304
3305           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3306           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
3307                                                      offset));
3308           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3309                                                     reg2));
3310
3311           /* The first part of a frame-related parallel insn is
3312              always assumed to be relevant to the frame
3313              calculations; subsequent parts, are only
3314              frame-related if explicitly marked.  */
3315           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3316           regno = regno2;
3317         }
3318       else
3319         insn = emit_move_insn (mem, reg);
3320
3321       RTX_FRAME_RELATED_P (insn) = 1;
3322     }
3323 }
3324
3325 /* Emit code to restore the callee registers of mode MODE from register
3326    number START up to and including LIMIT.  Restore from the stack offset
3327    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3328    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
3329
3330 static void
3331 aarch64_restore_callee_saves (machine_mode mode,
3332                               HOST_WIDE_INT start_offset, unsigned start,
3333                               unsigned limit, bool skip_wb, rtx *cfi_ops)
3334 {
3335   rtx base_rtx = stack_pointer_rtx;
3336   unsigned regno;
3337   unsigned regno2;
3338   HOST_WIDE_INT offset;
3339
3340   for (regno = aarch64_next_callee_save (start, limit);
3341        regno <= limit;
3342        regno = aarch64_next_callee_save (regno + 1, limit))
3343     {
3344       if (cfun->machine->reg_is_wrapped_separately[regno])
3345        continue;
3346
3347       rtx reg, mem;
3348
3349       if (skip_wb
3350           && (regno == cfun->machine->frame.wb_candidate1
3351               || regno == cfun->machine->frame.wb_candidate2))
3352         continue;
3353
3354       reg = gen_rtx_REG (mode, regno);
3355       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3356       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
3357
3358       regno2 = aarch64_next_callee_save (regno + 1, limit);
3359
3360       if (regno2 <= limit
3361           && !cfun->machine->reg_is_wrapped_separately[regno2]
3362           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3363               == cfun->machine->frame.reg_offset[regno2]))
3364         {
3365           rtx reg2 = gen_rtx_REG (mode, regno2);
3366           rtx mem2;
3367
3368           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3369           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
3370           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3371
3372           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3373           regno = regno2;
3374         }
3375       else
3376         emit_move_insn (reg, mem);
3377       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3378     }
3379 }
3380
3381 static inline bool
3382 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3383                                HOST_WIDE_INT offset)
3384 {
3385   return offset >= -256 && offset < 256;
3386 }
3387
3388 static inline bool
3389 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3390 {
3391   return (offset >= 0
3392           && offset < 4096 * GET_MODE_SIZE (mode)
3393           && offset % GET_MODE_SIZE (mode) == 0);
3394 }
3395
3396 bool
3397 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3398 {
3399   return (offset >= -64 * GET_MODE_SIZE (mode)
3400           && offset < 64 * GET_MODE_SIZE (mode)
3401           && offset % GET_MODE_SIZE (mode) == 0);
3402 }
3403
3404 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
3405
3406 static sbitmap
3407 aarch64_get_separate_components (void)
3408 {
3409   aarch64_layout_frame ();
3410
3411   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3412   bitmap_clear (components);
3413
3414   /* The registers we need saved to the frame.  */
3415   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3416     if (aarch64_register_saved_on_entry (regno))
3417       {
3418         HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3419         if (!frame_pointer_needed)
3420           offset += cfun->machine->frame.frame_size
3421                     - cfun->machine->frame.hard_fp_offset;
3422         /* Check that we can access the stack slot of the register with one
3423            direct load with no adjustments needed.  */
3424         if (offset_12bit_unsigned_scaled_p (DImode, offset))
3425           bitmap_set_bit (components, regno);
3426       }
3427
3428   /* Don't mess with the hard frame pointer.  */
3429   if (frame_pointer_needed)
3430     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
3431
3432   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3433   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3434   /* If aarch64_layout_frame has chosen registers to store/restore with
3435      writeback don't interfere with them to avoid having to output explicit
3436      stack adjustment instructions.  */
3437   if (reg2 != INVALID_REGNUM)
3438     bitmap_clear_bit (components, reg2);
3439   if (reg1 != INVALID_REGNUM)
3440     bitmap_clear_bit (components, reg1);
3441
3442   bitmap_clear_bit (components, LR_REGNUM);
3443   bitmap_clear_bit (components, SP_REGNUM);
3444
3445   return components;
3446 }
3447
3448 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
3449
3450 static sbitmap
3451 aarch64_components_for_bb (basic_block bb)
3452 {
3453   bitmap in = DF_LIVE_IN (bb);
3454   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
3455   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
3456
3457   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3458   bitmap_clear (components);
3459
3460   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
3461   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3462     if ((!call_used_regs[regno])
3463        && (bitmap_bit_p (in, regno)
3464            || bitmap_bit_p (gen, regno)
3465            || bitmap_bit_p (kill, regno)))
3466           bitmap_set_bit (components, regno);
3467
3468   return components;
3469 }
3470
3471 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
3472    Nothing to do for aarch64.  */
3473
3474 static void
3475 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
3476 {
3477 }
3478
3479 /* Return the next set bit in BMP from START onwards.  Return the total number
3480    of bits in BMP if no set bit is found at or after START.  */
3481
3482 static unsigned int
3483 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
3484 {
3485   unsigned int nbits = SBITMAP_SIZE (bmp);
3486   if (start == nbits)
3487     return start;
3488
3489   gcc_assert (start < nbits);
3490   for (unsigned int i = start; i < nbits; i++)
3491     if (bitmap_bit_p (bmp, i))
3492       return i;
3493
3494   return nbits;
3495 }
3496
3497 /* Do the work for aarch64_emit_prologue_components and
3498    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
3499    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
3500    for these components or the epilogue sequence.  That is, it determines
3501    whether we should emit stores or loads and what kind of CFA notes to attach
3502    to the insns.  Otherwise the logic for the two sequences is very
3503    similar.  */
3504
3505 static void
3506 aarch64_process_components (sbitmap components, bool prologue_p)
3507 {
3508   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
3509                              ? HARD_FRAME_POINTER_REGNUM
3510                              : STACK_POINTER_REGNUM);
3511
3512   unsigned last_regno = SBITMAP_SIZE (components);
3513   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
3514   rtx_insn *insn = NULL;
3515
3516   while (regno != last_regno)
3517     {
3518       /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
3519          so DFmode for the vector registers is enough.  */
3520       machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
3521       rtx reg = gen_rtx_REG (mode, regno);
3522       HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3523       if (!frame_pointer_needed)
3524         offset += cfun->machine->frame.frame_size
3525                   - cfun->machine->frame.hard_fp_offset;
3526       rtx addr = plus_constant (Pmode, ptr_reg, offset);
3527       rtx mem = gen_frame_mem (mode, addr);
3528
3529       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
3530       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
3531       /* No more registers to handle after REGNO.
3532          Emit a single save/restore and exit.  */
3533       if (regno2 == last_regno)
3534         {
3535           insn = emit_insn (set);
3536           RTX_FRAME_RELATED_P (insn) = 1;
3537           if (prologue_p)
3538             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3539           else
3540             add_reg_note (insn, REG_CFA_RESTORE, reg);
3541           break;
3542         }
3543
3544       HOST_WIDE_INT offset2 = cfun->machine->frame.reg_offset[regno2];
3545       /* The next register is not of the same class or its offset is not
3546          mergeable with the current one into a pair.  */
3547       if (!satisfies_constraint_Ump (mem)
3548           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
3549           || (offset2 - cfun->machine->frame.reg_offset[regno])
3550                 != GET_MODE_SIZE (mode))
3551         {
3552           insn = emit_insn (set);
3553           RTX_FRAME_RELATED_P (insn) = 1;
3554           if (prologue_p)
3555             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3556           else
3557             add_reg_note (insn, REG_CFA_RESTORE, reg);
3558
3559           regno = regno2;
3560           continue;
3561         }
3562
3563       /* REGNO2 can be saved/restored in a pair with REGNO.  */
3564       rtx reg2 = gen_rtx_REG (mode, regno2);
3565       if (!frame_pointer_needed)
3566         offset2 += cfun->machine->frame.frame_size
3567                   - cfun->machine->frame.hard_fp_offset;
3568       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
3569       rtx mem2 = gen_frame_mem (mode, addr2);
3570       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
3571                              : gen_rtx_SET (reg2, mem2);
3572
3573       if (prologue_p)
3574         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
3575       else
3576         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3577
3578       RTX_FRAME_RELATED_P (insn) = 1;
3579       if (prologue_p)
3580         {
3581           add_reg_note (insn, REG_CFA_OFFSET, set);
3582           add_reg_note (insn, REG_CFA_OFFSET, set2);
3583         }
3584       else
3585         {
3586           add_reg_note (insn, REG_CFA_RESTORE, reg);
3587           add_reg_note (insn, REG_CFA_RESTORE, reg2);
3588         }
3589
3590       regno = aarch64_get_next_set_bit (components, regno2 + 1);
3591     }
3592 }
3593
3594 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
3595
3596 static void
3597 aarch64_emit_prologue_components (sbitmap components)
3598 {
3599   aarch64_process_components (components, true);
3600 }
3601
3602 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
3603
3604 static void
3605 aarch64_emit_epilogue_components (sbitmap components)
3606 {
3607   aarch64_process_components (components, false);
3608 }
3609
3610 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
3611
3612 static void
3613 aarch64_set_handled_components (sbitmap components)
3614 {
3615   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3616     if (bitmap_bit_p (components, regno))
3617       cfun->machine->reg_is_wrapped_separately[regno] = true;
3618 }
3619
3620 /* AArch64 stack frames generated by this compiler look like:
3621
3622         +-------------------------------+
3623         |                               |
3624         |  incoming stack arguments     |
3625         |                               |
3626         +-------------------------------+
3627         |                               | <-- incoming stack pointer (aligned)
3628         |  callee-allocated save area   |
3629         |  for register varargs         |
3630         |                               |
3631         +-------------------------------+
3632         |  local variables              | <-- frame_pointer_rtx
3633         |                               |
3634         +-------------------------------+
3635         |  padding0                     | \
3636         +-------------------------------+  |
3637         |  callee-saved registers       |  | frame.saved_regs_size
3638         +-------------------------------+  |
3639         |  LR'                          |  |
3640         +-------------------------------+  |
3641         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
3642         +-------------------------------+
3643         |  dynamic allocation           |
3644         +-------------------------------+
3645         |  padding                      |
3646         +-------------------------------+
3647         |  outgoing stack arguments     | <-- arg_pointer
3648         |                               |
3649         +-------------------------------+
3650         |                               | <-- stack_pointer_rtx (aligned)
3651
3652    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3653    but leave frame_pointer_rtx and hard_frame_pointer_rtx
3654    unchanged.  */
3655
3656 /* Generate the prologue instructions for entry into a function.
3657    Establish the stack frame by decreasing the stack pointer with a
3658    properly calculated size and, if necessary, create a frame record
3659    filled with the values of LR and previous frame pointer.  The
3660    current FP is also set up if it is in use.  */
3661
3662 void
3663 aarch64_expand_prologue (void)
3664 {
3665   aarch64_layout_frame ();
3666
3667   HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3668   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3669   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3670   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3671   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3672   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3673   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3674   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
3675   rtx_insn *insn;
3676
3677   /* Sign return address for functions.  */
3678   if (aarch64_return_address_signing_enabled ())
3679     {
3680       insn = emit_insn (gen_pacisp ());
3681       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3682       RTX_FRAME_RELATED_P (insn) = 1;
3683     }
3684
3685   if (flag_stack_usage_info)
3686     current_function_static_stack_size = frame_size;
3687
3688   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3689     {
3690       if (crtl->is_leaf && !cfun->calls_alloca)
3691         {
3692           if (frame_size > PROBE_INTERVAL
3693               && frame_size > get_stack_check_protect ())
3694             aarch64_emit_probe_stack_range (get_stack_check_protect (),
3695                                             (frame_size
3696                                              - get_stack_check_protect ()));
3697         }
3698       else if (frame_size > 0)
3699         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
3700     }
3701
3702   aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
3703
3704   if (callee_adjust != 0)
3705     aarch64_push_regs (reg1, reg2, callee_adjust);
3706
3707   if (emit_frame_chain)
3708     {
3709       if (callee_adjust == 0)
3710         aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3711                                    R30_REGNUM, false);
3712       insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3713                                        stack_pointer_rtx,
3714                                        GEN_INT (callee_offset)));
3715       RTX_FRAME_RELATED_P (insn) = frame_pointer_needed;
3716       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3717     }
3718
3719   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3720                              callee_adjust != 0 || emit_frame_chain);
3721   aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3722                              callee_adjust != 0 || emit_frame_chain);
3723   aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
3724 }
3725
3726 /* Return TRUE if we can use a simple_return insn.
3727
3728    This function checks whether the callee saved stack is empty, which
3729    means no restore actions are need. The pro_and_epilogue will use
3730    this to check whether shrink-wrapping opt is feasible.  */
3731
3732 bool
3733 aarch64_use_return_insn_p (void)
3734 {
3735   if (!reload_completed)
3736     return false;
3737
3738   if (crtl->profile)
3739     return false;
3740
3741   aarch64_layout_frame ();
3742
3743   return cfun->machine->frame.frame_size == 0;
3744 }
3745
3746 /* Generate the epilogue instructions for returning from a function.
3747    This is almost exactly the reverse of the prolog sequence, except
3748    that we need to insert barriers to avoid scheduling loads that read
3749    from a deallocated stack, and we optimize the unwind records by
3750    emitting them all together if possible.  */
3751 void
3752 aarch64_expand_epilogue (bool for_sibcall)
3753 {
3754   aarch64_layout_frame ();
3755
3756   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3757   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3758   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3759   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3760   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3761   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3762   rtx cfi_ops = NULL;
3763   rtx_insn *insn;
3764
3765   /* We need to add memory barrier to prevent read from deallocated stack.  */
3766   bool need_barrier_p = (get_frame_size ()
3767                          + cfun->machine->frame.saved_varargs_size) != 0;
3768
3769   /* Emit a barrier to prevent loads from a deallocated stack.  */
3770   if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca
3771       || crtl->calls_eh_return)
3772     {
3773       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3774       need_barrier_p = false;
3775     }
3776
3777   /* Restore the stack pointer from the frame pointer if it may not
3778      be the same as the stack pointer.  */
3779   if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3780     {
3781       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3782                                        hard_frame_pointer_rtx,
3783                                        GEN_INT (-callee_offset)));
3784       /* If writeback is used when restoring callee-saves, the CFA
3785          is restored on the instruction doing the writeback.  */
3786       RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3787     }
3788   else
3789     aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
3790
3791   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3792                                 callee_adjust != 0, &cfi_ops);
3793   aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3794                                 callee_adjust != 0, &cfi_ops);
3795
3796   if (need_barrier_p)
3797     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3798
3799   if (callee_adjust != 0)
3800     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3801
3802   if (callee_adjust != 0 || initial_adjust > 65536)
3803     {
3804       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
3805       insn = get_last_insn ();
3806       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3807       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3808       RTX_FRAME_RELATED_P (insn) = 1;
3809       cfi_ops = NULL;
3810     }
3811
3812   aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
3813
3814   if (cfi_ops)
3815     {
3816       /* Emit delayed restores and reset the CFA to be SP.  */
3817       insn = get_last_insn ();
3818       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3819       REG_NOTES (insn) = cfi_ops;
3820       RTX_FRAME_RELATED_P (insn) = 1;
3821     }
3822
3823   /* We prefer to emit the combined return/authenticate instruction RETAA,
3824      however there are three cases in which we must instead emit an explicit
3825      authentication instruction.
3826
3827         1) Sibcalls don't return in a normal way, so if we're about to call one
3828            we must authenticate.
3829
3830         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
3831            generating code for !TARGET_ARMV8_3 we can't use it and must
3832            explicitly authenticate.
3833
3834         3) On an eh_return path we make extra stack adjustments to update the
3835            canonical frame address to be the exception handler's CFA.  We want
3836            to authenticate using the CFA of the function which calls eh_return.
3837     */
3838   if (aarch64_return_address_signing_enabled ()
3839       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
3840     {
3841       insn = emit_insn (gen_autisp ());
3842       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3843       RTX_FRAME_RELATED_P (insn) = 1;
3844     }
3845
3846   /* Stack adjustment for exception handler.  */
3847   if (crtl->calls_eh_return)
3848     {
3849       /* We need to unwind the stack by the offset computed by
3850          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
3851          to be SP; letting the CFA move during this adjustment
3852          is just as correct as retaining the CFA from the body
3853          of the function.  Therefore, do nothing special.  */
3854       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3855     }
3856
3857   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3858   if (!for_sibcall)
3859     emit_jump_insn (ret_rtx);
3860 }
3861
3862 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
3863    normally or return to a previous frame after unwinding.
3864
3865    An EH return uses a single shared return sequence.  The epilogue is
3866    exactly like a normal epilogue except that it has an extra input
3867    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
3868    that must be applied after the frame has been destroyed.  An extra label
3869    is inserted before the epilogue which initializes this register to zero,
3870    and this is the entry point for a normal return.
3871
3872    An actual EH return updates the return address, initializes the stack
3873    adjustment and jumps directly into the epilogue (bypassing the zeroing
3874    of the adjustment).  Since the return address is typically saved on the
3875    stack when a function makes a call, the saved LR must be updated outside
3876    the epilogue.
3877
3878    This poses problems as the store is generated well before the epilogue,
3879    so the offset of LR is not known yet.  Also optimizations will remove the
3880    store as it appears dead, even after the epilogue is generated (as the
3881    base or offset for loading LR is different in many cases).
3882
3883    To avoid these problems this implementation forces the frame pointer
3884    in eh_return functions so that the location of LR is fixed and known early.
3885    It also marks the store volatile, so no optimization is permitted to
3886    remove the store.  */
3887 rtx
3888 aarch64_eh_return_handler_rtx (void)
3889 {
3890   rtx tmp = gen_frame_mem (Pmode,
3891     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3892
3893   /* Mark the store volatile, so no optimization is permitted to remove it.  */
3894   MEM_VOLATILE_P (tmp) = true;
3895   return tmp;
3896 }
3897
3898 /* Output code to add DELTA to the first argument, and then jump
3899    to FUNCTION.  Used for C++ multiple inheritance.  */
3900 static void
3901 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3902                          HOST_WIDE_INT delta,
3903                          HOST_WIDE_INT vcall_offset,
3904                          tree function)
3905 {
3906   /* The this pointer is always in x0.  Note that this differs from
3907      Arm where the this pointer maybe bumped to r1 if r0 is required
3908      to return a pointer to an aggregate.  On AArch64 a result value
3909      pointer will be in x8.  */
3910   int this_regno = R0_REGNUM;
3911   rtx this_rtx, temp0, temp1, addr, funexp;
3912   rtx_insn *insn;
3913
3914   reload_completed = 1;
3915   emit_note (NOTE_INSN_PROLOGUE_END);
3916
3917   if (vcall_offset == 0)
3918     aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3919   else
3920     {
3921       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3922
3923       this_rtx = gen_rtx_REG (Pmode, this_regno);
3924       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3925       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3926
3927       addr = this_rtx;
3928       if (delta != 0)
3929         {
3930           if (delta >= -256 && delta < 256)
3931             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3932                                        plus_constant (Pmode, this_rtx, delta));
3933           else
3934             aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3935         }
3936
3937       if (Pmode == ptr_mode)
3938         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3939       else
3940         aarch64_emit_move (temp0,
3941                            gen_rtx_ZERO_EXTEND (Pmode,
3942                                                 gen_rtx_MEM (ptr_mode, addr)));
3943
3944       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3945           addr = plus_constant (Pmode, temp0, vcall_offset);
3946       else
3947         {
3948           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3949                                           Pmode);
3950           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3951         }
3952
3953       if (Pmode == ptr_mode)
3954         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3955       else
3956         aarch64_emit_move (temp1,
3957                            gen_rtx_SIGN_EXTEND (Pmode,
3958                                                 gen_rtx_MEM (ptr_mode, addr)));
3959
3960       emit_insn (gen_add2_insn (this_rtx, temp1));
3961     }
3962
3963   /* Generate a tail call to the target function.  */
3964   if (!TREE_USED (function))
3965     {
3966       assemble_external (function);
3967       TREE_USED (function) = 1;
3968     }
3969   funexp = XEXP (DECL_RTL (function), 0);
3970   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3971   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3972   SIBLING_CALL_P (insn) = 1;
3973
3974   insn = get_insns ();
3975   shorten_branches (insn);
3976   final_start_function (insn, file, 1);
3977   final (insn, file, 1);
3978   final_end_function ();
3979
3980   /* Stop pretending to be a post-reload pass.  */
3981   reload_completed = 0;
3982 }
3983
3984 static bool
3985 aarch64_tls_referenced_p (rtx x)
3986 {
3987   if (!TARGET_HAVE_TLS)
3988     return false;
3989   subrtx_iterator::array_type array;
3990   FOR_EACH_SUBRTX (iter, array, x, ALL)
3991     {
3992       const_rtx x = *iter;
3993       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3994         return true;
3995       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3996          TLS offsets, not real symbol references.  */
3997       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3998         iter.skip_subrtxes ();
3999     }
4000   return false;
4001 }
4002
4003
4004 /* Return true if val can be encoded as a 12-bit unsigned immediate with
4005    a left shift of 0 or 12 bits.  */
4006 bool
4007 aarch64_uimm12_shift (HOST_WIDE_INT val)
4008 {
4009   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
4010           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
4011           );
4012 }
4013
4014
4015 /* Return true if val is an immediate that can be loaded into a
4016    register by a MOVZ instruction.  */
4017 static bool
4018 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
4019 {
4020   if (GET_MODE_SIZE (mode) > 4)
4021     {
4022       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
4023           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
4024         return 1;
4025     }
4026   else
4027     {
4028       /* Ignore sign extension.  */
4029       val &= (HOST_WIDE_INT) 0xffffffff;
4030     }
4031   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
4032           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
4033 }
4034
4035 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
4036
4037 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
4038   {
4039     0x0000000100000001ull,
4040     0x0001000100010001ull,
4041     0x0101010101010101ull,
4042     0x1111111111111111ull,
4043     0x5555555555555555ull,
4044   };
4045
4046
4047 /* Return true if val is a valid bitmask immediate.  */
4048
4049 bool
4050 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
4051 {
4052   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
4053   int bits;
4054
4055   /* Check for a single sequence of one bits and return quickly if so.
4056      The special cases of all ones and all zeroes returns false.  */
4057   val = (unsigned HOST_WIDE_INT) val_in;
4058   tmp = val + (val & -val);
4059
4060   if (tmp == (tmp & -tmp))
4061     return (val + 1) > 1;
4062
4063   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
4064   if (mode == SImode)
4065     val = (val << 32) | (val & 0xffffffff);
4066
4067   /* Invert if the immediate doesn't start with a zero bit - this means we
4068      only need to search for sequences of one bits.  */
4069   if (val & 1)
4070     val = ~val;
4071
4072   /* Find the first set bit and set tmp to val with the first sequence of one
4073      bits removed.  Return success if there is a single sequence of ones.  */
4074   first_one = val & -val;
4075   tmp = val & (val + first_one);
4076
4077   if (tmp == 0)
4078     return true;
4079
4080   /* Find the next set bit and compute the difference in bit position.  */
4081   next_one = tmp & -tmp;
4082   bits = clz_hwi (first_one) - clz_hwi (next_one);
4083   mask = val ^ tmp;
4084
4085   /* Check the bit position difference is a power of 2, and that the first
4086      sequence of one bits fits within 'bits' bits.  */
4087   if ((mask >> bits) != 0 || bits != (bits & -bits))
4088     return false;
4089
4090   /* Check the sequence of one bits is repeated 64/bits times.  */
4091   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
4092 }
4093
4094 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
4095    Assumed precondition: VAL_IN Is not zero.  */
4096
4097 unsigned HOST_WIDE_INT
4098 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
4099 {
4100   int lowest_bit_set = ctz_hwi (val_in);
4101   int highest_bit_set = floor_log2 (val_in);
4102   gcc_assert (val_in != 0);
4103
4104   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
4105           (HOST_WIDE_INT_1U << lowest_bit_set));
4106 }
4107
4108 /* Create constant where bits outside of lowest bit set to highest bit set
4109    are set to 1.  */
4110
4111 unsigned HOST_WIDE_INT
4112 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
4113 {
4114   return val_in | ~aarch64_and_split_imm1 (val_in);
4115 }
4116
4117 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
4118
4119 bool
4120 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
4121 {
4122   scalar_int_mode int_mode;
4123   if (!is_a <scalar_int_mode> (mode, &int_mode))
4124     return false;
4125
4126   if (aarch64_bitmask_imm (val_in, int_mode))
4127     return false;
4128
4129   if (aarch64_move_imm (val_in, int_mode))
4130     return false;
4131
4132   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
4133
4134   return aarch64_bitmask_imm (imm2, int_mode);
4135 }
4136
4137 /* Return true if val is an immediate that can be loaded into a
4138    register in a single instruction.  */
4139 bool
4140 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
4141 {
4142   scalar_int_mode int_mode;
4143   if (!is_a <scalar_int_mode> (mode, &int_mode))
4144     return false;
4145
4146   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
4147     return 1;
4148   return aarch64_bitmask_imm (val, int_mode);
4149 }
4150
4151 static bool
4152 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
4153 {
4154   rtx base, offset;
4155
4156   if (GET_CODE (x) == HIGH)
4157     return true;
4158
4159   split_const (x, &base, &offset);
4160   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
4161     {
4162       if (aarch64_classify_symbol (base, offset)
4163           != SYMBOL_FORCE_TO_MEM)
4164         return true;
4165       else
4166         /* Avoid generating a 64-bit relocation in ILP32; leave
4167            to aarch64_expand_mov_immediate to handle it properly.  */
4168         return mode != ptr_mode;
4169     }
4170
4171   return aarch64_tls_referenced_p (x);
4172 }
4173
4174 /* Implement TARGET_CASE_VALUES_THRESHOLD.
4175    The expansion for a table switch is quite expensive due to the number
4176    of instructions, the table lookup and hard to predict indirect jump.
4177    When optimizing for speed, and -O3 enabled, use the per-core tuning if
4178    set, otherwise use tables for > 16 cases as a tradeoff between size and
4179    performance.  When optimizing for size, use the default setting.  */
4180
4181 static unsigned int
4182 aarch64_case_values_threshold (void)
4183 {
4184   /* Use the specified limit for the number of cases before using jump
4185      tables at higher optimization levels.  */
4186   if (optimize > 2
4187       && selected_cpu->tune->max_case_values != 0)
4188     return selected_cpu->tune->max_case_values;
4189   else
4190     return optimize_size ? default_case_values_threshold () : 17;
4191 }
4192
4193 /* Return true if register REGNO is a valid index register.
4194    STRICT_P is true if REG_OK_STRICT is in effect.  */
4195
4196 bool
4197 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
4198 {
4199   if (!HARD_REGISTER_NUM_P (regno))
4200     {
4201       if (!strict_p)
4202         return true;
4203
4204       if (!reg_renumber)
4205         return false;
4206
4207       regno = reg_renumber[regno];
4208     }
4209   return GP_REGNUM_P (regno);
4210 }
4211
4212 /* Return true if register REGNO is a valid base register for mode MODE.
4213    STRICT_P is true if REG_OK_STRICT is in effect.  */
4214
4215 bool
4216 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
4217 {
4218   if (!HARD_REGISTER_NUM_P (regno))
4219     {
4220       if (!strict_p)
4221         return true;
4222
4223       if (!reg_renumber)
4224         return false;
4225
4226       regno = reg_renumber[regno];
4227     }
4228
4229   /* The fake registers will be eliminated to either the stack or
4230      hard frame pointer, both of which are usually valid base registers.
4231      Reload deals with the cases where the eliminated form isn't valid.  */
4232   return (GP_REGNUM_P (regno)
4233           || regno == SP_REGNUM
4234           || regno == FRAME_POINTER_REGNUM
4235           || regno == ARG_POINTER_REGNUM);
4236 }
4237
4238 /* Return true if X is a valid base register for mode MODE.
4239    STRICT_P is true if REG_OK_STRICT is in effect.  */
4240
4241 static bool
4242 aarch64_base_register_rtx_p (rtx x, bool strict_p)
4243 {
4244   if (!strict_p
4245       && GET_CODE (x) == SUBREG
4246       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
4247     x = SUBREG_REG (x);
4248
4249   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
4250 }
4251
4252 /* Return true if address offset is a valid index.  If it is, fill in INFO
4253    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
4254
4255 static bool
4256 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
4257                         machine_mode mode, bool strict_p)
4258 {
4259   enum aarch64_address_type type;
4260   rtx index;
4261   int shift;
4262
4263   /* (reg:P) */
4264   if ((REG_P (x) || GET_CODE (x) == SUBREG)
4265       && GET_MODE (x) == Pmode)
4266     {
4267       type = ADDRESS_REG_REG;
4268       index = x;
4269       shift = 0;
4270     }
4271   /* (sign_extend:DI (reg:SI)) */
4272   else if ((GET_CODE (x) == SIGN_EXTEND
4273             || GET_CODE (x) == ZERO_EXTEND)
4274            && GET_MODE (x) == DImode
4275            && GET_MODE (XEXP (x, 0)) == SImode)
4276     {
4277       type = (GET_CODE (x) == SIGN_EXTEND)
4278         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4279       index = XEXP (x, 0);
4280       shift = 0;
4281     }
4282   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
4283   else if (GET_CODE (x) == MULT
4284            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4285                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4286            && GET_MODE (XEXP (x, 0)) == DImode
4287            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4288            && CONST_INT_P (XEXP (x, 1)))
4289     {
4290       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4291         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4292       index = XEXP (XEXP (x, 0), 0);
4293       shift = exact_log2 (INTVAL (XEXP (x, 1)));
4294     }
4295   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
4296   else if (GET_CODE (x) == ASHIFT
4297            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4298                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4299            && GET_MODE (XEXP (x, 0)) == DImode
4300            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4301            && CONST_INT_P (XEXP (x, 1)))
4302     {
4303       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4304         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4305       index = XEXP (XEXP (x, 0), 0);
4306       shift = INTVAL (XEXP (x, 1));
4307     }
4308   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
4309   else if ((GET_CODE (x) == SIGN_EXTRACT
4310             || GET_CODE (x) == ZERO_EXTRACT)
4311            && GET_MODE (x) == DImode
4312            && GET_CODE (XEXP (x, 0)) == MULT
4313            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4314            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4315     {
4316       type = (GET_CODE (x) == SIGN_EXTRACT)
4317         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4318       index = XEXP (XEXP (x, 0), 0);
4319       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4320       if (INTVAL (XEXP (x, 1)) != 32 + shift
4321           || INTVAL (XEXP (x, 2)) != 0)
4322         shift = -1;
4323     }
4324   /* (and:DI (mult:DI (reg:DI) (const_int scale))
4325      (const_int 0xffffffff<<shift)) */
4326   else if (GET_CODE (x) == AND
4327            && GET_MODE (x) == DImode
4328            && GET_CODE (XEXP (x, 0)) == MULT
4329            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4330            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4331            && CONST_INT_P (XEXP (x, 1)))
4332     {
4333       type = ADDRESS_REG_UXTW;
4334       index = XEXP (XEXP (x, 0), 0);
4335       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4336       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4337         shift = -1;
4338     }
4339   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
4340   else if ((GET_CODE (x) == SIGN_EXTRACT
4341             || GET_CODE (x) == ZERO_EXTRACT)
4342            && GET_MODE (x) == DImode
4343            && GET_CODE (XEXP (x, 0)) == ASHIFT
4344            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4345            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4346     {
4347       type = (GET_CODE (x) == SIGN_EXTRACT)
4348         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4349       index = XEXP (XEXP (x, 0), 0);
4350       shift = INTVAL (XEXP (XEXP (x, 0), 1));
4351       if (INTVAL (XEXP (x, 1)) != 32 + shift
4352           || INTVAL (XEXP (x, 2)) != 0)
4353         shift = -1;
4354     }
4355   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
4356      (const_int 0xffffffff<<shift)) */
4357   else if (GET_CODE (x) == AND
4358            && GET_MODE (x) == DImode
4359            && GET_CODE (XEXP (x, 0)) == ASHIFT
4360            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4361            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4362            && CONST_INT_P (XEXP (x, 1)))
4363     {
4364       type = ADDRESS_REG_UXTW;
4365       index = XEXP (XEXP (x, 0), 0);
4366       shift = INTVAL (XEXP (XEXP (x, 0), 1));
4367       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4368         shift = -1;
4369     }
4370   /* (mult:P (reg:P) (const_int scale)) */
4371   else if (GET_CODE (x) == MULT
4372            && GET_MODE (x) == Pmode
4373            && GET_MODE (XEXP (x, 0)) == Pmode
4374            && CONST_INT_P (XEXP (x, 1)))
4375     {
4376       type = ADDRESS_REG_REG;
4377       index = XEXP (x, 0);
4378       shift = exact_log2 (INTVAL (XEXP (x, 1)));
4379     }
4380   /* (ashift:P (reg:P) (const_int shift)) */
4381   else if (GET_CODE (x) == ASHIFT
4382            && GET_MODE (x) == Pmode
4383            && GET_MODE (XEXP (x, 0)) == Pmode
4384            && CONST_INT_P (XEXP (x, 1)))
4385     {
4386       type = ADDRESS_REG_REG;
4387       index = XEXP (x, 0);
4388       shift = INTVAL (XEXP (x, 1));
4389     }
4390   else
4391     return false;
4392
4393   if (!strict_p
4394       && GET_CODE (index) == SUBREG
4395       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
4396     index = SUBREG_REG (index);
4397
4398   if ((shift == 0 ||
4399        (shift > 0 && shift <= 3
4400         && (1 << shift) == GET_MODE_SIZE (mode)))
4401       && REG_P (index)
4402       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
4403     {
4404       info->type = type;
4405       info->offset = index;
4406       info->shift = shift;
4407       return true;
4408     }
4409
4410   return false;
4411 }
4412
4413 /* Return true if MODE is one of the modes for which we
4414    support LDP/STP operations.  */
4415
4416 static bool
4417 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
4418 {
4419   return mode == SImode || mode == DImode
4420          || mode == SFmode || mode == DFmode
4421          || (aarch64_vector_mode_supported_p (mode)
4422              && GET_MODE_SIZE (mode) == 8);
4423 }
4424
4425 /* Return true if REGNO is a virtual pointer register, or an eliminable
4426    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
4427    include stack_pointer or hard_frame_pointer.  */
4428 static bool
4429 virt_or_elim_regno_p (unsigned regno)
4430 {
4431   return ((regno >= FIRST_VIRTUAL_REGISTER
4432            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
4433           || regno == FRAME_POINTER_REGNUM
4434           || regno == ARG_POINTER_REGNUM);
4435 }
4436
4437 /* Return true if X is a valid address for machine mode MODE.  If it is,
4438    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
4439    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
4440
4441 static bool
4442 aarch64_classify_address (struct aarch64_address_info *info,
4443                           rtx x, machine_mode mode,
4444                           RTX_CODE outer_code, bool strict_p)
4445 {
4446   enum rtx_code code = GET_CODE (x);
4447   rtx op0, op1;
4448
4449   /* On BE, we use load/store pair for all large int mode load/stores.
4450      TI/TFmode may also use a load/store pair.  */
4451   bool load_store_pair_p = (outer_code == PARALLEL
4452                             || mode == TImode
4453                             || mode == TFmode
4454                             || (BYTES_BIG_ENDIAN
4455                                 && aarch64_vect_struct_mode_p (mode)));
4456
4457   bool allow_reg_index_p =
4458     !load_store_pair_p
4459     && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
4460     && !aarch64_vect_struct_mode_p (mode);
4461
4462   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4463      REG addressing.  */
4464   if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
4465       && (code != POST_INC && code != REG))
4466     return false;
4467
4468   switch (code)
4469     {
4470     case REG:
4471     case SUBREG:
4472       info->type = ADDRESS_REG_IMM;
4473       info->base = x;
4474       info->offset = const0_rtx;
4475       return aarch64_base_register_rtx_p (x, strict_p);
4476
4477     case PLUS:
4478       op0 = XEXP (x, 0);
4479       op1 = XEXP (x, 1);
4480
4481       if (! strict_p
4482           && REG_P (op0)
4483           && virt_or_elim_regno_p (REGNO (op0))
4484           && CONST_INT_P (op1))
4485         {
4486           info->type = ADDRESS_REG_IMM;
4487           info->base = op0;
4488           info->offset = op1;
4489
4490           return true;
4491         }
4492
4493       if (GET_MODE_SIZE (mode) != 0
4494           && CONST_INT_P (op1)
4495           && aarch64_base_register_rtx_p (op0, strict_p))
4496         {
4497           HOST_WIDE_INT offset = INTVAL (op1);
4498
4499           info->type = ADDRESS_REG_IMM;
4500           info->base = op0;
4501           info->offset = op1;
4502
4503           /* TImode and TFmode values are allowed in both pairs of X
4504              registers and individual Q registers.  The available
4505              address modes are:
4506              X,X: 7-bit signed scaled offset
4507              Q:   9-bit signed offset
4508              We conservatively require an offset representable in either mode.
4509              When performing the check for pairs of X registers i.e.  LDP/STP
4510              pass down DImode since that is the natural size of the LDP/STP
4511              instruction memory accesses.  */
4512           if (mode == TImode || mode == TFmode)
4513             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
4514                     && (offset_9bit_signed_unscaled_p (mode, offset)
4515                         || offset_12bit_unsigned_scaled_p (mode, offset)));
4516
4517           /* A 7bit offset check because OImode will emit a ldp/stp
4518              instruction (only big endian will get here).
4519              For ldp/stp instructions, the offset is scaled for the size of a
4520              single element of the pair.  */
4521           if (mode == OImode)
4522             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4523
4524           /* Three 9/12 bit offsets checks because CImode will emit three
4525              ldr/str instructions (only big endian will get here).  */
4526           if (mode == CImode)
4527             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4528                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4529                         || offset_12bit_unsigned_scaled_p (V16QImode,
4530                                                            offset + 32)));
4531
4532           /* Two 7bit offsets checks because XImode will emit two ldp/stp
4533              instructions (only big endian will get here).  */
4534           if (mode == XImode)
4535             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4536                     && aarch64_offset_7bit_signed_scaled_p (TImode,
4537                                                             offset + 32));
4538
4539           if (load_store_pair_p)
4540             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4541                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4542           else
4543             return (offset_9bit_signed_unscaled_p (mode, offset)
4544                     || offset_12bit_unsigned_scaled_p (mode, offset));
4545         }
4546
4547       if (allow_reg_index_p)
4548         {
4549           /* Look for base + (scaled/extended) index register.  */
4550           if (aarch64_base_register_rtx_p (op0, strict_p)
4551               && aarch64_classify_index (info, op1, mode, strict_p))
4552             {
4553               info->base = op0;
4554               return true;
4555             }
4556           if (aarch64_base_register_rtx_p (op1, strict_p)
4557               && aarch64_classify_index (info, op0, mode, strict_p))
4558             {
4559               info->base = op1;
4560               return true;
4561             }
4562         }
4563
4564       return false;
4565
4566     case POST_INC:
4567     case POST_DEC:
4568     case PRE_INC:
4569     case PRE_DEC:
4570       info->type = ADDRESS_REG_WB;
4571       info->base = XEXP (x, 0);
4572       info->offset = NULL_RTX;
4573       return aarch64_base_register_rtx_p (info->base, strict_p);
4574
4575     case POST_MODIFY:
4576     case PRE_MODIFY:
4577       info->type = ADDRESS_REG_WB;
4578       info->base = XEXP (x, 0);
4579       if (GET_CODE (XEXP (x, 1)) == PLUS
4580           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4581           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4582           && aarch64_base_register_rtx_p (info->base, strict_p))
4583         {
4584           HOST_WIDE_INT offset;
4585           info->offset = XEXP (XEXP (x, 1), 1);
4586           offset = INTVAL (info->offset);
4587
4588           /* TImode and TFmode values are allowed in both pairs of X
4589              registers and individual Q registers.  The available
4590              address modes are:
4591              X,X: 7-bit signed scaled offset
4592              Q:   9-bit signed offset
4593              We conservatively require an offset representable in either mode.
4594            */
4595           if (mode == TImode || mode == TFmode)
4596             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4597                     && offset_9bit_signed_unscaled_p (mode, offset));
4598
4599           if (load_store_pair_p)
4600             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4601                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4602           else
4603             return offset_9bit_signed_unscaled_p (mode, offset);
4604         }
4605       return false;
4606
4607     case CONST:
4608     case SYMBOL_REF:
4609     case LABEL_REF:
4610       /* load literal: pc-relative constant pool entry.  Only supported
4611          for SI mode or larger.  */
4612       info->type = ADDRESS_SYMBOLIC;
4613
4614       if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4615         {
4616           rtx sym, addend;
4617
4618           split_const (x, &sym, &addend);
4619           return ((GET_CODE (sym) == LABEL_REF
4620                    || (GET_CODE (sym) == SYMBOL_REF
4621                        && CONSTANT_POOL_ADDRESS_P (sym)
4622                        && aarch64_pcrelative_literal_loads)));
4623         }
4624       return false;
4625
4626     case LO_SUM:
4627       info->type = ADDRESS_LO_SUM;
4628       info->base = XEXP (x, 0);
4629       info->offset = XEXP (x, 1);
4630       if (allow_reg_index_p
4631           && aarch64_base_register_rtx_p (info->base, strict_p))
4632         {
4633           rtx sym, offs;
4634           split_const (info->offset, &sym, &offs);
4635           if (GET_CODE (sym) == SYMBOL_REF
4636               && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4637             {
4638               /* The symbol and offset must be aligned to the access size.  */
4639               unsigned int align;
4640               unsigned int ref_size;
4641
4642               if (CONSTANT_POOL_ADDRESS_P (sym))
4643                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4644               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4645                 {
4646                   tree exp = SYMBOL_REF_DECL (sym);
4647                   align = TYPE_ALIGN (TREE_TYPE (exp));
4648                   align = aarch64_constant_alignment (exp, align);
4649                 }
4650               else if (SYMBOL_REF_DECL (sym))
4651                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4652               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4653                        && SYMBOL_REF_BLOCK (sym) != NULL)
4654                 align = SYMBOL_REF_BLOCK (sym)->alignment;
4655               else
4656                 align = BITS_PER_UNIT;
4657
4658               ref_size = GET_MODE_SIZE (mode);
4659               if (ref_size == 0)
4660                 ref_size = GET_MODE_SIZE (DImode);
4661
4662               return ((INTVAL (offs) & (ref_size - 1)) == 0
4663                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4664             }
4665         }
4666       return false;
4667
4668     default:
4669       return false;
4670     }
4671 }
4672
4673 /* Return true if the address X is valid for a PRFM instruction.
4674    STRICT_P is true if we should do strict checking with
4675    aarch64_classify_address.  */
4676
4677 bool
4678 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
4679 {
4680   struct aarch64_address_info addr;
4681
4682   /* PRFM accepts the same addresses as DImode...  */
4683   bool res = aarch64_classify_address (&addr, x, DImode, MEM, strict_p);
4684   if (!res)
4685     return false;
4686
4687   /* ... except writeback forms.  */
4688   return addr.type != ADDRESS_REG_WB;
4689 }
4690
4691 bool
4692 aarch64_symbolic_address_p (rtx x)
4693 {
4694   rtx offset;
4695
4696   split_const (x, &x, &offset);
4697   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4698 }
4699
4700 /* Classify the base of symbolic expression X.  */
4701
4702 enum aarch64_symbol_type
4703 aarch64_classify_symbolic_expression (rtx x)
4704 {
4705   rtx offset;
4706
4707   split_const (x, &x, &offset);
4708   return aarch64_classify_symbol (x, offset);
4709 }
4710
4711
4712 /* Return TRUE if X is a legitimate address for accessing memory in
4713    mode MODE.  */
4714 static bool
4715 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4716 {
4717   struct aarch64_address_info addr;
4718
4719   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4720 }
4721
4722 /* Return TRUE if X is a legitimate address for accessing memory in
4723    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
4724    pair operation.  */
4725 bool
4726 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4727                               RTX_CODE outer_code, bool strict_p)
4728 {
4729   struct aarch64_address_info addr;
4730
4731   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4732 }
4733
4734 /* Split an out-of-range address displacement into a base and offset.
4735    Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4736    to increase opportunities for sharing the base address of different sizes.
4737    Unaligned accesses use the signed 9-bit range, TImode/TFmode use
4738    the intersection of signed scaled 7-bit and signed 9-bit offset.  */
4739 static bool
4740 aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
4741 {
4742   HOST_WIDE_INT offset = INTVAL (*disp);
4743   HOST_WIDE_INT base;
4744
4745   if (mode == TImode || mode == TFmode)
4746     base = (offset + 0x100) & ~0x1f8;
4747   else if ((offset & (GET_MODE_SIZE (mode) - 1)) != 0)
4748     base = (offset + 0x100) & ~0x1ff;
4749   else
4750     base = offset & ~(GET_MODE_SIZE (mode) < 4 ? 0xfff : 0x3ffc);
4751
4752   *off = GEN_INT (base);
4753   *disp = GEN_INT (offset - base);
4754   return true;
4755 }
4756
4757 /* Return the binary representation of floating point constant VALUE in INTVAL.
4758    If the value cannot be converted, return false without setting INTVAL.
4759    The conversion is done in the given MODE.  */
4760 bool
4761 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
4762 {
4763
4764   /* We make a general exception for 0.  */
4765   if (aarch64_float_const_zero_rtx_p (value))
4766     {
4767       *intval = 0;
4768       return true;
4769     }
4770
4771   machine_mode mode = GET_MODE (value);
4772   if (GET_CODE (value) != CONST_DOUBLE
4773       || !SCALAR_FLOAT_MODE_P (mode)
4774       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
4775       /* Only support up to DF mode.  */
4776       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
4777     return false;
4778
4779   unsigned HOST_WIDE_INT ival = 0;
4780
4781   long res[2];
4782   real_to_target (res,
4783                   CONST_DOUBLE_REAL_VALUE (value),
4784                   REAL_MODE_FORMAT (mode));
4785
4786   if (mode == DFmode)
4787     {
4788       int order = BYTES_BIG_ENDIAN ? 1 : 0;
4789       ival = zext_hwi (res[order], 32);
4790       ival |= (zext_hwi (res[1 - order], 32) << 32);
4791     }
4792   else
4793       ival = zext_hwi (res[0], 32);
4794
4795   *intval = ival;
4796   return true;
4797 }
4798
4799 /* Return TRUE if rtx X is an immediate constant that can be moved using a
4800    single MOV(+MOVK) followed by an FMOV.  */
4801 bool
4802 aarch64_float_const_rtx_p (rtx x)
4803 {
4804   machine_mode mode = GET_MODE (x);
4805   if (mode == VOIDmode)
4806     return false;
4807
4808   /* Determine whether it's cheaper to write float constants as
4809      mov/movk pairs over ldr/adrp pairs.  */
4810   unsigned HOST_WIDE_INT ival;
4811
4812   if (GET_CODE (x) == CONST_DOUBLE
4813       && SCALAR_FLOAT_MODE_P (mode)
4814       && aarch64_reinterpret_float_as_int (x, &ival))
4815     {
4816       scalar_int_mode imode = (mode == HFmode
4817                                ? SImode
4818                                : int_mode_for_mode (mode).require ());
4819       int num_instr = aarch64_internal_mov_immediate
4820                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
4821       return num_instr < 3;
4822     }
4823
4824   return false;
4825 }
4826
4827 /* Return TRUE if rtx X is immediate constant 0.0 */
4828 bool
4829 aarch64_float_const_zero_rtx_p (rtx x)
4830 {
4831   if (GET_MODE (x) == VOIDmode)
4832     return false;
4833
4834   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4835     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4836   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4837 }
4838
4839 /* Return TRUE if rtx X is immediate constant that fits in a single
4840    MOVI immediate operation.  */
4841 bool
4842 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
4843 {
4844   if (!TARGET_SIMD)
4845      return false;
4846
4847   machine_mode vmode;
4848   scalar_int_mode imode;
4849   unsigned HOST_WIDE_INT ival;
4850
4851   if (GET_CODE (x) == CONST_DOUBLE
4852       && SCALAR_FLOAT_MODE_P (mode))
4853     {
4854       if (!aarch64_reinterpret_float_as_int (x, &ival))
4855         return false;
4856
4857       /* We make a general exception for 0.  */
4858       if (aarch64_float_const_zero_rtx_p (x))
4859         return true;
4860
4861       imode = int_mode_for_mode (mode).require ();
4862     }
4863   else if (GET_CODE (x) == CONST_INT
4864            && is_a <scalar_int_mode> (mode, &imode))
4865     ival = INTVAL (x);
4866   else
4867     return false;
4868
4869    /* use a 64 bit mode for everything except for DI/DF mode, where we use
4870      a 128 bit vector mode.  */
4871   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
4872
4873   vmode = aarch64_simd_container_mode (imode, width);
4874   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
4875
4876   return aarch64_simd_valid_immediate (v_op, vmode, false, NULL);
4877 }
4878
4879
4880 /* Return the fixed registers used for condition codes.  */
4881
4882 static bool
4883 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4884 {
4885   *p1 = CC_REGNUM;
4886   *p2 = INVALID_REGNUM;
4887   return true;
4888 }
4889
4890 /* This function is used by the call expanders of the machine description.
4891    RESULT is the register in which the result is returned.  It's NULL for
4892    "call" and "sibcall".
4893    MEM is the location of the function call.
4894    SIBCALL indicates whether this function call is normal call or sibling call.
4895    It will generate different pattern accordingly.  */
4896
4897 void
4898 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
4899 {
4900   rtx call, callee, tmp;
4901   rtvec vec;
4902   machine_mode mode;
4903
4904   gcc_assert (MEM_P (mem));
4905   callee = XEXP (mem, 0);
4906   mode = GET_MODE (callee);
4907   gcc_assert (mode == Pmode);
4908
4909   /* Decide if we should generate indirect calls by loading the
4910      address of the callee into a register before performing
4911      the branch-and-link.  */
4912   if (SYMBOL_REF_P (callee)
4913       ? (aarch64_is_long_call_p (callee)
4914          || aarch64_is_noplt_call_p (callee))
4915       : !REG_P (callee))
4916     XEXP (mem, 0) = force_reg (mode, callee);
4917
4918   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
4919
4920   if (result != NULL_RTX)
4921     call = gen_rtx_SET (result, call);
4922
4923   if (sibcall)
4924     tmp = ret_rtx;
4925   else
4926     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
4927
4928   vec = gen_rtvec (2, call, tmp);
4929   call = gen_rtx_PARALLEL (VOIDmode, vec);
4930
4931   aarch64_emit_call_insn (call);
4932 }
4933
4934 /* Emit call insn with PAT and do aarch64-specific handling.  */
4935
4936 void
4937 aarch64_emit_call_insn (rtx pat)
4938 {
4939   rtx insn = emit_call_insn (pat);
4940
4941   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4942   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4943   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4944 }
4945
4946 machine_mode
4947 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4948 {
4949   /* All floating point compares return CCFP if it is an equality
4950      comparison, and CCFPE otherwise.  */
4951   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4952     {
4953       switch (code)
4954         {
4955         case EQ:
4956         case NE:
4957         case UNORDERED:
4958         case ORDERED:
4959         case UNLT:
4960         case UNLE:
4961         case UNGT:
4962         case UNGE:
4963         case UNEQ:
4964         case LTGT:
4965           return CCFPmode;
4966
4967         case LT:
4968         case LE:
4969         case GT:
4970         case GE:
4971           return CCFPEmode;
4972
4973         default:
4974           gcc_unreachable ();
4975         }
4976     }
4977
4978   /* Equality comparisons of short modes against zero can be performed
4979      using the TST instruction with the appropriate bitmask.  */
4980   if (y == const0_rtx && REG_P (x)
4981       && (code == EQ || code == NE)
4982       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4983     return CC_NZmode;
4984
4985   /* Similarly, comparisons of zero_extends from shorter modes can
4986      be performed using an ANDS with an immediate mask.  */
4987   if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
4988       && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4989       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
4990       && (code == EQ || code == NE))
4991     return CC_NZmode;
4992
4993   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4994       && y == const0_rtx
4995       && (code == EQ || code == NE || code == LT || code == GE)
4996       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4997           || GET_CODE (x) == NEG
4998           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4999               && CONST_INT_P (XEXP (x, 2)))))
5000     return CC_NZmode;
5001
5002   /* A compare with a shifted operand.  Because of canonicalization,
5003      the comparison will have to be swapped when we emit the assembly
5004      code.  */
5005   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
5006       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
5007       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
5008           || GET_CODE (x) == LSHIFTRT
5009           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
5010     return CC_SWPmode;
5011
5012   /* Similarly for a negated operand, but we can only do this for
5013      equalities.  */
5014   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
5015       && (REG_P (y) || GET_CODE (y) == SUBREG)
5016       && (code == EQ || code == NE)
5017       && GET_CODE (x) == NEG)
5018     return CC_Zmode;
5019
5020   /* A test for unsigned overflow.  */
5021   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
5022       && code == NE
5023       && GET_CODE (x) == PLUS
5024       && GET_CODE (y) == ZERO_EXTEND)
5025     return CC_Cmode;
5026
5027   /* For everything else, return CCmode.  */
5028   return CCmode;
5029 }
5030
5031 static int
5032 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
5033
5034 int
5035 aarch64_get_condition_code (rtx x)
5036 {
5037   machine_mode mode = GET_MODE (XEXP (x, 0));
5038   enum rtx_code comp_code = GET_CODE (x);
5039
5040   if (GET_MODE_CLASS (mode) != MODE_CC)
5041     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
5042   return aarch64_get_condition_code_1 (mode, comp_code);
5043 }
5044
5045 static int
5046 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
5047 {
5048   switch (mode)
5049     {
5050     case E_CCFPmode:
5051     case E_CCFPEmode:
5052       switch (comp_code)
5053         {
5054         case GE: return AARCH64_GE;
5055         case GT: return AARCH64_GT;
5056         case LE: return AARCH64_LS;
5057         case LT: return AARCH64_MI;
5058         case NE: return AARCH64_NE;
5059         case EQ: return AARCH64_EQ;
5060         case ORDERED: return AARCH64_VC;
5061         case UNORDERED: return AARCH64_VS;
5062         case UNLT: return AARCH64_LT;
5063         case UNLE: return AARCH64_LE;
5064         case UNGT: return AARCH64_HI;
5065         case UNGE: return AARCH64_PL;
5066         default: return -1;
5067         }
5068       break;
5069
5070     case E_CCmode:
5071       switch (comp_code)
5072         {
5073         case NE: return AARCH64_NE;
5074         case EQ: return AARCH64_EQ;
5075         case GE: return AARCH64_GE;
5076         case GT: return AARCH64_GT;
5077         case LE: return AARCH64_LE;
5078         case LT: return AARCH64_LT;
5079         case GEU: return AARCH64_CS;
5080         case GTU: return AARCH64_HI;
5081         case LEU: return AARCH64_LS;
5082         case LTU: return AARCH64_CC;
5083         default: return -1;
5084         }
5085       break;
5086
5087     case E_CC_SWPmode:
5088       switch (comp_code)
5089         {
5090         case NE: return AARCH64_NE;
5091         case EQ: return AARCH64_EQ;
5092         case GE: return AARCH64_LE;
5093         case GT: return AARCH64_LT;
5094         case LE: return AARCH64_GE;
5095         case LT: return AARCH64_GT;
5096         case GEU: return AARCH64_LS;
5097         case GTU: return AARCH64_CC;
5098         case LEU: return AARCH64_CS;
5099         case LTU: return AARCH64_HI;
5100         default: return -1;
5101         }
5102       break;
5103
5104     case E_CC_NZmode:
5105       switch (comp_code)
5106         {
5107         case NE: return AARCH64_NE;
5108         case EQ: return AARCH64_EQ;
5109         case GE: return AARCH64_PL;
5110         case LT: return AARCH64_MI;
5111         default: return -1;
5112         }
5113       break;
5114
5115     case E_CC_Zmode:
5116       switch (comp_code)
5117         {
5118         case NE: return AARCH64_NE;
5119         case EQ: return AARCH64_EQ;
5120         default: return -1;
5121         }
5122       break;
5123
5124     case E_CC_Cmode:
5125       switch (comp_code)
5126         {
5127         case NE: return AARCH64_CS;
5128         case EQ: return AARCH64_CC;
5129         default: return -1;
5130         }
5131       break;
5132
5133     default:
5134       return -1;
5135     }
5136
5137   return -1;
5138 }
5139
5140 bool
5141 aarch64_const_vec_all_same_in_range_p (rtx x,
5142                                   HOST_WIDE_INT minval,
5143                                   HOST_WIDE_INT maxval)
5144 {
5145   HOST_WIDE_INT firstval;
5146   int count, i;
5147
5148   if (GET_CODE (x) != CONST_VECTOR
5149       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
5150     return false;
5151
5152   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
5153   if (firstval < minval || firstval > maxval)
5154     return false;
5155
5156   count = CONST_VECTOR_NUNITS (x);
5157   for (i = 1; i < count; i++)
5158     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
5159       return false;
5160
5161   return true;
5162 }
5163
5164 bool
5165 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
5166 {
5167   return aarch64_const_vec_all_same_in_range_p (x, val, val);
5168 }
5169
5170
5171 /* N Z C V.  */
5172 #define AARCH64_CC_V 1
5173 #define AARCH64_CC_C (1 << 1)
5174 #define AARCH64_CC_Z (1 << 2)
5175 #define AARCH64_CC_N (1 << 3)
5176
5177 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
5178 static const int aarch64_nzcv_codes[] =
5179 {
5180   0,            /* EQ, Z == 1.  */
5181   AARCH64_CC_Z, /* NE, Z == 0.  */
5182   0,            /* CS, C == 1.  */
5183   AARCH64_CC_C, /* CC, C == 0.  */
5184   0,            /* MI, N == 1.  */
5185   AARCH64_CC_N, /* PL, N == 0.  */
5186   0,            /* VS, V == 1.  */
5187   AARCH64_CC_V, /* VC, V == 0.  */
5188   0,            /* HI, C ==1 && Z == 0.  */
5189   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
5190   AARCH64_CC_V, /* GE, N == V.  */
5191   0,            /* LT, N != V.  */
5192   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
5193   0,            /* LE, !(Z == 0 && N == V).  */
5194   0,            /* AL, Any.  */
5195   0             /* NV, Any.  */
5196 };
5197
5198 /* Print operand X to file F in a target specific manner according to CODE.
5199    The acceptable formatting commands given by CODE are:
5200      'c':               An integer or symbol address without a preceding #
5201                         sign.
5202      'e':               Print the sign/zero-extend size as a character 8->b,
5203                         16->h, 32->w.
5204      'p':               Prints N such that 2^N == X (X must be power of 2 and
5205                         const int).
5206      'P':               Print the number of non-zero bits in X (a const_int).
5207      'H':               Print the higher numbered register of a pair (TImode)
5208                         of regs.
5209      'm':               Print a condition (eq, ne, etc).
5210      'M':               Same as 'm', but invert condition.
5211      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
5212      'S/T/U/V':         Print a FP/SIMD register name for a register list.
5213                         The register printed is the FP/SIMD register name
5214                         of X + 0/1/2/3 for S/T/U/V.
5215      'R':               Print a scalar FP/SIMD register name + 1.
5216      'X':               Print bottom 16 bits of integer constant in hex.
5217      'w/x':             Print a general register name or the zero register
5218                         (32-bit or 64-bit).
5219      '0':               Print a normal operand, if it's a general register,
5220                         then we assume DImode.
5221      'k':               Print NZCV for conditional compare instructions.
5222      'A':               Output address constant representing the first
5223                         argument of X, specifying a relocation offset
5224                         if appropriate.
5225      'L':               Output constant address specified by X
5226                         with a relocation offset if appropriate.
5227      'G':               Prints address of X, specifying a PC relative
5228                         relocation mode if appropriate.  */
5229
5230 static void
5231 aarch64_print_operand (FILE *f, rtx x, int code)
5232 {
5233   switch (code)
5234     {
5235     case 'c':
5236       switch (GET_CODE (x))
5237         {
5238         case CONST_INT:
5239           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
5240           break;
5241
5242         case SYMBOL_REF:
5243           output_addr_const (f, x);
5244           break;
5245
5246         case CONST:
5247           if (GET_CODE (XEXP (x, 0)) == PLUS
5248               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
5249             {
5250               output_addr_const (f, x);
5251               break;
5252             }
5253           /* Fall through.  */
5254
5255         default:
5256           output_operand_lossage ("Unsupported operand for code '%c'", code);
5257         }
5258       break;
5259
5260     case 'e':
5261       {
5262         int n;
5263
5264         if (!CONST_INT_P (x)
5265             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
5266           {
5267             output_operand_lossage ("invalid operand for '%%%c'", code);
5268             return;
5269           }
5270
5271         switch (n)
5272           {
5273           case 3:
5274             fputc ('b', f);
5275             break;
5276           case 4:
5277             fputc ('h', f);
5278             break;
5279           case 5:
5280             fputc ('w', f);
5281             break;
5282           default:
5283             output_operand_lossage ("invalid operand for '%%%c'", code);
5284             return;
5285           }
5286       }
5287       break;
5288
5289     case 'p':
5290       {
5291         int n;
5292
5293         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
5294           {
5295             output_operand_lossage ("invalid operand for '%%%c'", code);
5296             return;
5297           }
5298
5299         asm_fprintf (f, "%d", n);
5300       }
5301       break;
5302
5303     case 'P':
5304       if (!CONST_INT_P (x))
5305         {
5306           output_operand_lossage ("invalid operand for '%%%c'", code);
5307           return;
5308         }
5309
5310       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
5311       break;
5312
5313     case 'H':
5314       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
5315         {
5316           output_operand_lossage ("invalid operand for '%%%c'", code);
5317           return;
5318         }
5319
5320       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
5321       break;
5322
5323     case 'M':
5324     case 'm':
5325       {
5326         int cond_code;
5327         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
5328         if (x == const_true_rtx)
5329           {
5330             if (code == 'M')
5331               fputs ("nv", f);
5332             return;
5333           }
5334
5335         if (!COMPARISON_P (x))
5336           {
5337             output_operand_lossage ("invalid operand for '%%%c'", code);
5338             return;
5339           }
5340
5341         cond_code = aarch64_get_condition_code (x);
5342         gcc_assert (cond_code >= 0);
5343         if (code == 'M')
5344           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
5345         fputs (aarch64_condition_codes[cond_code], f);
5346       }
5347       break;
5348
5349     case 'b':
5350     case 'h':
5351     case 's':
5352     case 'd':
5353     case 'q':
5354       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5355         {
5356           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5357           return;
5358         }
5359       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
5360       break;
5361
5362     case 'S':
5363     case 'T':
5364     case 'U':
5365     case 'V':
5366       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5367         {
5368           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5369           return;
5370         }
5371       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
5372       break;
5373
5374     case 'R':
5375       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5376         {
5377           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5378           return;
5379         }
5380       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
5381       break;
5382
5383     case 'X':
5384       if (!CONST_INT_P (x))
5385         {
5386           output_operand_lossage ("invalid operand for '%%%c'", code);
5387           return;
5388         }
5389       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
5390       break;
5391
5392     case 'w':
5393     case 'x':
5394       if (x == const0_rtx
5395           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
5396         {
5397           asm_fprintf (f, "%czr", code);
5398           break;
5399         }
5400
5401       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
5402         {
5403           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
5404           break;
5405         }
5406
5407       if (REG_P (x) && REGNO (x) == SP_REGNUM)
5408         {
5409           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
5410           break;
5411         }
5412
5413       /* Fall through */
5414
5415     case 0:
5416       if (x == NULL)
5417         {
5418           output_operand_lossage ("missing operand");
5419           return;
5420         }
5421
5422       switch (GET_CODE (x))
5423         {
5424         case REG:
5425           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
5426           break;
5427
5428         case MEM:
5429           output_address (GET_MODE (x), XEXP (x, 0));
5430           /* Check all memory references are Pmode - even with ILP32.  */
5431           gcc_assert (GET_MODE (XEXP (x, 0)) == Pmode);
5432           break;
5433
5434         case CONST:
5435         case LABEL_REF:
5436         case SYMBOL_REF:
5437           output_addr_const (asm_out_file, x);
5438           break;
5439
5440         case CONST_INT:
5441           asm_fprintf (f, "%wd", INTVAL (x));
5442           break;
5443
5444         case CONST_VECTOR:
5445           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
5446             {
5447               gcc_assert (
5448                   aarch64_const_vec_all_same_in_range_p (x,
5449                                                          HOST_WIDE_INT_MIN,
5450                                                          HOST_WIDE_INT_MAX));
5451               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
5452             }
5453           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
5454             {
5455               fputc ('0', f);
5456             }
5457           else
5458             gcc_unreachable ();
5459           break;
5460
5461         case CONST_DOUBLE:
5462           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
5463              be getting CONST_DOUBLEs holding integers.  */
5464           gcc_assert (GET_MODE (x) != VOIDmode);
5465           if (aarch64_float_const_zero_rtx_p (x))
5466             {
5467               fputc ('0', f);
5468               break;
5469             }
5470           else if (aarch64_float_const_representable_p (x))
5471             {
5472 #define buf_size 20
5473               char float_buf[buf_size] = {'\0'};
5474               real_to_decimal_for_mode (float_buf,
5475                                         CONST_DOUBLE_REAL_VALUE (x),
5476                                         buf_size, buf_size,
5477                                         1, GET_MODE (x));
5478               asm_fprintf (asm_out_file, "%s", float_buf);
5479               break;
5480 #undef buf_size
5481             }
5482           output_operand_lossage ("invalid constant");
5483           return;
5484         default:
5485           output_operand_lossage ("invalid operand");
5486           return;
5487         }
5488       break;
5489
5490     case 'A':
5491       if (GET_CODE (x) == HIGH)
5492         x = XEXP (x, 0);
5493
5494       switch (aarch64_classify_symbolic_expression (x))
5495         {
5496         case SYMBOL_SMALL_GOT_4G:
5497           asm_fprintf (asm_out_file, ":got:");
5498           break;
5499
5500         case SYMBOL_SMALL_TLSGD:
5501           asm_fprintf (asm_out_file, ":tlsgd:");
5502           break;
5503
5504         case SYMBOL_SMALL_TLSDESC:
5505           asm_fprintf (asm_out_file, ":tlsdesc:");
5506           break;
5507
5508         case SYMBOL_SMALL_TLSIE:
5509           asm_fprintf (asm_out_file, ":gottprel:");
5510           break;
5511
5512         case SYMBOL_TLSLE24:
5513           asm_fprintf (asm_out_file, ":tprel:");
5514           break;
5515
5516         case SYMBOL_TINY_GOT:
5517           gcc_unreachable ();
5518           break;
5519
5520         default:
5521           break;
5522         }
5523       output_addr_const (asm_out_file, x);
5524       break;
5525
5526     case 'L':
5527       switch (aarch64_classify_symbolic_expression (x))
5528         {
5529         case SYMBOL_SMALL_GOT_4G:
5530           asm_fprintf (asm_out_file, ":lo12:");
5531           break;
5532
5533         case SYMBOL_SMALL_TLSGD:
5534           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
5535           break;
5536
5537         case SYMBOL_SMALL_TLSDESC:
5538           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
5539           break;
5540
5541         case SYMBOL_SMALL_TLSIE:
5542           asm_fprintf (asm_out_file, ":gottprel_lo12:");
5543           break;
5544
5545         case SYMBOL_TLSLE12:
5546           asm_fprintf (asm_out_file, ":tprel_lo12:");
5547           break;
5548
5549         case SYMBOL_TLSLE24:
5550           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
5551           break;
5552
5553         case SYMBOL_TINY_GOT:
5554           asm_fprintf (asm_out_file, ":got:");
5555           break;
5556
5557         case SYMBOL_TINY_TLSIE:
5558           asm_fprintf (asm_out_file, ":gottprel:");
5559           break;
5560
5561         default:
5562           break;
5563         }
5564       output_addr_const (asm_out_file, x);
5565       break;
5566
5567     case 'G':
5568       switch (aarch64_classify_symbolic_expression (x))
5569         {
5570         case SYMBOL_TLSLE24:
5571           asm_fprintf (asm_out_file, ":tprel_hi12:");
5572           break;
5573         default:
5574           break;
5575         }
5576       output_addr_const (asm_out_file, x);
5577       break;
5578
5579     case 'k':
5580       {
5581         HOST_WIDE_INT cond_code;
5582
5583         if (!CONST_INT_P (x))
5584           {
5585             output_operand_lossage ("invalid operand for '%%%c'", code);
5586             return;
5587           }
5588
5589         cond_code = INTVAL (x);
5590         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
5591         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
5592       }
5593       break;
5594
5595     default:
5596       output_operand_lossage ("invalid operand prefix '%%%c'", code);
5597       return;
5598     }
5599 }
5600
5601 static void
5602 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
5603 {
5604   struct aarch64_address_info addr;
5605
5606   if (aarch64_classify_address (&addr, x, mode, MEM, true))
5607     switch (addr.type)
5608       {
5609       case ADDRESS_REG_IMM:
5610         if (addr.offset == const0_rtx)
5611           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
5612         else
5613           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
5614                        INTVAL (addr.offset));
5615         return;
5616
5617       case ADDRESS_REG_REG:
5618         if (addr.shift == 0)
5619           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
5620                        reg_names [REGNO (addr.offset)]);
5621         else
5622           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
5623                        reg_names [REGNO (addr.offset)], addr.shift);
5624         return;
5625
5626       case ADDRESS_REG_UXTW:
5627         if (addr.shift == 0)
5628           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
5629                        REGNO (addr.offset) - R0_REGNUM);
5630         else
5631           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
5632                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
5633         return;
5634
5635       case ADDRESS_REG_SXTW:
5636         if (addr.shift == 0)
5637           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
5638                        REGNO (addr.offset) - R0_REGNUM);
5639         else
5640           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
5641                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
5642         return;
5643
5644       case ADDRESS_REG_WB:
5645         switch (GET_CODE (x))
5646           {
5647           case PRE_INC:
5648             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
5649                          GET_MODE_SIZE (mode));
5650             return;
5651           case POST_INC:
5652             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
5653                          GET_MODE_SIZE (mode));
5654             return;
5655           case PRE_DEC:
5656             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
5657                          GET_MODE_SIZE (mode));
5658             return;
5659           case POST_DEC:
5660             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
5661                          GET_MODE_SIZE (mode));
5662             return;
5663           case PRE_MODIFY:
5664             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
5665                          INTVAL (addr.offset));
5666             return;
5667           case POST_MODIFY:
5668             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
5669                          INTVAL (addr.offset));
5670             return;
5671           default:
5672             break;
5673           }
5674         break;
5675
5676       case ADDRESS_LO_SUM:
5677         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
5678         output_addr_const (f, addr.offset);
5679         asm_fprintf (f, "]");
5680         return;
5681
5682       case ADDRESS_SYMBOLIC:
5683         break;
5684       }
5685
5686   output_addr_const (f, x);
5687 }
5688
5689 bool
5690 aarch64_label_mentioned_p (rtx x)
5691 {
5692   const char *fmt;
5693   int i;
5694
5695   if (GET_CODE (x) == LABEL_REF)
5696     return true;
5697
5698   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5699      referencing instruction, but they are constant offsets, not
5700      symbols.  */
5701   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5702     return false;
5703
5704   fmt = GET_RTX_FORMAT (GET_CODE (x));
5705   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
5706     {
5707       if (fmt[i] == 'E')
5708         {
5709           int j;
5710
5711           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
5712             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
5713               return 1;
5714         }
5715       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
5716         return 1;
5717     }
5718
5719   return 0;
5720 }
5721
5722 /* Implement REGNO_REG_CLASS.  */
5723
5724 enum reg_class
5725 aarch64_regno_regclass (unsigned regno)
5726 {
5727   if (GP_REGNUM_P (regno))
5728     return GENERAL_REGS;
5729
5730   if (regno == SP_REGNUM)
5731     return STACK_REG;
5732
5733   if (regno == FRAME_POINTER_REGNUM
5734       || regno == ARG_POINTER_REGNUM)
5735     return POINTER_REGS;
5736
5737   if (FP_REGNUM_P (regno))
5738     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
5739
5740   return NO_REGS;
5741 }
5742
5743 static rtx
5744 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
5745 {
5746   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5747      where mask is selected by alignment and size of the offset.
5748      We try to pick as large a range for the offset as possible to
5749      maximize the chance of a CSE.  However, for aligned addresses
5750      we limit the range to 4k so that structures with different sized
5751      elements are likely to use the same base.  We need to be careful
5752      not to split a CONST for some forms of address expression, otherwise
5753      it will generate sub-optimal code.  */
5754
5755   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5756     {
5757       rtx base = XEXP (x, 0);
5758       rtx offset_rtx = XEXP (x, 1);
5759       HOST_WIDE_INT offset = INTVAL (offset_rtx);
5760
5761       if (GET_CODE (base) == PLUS)
5762         {
5763           rtx op0 = XEXP (base, 0);
5764           rtx op1 = XEXP (base, 1);
5765
5766           /* Force any scaling into a temp for CSE.  */
5767           op0 = force_reg (Pmode, op0);
5768           op1 = force_reg (Pmode, op1);
5769
5770           /* Let the pointer register be in op0.  */
5771           if (REG_POINTER (op1))
5772             std::swap (op0, op1);
5773
5774           /* If the pointer is virtual or frame related, then we know that
5775              virtual register instantiation or register elimination is going
5776              to apply a second constant.  We want the two constants folded
5777              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
5778           if (virt_or_elim_regno_p (REGNO (op0)))
5779             {
5780               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5781                                    NULL_RTX, true, OPTAB_DIRECT);
5782               return gen_rtx_PLUS (Pmode, base, op1);
5783             }
5784
5785           /* Otherwise, in order to encourage CSE (and thence loop strength
5786              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
5787           base = expand_binop (Pmode, add_optab, op0, op1,
5788                                NULL_RTX, true, OPTAB_DIRECT);
5789           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5790         }
5791
5792       /* Does it look like we'll need a 16-byte load/store-pair operation?  */
5793       HOST_WIDE_INT base_offset;
5794       if (GET_MODE_SIZE (mode) > 16)
5795         base_offset = (offset + 0x400) & ~0x7f0;
5796       /* For offsets aren't a multiple of the access size, the limit is
5797          -256...255.  */
5798       else if (offset & (GET_MODE_SIZE (mode) - 1))
5799         {
5800           base_offset = (offset + 0x100) & ~0x1ff;
5801
5802           /* BLKmode typically uses LDP of X-registers.  */
5803           if (mode == BLKmode)
5804             base_offset = (offset + 512) & ~0x3ff;
5805         }
5806       /* Small negative offsets are supported.  */
5807       else if (IN_RANGE (offset, -256, 0))
5808         base_offset = 0;
5809       else if (mode == TImode || mode == TFmode)
5810         base_offset = (offset + 0x100) & ~0x1ff;
5811       /* Use 12-bit offset by access size.  */
5812       else
5813         base_offset = offset & (~0xfff * GET_MODE_SIZE (mode));
5814
5815       if (base_offset != 0)
5816         {
5817           base = plus_constant (Pmode, base, base_offset);
5818           base = force_operand (base, NULL_RTX);
5819           return plus_constant (Pmode, base, offset - base_offset);
5820         }
5821     }
5822
5823   return x;
5824 }
5825
5826 /* Return the reload icode required for a constant pool in mode.  */
5827 static enum insn_code
5828 aarch64_constant_pool_reload_icode (machine_mode mode)
5829 {
5830   switch (mode)
5831     {
5832     case E_SFmode:
5833       return CODE_FOR_aarch64_reload_movcpsfdi;
5834
5835     case E_DFmode:
5836       return CODE_FOR_aarch64_reload_movcpdfdi;
5837
5838     case E_TFmode:
5839       return CODE_FOR_aarch64_reload_movcptfdi;
5840
5841     case E_V8QImode:
5842       return CODE_FOR_aarch64_reload_movcpv8qidi;
5843
5844     case E_V16QImode:
5845       return CODE_FOR_aarch64_reload_movcpv16qidi;
5846
5847     case E_V4HImode:
5848       return CODE_FOR_aarch64_reload_movcpv4hidi;
5849
5850     case E_V8HImode:
5851       return CODE_FOR_aarch64_reload_movcpv8hidi;
5852
5853     case E_V2SImode:
5854       return CODE_FOR_aarch64_reload_movcpv2sidi;
5855
5856     case E_V4SImode:
5857       return CODE_FOR_aarch64_reload_movcpv4sidi;
5858
5859     case E_V2DImode:
5860       return CODE_FOR_aarch64_reload_movcpv2didi;
5861
5862     case E_V2DFmode:
5863       return CODE_FOR_aarch64_reload_movcpv2dfdi;
5864
5865     default:
5866       gcc_unreachable ();
5867     }
5868
5869   gcc_unreachable ();
5870 }
5871 static reg_class_t
5872 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5873                           reg_class_t rclass,
5874                           machine_mode mode,
5875                           secondary_reload_info *sri)
5876 {
5877
5878   /* If we have to disable direct literal pool loads and stores because the
5879      function is too big, then we need a scratch register.  */
5880   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5881       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5882           || targetm.vector_mode_supported_p (GET_MODE (x)))
5883       && !aarch64_pcrelative_literal_loads)
5884     {
5885       sri->icode = aarch64_constant_pool_reload_icode (mode);
5886       return NO_REGS;
5887     }
5888
5889   /* Without the TARGET_SIMD instructions we cannot move a Q register
5890      to a Q register directly.  We need a scratch.  */
5891   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5892       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5893       && reg_class_subset_p (rclass, FP_REGS))
5894     {
5895       if (mode == TFmode)
5896         sri->icode = CODE_FOR_aarch64_reload_movtf;
5897       else if (mode == TImode)
5898         sri->icode = CODE_FOR_aarch64_reload_movti;
5899       return NO_REGS;
5900     }
5901
5902   /* A TFmode or TImode memory access should be handled via an FP_REGS
5903      because AArch64 has richer addressing modes for LDR/STR instructions
5904      than LDP/STP instructions.  */
5905   if (TARGET_FLOAT && rclass == GENERAL_REGS
5906       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5907     return FP_REGS;
5908
5909   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5910       return GENERAL_REGS;
5911
5912   return NO_REGS;
5913 }
5914
5915 static bool
5916 aarch64_can_eliminate (const int from, const int to)
5917 {
5918   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5919      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
5920
5921   if (frame_pointer_needed)
5922     {
5923       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5924         return true;
5925       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5926         return false;
5927       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5928           && !cfun->calls_alloca)
5929         return true;
5930       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5931         return true;
5932
5933       return false;
5934     }
5935
5936   return true;
5937 }
5938
5939 HOST_WIDE_INT
5940 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5941 {
5942   aarch64_layout_frame ();
5943
5944   if (to == HARD_FRAME_POINTER_REGNUM)
5945     {
5946       if (from == ARG_POINTER_REGNUM)
5947         return cfun->machine->frame.hard_fp_offset;
5948
5949       if (from == FRAME_POINTER_REGNUM)
5950         return cfun->machine->frame.hard_fp_offset
5951                - cfun->machine->frame.locals_offset;
5952     }
5953
5954   if (to == STACK_POINTER_REGNUM)
5955     {
5956       if (from == FRAME_POINTER_REGNUM)
5957           return cfun->machine->frame.frame_size
5958                  - cfun->machine->frame.locals_offset;
5959     }
5960
5961   return cfun->machine->frame.frame_size;
5962 }
5963
5964 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
5965    previous frame.  */
5966
5967 rtx
5968 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5969 {
5970   if (count != 0)
5971     return const0_rtx;
5972   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5973 }
5974
5975
5976 static void
5977 aarch64_asm_trampoline_template (FILE *f)
5978 {
5979   if (TARGET_ILP32)
5980     {
5981       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5982       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5983     }
5984   else
5985     {
5986       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5987       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5988     }
5989   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5990   assemble_aligned_integer (4, const0_rtx);
5991   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5992   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5993 }
5994
5995 static void
5996 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5997 {
5998   rtx fnaddr, mem, a_tramp;
5999   const int tramp_code_sz = 16;
6000
6001   /* Don't need to copy the trailing D-words, we fill those in below.  */
6002   emit_block_move (m_tramp, assemble_trampoline_template (),
6003                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
6004   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
6005   fnaddr = XEXP (DECL_RTL (fndecl), 0);
6006   if (GET_MODE (fnaddr) != ptr_mode)
6007     fnaddr = convert_memory_address (ptr_mode, fnaddr);
6008   emit_move_insn (mem, fnaddr);
6009
6010   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
6011   emit_move_insn (mem, chain_value);
6012
6013   /* XXX We should really define a "clear_cache" pattern and use
6014      gen_clear_cache().  */
6015   a_tramp = XEXP (m_tramp, 0);
6016   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
6017                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
6018                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
6019                      ptr_mode);
6020 }
6021
6022 static unsigned char
6023 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
6024 {
6025   switch (regclass)
6026     {
6027     case CALLER_SAVE_REGS:
6028     case POINTER_REGS:
6029     case GENERAL_REGS:
6030     case ALL_REGS:
6031     case POINTER_AND_FP_REGS:
6032     case FP_REGS:
6033     case FP_LO_REGS:
6034       return
6035         aarch64_vector_mode_p (mode)
6036           ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
6037           : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6038     case STACK_REG:
6039       return 1;
6040
6041     case NO_REGS:
6042       return 0;
6043
6044     default:
6045       break;
6046     }
6047   gcc_unreachable ();
6048 }
6049
6050 static reg_class_t
6051 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
6052 {
6053   if (regclass == POINTER_REGS)
6054     return GENERAL_REGS;
6055
6056   if (regclass == STACK_REG)
6057     {
6058       if (REG_P(x)
6059           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
6060           return regclass;
6061
6062       return NO_REGS;
6063     }
6064
6065   /* Register eliminiation can result in a request for
6066      SP+constant->FP_REGS.  We cannot support such operations which
6067      use SP as source and an FP_REG as destination, so reject out
6068      right now.  */
6069   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
6070     {
6071       rtx lhs = XEXP (x, 0);
6072
6073       /* Look through a possible SUBREG introduced by ILP32.  */
6074       if (GET_CODE (lhs) == SUBREG)
6075         lhs = SUBREG_REG (lhs);
6076
6077       gcc_assert (REG_P (lhs));
6078       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
6079                                       POINTER_REGS));
6080       return NO_REGS;
6081     }
6082
6083   return regclass;
6084 }
6085
6086 void
6087 aarch64_asm_output_labelref (FILE* f, const char *name)
6088 {
6089   asm_fprintf (f, "%U%s", name);
6090 }
6091
6092 static void
6093 aarch64_elf_asm_constructor (rtx symbol, int priority)
6094 {
6095   if (priority == DEFAULT_INIT_PRIORITY)
6096     default_ctor_section_asm_out_constructor (symbol, priority);
6097   else
6098     {
6099       section *s;
6100       /* While priority is known to be in range [0, 65535], so 18 bytes
6101          would be enough, the compiler might not know that.  To avoid
6102          -Wformat-truncation false positive, use a larger size.  */
6103       char buf[23];
6104       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
6105       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
6106       switch_to_section (s);
6107       assemble_align (POINTER_SIZE);
6108       assemble_aligned_integer (POINTER_BYTES, symbol);
6109     }
6110 }
6111
6112 static void
6113 aarch64_elf_asm_destructor (rtx symbol, int priority)
6114 {
6115   if (priority == DEFAULT_INIT_PRIORITY)
6116     default_dtor_section_asm_out_destructor (symbol, priority);
6117   else
6118     {
6119       section *s;
6120       /* While priority is known to be in range [0, 65535], so 18 bytes
6121          would be enough, the compiler might not know that.  To avoid
6122          -Wformat-truncation false positive, use a larger size.  */
6123       char buf[23];
6124       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
6125       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
6126       switch_to_section (s);
6127       assemble_align (POINTER_SIZE);
6128       assemble_aligned_integer (POINTER_BYTES, symbol);
6129     }
6130 }
6131
6132 const char*
6133 aarch64_output_casesi (rtx *operands)
6134 {
6135   char buf[100];
6136   char label[100];
6137   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
6138   int index;
6139   static const char *const patterns[4][2] =
6140   {
6141     {
6142       "ldrb\t%w3, [%0,%w1,uxtw]",
6143       "add\t%3, %4, %w3, sxtb #2"
6144     },
6145     {
6146       "ldrh\t%w3, [%0,%w1,uxtw #1]",
6147       "add\t%3, %4, %w3, sxth #2"
6148     },
6149     {
6150       "ldr\t%w3, [%0,%w1,uxtw #2]",
6151       "add\t%3, %4, %w3, sxtw #2"
6152     },
6153     /* We assume that DImode is only generated when not optimizing and
6154        that we don't really need 64-bit address offsets.  That would
6155        imply an object file with 8GB of code in a single function!  */
6156     {
6157       "ldr\t%w3, [%0,%w1,uxtw #2]",
6158       "add\t%3, %4, %w3, sxtw #2"
6159     }
6160   };
6161
6162   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
6163
6164   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
6165   index = exact_log2 (GET_MODE_SIZE (mode));
6166
6167   gcc_assert (index >= 0 && index <= 3);
6168
6169   /* Need to implement table size reduction, by chaning the code below.  */
6170   output_asm_insn (patterns[index][0], operands);
6171   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
6172   snprintf (buf, sizeof (buf),
6173             "adr\t%%4, %s", targetm.strip_name_encoding (label));
6174   output_asm_insn (buf, operands);
6175   output_asm_insn (patterns[index][1], operands);
6176   output_asm_insn ("br\t%3", operands);
6177   assemble_label (asm_out_file, label);
6178   return "";
6179 }
6180
6181
6182 /* Return size in bits of an arithmetic operand which is shifted/scaled and
6183    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
6184    operator.  */
6185
6186 int
6187 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
6188 {
6189   if (shift >= 0 && shift <= 3)
6190     {
6191       int size;
6192       for (size = 8; size <= 32; size *= 2)
6193         {
6194           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
6195           if (mask == bits << shift)
6196             return size;
6197         }
6198     }
6199   return 0;
6200 }
6201
6202 /* Constant pools are per function only when PC relative
6203    literal loads are true or we are in the large memory
6204    model.  */
6205
6206 static inline bool
6207 aarch64_can_use_per_function_literal_pools_p (void)
6208 {
6209   return (aarch64_pcrelative_literal_loads
6210           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
6211 }
6212
6213 static bool
6214 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
6215 {
6216   /* Fixme:: In an ideal world this would work similar
6217      to the logic in aarch64_select_rtx_section but this
6218      breaks bootstrap in gcc go.  For now we workaround
6219      this by returning false here.  */
6220   return false;
6221 }
6222
6223 /* Select appropriate section for constants depending
6224    on where we place literal pools.  */
6225
6226 static section *
6227 aarch64_select_rtx_section (machine_mode mode,
6228                             rtx x,
6229                             unsigned HOST_WIDE_INT align)
6230 {
6231   if (aarch64_can_use_per_function_literal_pools_p ())
6232     return function_section (current_function_decl);
6233
6234   return default_elf_select_rtx_section (mode, x, align);
6235 }
6236
6237 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
6238 void
6239 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
6240                                   HOST_WIDE_INT offset)
6241 {
6242   /* When using per-function literal pools, we must ensure that any code
6243      section is aligned to the minimal instruction length, lest we get
6244      errors from the assembler re "unaligned instructions".  */
6245   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
6246     ASM_OUTPUT_ALIGN (f, 2);
6247 }
6248
6249 /* Costs.  */
6250
6251 /* Helper function for rtx cost calculation.  Strip a shift expression
6252    from X.  Returns the inner operand if successful, or the original
6253    expression on failure.  */
6254 static rtx
6255 aarch64_strip_shift (rtx x)
6256 {
6257   rtx op = x;
6258
6259   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
6260      we can convert both to ROR during final output.  */
6261   if ((GET_CODE (op) == ASHIFT
6262        || GET_CODE (op) == ASHIFTRT
6263        || GET_CODE (op) == LSHIFTRT
6264        || GET_CODE (op) == ROTATERT
6265        || GET_CODE (op) == ROTATE)
6266       && CONST_INT_P (XEXP (op, 1)))
6267     return XEXP (op, 0);
6268
6269   if (GET_CODE (op) == MULT
6270       && CONST_INT_P (XEXP (op, 1))
6271       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
6272     return XEXP (op, 0);
6273
6274   return x;
6275 }
6276
6277 /* Helper function for rtx cost calculation.  Strip an extend
6278    expression from X.  Returns the inner operand if successful, or the
6279    original expression on failure.  We deal with a number of possible
6280    canonicalization variations here. If STRIP_SHIFT is true, then
6281    we can strip off a shift also.  */
6282 static rtx
6283 aarch64_strip_extend (rtx x, bool strip_shift)
6284 {
6285   scalar_int_mode mode;
6286   rtx op = x;
6287
6288   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
6289     return op;
6290
6291   /* Zero and sign extraction of a widened value.  */
6292   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
6293       && XEXP (op, 2) == const0_rtx
6294       && GET_CODE (XEXP (op, 0)) == MULT
6295       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
6296                                          XEXP (op, 1)))
6297     return XEXP (XEXP (op, 0), 0);
6298
6299   /* It can also be represented (for zero-extend) as an AND with an
6300      immediate.  */
6301   if (GET_CODE (op) == AND
6302       && GET_CODE (XEXP (op, 0)) == MULT
6303       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
6304       && CONST_INT_P (XEXP (op, 1))
6305       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
6306                            INTVAL (XEXP (op, 1))) != 0)
6307     return XEXP (XEXP (op, 0), 0);
6308
6309   /* Now handle extended register, as this may also have an optional
6310      left shift by 1..4.  */
6311   if (strip_shift
6312       && GET_CODE (op) == ASHIFT
6313       && CONST_INT_P (XEXP (op, 1))
6314       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
6315     op = XEXP (op, 0);
6316
6317   if (GET_CODE (op) == ZERO_EXTEND
6318       || GET_CODE (op) == SIGN_EXTEND)
6319     op = XEXP (op, 0);
6320
6321   if (op != x)
6322     return op;
6323
6324   return x;
6325 }
6326
6327 /* Return true iff CODE is a shift supported in combination
6328    with arithmetic instructions.  */
6329
6330 static bool
6331 aarch64_shift_p (enum rtx_code code)
6332 {
6333   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
6334 }
6335
6336
6337 /* Return true iff X is a cheap shift without a sign extend. */
6338
6339 static bool
6340 aarch64_cheap_mult_shift_p (rtx x)
6341 {
6342   rtx op0, op1;
6343
6344   op0 = XEXP (x, 0);
6345   op1 = XEXP (x, 1);
6346
6347   if (!(aarch64_tune_params.extra_tuning_flags
6348                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
6349     return false;
6350
6351   if (GET_CODE (op0) == SIGN_EXTEND)
6352     return false;
6353
6354   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
6355       && UINTVAL (op1) <= 4)
6356     return true;
6357
6358   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
6359     return false;
6360
6361   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
6362
6363   if (l2 > 0 && l2 <= 4)
6364     return true;
6365
6366   return false;
6367 }
6368
6369 /* Helper function for rtx cost calculation.  Calculate the cost of
6370    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
6371    Return the calculated cost of the expression, recursing manually in to
6372    operands where needed.  */
6373
6374 static int
6375 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
6376 {
6377   rtx op0, op1;
6378   const struct cpu_cost_table *extra_cost
6379     = aarch64_tune_params.insn_extra_cost;
6380   int cost = 0;
6381   bool compound_p = (outer == PLUS || outer == MINUS);
6382   machine_mode mode = GET_MODE (x);
6383
6384   gcc_checking_assert (code == MULT);
6385
6386   op0 = XEXP (x, 0);
6387   op1 = XEXP (x, 1);
6388
6389   if (VECTOR_MODE_P (mode))
6390     mode = GET_MODE_INNER (mode);
6391
6392   /* Integer multiply/fma.  */
6393   if (GET_MODE_CLASS (mode) == MODE_INT)
6394     {
6395       /* The multiply will be canonicalized as a shift, cost it as such.  */
6396       if (aarch64_shift_p (GET_CODE (x))
6397           || (CONST_INT_P (op1)
6398               && exact_log2 (INTVAL (op1)) > 0))
6399         {
6400           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
6401                            || GET_CODE (op0) == SIGN_EXTEND;
6402           if (speed)
6403             {
6404               if (compound_p)
6405                 {
6406                   /* If the shift is considered cheap,
6407                      then don't add any cost. */
6408                   if (aarch64_cheap_mult_shift_p (x))
6409                     ;
6410                   else if (REG_P (op1))
6411                     /* ARITH + shift-by-register.  */
6412                     cost += extra_cost->alu.arith_shift_reg;
6413                   else if (is_extend)
6414                     /* ARITH + extended register.  We don't have a cost field
6415                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
6416                     cost += extra_cost->alu.extend_arith;
6417                   else
6418                     /* ARITH + shift-by-immediate.  */
6419                     cost += extra_cost->alu.arith_shift;
6420                 }
6421               else
6422                 /* LSL (immediate).  */
6423                 cost += extra_cost->alu.shift;
6424
6425             }
6426           /* Strip extends as we will have costed them in the case above.  */
6427           if (is_extend)
6428             op0 = aarch64_strip_extend (op0, true);
6429
6430           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
6431
6432           return cost;
6433         }
6434
6435       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
6436          compound and let the below cases handle it.  After all, MNEG is a
6437          special-case alias of MSUB.  */
6438       if (GET_CODE (op0) == NEG)
6439         {
6440           op0 = XEXP (op0, 0);
6441           compound_p = true;
6442         }
6443
6444       /* Integer multiplies or FMAs have zero/sign extending variants.  */
6445       if ((GET_CODE (op0) == ZERO_EXTEND
6446            && GET_CODE (op1) == ZERO_EXTEND)
6447           || (GET_CODE (op0) == SIGN_EXTEND
6448               && GET_CODE (op1) == SIGN_EXTEND))
6449         {
6450           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
6451           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
6452
6453           if (speed)
6454             {
6455               if (compound_p)
6456                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
6457                 cost += extra_cost->mult[0].extend_add;
6458               else
6459                 /* MUL/SMULL/UMULL.  */
6460                 cost += extra_cost->mult[0].extend;
6461             }
6462
6463           return cost;
6464         }
6465
6466       /* This is either an integer multiply or a MADD.  In both cases
6467          we want to recurse and cost the operands.  */
6468       cost += rtx_cost (op0, mode, MULT, 0, speed);
6469       cost += rtx_cost (op1, mode, MULT, 1, speed);
6470
6471       if (speed)
6472         {
6473           if (compound_p)
6474             /* MADD/MSUB.  */
6475             cost += extra_cost->mult[mode == DImode].add;
6476           else
6477             /* MUL.  */
6478             cost += extra_cost->mult[mode == DImode].simple;
6479         }
6480
6481       return cost;
6482     }
6483   else
6484     {
6485       if (speed)
6486         {
6487           /* Floating-point FMA/FMUL can also support negations of the
6488              operands, unless the rounding mode is upward or downward in
6489              which case FNMUL is different than FMUL with operand negation.  */
6490           bool neg0 = GET_CODE (op0) == NEG;
6491           bool neg1 = GET_CODE (op1) == NEG;
6492           if (compound_p || !flag_rounding_math || (neg0 && neg1))
6493             {
6494               if (neg0)
6495                 op0 = XEXP (op0, 0);
6496               if (neg1)
6497                 op1 = XEXP (op1, 0);
6498             }
6499
6500           if (compound_p)
6501             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
6502             cost += extra_cost->fp[mode == DFmode].fma;
6503           else
6504             /* FMUL/FNMUL.  */
6505             cost += extra_cost->fp[mode == DFmode].mult;
6506         }
6507
6508       cost += rtx_cost (op0, mode, MULT, 0, speed);
6509       cost += rtx_cost (op1, mode, MULT, 1, speed);
6510       return cost;
6511     }
6512 }
6513
6514 static int
6515 aarch64_address_cost (rtx x,
6516                       machine_mode mode,
6517                       addr_space_t as ATTRIBUTE_UNUSED,
6518                       bool speed)
6519 {
6520   enum rtx_code c = GET_CODE (x);
6521   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
6522   struct aarch64_address_info info;
6523   int cost = 0;
6524   info.shift = 0;
6525
6526   if (!aarch64_classify_address (&info, x, mode, c, false))
6527     {
6528       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
6529         {
6530           /* This is a CONST or SYMBOL ref which will be split
6531              in a different way depending on the code model in use.
6532              Cost it through the generic infrastructure.  */
6533           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
6534           /* Divide through by the cost of one instruction to
6535              bring it to the same units as the address costs.  */
6536           cost_symbol_ref /= COSTS_N_INSNS (1);
6537           /* The cost is then the cost of preparing the address,
6538              followed by an immediate (possibly 0) offset.  */
6539           return cost_symbol_ref + addr_cost->imm_offset;
6540         }
6541       else
6542         {
6543           /* This is most likely a jump table from a case
6544              statement.  */
6545           return addr_cost->register_offset;
6546         }
6547     }
6548
6549   switch (info.type)
6550     {
6551       case ADDRESS_LO_SUM:
6552       case ADDRESS_SYMBOLIC:
6553       case ADDRESS_REG_IMM:
6554         cost += addr_cost->imm_offset;
6555         break;
6556
6557       case ADDRESS_REG_WB:
6558         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
6559           cost += addr_cost->pre_modify;
6560         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
6561           cost += addr_cost->post_modify;
6562         else
6563           gcc_unreachable ();
6564
6565         break;
6566
6567       case ADDRESS_REG_REG:
6568         cost += addr_cost->register_offset;
6569         break;
6570
6571       case ADDRESS_REG_SXTW:
6572         cost += addr_cost->register_sextend;
6573         break;
6574
6575       case ADDRESS_REG_UXTW:
6576         cost += addr_cost->register_zextend;
6577         break;
6578
6579       default:
6580         gcc_unreachable ();
6581     }
6582
6583
6584   if (info.shift > 0)
6585     {
6586       /* For the sake of calculating the cost of the shifted register
6587          component, we can treat same sized modes in the same way.  */
6588       switch (GET_MODE_BITSIZE (mode))
6589         {
6590           case 16:
6591             cost += addr_cost->addr_scale_costs.hi;
6592             break;
6593
6594           case 32:
6595             cost += addr_cost->addr_scale_costs.si;
6596             break;
6597
6598           case 64:
6599             cost += addr_cost->addr_scale_costs.di;
6600             break;
6601
6602           /* We can't tell, or this is a 128-bit vector.  */
6603           default:
6604             cost += addr_cost->addr_scale_costs.ti;
6605             break;
6606         }
6607     }
6608
6609   return cost;
6610 }
6611
6612 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
6613    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
6614    to be taken.  */
6615
6616 int
6617 aarch64_branch_cost (bool speed_p, bool predictable_p)
6618 {
6619   /* When optimizing for speed, use the cost of unpredictable branches.  */
6620   const struct cpu_branch_cost *branch_costs =
6621     aarch64_tune_params.branch_costs;
6622
6623   if (!speed_p || predictable_p)
6624     return branch_costs->predictable;
6625   else
6626     return branch_costs->unpredictable;
6627 }
6628
6629 /* Return true if the RTX X in mode MODE is a zero or sign extract
6630    usable in an ADD or SUB (extended register) instruction.  */
6631 static bool
6632 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
6633 {
6634   /* Catch add with a sign extract.
6635      This is add_<optab><mode>_multp2.  */
6636   if (GET_CODE (x) == SIGN_EXTRACT
6637       || GET_CODE (x) == ZERO_EXTRACT)
6638     {
6639       rtx op0 = XEXP (x, 0);
6640       rtx op1 = XEXP (x, 1);
6641       rtx op2 = XEXP (x, 2);
6642
6643       if (GET_CODE (op0) == MULT
6644           && CONST_INT_P (op1)
6645           && op2 == const0_rtx
6646           && CONST_INT_P (XEXP (op0, 1))
6647           && aarch64_is_extend_from_extract (mode,
6648                                              XEXP (op0, 1),
6649                                              op1))
6650         {
6651           return true;
6652         }
6653     }
6654   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
6655      No shift.  */
6656   else if (GET_CODE (x) == SIGN_EXTEND
6657            || GET_CODE (x) == ZERO_EXTEND)
6658     return REG_P (XEXP (x, 0));
6659
6660   return false;
6661 }
6662
6663 static bool
6664 aarch64_frint_unspec_p (unsigned int u)
6665 {
6666   switch (u)
6667     {
6668       case UNSPEC_FRINTZ:
6669       case UNSPEC_FRINTP:
6670       case UNSPEC_FRINTM:
6671       case UNSPEC_FRINTA:
6672       case UNSPEC_FRINTN:
6673       case UNSPEC_FRINTX:
6674       case UNSPEC_FRINTI:
6675         return true;
6676
6677       default:
6678         return false;
6679     }
6680 }
6681
6682 /* Return true iff X is an rtx that will match an extr instruction
6683    i.e. as described in the *extr<mode>5_insn family of patterns.
6684    OP0 and OP1 will be set to the operands of the shifts involved
6685    on success and will be NULL_RTX otherwise.  */
6686
6687 static bool
6688 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
6689 {
6690   rtx op0, op1;
6691   scalar_int_mode mode;
6692   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
6693     return false;
6694
6695   *res_op0 = NULL_RTX;
6696   *res_op1 = NULL_RTX;
6697
6698   if (GET_CODE (x) != IOR)
6699     return false;
6700
6701   op0 = XEXP (x, 0);
6702   op1 = XEXP (x, 1);
6703
6704   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
6705       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
6706     {
6707      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
6708       if (GET_CODE (op1) == ASHIFT)
6709         std::swap (op0, op1);
6710
6711       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6712         return false;
6713
6714       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6715       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6716
6717       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6718           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6719         {
6720           *res_op0 = XEXP (op0, 0);
6721           *res_op1 = XEXP (op1, 0);
6722           return true;
6723         }
6724     }
6725
6726   return false;
6727 }
6728
6729 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6730    storing it in *COST.  Result is true if the total cost of the operation
6731    has now been calculated.  */
6732 static bool
6733 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6734 {
6735   rtx inner;
6736   rtx comparator;
6737   enum rtx_code cmpcode;
6738
6739   if (COMPARISON_P (op0))
6740     {
6741       inner = XEXP (op0, 0);
6742       comparator = XEXP (op0, 1);
6743       cmpcode = GET_CODE (op0);
6744     }
6745   else
6746     {
6747       inner = op0;
6748       comparator = const0_rtx;
6749       cmpcode = NE;
6750     }
6751
6752   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6753     {
6754       /* Conditional branch.  */
6755       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6756         return true;
6757       else
6758         {
6759           if (cmpcode == NE || cmpcode == EQ)
6760             {
6761               if (comparator == const0_rtx)
6762                 {
6763                   /* TBZ/TBNZ/CBZ/CBNZ.  */
6764                   if (GET_CODE (inner) == ZERO_EXTRACT)
6765                     /* TBZ/TBNZ.  */
6766                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6767                                        ZERO_EXTRACT, 0, speed);
6768                   else
6769                     /* CBZ/CBNZ.  */
6770                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6771
6772                 return true;
6773               }
6774             }
6775           else if (cmpcode == LT || cmpcode == GE)
6776             {
6777               /* TBZ/TBNZ.  */
6778               if (comparator == const0_rtx)
6779                 return true;
6780             }
6781         }
6782     }
6783   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6784     {
6785       /* CCMP.  */
6786       if (GET_CODE (op1) == COMPARE)
6787         {
6788           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
6789           if (XEXP (op1, 1) == const0_rtx)
6790             *cost += 1;
6791           if (speed)
6792             {
6793               machine_mode mode = GET_MODE (XEXP (op1, 0));
6794               const struct cpu_cost_table *extra_cost
6795                 = aarch64_tune_params.insn_extra_cost;
6796
6797               if (GET_MODE_CLASS (mode) == MODE_INT)
6798                 *cost += extra_cost->alu.arith;
6799               else
6800                 *cost += extra_cost->fp[mode == DFmode].compare;
6801             }
6802           return true;
6803         }
6804
6805       /* It's a conditional operation based on the status flags,
6806          so it must be some flavor of CSEL.  */
6807
6808       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
6809       if (GET_CODE (op1) == NEG
6810           || GET_CODE (op1) == NOT
6811           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6812         op1 = XEXP (op1, 0);
6813       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6814         {
6815           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
6816           op1 = XEXP (op1, 0);
6817           op2 = XEXP (op2, 0);
6818         }
6819
6820       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6821       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6822       return true;
6823     }
6824
6825   /* We don't know what this is, cost all operands.  */
6826   return false;
6827 }
6828
6829 /* Check whether X is a bitfield operation of the form shift + extend that
6830    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
6831    operand to which the bitfield operation is applied.  Otherwise return
6832    NULL_RTX.  */
6833
6834 static rtx
6835 aarch64_extend_bitfield_pattern_p (rtx x)
6836 {
6837   rtx_code outer_code = GET_CODE (x);
6838   machine_mode outer_mode = GET_MODE (x);
6839
6840   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6841       && outer_mode != SImode && outer_mode != DImode)
6842     return NULL_RTX;
6843
6844   rtx inner = XEXP (x, 0);
6845   rtx_code inner_code = GET_CODE (inner);
6846   machine_mode inner_mode = GET_MODE (inner);
6847   rtx op = NULL_RTX;
6848
6849   switch (inner_code)
6850     {
6851       case ASHIFT:
6852         if (CONST_INT_P (XEXP (inner, 1))
6853             && (inner_mode == QImode || inner_mode == HImode))
6854           op = XEXP (inner, 0);
6855         break;
6856       case LSHIFTRT:
6857         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6858             && (inner_mode == QImode || inner_mode == HImode))
6859           op = XEXP (inner, 0);
6860         break;
6861       case ASHIFTRT:
6862         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6863             && (inner_mode == QImode || inner_mode == HImode))
6864           op = XEXP (inner, 0);
6865         break;
6866       default:
6867         break;
6868     }
6869
6870   return op;
6871 }
6872
6873 /* Return true if the mask and a shift amount from an RTX of the form
6874    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6875    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
6876
6877 bool
6878 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
6879                                     rtx shft_amnt)
6880 {
6881   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6882          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6883          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6884          && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6885 }
6886
6887 /* Calculate the cost of calculating X, storing it in *COST.  Result
6888    is true if the total cost of the operation has now been calculated.  */
6889 static bool
6890 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6891                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6892 {
6893   rtx op0, op1, op2;
6894   const struct cpu_cost_table *extra_cost
6895     = aarch64_tune_params.insn_extra_cost;
6896   int code = GET_CODE (x);
6897   scalar_int_mode int_mode;
6898
6899   /* By default, assume that everything has equivalent cost to the
6900      cheapest instruction.  Any additional costs are applied as a delta
6901      above this default.  */
6902   *cost = COSTS_N_INSNS (1);
6903
6904   switch (code)
6905     {
6906     case SET:
6907       /* The cost depends entirely on the operands to SET.  */
6908       *cost = 0;
6909       op0 = SET_DEST (x);
6910       op1 = SET_SRC (x);
6911
6912       switch (GET_CODE (op0))
6913         {
6914         case MEM:
6915           if (speed)
6916             {
6917               rtx address = XEXP (op0, 0);
6918               if (VECTOR_MODE_P (mode))
6919                 *cost += extra_cost->ldst.storev;
6920               else if (GET_MODE_CLASS (mode) == MODE_INT)
6921                 *cost += extra_cost->ldst.store;
6922               else if (mode == SFmode)
6923                 *cost += extra_cost->ldst.storef;
6924               else if (mode == DFmode)
6925                 *cost += extra_cost->ldst.stored;
6926
6927               *cost +=
6928                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6929                                                      0, speed));
6930             }
6931
6932           *cost += rtx_cost (op1, mode, SET, 1, speed);
6933           return true;
6934
6935         case SUBREG:
6936           if (! REG_P (SUBREG_REG (op0)))
6937             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6938
6939           /* Fall through.  */
6940         case REG:
6941           /* The cost is one per vector-register copied.  */
6942           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6943             {
6944               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
6945               *cost = COSTS_N_INSNS (nregs);
6946             }
6947           /* const0_rtx is in general free, but we will use an
6948              instruction to set a register to 0.  */
6949           else if (REG_P (op1) || op1 == const0_rtx)
6950             {
6951               /* The cost is 1 per register copied.  */
6952               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
6953               *cost = COSTS_N_INSNS (nregs);
6954             }
6955           else
6956             /* Cost is just the cost of the RHS of the set.  */
6957             *cost += rtx_cost (op1, mode, SET, 1, speed);
6958           return true;
6959
6960         case ZERO_EXTRACT:
6961         case SIGN_EXTRACT:
6962           /* Bit-field insertion.  Strip any redundant widening of
6963              the RHS to meet the width of the target.  */
6964           if (GET_CODE (op1) == SUBREG)
6965             op1 = SUBREG_REG (op1);
6966           if ((GET_CODE (op1) == ZERO_EXTEND
6967                || GET_CODE (op1) == SIGN_EXTEND)
6968               && CONST_INT_P (XEXP (op0, 1))
6969               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
6970               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
6971             op1 = XEXP (op1, 0);
6972
6973           if (CONST_INT_P (op1))
6974             {
6975               /* MOV immediate is assumed to always be cheap.  */
6976               *cost = COSTS_N_INSNS (1);
6977             }
6978           else
6979             {
6980               /* BFM.  */
6981               if (speed)
6982                 *cost += extra_cost->alu.bfi;
6983               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6984             }
6985
6986           return true;
6987
6988         default:
6989           /* We can't make sense of this, assume default cost.  */
6990           *cost = COSTS_N_INSNS (1);
6991           return false;
6992         }
6993       return false;
6994
6995     case CONST_INT:
6996       /* If an instruction can incorporate a constant within the
6997          instruction, the instruction's expression avoids calling
6998          rtx_cost() on the constant.  If rtx_cost() is called on a
6999          constant, then it is usually because the constant must be
7000          moved into a register by one or more instructions.
7001
7002          The exception is constant 0, which can be expressed
7003          as XZR/WZR and is therefore free.  The exception to this is
7004          if we have (set (reg) (const0_rtx)) in which case we must cost
7005          the move.  However, we can catch that when we cost the SET, so
7006          we don't need to consider that here.  */
7007       if (x == const0_rtx)
7008         *cost = 0;
7009       else
7010         {
7011           /* To an approximation, building any other constant is
7012              proportionally expensive to the number of instructions
7013              required to build that constant.  This is true whether we
7014              are compiling for SPEED or otherwise.  */
7015           if (!is_a <scalar_int_mode> (mode, &int_mode))
7016             int_mode = word_mode;
7017           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
7018                                  (NULL_RTX, x, false, int_mode));
7019         }
7020       return true;
7021
7022     case CONST_DOUBLE:
7023
7024       /* First determine number of instructions to do the move
7025           as an integer constant.  */
7026       if (!aarch64_float_const_representable_p (x)
7027            && !aarch64_can_const_movi_rtx_p (x, mode)
7028            && aarch64_float_const_rtx_p (x))
7029         {
7030           unsigned HOST_WIDE_INT ival;
7031           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
7032           gcc_assert (succeed);
7033
7034           scalar_int_mode imode = (mode == HFmode
7035                                    ? SImode
7036                                    : int_mode_for_mode (mode).require ());
7037           int ncost = aarch64_internal_mov_immediate
7038                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7039           *cost += COSTS_N_INSNS (ncost);
7040           return true;
7041         }
7042
7043       if (speed)
7044         {
7045           /* mov[df,sf]_aarch64.  */
7046           if (aarch64_float_const_representable_p (x))
7047             /* FMOV (scalar immediate).  */
7048             *cost += extra_cost->fp[mode == DFmode].fpconst;
7049           else if (!aarch64_float_const_zero_rtx_p (x))
7050             {
7051               /* This will be a load from memory.  */
7052               if (mode == DFmode)
7053                 *cost += extra_cost->ldst.loadd;
7054               else
7055                 *cost += extra_cost->ldst.loadf;
7056             }
7057           else
7058             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
7059                or MOV v0.s[0], wzr - neither of which are modeled by the
7060                cost tables.  Just use the default cost.  */
7061             {
7062             }
7063         }
7064
7065       return true;
7066
7067     case MEM:
7068       if (speed)
7069         {
7070           /* For loads we want the base cost of a load, plus an
7071              approximation for the additional cost of the addressing
7072              mode.  */
7073           rtx address = XEXP (x, 0);
7074           if (VECTOR_MODE_P (mode))
7075             *cost += extra_cost->ldst.loadv;
7076           else if (GET_MODE_CLASS (mode) == MODE_INT)
7077             *cost += extra_cost->ldst.load;
7078           else if (mode == SFmode)
7079             *cost += extra_cost->ldst.loadf;
7080           else if (mode == DFmode)
7081             *cost += extra_cost->ldst.loadd;
7082
7083           *cost +=
7084                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7085                                                      0, speed));
7086         }
7087
7088       return true;
7089
7090     case NEG:
7091       op0 = XEXP (x, 0);
7092
7093       if (VECTOR_MODE_P (mode))
7094         {
7095           if (speed)
7096             {
7097               /* FNEG.  */
7098               *cost += extra_cost->vect.alu;
7099             }
7100           return false;
7101         }
7102
7103       if (GET_MODE_CLASS (mode) == MODE_INT)
7104         {
7105           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7106               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7107             {
7108               /* CSETM.  */
7109               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
7110               return true;
7111             }
7112
7113           /* Cost this as SUB wzr, X.  */
7114           op0 = CONST0_RTX (mode);
7115           op1 = XEXP (x, 0);
7116           goto cost_minus;
7117         }
7118
7119       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7120         {
7121           /* Support (neg(fma...)) as a single instruction only if
7122              sign of zeros is unimportant.  This matches the decision
7123              making in aarch64.md.  */
7124           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
7125             {
7126               /* FNMADD.  */
7127               *cost = rtx_cost (op0, mode, NEG, 0, speed);
7128               return true;
7129             }
7130           if (GET_CODE (op0) == MULT)
7131             {
7132               /* FNMUL.  */
7133               *cost = rtx_cost (op0, mode, NEG, 0, speed);
7134               return true;
7135             }
7136           if (speed)
7137             /* FNEG.  */
7138             *cost += extra_cost->fp[mode == DFmode].neg;
7139           return false;
7140         }
7141
7142       return false;
7143
7144     case CLRSB:
7145     case CLZ:
7146       if (speed)
7147         {
7148           if (VECTOR_MODE_P (mode))
7149             *cost += extra_cost->vect.alu;
7150           else
7151             *cost += extra_cost->alu.clz;
7152         }
7153
7154       return false;
7155
7156     case COMPARE:
7157       op0 = XEXP (x, 0);
7158       op1 = XEXP (x, 1);
7159
7160       if (op1 == const0_rtx
7161           && GET_CODE (op0) == AND)
7162         {
7163           x = op0;
7164           mode = GET_MODE (op0);
7165           goto cost_logic;
7166         }
7167
7168       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
7169         {
7170           /* TODO: A write to the CC flags possibly costs extra, this
7171              needs encoding in the cost tables.  */
7172
7173           mode = GET_MODE (op0);
7174           /* ANDS.  */
7175           if (GET_CODE (op0) == AND)
7176             {
7177               x = op0;
7178               goto cost_logic;
7179             }
7180
7181           if (GET_CODE (op0) == PLUS)
7182             {
7183               /* ADDS (and CMN alias).  */
7184               x = op0;
7185               goto cost_plus;
7186             }
7187
7188           if (GET_CODE (op0) == MINUS)
7189             {
7190               /* SUBS.  */
7191               x = op0;
7192               goto cost_minus;
7193             }
7194
7195           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
7196               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
7197               && CONST_INT_P (XEXP (op0, 2)))
7198             {
7199               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
7200                  Handle it here directly rather than going to cost_logic
7201                  since we know the immediate generated for the TST is valid
7202                  so we can avoid creating an intermediate rtx for it only
7203                  for costing purposes.  */
7204               if (speed)
7205                 *cost += extra_cost->alu.logical;
7206
7207               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
7208                                  ZERO_EXTRACT, 0, speed);
7209               return true;
7210             }
7211
7212           if (GET_CODE (op1) == NEG)
7213             {
7214               /* CMN.  */
7215               if (speed)
7216                 *cost += extra_cost->alu.arith;
7217
7218               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
7219               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
7220               return true;
7221             }
7222
7223           /* CMP.
7224
7225              Compare can freely swap the order of operands, and
7226              canonicalization puts the more complex operation first.
7227              But the integer MINUS logic expects the shift/extend
7228              operation in op1.  */
7229           if (! (REG_P (op0)
7230                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
7231           {
7232             op0 = XEXP (x, 1);
7233             op1 = XEXP (x, 0);
7234           }
7235           goto cost_minus;
7236         }
7237
7238       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
7239         {
7240           /* FCMP.  */
7241           if (speed)
7242             *cost += extra_cost->fp[mode == DFmode].compare;
7243
7244           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
7245             {
7246               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
7247               /* FCMP supports constant 0.0 for no extra cost. */
7248               return true;
7249             }
7250           return false;
7251         }
7252
7253       if (VECTOR_MODE_P (mode))
7254         {
7255           /* Vector compare.  */
7256           if (speed)
7257             *cost += extra_cost->vect.alu;
7258
7259           if (aarch64_float_const_zero_rtx_p (op1))
7260             {
7261               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
7262                  cost.  */
7263               return true;
7264             }
7265           return false;
7266         }
7267       return false;
7268
7269     case MINUS:
7270       {
7271         op0 = XEXP (x, 0);
7272         op1 = XEXP (x, 1);
7273
7274 cost_minus:
7275         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
7276
7277         /* Detect valid immediates.  */
7278         if ((GET_MODE_CLASS (mode) == MODE_INT
7279              || (GET_MODE_CLASS (mode) == MODE_CC
7280                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
7281             && CONST_INT_P (op1)
7282             && aarch64_uimm12_shift (INTVAL (op1)))
7283           {
7284             if (speed)
7285               /* SUB(S) (immediate).  */
7286               *cost += extra_cost->alu.arith;
7287             return true;
7288           }
7289
7290         /* Look for SUB (extended register).  */
7291         if (is_a <scalar_int_mode> (mode, &int_mode)
7292             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
7293           {
7294             if (speed)
7295               *cost += extra_cost->alu.extend_arith;
7296
7297             op1 = aarch64_strip_extend (op1, true);
7298             *cost += rtx_cost (op1, VOIDmode,
7299                                (enum rtx_code) GET_CODE (op1), 0, speed);
7300             return true;
7301           }
7302
7303         rtx new_op1 = aarch64_strip_extend (op1, false);
7304
7305         /* Cost this as an FMA-alike operation.  */
7306         if ((GET_CODE (new_op1) == MULT
7307              || aarch64_shift_p (GET_CODE (new_op1)))
7308             && code != COMPARE)
7309           {
7310             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
7311                                             (enum rtx_code) code,
7312                                             speed);
7313             return true;
7314           }
7315
7316         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
7317
7318         if (speed)
7319           {
7320             if (VECTOR_MODE_P (mode))
7321               {
7322                 /* Vector SUB.  */
7323                 *cost += extra_cost->vect.alu;
7324               }
7325             else if (GET_MODE_CLASS (mode) == MODE_INT)
7326               {
7327                 /* SUB(S).  */
7328                 *cost += extra_cost->alu.arith;
7329               }
7330             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7331               {
7332                 /* FSUB.  */
7333                 *cost += extra_cost->fp[mode == DFmode].addsub;
7334               }
7335           }
7336         return true;
7337       }
7338
7339     case PLUS:
7340       {
7341         rtx new_op0;
7342
7343         op0 = XEXP (x, 0);
7344         op1 = XEXP (x, 1);
7345
7346 cost_plus:
7347         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7348             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7349           {
7350             /* CSINC.  */
7351             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
7352             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7353             return true;
7354           }
7355
7356         if (GET_MODE_CLASS (mode) == MODE_INT
7357             && CONST_INT_P (op1)
7358             && aarch64_uimm12_shift (INTVAL (op1)))
7359           {
7360             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
7361
7362             if (speed)
7363               /* ADD (immediate).  */
7364               *cost += extra_cost->alu.arith;
7365             return true;
7366           }
7367
7368         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7369
7370         /* Look for ADD (extended register).  */
7371         if (is_a <scalar_int_mode> (mode, &int_mode)
7372             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
7373           {
7374             if (speed)
7375               *cost += extra_cost->alu.extend_arith;
7376
7377             op0 = aarch64_strip_extend (op0, true);
7378             *cost += rtx_cost (op0, VOIDmode,
7379                                (enum rtx_code) GET_CODE (op0), 0, speed);
7380             return true;
7381           }
7382
7383         /* Strip any extend, leave shifts behind as we will
7384            cost them through mult_cost.  */
7385         new_op0 = aarch64_strip_extend (op0, false);
7386
7387         if (GET_CODE (new_op0) == MULT
7388             || aarch64_shift_p (GET_CODE (new_op0)))
7389           {
7390             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
7391                                             speed);
7392             return true;
7393           }
7394
7395         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
7396
7397         if (speed)
7398           {
7399             if (VECTOR_MODE_P (mode))
7400               {
7401                 /* Vector ADD.  */
7402                 *cost += extra_cost->vect.alu;
7403               }
7404             else if (GET_MODE_CLASS (mode) == MODE_INT)
7405               {
7406                 /* ADD.  */
7407                 *cost += extra_cost->alu.arith;
7408               }
7409             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7410               {
7411                 /* FADD.  */
7412                 *cost += extra_cost->fp[mode == DFmode].addsub;
7413               }
7414           }
7415         return true;
7416       }
7417
7418     case BSWAP:
7419       *cost = COSTS_N_INSNS (1);
7420
7421       if (speed)
7422         {
7423           if (VECTOR_MODE_P (mode))
7424             *cost += extra_cost->vect.alu;
7425           else
7426             *cost += extra_cost->alu.rev;
7427         }
7428       return false;
7429
7430     case IOR:
7431       if (aarch_rev16_p (x))
7432         {
7433           *cost = COSTS_N_INSNS (1);
7434
7435           if (speed)
7436             {
7437               if (VECTOR_MODE_P (mode))
7438                 *cost += extra_cost->vect.alu;
7439               else
7440                 *cost += extra_cost->alu.rev;
7441             }
7442           return true;
7443         }
7444
7445       if (aarch64_extr_rtx_p (x, &op0, &op1))
7446         {
7447           *cost += rtx_cost (op0, mode, IOR, 0, speed);
7448           *cost += rtx_cost (op1, mode, IOR, 1, speed);
7449           if (speed)
7450             *cost += extra_cost->alu.shift;
7451
7452           return true;
7453         }
7454     /* Fall through.  */
7455     case XOR:
7456     case AND:
7457     cost_logic:
7458       op0 = XEXP (x, 0);
7459       op1 = XEXP (x, 1);
7460
7461       if (VECTOR_MODE_P (mode))
7462         {
7463           if (speed)
7464             *cost += extra_cost->vect.alu;
7465           return true;
7466         }
7467
7468       if (code == AND
7469           && GET_CODE (op0) == MULT
7470           && CONST_INT_P (XEXP (op0, 1))
7471           && CONST_INT_P (op1)
7472           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
7473                                INTVAL (op1)) != 0)
7474         {
7475           /* This is a UBFM/SBFM.  */
7476           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
7477           if (speed)
7478             *cost += extra_cost->alu.bfx;
7479           return true;
7480         }
7481
7482       if (is_int_mode (mode, &int_mode))
7483         {
7484           if (CONST_INT_P (op1))
7485             {
7486               /* We have a mask + shift version of a UBFIZ
7487                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
7488               if (GET_CODE (op0) == ASHIFT
7489                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
7490                                                          XEXP (op0, 1)))
7491                 {
7492                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
7493                                      (enum rtx_code) code, 0, speed);
7494                   if (speed)
7495                     *cost += extra_cost->alu.bfx;
7496
7497                   return true;
7498                 }
7499               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
7500                 {
7501                 /* We possibly get the immediate for free, this is not
7502                    modelled.  */
7503                   *cost += rtx_cost (op0, int_mode,
7504                                      (enum rtx_code) code, 0, speed);
7505                   if (speed)
7506                     *cost += extra_cost->alu.logical;
7507
7508                   return true;
7509                 }
7510             }
7511           else
7512             {
7513               rtx new_op0 = op0;
7514
7515               /* Handle ORN, EON, or BIC.  */
7516               if (GET_CODE (op0) == NOT)
7517                 op0 = XEXP (op0, 0);
7518
7519               new_op0 = aarch64_strip_shift (op0);
7520
7521               /* If we had a shift on op0 then this is a logical-shift-
7522                  by-register/immediate operation.  Otherwise, this is just
7523                  a logical operation.  */
7524               if (speed)
7525                 {
7526                   if (new_op0 != op0)
7527                     {
7528                       /* Shift by immediate.  */
7529                       if (CONST_INT_P (XEXP (op0, 1)))
7530                         *cost += extra_cost->alu.log_shift;
7531                       else
7532                         *cost += extra_cost->alu.log_shift_reg;
7533                     }
7534                   else
7535                     *cost += extra_cost->alu.logical;
7536                 }
7537
7538               /* In both cases we want to cost both operands.  */
7539               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
7540                                  0, speed);
7541               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
7542                                  1, speed);
7543
7544               return true;
7545             }
7546         }
7547       return false;
7548
7549     case NOT:
7550       x = XEXP (x, 0);
7551       op0 = aarch64_strip_shift (x);
7552
7553       if (VECTOR_MODE_P (mode))
7554         {
7555           /* Vector NOT.  */
7556           *cost += extra_cost->vect.alu;
7557           return false;
7558         }
7559
7560       /* MVN-shifted-reg.  */
7561       if (op0 != x)
7562         {
7563           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7564
7565           if (speed)
7566             *cost += extra_cost->alu.log_shift;
7567
7568           return true;
7569         }
7570       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
7571          Handle the second form here taking care that 'a' in the above can
7572          be a shift.  */
7573       else if (GET_CODE (op0) == XOR)
7574         {
7575           rtx newop0 = XEXP (op0, 0);
7576           rtx newop1 = XEXP (op0, 1);
7577           rtx op0_stripped = aarch64_strip_shift (newop0);
7578
7579           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
7580           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
7581
7582           if (speed)
7583             {
7584               if (op0_stripped != newop0)
7585                 *cost += extra_cost->alu.log_shift;
7586               else
7587                 *cost += extra_cost->alu.logical;
7588             }
7589
7590           return true;
7591         }
7592       /* MVN.  */
7593       if (speed)
7594         *cost += extra_cost->alu.logical;
7595
7596       return false;
7597
7598     case ZERO_EXTEND:
7599
7600       op0 = XEXP (x, 0);
7601       /* If a value is written in SI mode, then zero extended to DI
7602          mode, the operation will in general be free as a write to
7603          a 'w' register implicitly zeroes the upper bits of an 'x'
7604          register.  However, if this is
7605
7606            (set (reg) (zero_extend (reg)))
7607
7608          we must cost the explicit register move.  */
7609       if (mode == DImode
7610           && GET_MODE (op0) == SImode
7611           && outer == SET)
7612         {
7613           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
7614
7615         /* If OP_COST is non-zero, then the cost of the zero extend
7616            is effectively the cost of the inner operation.  Otherwise
7617            we have a MOV instruction and we take the cost from the MOV
7618            itself.  This is true independently of whether we are
7619            optimizing for space or time.  */
7620           if (op_cost)
7621             *cost = op_cost;
7622
7623           return true;
7624         }
7625       else if (MEM_P (op0))
7626         {
7627           /* All loads can zero extend to any size for free.  */
7628           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
7629           return true;
7630         }
7631
7632       op0 = aarch64_extend_bitfield_pattern_p (x);
7633       if (op0)
7634         {
7635           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
7636           if (speed)
7637             *cost += extra_cost->alu.bfx;
7638           return true;
7639         }
7640
7641       if (speed)
7642         {
7643           if (VECTOR_MODE_P (mode))
7644             {
7645               /* UMOV.  */
7646               *cost += extra_cost->vect.alu;
7647             }
7648           else
7649             {
7650               /* We generate an AND instead of UXTB/UXTH.  */
7651               *cost += extra_cost->alu.logical;
7652             }
7653         }
7654       return false;
7655
7656     case SIGN_EXTEND:
7657       if (MEM_P (XEXP (x, 0)))
7658         {
7659           /* LDRSH.  */
7660           if (speed)
7661             {
7662               rtx address = XEXP (XEXP (x, 0), 0);
7663               *cost += extra_cost->ldst.load_sign_extend;
7664
7665               *cost +=
7666                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7667                                                      0, speed));
7668             }
7669           return true;
7670         }
7671
7672       op0 = aarch64_extend_bitfield_pattern_p (x);
7673       if (op0)
7674         {
7675           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
7676           if (speed)
7677             *cost += extra_cost->alu.bfx;
7678           return true;
7679         }
7680
7681       if (speed)
7682         {
7683           if (VECTOR_MODE_P (mode))
7684             *cost += extra_cost->vect.alu;
7685           else
7686             *cost += extra_cost->alu.extend;
7687         }
7688       return false;
7689
7690     case ASHIFT:
7691       op0 = XEXP (x, 0);
7692       op1 = XEXP (x, 1);
7693
7694       if (CONST_INT_P (op1))
7695         {
7696           if (speed)
7697             {
7698               if (VECTOR_MODE_P (mode))
7699                 {
7700                   /* Vector shift (immediate).  */
7701                   *cost += extra_cost->vect.alu;
7702                 }
7703               else
7704                 {
7705                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
7706                      aliases.  */
7707                   *cost += extra_cost->alu.shift;
7708                 }
7709             }
7710
7711           /* We can incorporate zero/sign extend for free.  */
7712           if (GET_CODE (op0) == ZERO_EXTEND
7713               || GET_CODE (op0) == SIGN_EXTEND)
7714             op0 = XEXP (op0, 0);
7715
7716           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
7717           return true;
7718         }
7719       else
7720         {
7721           if (VECTOR_MODE_P (mode))
7722             {
7723               if (speed)
7724                 /* Vector shift (register).  */
7725                 *cost += extra_cost->vect.alu;
7726             }
7727           else
7728             {
7729               if (speed)
7730                 /* LSLV.  */
7731                 *cost += extra_cost->alu.shift_reg;
7732
7733               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7734                   && CONST_INT_P (XEXP (op1, 1))
7735                   && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7736                 {
7737                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7738                   /* We already demanded XEXP (op1, 0) to be REG_P, so
7739                      don't recurse into it.  */
7740                   return true;
7741                 }
7742             }
7743           return false;  /* All arguments need to be in registers.  */
7744         }
7745
7746     case ROTATE:
7747     case ROTATERT:
7748     case LSHIFTRT:
7749     case ASHIFTRT:
7750       op0 = XEXP (x, 0);
7751       op1 = XEXP (x, 1);
7752
7753       if (CONST_INT_P (op1))
7754         {
7755           /* ASR (immediate) and friends.  */
7756           if (speed)
7757             {
7758               if (VECTOR_MODE_P (mode))
7759                 *cost += extra_cost->vect.alu;
7760               else
7761                 *cost += extra_cost->alu.shift;
7762             }
7763
7764           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7765           return true;
7766         }
7767       else
7768         {
7769           if (VECTOR_MODE_P (mode))
7770             {
7771               if (speed)
7772                 /* Vector shift (register).  */
7773                 *cost += extra_cost->vect.alu;
7774             }
7775           else
7776             {
7777               if (speed)
7778                 /* ASR (register) and friends.  */
7779                 *cost += extra_cost->alu.shift_reg;
7780
7781               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7782                   && CONST_INT_P (XEXP (op1, 1))
7783                   && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7784                 {
7785                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7786                   /* We already demanded XEXP (op1, 0) to be REG_P, so
7787                      don't recurse into it.  */
7788                   return true;
7789                 }
7790             }
7791           return false;  /* All arguments need to be in registers.  */
7792         }
7793
7794     case SYMBOL_REF:
7795
7796       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7797           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7798         {
7799           /* LDR.  */
7800           if (speed)
7801             *cost += extra_cost->ldst.load;
7802         }
7803       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7804                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7805         {
7806           /* ADRP, followed by ADD.  */
7807           *cost += COSTS_N_INSNS (1);
7808           if (speed)
7809             *cost += 2 * extra_cost->alu.arith;
7810         }
7811       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7812                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7813         {
7814           /* ADR.  */
7815           if (speed)
7816             *cost += extra_cost->alu.arith;
7817         }
7818
7819       if (flag_pic)
7820         {
7821           /* One extra load instruction, after accessing the GOT.  */
7822           *cost += COSTS_N_INSNS (1);
7823           if (speed)
7824             *cost += extra_cost->ldst.load;
7825         }
7826       return true;
7827
7828     case HIGH:
7829     case LO_SUM:
7830       /* ADRP/ADD (immediate).  */
7831       if (speed)
7832         *cost += extra_cost->alu.arith;
7833       return true;
7834
7835     case ZERO_EXTRACT:
7836     case SIGN_EXTRACT:
7837       /* UBFX/SBFX.  */
7838       if (speed)
7839         {
7840           if (VECTOR_MODE_P (mode))
7841             *cost += extra_cost->vect.alu;
7842           else
7843             *cost += extra_cost->alu.bfx;
7844         }
7845
7846       /* We can trust that the immediates used will be correct (there
7847          are no by-register forms), so we need only cost op0.  */
7848       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7849       return true;
7850
7851     case MULT:
7852       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7853       /* aarch64_rtx_mult_cost always handles recursion to its
7854          operands.  */
7855       return true;
7856
7857     case MOD:
7858     /* We can expand signed mod by power of 2 using a NEGS, two parallel
7859        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
7860        an unconditional negate.  This case should only ever be reached through
7861        the set_smod_pow2_cheap check in expmed.c.  */
7862       if (CONST_INT_P (XEXP (x, 1))
7863           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7864           && (mode == SImode || mode == DImode))
7865         {
7866           /* We expand to 4 instructions.  Reset the baseline.  */
7867           *cost = COSTS_N_INSNS (4);
7868
7869           if (speed)
7870             *cost += 2 * extra_cost->alu.logical
7871                      + 2 * extra_cost->alu.arith;
7872
7873           return true;
7874         }
7875
7876     /* Fall-through.  */
7877     case UMOD:
7878       if (speed)
7879         {
7880           /* Slighly prefer UMOD over SMOD.  */
7881           if (VECTOR_MODE_P (mode))
7882             *cost += extra_cost->vect.alu;
7883           else if (GET_MODE_CLASS (mode) == MODE_INT)
7884             *cost += (extra_cost->mult[mode == DImode].add
7885                       + extra_cost->mult[mode == DImode].idiv
7886                       + (code == MOD ? 1 : 0));
7887         }
7888       return false;  /* All arguments need to be in registers.  */
7889
7890     case DIV:
7891     case UDIV:
7892     case SQRT:
7893       if (speed)
7894         {
7895           if (VECTOR_MODE_P (mode))
7896             *cost += extra_cost->vect.alu;
7897           else if (GET_MODE_CLASS (mode) == MODE_INT)
7898             /* There is no integer SQRT, so only DIV and UDIV can get
7899                here.  */
7900             *cost += (extra_cost->mult[mode == DImode].idiv
7901                      /* Slighly prefer UDIV over SDIV.  */
7902                      + (code == DIV ? 1 : 0));
7903           else
7904             *cost += extra_cost->fp[mode == DFmode].div;
7905         }
7906       return false;  /* All arguments need to be in registers.  */
7907
7908     case IF_THEN_ELSE:
7909       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7910                                          XEXP (x, 2), cost, speed);
7911
7912     case EQ:
7913     case NE:
7914     case GT:
7915     case GTU:
7916     case LT:
7917     case LTU:
7918     case GE:
7919     case GEU:
7920     case LE:
7921     case LEU:
7922
7923       return false; /* All arguments must be in registers.  */
7924
7925     case FMA:
7926       op0 = XEXP (x, 0);
7927       op1 = XEXP (x, 1);
7928       op2 = XEXP (x, 2);
7929
7930       if (speed)
7931         {
7932           if (VECTOR_MODE_P (mode))
7933             *cost += extra_cost->vect.alu;
7934           else
7935             *cost += extra_cost->fp[mode == DFmode].fma;
7936         }
7937
7938       /* FMSUB, FNMADD, and FNMSUB are free.  */
7939       if (GET_CODE (op0) == NEG)
7940         op0 = XEXP (op0, 0);
7941
7942       if (GET_CODE (op2) == NEG)
7943         op2 = XEXP (op2, 0);
7944
7945       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7946          and the by-element operand as operand 0.  */
7947       if (GET_CODE (op1) == NEG)
7948         op1 = XEXP (op1, 0);
7949
7950       /* Catch vector-by-element operations.  The by-element operand can
7951          either be (vec_duplicate (vec_select (x))) or just
7952          (vec_select (x)), depending on whether we are multiplying by
7953          a vector or a scalar.
7954
7955          Canonicalization is not very good in these cases, FMA4 will put the
7956          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
7957       if (GET_CODE (op0) == VEC_DUPLICATE)
7958         op0 = XEXP (op0, 0);
7959       else if (GET_CODE (op1) == VEC_DUPLICATE)
7960         op1 = XEXP (op1, 0);
7961
7962       if (GET_CODE (op0) == VEC_SELECT)
7963         op0 = XEXP (op0, 0);
7964       else if (GET_CODE (op1) == VEC_SELECT)
7965         op1 = XEXP (op1, 0);
7966
7967       /* If the remaining parameters are not registers,
7968          get the cost to put them into registers.  */
7969       *cost += rtx_cost (op0, mode, FMA, 0, speed);
7970       *cost += rtx_cost (op1, mode, FMA, 1, speed);
7971       *cost += rtx_cost (op2, mode, FMA, 2, speed);
7972       return true;
7973
7974     case FLOAT:
7975     case UNSIGNED_FLOAT:
7976       if (speed)
7977         *cost += extra_cost->fp[mode == DFmode].fromint;
7978       return false;
7979
7980     case FLOAT_EXTEND:
7981       if (speed)
7982         {
7983           if (VECTOR_MODE_P (mode))
7984             {
7985               /*Vector truncate.  */
7986               *cost += extra_cost->vect.alu;
7987             }
7988           else
7989             *cost += extra_cost->fp[mode == DFmode].widen;
7990         }
7991       return false;
7992
7993     case FLOAT_TRUNCATE:
7994       if (speed)
7995         {
7996           if (VECTOR_MODE_P (mode))
7997             {
7998               /*Vector conversion.  */
7999               *cost += extra_cost->vect.alu;
8000             }
8001           else
8002             *cost += extra_cost->fp[mode == DFmode].narrow;
8003         }
8004       return false;
8005
8006     case FIX:
8007     case UNSIGNED_FIX:
8008       x = XEXP (x, 0);
8009       /* Strip the rounding part.  They will all be implemented
8010          by the fcvt* family of instructions anyway.  */
8011       if (GET_CODE (x) == UNSPEC)
8012         {
8013           unsigned int uns_code = XINT (x, 1);
8014
8015           if (uns_code == UNSPEC_FRINTA
8016               || uns_code == UNSPEC_FRINTM
8017               || uns_code == UNSPEC_FRINTN
8018               || uns_code == UNSPEC_FRINTP
8019               || uns_code == UNSPEC_FRINTZ)
8020             x = XVECEXP (x, 0, 0);
8021         }
8022
8023       if (speed)
8024         {
8025           if (VECTOR_MODE_P (mode))
8026             *cost += extra_cost->vect.alu;
8027           else
8028             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
8029         }
8030
8031       /* We can combine fmul by a power of 2 followed by a fcvt into a single
8032          fixed-point fcvt.  */
8033       if (GET_CODE (x) == MULT
8034           && ((VECTOR_MODE_P (mode)
8035                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
8036               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
8037         {
8038           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
8039                              0, speed);
8040           return true;
8041         }
8042
8043       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
8044       return true;
8045
8046     case ABS:
8047       if (VECTOR_MODE_P (mode))
8048         {
8049           /* ABS (vector).  */
8050           if (speed)
8051             *cost += extra_cost->vect.alu;
8052         }
8053       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8054         {
8055           op0 = XEXP (x, 0);
8056
8057           /* FABD, which is analogous to FADD.  */
8058           if (GET_CODE (op0) == MINUS)
8059             {
8060               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
8061               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
8062               if (speed)
8063                 *cost += extra_cost->fp[mode == DFmode].addsub;
8064
8065               return true;
8066             }
8067           /* Simple FABS is analogous to FNEG.  */
8068           if (speed)
8069             *cost += extra_cost->fp[mode == DFmode].neg;
8070         }
8071       else
8072         {
8073           /* Integer ABS will either be split to
8074              two arithmetic instructions, or will be an ABS
8075              (scalar), which we don't model.  */
8076           *cost = COSTS_N_INSNS (2);
8077           if (speed)
8078             *cost += 2 * extra_cost->alu.arith;
8079         }
8080       return false;
8081
8082     case SMAX:
8083     case SMIN:
8084       if (speed)
8085         {
8086           if (VECTOR_MODE_P (mode))
8087             *cost += extra_cost->vect.alu;
8088           else
8089             {
8090               /* FMAXNM/FMINNM/FMAX/FMIN.
8091                  TODO: This may not be accurate for all implementations, but
8092                  we do not model this in the cost tables.  */
8093               *cost += extra_cost->fp[mode == DFmode].addsub;
8094             }
8095         }
8096       return false;
8097
8098     case UNSPEC:
8099       /* The floating point round to integer frint* instructions.  */
8100       if (aarch64_frint_unspec_p (XINT (x, 1)))
8101         {
8102           if (speed)
8103             *cost += extra_cost->fp[mode == DFmode].roundint;
8104
8105           return false;
8106         }
8107
8108       if (XINT (x, 1) == UNSPEC_RBIT)
8109         {
8110           if (speed)
8111             *cost += extra_cost->alu.rev;
8112
8113           return false;
8114         }
8115       break;
8116
8117     case TRUNCATE:
8118
8119       /* Decompose <su>muldi3_highpart.  */
8120       if (/* (truncate:DI  */
8121           mode == DImode
8122           /*   (lshiftrt:TI  */
8123           && GET_MODE (XEXP (x, 0)) == TImode
8124           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
8125           /*      (mult:TI  */
8126           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
8127           /*        (ANY_EXTEND:TI (reg:DI))
8128                     (ANY_EXTEND:TI (reg:DI)))  */
8129           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
8130                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
8131               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
8132                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
8133           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
8134           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
8135           /*     (const_int 64)  */
8136           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8137           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
8138         {
8139           /* UMULH/SMULH.  */
8140           if (speed)
8141             *cost += extra_cost->mult[mode == DImode].extend;
8142           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
8143                              mode, MULT, 0, speed);
8144           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
8145                              mode, MULT, 1, speed);
8146           return true;
8147         }
8148
8149       /* Fall through.  */
8150     default:
8151       break;
8152     }
8153
8154   if (dump_file
8155       && flag_aarch64_verbose_cost)
8156     fprintf (dump_file,
8157       "\nFailed to cost RTX.  Assuming default cost.\n");
8158
8159   return true;
8160 }
8161
8162 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
8163    calculated for X.  This cost is stored in *COST.  Returns true
8164    if the total cost of X was calculated.  */
8165 static bool
8166 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
8167                    int param, int *cost, bool speed)
8168 {
8169   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
8170
8171   if (dump_file
8172       && flag_aarch64_verbose_cost)
8173     {
8174       print_rtl_single (dump_file, x);
8175       fprintf (dump_file, "\n%s cost: %d (%s)\n",
8176                speed ? "Hot" : "Cold",
8177                *cost, result ? "final" : "partial");
8178     }
8179
8180   return result;
8181 }
8182
8183 static int
8184 aarch64_register_move_cost (machine_mode mode,
8185                             reg_class_t from_i, reg_class_t to_i)
8186 {
8187   enum reg_class from = (enum reg_class) from_i;
8188   enum reg_class to = (enum reg_class) to_i;
8189   const struct cpu_regmove_cost *regmove_cost
8190     = aarch64_tune_params.regmove_cost;
8191
8192   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
8193   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
8194     to = GENERAL_REGS;
8195
8196   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
8197     from = GENERAL_REGS;
8198
8199   /* Moving between GPR and stack cost is the same as GP2GP.  */
8200   if ((from == GENERAL_REGS && to == STACK_REG)
8201       || (to == GENERAL_REGS && from == STACK_REG))
8202     return regmove_cost->GP2GP;
8203
8204   /* To/From the stack register, we move via the gprs.  */
8205   if (to == STACK_REG || from == STACK_REG)
8206     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
8207             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
8208
8209   if (GET_MODE_SIZE (mode) == 16)
8210     {
8211       /* 128-bit operations on general registers require 2 instructions.  */
8212       if (from == GENERAL_REGS && to == GENERAL_REGS)
8213         return regmove_cost->GP2GP * 2;
8214       else if (from == GENERAL_REGS)
8215         return regmove_cost->GP2FP * 2;
8216       else if (to == GENERAL_REGS)
8217         return regmove_cost->FP2GP * 2;
8218
8219       /* When AdvSIMD instructions are disabled it is not possible to move
8220          a 128-bit value directly between Q registers.  This is handled in
8221          secondary reload.  A general register is used as a scratch to move
8222          the upper DI value and the lower DI value is moved directly,
8223          hence the cost is the sum of three moves. */
8224       if (! TARGET_SIMD)
8225         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
8226
8227       return regmove_cost->FP2FP;
8228     }
8229
8230   if (from == GENERAL_REGS && to == GENERAL_REGS)
8231     return regmove_cost->GP2GP;
8232   else if (from == GENERAL_REGS)
8233     return regmove_cost->GP2FP;
8234   else if (to == GENERAL_REGS)
8235     return regmove_cost->FP2GP;
8236
8237   return regmove_cost->FP2FP;
8238 }
8239
8240 static int
8241 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
8242                           reg_class_t rclass ATTRIBUTE_UNUSED,
8243                           bool in ATTRIBUTE_UNUSED)
8244 {
8245   return aarch64_tune_params.memmov_cost;
8246 }
8247
8248 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
8249    to optimize 1.0/sqrt.  */
8250
8251 static bool
8252 use_rsqrt_p (machine_mode mode)
8253 {
8254   return (!flag_trapping_math
8255           && flag_unsafe_math_optimizations
8256           && ((aarch64_tune_params.approx_modes->recip_sqrt
8257                & AARCH64_APPROX_MODE (mode))
8258               || flag_mrecip_low_precision_sqrt));
8259 }
8260
8261 /* Function to decide when to use the approximate reciprocal square root
8262    builtin.  */
8263
8264 static tree
8265 aarch64_builtin_reciprocal (tree fndecl)
8266 {
8267   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
8268
8269   if (!use_rsqrt_p (mode))
8270     return NULL_TREE;
8271   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
8272 }
8273
8274 typedef rtx (*rsqrte_type) (rtx, rtx);
8275
8276 /* Select reciprocal square root initial estimate insn depending on machine
8277    mode.  */
8278
8279 static rsqrte_type
8280 get_rsqrte_type (machine_mode mode)
8281 {
8282   switch (mode)
8283   {
8284     case E_DFmode:   return gen_aarch64_rsqrtedf;
8285     case E_SFmode:   return gen_aarch64_rsqrtesf;
8286     case E_V2DFmode: return gen_aarch64_rsqrtev2df;
8287     case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
8288     case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
8289     default: gcc_unreachable ();
8290   }
8291 }
8292
8293 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
8294
8295 /* Select reciprocal square root series step insn depending on machine mode.  */
8296
8297 static rsqrts_type
8298 get_rsqrts_type (machine_mode mode)
8299 {
8300   switch (mode)
8301   {
8302     case E_DFmode:   return gen_aarch64_rsqrtsdf;
8303     case E_SFmode:   return gen_aarch64_rsqrtssf;
8304     case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
8305     case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
8306     case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
8307     default: gcc_unreachable ();
8308   }
8309 }
8310
8311 /* Emit instruction sequence to compute either the approximate square root
8312    or its approximate reciprocal, depending on the flag RECP, and return
8313    whether the sequence was emitted or not.  */
8314
8315 bool
8316 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
8317 {
8318   machine_mode mode = GET_MODE (dst);
8319
8320   if (GET_MODE_INNER (mode) == HFmode)
8321     {
8322       gcc_assert (!recp);
8323       return false;
8324     }
8325
8326   if (!recp)
8327     {
8328       if (!(flag_mlow_precision_sqrt
8329             || (aarch64_tune_params.approx_modes->sqrt
8330                 & AARCH64_APPROX_MODE (mode))))
8331         return false;
8332
8333       if (flag_finite_math_only
8334           || flag_trapping_math
8335           || !flag_unsafe_math_optimizations
8336           || optimize_function_for_size_p (cfun))
8337         return false;
8338     }
8339   else
8340     /* Caller assumes we cannot fail.  */
8341     gcc_assert (use_rsqrt_p (mode));
8342
8343   machine_mode mmsk = mode_for_int_vector (mode).require ();
8344   rtx xmsk = gen_reg_rtx (mmsk);
8345   if (!recp)
8346     /* When calculating the approximate square root, compare the
8347        argument with 0.0 and create a mask.  */
8348     emit_insn (gen_rtx_SET (xmsk,
8349                             gen_rtx_NEG (mmsk,
8350                                          gen_rtx_EQ (mmsk, src,
8351                                                      CONST0_RTX (mode)))));
8352
8353   /* Estimate the approximate reciprocal square root.  */
8354   rtx xdst = gen_reg_rtx (mode);
8355   emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
8356
8357   /* Iterate over the series twice for SF and thrice for DF.  */
8358   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8359
8360   /* Optionally iterate over the series once less for faster performance
8361      while sacrificing the accuracy.  */
8362   if ((recp && flag_mrecip_low_precision_sqrt)
8363       || (!recp && flag_mlow_precision_sqrt))
8364     iterations--;
8365
8366   /* Iterate over the series to calculate the approximate reciprocal square
8367      root.  */
8368   rtx x1 = gen_reg_rtx (mode);
8369   while (iterations--)
8370     {
8371       rtx x2 = gen_reg_rtx (mode);
8372       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
8373
8374       emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
8375
8376       if (iterations > 0)
8377         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
8378     }
8379
8380   if (!recp)
8381     {
8382       /* Qualify the approximate reciprocal square root when the argument is
8383          0.0 by squashing the intermediary result to 0.0.  */
8384       rtx xtmp = gen_reg_rtx (mmsk);
8385       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
8386                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
8387       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
8388
8389       /* Calculate the approximate square root.  */
8390       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
8391     }
8392
8393   /* Finalize the approximation.  */
8394   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
8395
8396   return true;
8397 }
8398
8399 typedef rtx (*recpe_type) (rtx, rtx);
8400
8401 /* Select reciprocal initial estimate insn depending on machine mode.  */
8402
8403 static recpe_type
8404 get_recpe_type (machine_mode mode)
8405 {
8406   switch (mode)
8407   {
8408     case E_SFmode:   return (gen_aarch64_frecpesf);
8409     case E_V2SFmode: return (gen_aarch64_frecpev2sf);
8410     case E_V4SFmode: return (gen_aarch64_frecpev4sf);
8411     case E_DFmode:   return (gen_aarch64_frecpedf);
8412     case E_V2DFmode: return (gen_aarch64_frecpev2df);
8413     default:         gcc_unreachable ();
8414   }
8415 }
8416
8417 typedef rtx (*recps_type) (rtx, rtx, rtx);
8418
8419 /* Select reciprocal series step insn depending on machine mode.  */
8420
8421 static recps_type
8422 get_recps_type (machine_mode mode)
8423 {
8424   switch (mode)
8425   {
8426     case E_SFmode:   return (gen_aarch64_frecpssf);
8427     case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
8428     case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
8429     case E_DFmode:   return (gen_aarch64_frecpsdf);
8430     case E_V2DFmode: return (gen_aarch64_frecpsv2df);
8431     default:         gcc_unreachable ();
8432   }
8433 }
8434
8435 /* Emit the instruction sequence to compute the approximation for the division
8436    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
8437
8438 bool
8439 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
8440 {
8441   machine_mode mode = GET_MODE (quo);
8442
8443   if (GET_MODE_INNER (mode) == HFmode)
8444     return false;
8445
8446   bool use_approx_division_p = (flag_mlow_precision_div
8447                                 || (aarch64_tune_params.approx_modes->division
8448                                     & AARCH64_APPROX_MODE (mode)));
8449
8450   if (!flag_finite_math_only
8451       || flag_trapping_math
8452       || !flag_unsafe_math_optimizations
8453       || optimize_function_for_size_p (cfun)
8454       || !use_approx_division_p)
8455     return false;
8456
8457   /* Estimate the approximate reciprocal.  */
8458   rtx xrcp = gen_reg_rtx (mode);
8459   emit_insn ((*get_recpe_type (mode)) (xrcp, den));
8460
8461   /* Iterate over the series twice for SF and thrice for DF.  */
8462   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8463
8464   /* Optionally iterate over the series once less for faster performance,
8465      while sacrificing the accuracy.  */
8466   if (flag_mlow_precision_div)
8467     iterations--;
8468
8469   /* Iterate over the series to calculate the approximate reciprocal.  */
8470   rtx xtmp = gen_reg_rtx (mode);
8471   while (iterations--)
8472     {
8473       emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
8474
8475       if (iterations > 0)
8476         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
8477     }
8478
8479   if (num != CONST1_RTX (mode))
8480     {
8481       /* As the approximate reciprocal of DEN is already calculated, only
8482          calculate the approximate division when NUM is not 1.0.  */
8483       rtx xnum = force_reg (mode, num);
8484       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
8485     }
8486
8487   /* Finalize the approximation.  */
8488   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
8489   return true;
8490 }
8491
8492 /* Return the number of instructions that can be issued per cycle.  */
8493 static int
8494 aarch64_sched_issue_rate (void)
8495 {
8496   return aarch64_tune_params.issue_rate;
8497 }
8498
8499 static int
8500 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
8501 {
8502   int issue_rate = aarch64_sched_issue_rate ();
8503
8504   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
8505 }
8506
8507
8508 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
8509    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
8510    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
8511
8512 static int
8513 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
8514                                                     int ready_index)
8515 {
8516   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
8517 }
8518
8519
8520 /* Vectorizer cost model target hooks.  */
8521
8522 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
8523 static int
8524 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
8525                                     tree vectype,
8526                                     int misalign ATTRIBUTE_UNUSED)
8527 {
8528   unsigned elements;
8529   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
8530   bool fp = false;
8531
8532   if (vectype != NULL)
8533     fp = FLOAT_TYPE_P (vectype);
8534
8535   switch (type_of_cost)
8536     {
8537       case scalar_stmt:
8538         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
8539
8540       case scalar_load:
8541         return costs->scalar_load_cost;
8542
8543       case scalar_store:
8544         return costs->scalar_store_cost;
8545
8546       case vector_stmt:
8547         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8548
8549       case vector_load:
8550         return costs->vec_align_load_cost;
8551
8552       case vector_store:
8553         return costs->vec_store_cost;
8554
8555       case vec_to_scalar:
8556         return costs->vec_to_scalar_cost;
8557
8558       case scalar_to_vec:
8559         return costs->scalar_to_vec_cost;
8560
8561       case unaligned_load:
8562       case vector_gather_load:
8563         return costs->vec_unalign_load_cost;
8564
8565       case unaligned_store:
8566       case vector_scatter_store:
8567         return costs->vec_unalign_store_cost;
8568
8569       case cond_branch_taken:
8570         return costs->cond_taken_branch_cost;
8571
8572       case cond_branch_not_taken:
8573         return costs->cond_not_taken_branch_cost;
8574
8575       case vec_perm:
8576         return costs->vec_permute_cost;
8577
8578       case vec_promote_demote:
8579         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8580
8581       case vec_construct:
8582         elements = TYPE_VECTOR_SUBPARTS (vectype);
8583         return elements / 2 + 1;
8584
8585       default:
8586         gcc_unreachable ();
8587     }
8588 }
8589
8590 /* Implement targetm.vectorize.add_stmt_cost.  */
8591 static unsigned
8592 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
8593                        struct _stmt_vec_info *stmt_info, int misalign,
8594                        enum vect_cost_model_location where)
8595 {
8596   unsigned *cost = (unsigned *) data;
8597   unsigned retval = 0;
8598
8599   if (flag_vect_cost_model)
8600     {
8601       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
8602       int stmt_cost =
8603             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
8604
8605       /* Statements in an inner loop relative to the loop being
8606          vectorized are weighted more heavily.  The value here is
8607          arbitrary and could potentially be improved with analysis.  */
8608       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
8609         count *= 50; /*  FIXME  */
8610
8611       retval = (unsigned) (count * stmt_cost);
8612       cost[where] += retval;
8613     }
8614
8615   return retval;
8616 }
8617
8618 static void initialize_aarch64_code_model (struct gcc_options *);
8619
8620 /* Parse the TO_PARSE string and put the architecture struct that it
8621    selects into RES and the architectural features into ISA_FLAGS.
8622    Return an aarch64_parse_opt_result describing the parse result.
8623    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
8624
8625 static enum aarch64_parse_opt_result
8626 aarch64_parse_arch (const char *to_parse, const struct processor **res,
8627                     unsigned long *isa_flags)
8628 {
8629   char *ext;
8630   const struct processor *arch;
8631   char *str = (char *) alloca (strlen (to_parse) + 1);
8632   size_t len;
8633
8634   strcpy (str, to_parse);
8635
8636   ext = strchr (str, '+');
8637
8638   if (ext != NULL)
8639     len = ext - str;
8640   else
8641     len = strlen (str);
8642
8643   if (len == 0)
8644     return AARCH64_PARSE_MISSING_ARG;
8645
8646
8647   /* Loop through the list of supported ARCHes to find a match.  */
8648   for (arch = all_architectures; arch->name != NULL; arch++)
8649     {
8650       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
8651         {
8652           unsigned long isa_temp = arch->flags;
8653
8654           if (ext != NULL)
8655             {
8656               /* TO_PARSE string contains at least one extension.  */
8657               enum aarch64_parse_opt_result ext_res
8658                 = aarch64_parse_extension (ext, &isa_temp);
8659
8660               if (ext_res != AARCH64_PARSE_OK)
8661                 return ext_res;
8662             }
8663           /* Extension parsing was successful.  Confirm the result
8664              arch and ISA flags.  */
8665           *res = arch;
8666           *isa_flags = isa_temp;
8667           return AARCH64_PARSE_OK;
8668         }
8669     }
8670
8671   /* ARCH name not found in list.  */
8672   return AARCH64_PARSE_INVALID_ARG;
8673 }
8674
8675 /* Parse the TO_PARSE string and put the result tuning in RES and the
8676    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
8677    describing the parse result.  If there is an error parsing, RES and
8678    ISA_FLAGS are left unchanged.  */
8679
8680 static enum aarch64_parse_opt_result
8681 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
8682                    unsigned long *isa_flags)
8683 {
8684   char *ext;
8685   const struct processor *cpu;
8686   char *str = (char *) alloca (strlen (to_parse) + 1);
8687   size_t len;
8688
8689   strcpy (str, to_parse);
8690
8691   ext = strchr (str, '+');
8692
8693   if (ext != NULL)
8694     len = ext - str;
8695   else
8696     len = strlen (str);
8697
8698   if (len == 0)
8699     return AARCH64_PARSE_MISSING_ARG;
8700
8701
8702   /* Loop through the list of supported CPUs to find a match.  */
8703   for (cpu = all_cores; cpu->name != NULL; cpu++)
8704     {
8705       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
8706         {
8707           unsigned long isa_temp = cpu->flags;
8708
8709
8710           if (ext != NULL)
8711             {
8712               /* TO_PARSE string contains at least one extension.  */
8713               enum aarch64_parse_opt_result ext_res
8714                 = aarch64_parse_extension (ext, &isa_temp);
8715
8716               if (ext_res != AARCH64_PARSE_OK)
8717                 return ext_res;
8718             }
8719           /* Extension parsing was successfull.  Confirm the result
8720              cpu and ISA flags.  */
8721           *res = cpu;
8722           *isa_flags = isa_temp;
8723           return AARCH64_PARSE_OK;
8724         }
8725     }
8726
8727   /* CPU name not found in list.  */
8728   return AARCH64_PARSE_INVALID_ARG;
8729 }
8730
8731 /* Parse the TO_PARSE string and put the cpu it selects into RES.
8732    Return an aarch64_parse_opt_result describing the parse result.
8733    If the parsing fails the RES does not change.  */
8734
8735 static enum aarch64_parse_opt_result
8736 aarch64_parse_tune (const char *to_parse, const struct processor **res)
8737 {
8738   const struct processor *cpu;
8739   char *str = (char *) alloca (strlen (to_parse) + 1);
8740
8741   strcpy (str, to_parse);
8742
8743   /* Loop through the list of supported CPUs to find a match.  */
8744   for (cpu = all_cores; cpu->name != NULL; cpu++)
8745     {
8746       if (strcmp (cpu->name, str) == 0)
8747         {
8748           *res = cpu;
8749           return AARCH64_PARSE_OK;
8750         }
8751     }
8752
8753   /* CPU name not found in list.  */
8754   return AARCH64_PARSE_INVALID_ARG;
8755 }
8756
8757 /* Parse TOKEN, which has length LENGTH to see if it is an option
8758    described in FLAG.  If it is, return the index bit for that fusion type.
8759    If not, error (printing OPTION_NAME) and return zero.  */
8760
8761 static unsigned int
8762 aarch64_parse_one_option_token (const char *token,
8763                                 size_t length,
8764                                 const struct aarch64_flag_desc *flag,
8765                                 const char *option_name)
8766 {
8767   for (; flag->name != NULL; flag++)
8768     {
8769       if (length == strlen (flag->name)
8770           && !strncmp (flag->name, token, length))
8771         return flag->flag;
8772     }
8773
8774   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
8775   return 0;
8776 }
8777
8778 /* Parse OPTION which is a comma-separated list of flags to enable.
8779    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8780    default state we inherit from the CPU tuning structures.  OPTION_NAME
8781    gives the top-level option we are parsing in the -moverride string,
8782    for use in error messages.  */
8783
8784 static unsigned int
8785 aarch64_parse_boolean_options (const char *option,
8786                                const struct aarch64_flag_desc *flags,
8787                                unsigned int initial_state,
8788                                const char *option_name)
8789 {
8790   const char separator = '.';
8791   const char* specs = option;
8792   const char* ntoken = option;
8793   unsigned int found_flags = initial_state;
8794
8795   while ((ntoken = strchr (specs, separator)))
8796     {
8797       size_t token_length = ntoken - specs;
8798       unsigned token_ops = aarch64_parse_one_option_token (specs,
8799                                                            token_length,
8800                                                            flags,
8801                                                            option_name);
8802       /* If we find "none" (or, for simplicity's sake, an error) anywhere
8803          in the token stream, reset the supported operations.  So:
8804
8805            adrp+add.cmp+branch.none.adrp+add
8806
8807            would have the result of turning on only adrp+add fusion.  */
8808       if (!token_ops)
8809         found_flags = 0;
8810
8811       found_flags |= token_ops;
8812       specs = ++ntoken;
8813     }
8814
8815   /* We ended with a comma, print something.  */
8816   if (!(*specs))
8817     {
8818       error ("%s string ill-formed\n", option_name);
8819       return 0;
8820     }
8821
8822   /* We still have one more token to parse.  */
8823   size_t token_length = strlen (specs);
8824   unsigned token_ops = aarch64_parse_one_option_token (specs,
8825                                                        token_length,
8826                                                        flags,
8827                                                        option_name);
8828    if (!token_ops)
8829      found_flags = 0;
8830
8831   found_flags |= token_ops;
8832   return found_flags;
8833 }
8834
8835 /* Support for overriding instruction fusion.  */
8836
8837 static void
8838 aarch64_parse_fuse_string (const char *fuse_string,
8839                             struct tune_params *tune)
8840 {
8841   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8842                                                      aarch64_fusible_pairs,
8843                                                      tune->fusible_ops,
8844                                                      "fuse=");
8845 }
8846
8847 /* Support for overriding other tuning flags.  */
8848
8849 static void
8850 aarch64_parse_tune_string (const char *tune_string,
8851                             struct tune_params *tune)
8852 {
8853   tune->extra_tuning_flags
8854     = aarch64_parse_boolean_options (tune_string,
8855                                      aarch64_tuning_flags,
8856                                      tune->extra_tuning_flags,
8857                                      "tune=");
8858 }
8859
8860 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8861    we understand.  If it is, extract the option string and handoff to
8862    the appropriate function.  */
8863
8864 void
8865 aarch64_parse_one_override_token (const char* token,
8866                                   size_t length,
8867                                   struct tune_params *tune)
8868 {
8869   const struct aarch64_tuning_override_function *fn
8870     = aarch64_tuning_override_functions;
8871
8872   const char *option_part = strchr (token, '=');
8873   if (!option_part)
8874     {
8875       error ("tuning string missing in option (%s)", token);
8876       return;
8877     }
8878
8879   /* Get the length of the option name.  */
8880   length = option_part - token;
8881   /* Skip the '=' to get to the option string.  */
8882   option_part++;
8883
8884   for (; fn->name != NULL; fn++)
8885     {
8886       if (!strncmp (fn->name, token, length))
8887         {
8888           fn->parse_override (option_part, tune);
8889           return;
8890         }
8891     }
8892
8893   error ("unknown tuning option (%s)",token);
8894   return;
8895 }
8896
8897 /* A checking mechanism for the implementation of the tls size.  */
8898
8899 static void
8900 initialize_aarch64_tls_size (struct gcc_options *opts)
8901 {
8902   if (aarch64_tls_size == 0)
8903     aarch64_tls_size = 24;
8904
8905   switch (opts->x_aarch64_cmodel_var)
8906     {
8907     case AARCH64_CMODEL_TINY:
8908       /* Both the default and maximum TLS size allowed under tiny is 1M which
8909          needs two instructions to address, so we clamp the size to 24.  */
8910       if (aarch64_tls_size > 24)
8911         aarch64_tls_size = 24;
8912       break;
8913     case AARCH64_CMODEL_SMALL:
8914       /* The maximum TLS size allowed under small is 4G.  */
8915       if (aarch64_tls_size > 32)
8916         aarch64_tls_size = 32;
8917       break;
8918     case AARCH64_CMODEL_LARGE:
8919       /* The maximum TLS size allowed under large is 16E.
8920          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
8921       if (aarch64_tls_size > 48)
8922         aarch64_tls_size = 48;
8923       break;
8924     default:
8925       gcc_unreachable ();
8926     }
8927
8928   return;
8929 }
8930
8931 /* Parse STRING looking for options in the format:
8932      string     :: option:string
8933      option     :: name=substring
8934      name       :: {a-z}
8935      substring  :: defined by option.  */
8936
8937 static void
8938 aarch64_parse_override_string (const char* input_string,
8939                                struct tune_params* tune)
8940 {
8941   const char separator = ':';
8942   size_t string_length = strlen (input_string) + 1;
8943   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8944   char *string = string_root;
8945   strncpy (string, input_string, string_length);
8946   string[string_length - 1] = '\0';
8947
8948   char* ntoken = string;
8949
8950   while ((ntoken = strchr (string, separator)))
8951     {
8952       size_t token_length = ntoken - string;
8953       /* Make this substring look like a string.  */
8954       *ntoken = '\0';
8955       aarch64_parse_one_override_token (string, token_length, tune);
8956       string = ++ntoken;
8957     }
8958
8959   /* One last option to parse.  */
8960   aarch64_parse_one_override_token (string, strlen (string), tune);
8961   free (string_root);
8962 }
8963
8964
8965 static void
8966 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8967 {
8968   /* PR 70044: We have to be careful about being called multiple times for the
8969      same function.  This means all changes should be repeatable.  */
8970
8971   /* If the frame pointer is enabled, set it to a special value that behaves
8972      similar to frame pointer omission.  If we don't do this all leaf functions
8973      will get a frame pointer even if flag_omit_leaf_frame_pointer is set.
8974      If flag_omit_frame_pointer has this special value, we must force the
8975      frame pointer if not in a leaf function.  We also need to force it in a
8976      leaf function if flag_omit_frame_pointer is not set or if LR is used.  */
8977   if (opts->x_flag_omit_frame_pointer == 0)
8978     opts->x_flag_omit_frame_pointer = 2;
8979
8980   /* If not optimizing for size, set the default
8981      alignment to what the target wants.  */
8982   if (!opts->x_optimize_size)
8983     {
8984       if (opts->x_align_loops <= 0)
8985         opts->x_align_loops = aarch64_tune_params.loop_align;
8986       if (opts->x_align_jumps <= 0)
8987         opts->x_align_jumps = aarch64_tune_params.jump_align;
8988       if (opts->x_align_functions <= 0)
8989         opts->x_align_functions = aarch64_tune_params.function_align;
8990     }
8991
8992   /* We default to no pc-relative literal loads.  */
8993
8994   aarch64_pcrelative_literal_loads = false;
8995
8996   /* If -mpc-relative-literal-loads is set on the command line, this
8997      implies that the user asked for PC relative literal loads.  */
8998   if (opts->x_pcrelative_literal_loads == 1)
8999     aarch64_pcrelative_literal_loads = true;
9000
9001   /* In the tiny memory model it makes no sense to disallow PC relative
9002      literal pool loads.  */
9003   if (aarch64_cmodel == AARCH64_CMODEL_TINY
9004       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
9005     aarch64_pcrelative_literal_loads = true;
9006
9007   /* When enabling the lower precision Newton series for the square root, also
9008      enable it for the reciprocal square root, since the latter is an
9009      intermediary step for the former.  */
9010   if (flag_mlow_precision_sqrt)
9011     flag_mrecip_low_precision_sqrt = true;
9012 }
9013
9014 /* 'Unpack' up the internal tuning structs and update the options
9015     in OPTS.  The caller must have set up selected_tune and selected_arch
9016     as all the other target-specific codegen decisions are
9017     derived from them.  */
9018
9019 void
9020 aarch64_override_options_internal (struct gcc_options *opts)
9021 {
9022   aarch64_tune_flags = selected_tune->flags;
9023   aarch64_tune = selected_tune->sched_core;
9024   /* Make a copy of the tuning parameters attached to the core, which
9025      we may later overwrite.  */
9026   aarch64_tune_params = *(selected_tune->tune);
9027   aarch64_architecture_version = selected_arch->architecture_version;
9028
9029   if (opts->x_aarch64_override_tune_string)
9030     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
9031                                   &aarch64_tune_params);
9032
9033   /* This target defaults to strict volatile bitfields.  */
9034   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
9035     opts->x_flag_strict_volatile_bitfields = 1;
9036
9037   initialize_aarch64_code_model (opts);
9038   initialize_aarch64_tls_size (opts);
9039
9040   int queue_depth = 0;
9041   switch (aarch64_tune_params.autoprefetcher_model)
9042     {
9043       case tune_params::AUTOPREFETCHER_OFF:
9044         queue_depth = -1;
9045         break;
9046       case tune_params::AUTOPREFETCHER_WEAK:
9047         queue_depth = 0;
9048         break;
9049       case tune_params::AUTOPREFETCHER_STRONG:
9050         queue_depth = max_insn_queue_index + 1;
9051         break;
9052       default:
9053         gcc_unreachable ();
9054     }
9055
9056   /* We don't mind passing in global_options_set here as we don't use
9057      the *options_set structs anyway.  */
9058   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
9059                          queue_depth,
9060                          opts->x_param_values,
9061                          global_options_set.x_param_values);
9062
9063   /* Set up parameters to be used in prefetching algorithm.  Do not
9064      override the defaults unless we are tuning for a core we have
9065      researched values for.  */
9066   if (aarch64_tune_params.prefetch->num_slots > 0)
9067     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
9068                            aarch64_tune_params.prefetch->num_slots,
9069                            opts->x_param_values,
9070                            global_options_set.x_param_values);
9071   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
9072     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
9073                            aarch64_tune_params.prefetch->l1_cache_size,
9074                            opts->x_param_values,
9075                            global_options_set.x_param_values);
9076   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
9077     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
9078                            aarch64_tune_params.prefetch->l1_cache_line_size,
9079                            opts->x_param_values,
9080                            global_options_set.x_param_values);
9081   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
9082     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
9083                            aarch64_tune_params.prefetch->l2_cache_size,
9084                            opts->x_param_values,
9085                            global_options_set.x_param_values);
9086
9087   /* Use the alternative scheduling-pressure algorithm by default.  */
9088   maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
9089                          opts->x_param_values,
9090                          global_options_set.x_param_values);
9091
9092   /* Enable sw prefetching at specified optimization level for
9093      CPUS that have prefetch.  Lower optimization level threshold by 1
9094      when profiling is enabled.  */
9095   if (opts->x_flag_prefetch_loop_arrays < 0
9096       && !opts->x_optimize_size
9097       && aarch64_tune_params.prefetch->default_opt_level >= 0
9098       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
9099     opts->x_flag_prefetch_loop_arrays = 1;
9100
9101   aarch64_override_options_after_change_1 (opts);
9102 }
9103
9104 /* Print a hint with a suggestion for a core or architecture name that
9105    most closely resembles what the user passed in STR.  ARCH is true if
9106    the user is asking for an architecture name.  ARCH is false if the user
9107    is asking for a core name.  */
9108
9109 static void
9110 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
9111 {
9112   auto_vec<const char *> candidates;
9113   const struct processor *entry = arch ? all_architectures : all_cores;
9114   for (; entry->name != NULL; entry++)
9115     candidates.safe_push (entry->name);
9116   char *s;
9117   const char *hint = candidates_list_and_hint (str, s, candidates);
9118   if (hint)
9119     inform (input_location, "valid arguments are: %s;"
9120                              " did you mean %qs?", s, hint);
9121   XDELETEVEC (s);
9122 }
9123
9124 /* Print a hint with a suggestion for a core name that most closely resembles
9125    what the user passed in STR.  */
9126
9127 inline static void
9128 aarch64_print_hint_for_core (const char *str)
9129 {
9130   aarch64_print_hint_for_core_or_arch (str, false);
9131 }
9132
9133 /* Print a hint with a suggestion for an architecture name that most closely
9134    resembles what the user passed in STR.  */
9135
9136 inline static void
9137 aarch64_print_hint_for_arch (const char *str)
9138 {
9139   aarch64_print_hint_for_core_or_arch (str, true);
9140 }
9141
9142 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
9143    specified in STR and throw errors if appropriate.  Put the results if
9144    they are valid in RES and ISA_FLAGS.  Return whether the option is
9145    valid.  */
9146
9147 static bool
9148 aarch64_validate_mcpu (const char *str, const struct processor **res,
9149                        unsigned long *isa_flags)
9150 {
9151   enum aarch64_parse_opt_result parse_res
9152     = aarch64_parse_cpu (str, res, isa_flags);
9153
9154   if (parse_res == AARCH64_PARSE_OK)
9155     return true;
9156
9157   switch (parse_res)
9158     {
9159       case AARCH64_PARSE_MISSING_ARG:
9160         error ("missing cpu name in %<-mcpu=%s%>", str);
9161         break;
9162       case AARCH64_PARSE_INVALID_ARG:
9163         error ("unknown value %qs for -mcpu", str);
9164         aarch64_print_hint_for_core (str);
9165         break;
9166       case AARCH64_PARSE_INVALID_FEATURE:
9167         error ("invalid feature modifier in %<-mcpu=%s%>", str);
9168         break;
9169       default:
9170         gcc_unreachable ();
9171     }
9172
9173   return false;
9174 }
9175
9176 /* Validate a command-line -march option.  Parse the arch and extensions
9177    (if any) specified in STR and throw errors if appropriate.  Put the
9178    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
9179    option is valid.  */
9180
9181 static bool
9182 aarch64_validate_march (const char *str, const struct processor **res,
9183                          unsigned long *isa_flags)
9184 {
9185   enum aarch64_parse_opt_result parse_res
9186     = aarch64_parse_arch (str, res, isa_flags);
9187
9188   if (parse_res == AARCH64_PARSE_OK)
9189     return true;
9190
9191   switch (parse_res)
9192     {
9193       case AARCH64_PARSE_MISSING_ARG:
9194         error ("missing arch name in %<-march=%s%>", str);
9195         break;
9196       case AARCH64_PARSE_INVALID_ARG:
9197         error ("unknown value %qs for -march", str);
9198         aarch64_print_hint_for_arch (str);
9199         break;
9200       case AARCH64_PARSE_INVALID_FEATURE:
9201         error ("invalid feature modifier in %<-march=%s%>", str);
9202         break;
9203       default:
9204         gcc_unreachable ();
9205     }
9206
9207   return false;
9208 }
9209
9210 /* Validate a command-line -mtune option.  Parse the cpu
9211    specified in STR and throw errors if appropriate.  Put the
9212    result, if it is valid, in RES.  Return whether the option is
9213    valid.  */
9214
9215 static bool
9216 aarch64_validate_mtune (const char *str, const struct processor **res)
9217 {
9218   enum aarch64_parse_opt_result parse_res
9219     = aarch64_parse_tune (str, res);
9220
9221   if (parse_res == AARCH64_PARSE_OK)
9222     return true;
9223
9224   switch (parse_res)
9225     {
9226       case AARCH64_PARSE_MISSING_ARG:
9227         error ("missing cpu name in %<-mtune=%s%>", str);
9228         break;
9229       case AARCH64_PARSE_INVALID_ARG:
9230         error ("unknown value %qs for -mtune", str);
9231         aarch64_print_hint_for_core (str);
9232         break;
9233       default:
9234         gcc_unreachable ();
9235     }
9236   return false;
9237 }
9238
9239 /* Return the CPU corresponding to the enum CPU.
9240    If it doesn't specify a cpu, return the default.  */
9241
9242 static const struct processor *
9243 aarch64_get_tune_cpu (enum aarch64_processor cpu)
9244 {
9245   if (cpu != aarch64_none)
9246     return &all_cores[cpu];
9247
9248   /* The & 0x3f is to extract the bottom 6 bits that encode the
9249      default cpu as selected by the --with-cpu GCC configure option
9250      in config.gcc.
9251      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
9252      flags mechanism should be reworked to make it more sane.  */
9253   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9254 }
9255
9256 /* Return the architecture corresponding to the enum ARCH.
9257    If it doesn't specify a valid architecture, return the default.  */
9258
9259 static const struct processor *
9260 aarch64_get_arch (enum aarch64_arch arch)
9261 {
9262   if (arch != aarch64_no_arch)
9263     return &all_architectures[arch];
9264
9265   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9266
9267   return &all_architectures[cpu->arch];
9268 }
9269
9270 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
9271    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
9272    tuning structs.  In particular it must set selected_tune and
9273    aarch64_isa_flags that define the available ISA features and tuning
9274    decisions.  It must also set selected_arch as this will be used to
9275    output the .arch asm tags for each function.  */
9276
9277 static void
9278 aarch64_override_options (void)
9279 {
9280   unsigned long cpu_isa = 0;
9281   unsigned long arch_isa = 0;
9282   aarch64_isa_flags = 0;
9283
9284   bool valid_cpu = true;
9285   bool valid_tune = true;
9286   bool valid_arch = true;
9287
9288   selected_cpu = NULL;
9289   selected_arch = NULL;
9290   selected_tune = NULL;
9291
9292   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
9293      If either of -march or -mtune is given, they override their
9294      respective component of -mcpu.  */
9295   if (aarch64_cpu_string)
9296     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
9297                                         &cpu_isa);
9298
9299   if (aarch64_arch_string)
9300     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
9301                                           &arch_isa);
9302
9303   if (aarch64_tune_string)
9304     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
9305
9306   /* If the user did not specify a processor, choose the default
9307      one for them.  This will be the CPU set during configuration using
9308      --with-cpu, otherwise it is "generic".  */
9309   if (!selected_cpu)
9310     {
9311       if (selected_arch)
9312         {
9313           selected_cpu = &all_cores[selected_arch->ident];
9314           aarch64_isa_flags = arch_isa;
9315           explicit_arch = selected_arch->arch;
9316         }
9317       else
9318         {
9319           /* Get default configure-time CPU.  */
9320           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
9321           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
9322         }
9323
9324       if (selected_tune)
9325         explicit_tune_core = selected_tune->ident;
9326     }
9327   /* If both -mcpu and -march are specified check that they are architecturally
9328      compatible, warn if they're not and prefer the -march ISA flags.  */
9329   else if (selected_arch)
9330     {
9331       if (selected_arch->arch != selected_cpu->arch)
9332         {
9333           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
9334                        all_architectures[selected_cpu->arch].name,
9335                        selected_arch->name);
9336         }
9337       aarch64_isa_flags = arch_isa;
9338       explicit_arch = selected_arch->arch;
9339       explicit_tune_core = selected_tune ? selected_tune->ident
9340                                           : selected_cpu->ident;
9341     }
9342   else
9343     {
9344       /* -mcpu but no -march.  */
9345       aarch64_isa_flags = cpu_isa;
9346       explicit_tune_core = selected_tune ? selected_tune->ident
9347                                           : selected_cpu->ident;
9348       gcc_assert (selected_cpu);
9349       selected_arch = &all_architectures[selected_cpu->arch];
9350       explicit_arch = selected_arch->arch;
9351     }
9352
9353   /* Set the arch as well as we will need it when outputing
9354      the .arch directive in assembly.  */
9355   if (!selected_arch)
9356     {
9357       gcc_assert (selected_cpu);
9358       selected_arch = &all_architectures[selected_cpu->arch];
9359     }
9360
9361   if (!selected_tune)
9362     selected_tune = selected_cpu;
9363
9364 #ifndef HAVE_AS_MABI_OPTION
9365   /* The compiler may have been configured with 2.23.* binutils, which does
9366      not have support for ILP32.  */
9367   if (TARGET_ILP32)
9368     error ("Assembler does not support -mabi=ilp32");
9369 #endif
9370
9371   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
9372     sorry ("Return address signing is only supported for -mabi=lp64");
9373
9374   /* Make sure we properly set up the explicit options.  */
9375   if ((aarch64_cpu_string && valid_cpu)
9376        || (aarch64_tune_string && valid_tune))
9377     gcc_assert (explicit_tune_core != aarch64_none);
9378
9379   if ((aarch64_cpu_string && valid_cpu)
9380        || (aarch64_arch_string && valid_arch))
9381     gcc_assert (explicit_arch != aarch64_no_arch);
9382
9383   aarch64_override_options_internal (&global_options);
9384
9385   /* Save these options as the default ones in case we push and pop them later
9386      while processing functions with potential target attributes.  */
9387   target_option_default_node = target_option_current_node
9388       = build_target_option_node (&global_options);
9389 }
9390
9391 /* Implement targetm.override_options_after_change.  */
9392
9393 static void
9394 aarch64_override_options_after_change (void)
9395 {
9396   aarch64_override_options_after_change_1 (&global_options);
9397 }
9398
9399 static struct machine_function *
9400 aarch64_init_machine_status (void)
9401 {
9402   struct machine_function *machine;
9403   machine = ggc_cleared_alloc<machine_function> ();
9404   return machine;
9405 }
9406
9407 void
9408 aarch64_init_expanders (void)
9409 {
9410   init_machine_status = aarch64_init_machine_status;
9411 }
9412
9413 /* A checking mechanism for the implementation of the various code models.  */
9414 static void
9415 initialize_aarch64_code_model (struct gcc_options *opts)
9416 {
9417    if (opts->x_flag_pic)
9418      {
9419        switch (opts->x_aarch64_cmodel_var)
9420          {
9421          case AARCH64_CMODEL_TINY:
9422            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
9423            break;
9424          case AARCH64_CMODEL_SMALL:
9425 #ifdef HAVE_AS_SMALL_PIC_RELOCS
9426            aarch64_cmodel = (flag_pic == 2
9427                              ? AARCH64_CMODEL_SMALL_PIC
9428                              : AARCH64_CMODEL_SMALL_SPIC);
9429 #else
9430            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
9431 #endif
9432            break;
9433          case AARCH64_CMODEL_LARGE:
9434            sorry ("code model %qs with -f%s", "large",
9435                   opts->x_flag_pic > 1 ? "PIC" : "pic");
9436            break;
9437          default:
9438            gcc_unreachable ();
9439          }
9440      }
9441    else
9442      aarch64_cmodel = opts->x_aarch64_cmodel_var;
9443 }
9444
9445 /* Implement TARGET_OPTION_SAVE.  */
9446
9447 static void
9448 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
9449 {
9450   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
9451 }
9452
9453 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
9454    using the information saved in PTR.  */
9455
9456 static void
9457 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
9458 {
9459   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
9460   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9461   opts->x_explicit_arch = ptr->x_explicit_arch;
9462   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
9463   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
9464
9465   aarch64_override_options_internal (opts);
9466 }
9467
9468 /* Implement TARGET_OPTION_PRINT.  */
9469
9470 static void
9471 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
9472 {
9473   const struct processor *cpu
9474     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9475   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
9476   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
9477   std::string extension
9478     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
9479
9480   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
9481   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
9482            arch->name, extension.c_str ());
9483 }
9484
9485 static GTY(()) tree aarch64_previous_fndecl;
9486
9487 void
9488 aarch64_reset_previous_fndecl (void)
9489 {
9490   aarch64_previous_fndecl = NULL;
9491 }
9492
9493 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
9494    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
9495    make sure optab availability predicates are recomputed when necessary.  */
9496
9497 void
9498 aarch64_save_restore_target_globals (tree new_tree)
9499 {
9500   if (TREE_TARGET_GLOBALS (new_tree))
9501     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
9502   else if (new_tree == target_option_default_node)
9503     restore_target_globals (&default_target_globals);
9504   else
9505     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
9506 }
9507
9508 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
9509    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
9510    of the function, if such exists.  This function may be called multiple
9511    times on a single function so use aarch64_previous_fndecl to avoid
9512    setting up identical state.  */
9513
9514 static void
9515 aarch64_set_current_function (tree fndecl)
9516 {
9517   if (!fndecl || fndecl == aarch64_previous_fndecl)
9518     return;
9519
9520   tree old_tree = (aarch64_previous_fndecl
9521                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
9522                    : NULL_TREE);
9523
9524   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9525
9526   /* If current function has no attributes but the previous one did,
9527      use the default node.  */
9528   if (!new_tree && old_tree)
9529     new_tree = target_option_default_node;
9530
9531   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
9532      the default have been handled by aarch64_save_restore_target_globals from
9533      aarch64_pragma_target_parse.  */
9534   if (old_tree == new_tree)
9535     return;
9536
9537   aarch64_previous_fndecl = fndecl;
9538
9539   /* First set the target options.  */
9540   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
9541
9542   aarch64_save_restore_target_globals (new_tree);
9543 }
9544
9545 /* Enum describing the various ways we can handle attributes.
9546    In many cases we can reuse the generic option handling machinery.  */
9547
9548 enum aarch64_attr_opt_type
9549 {
9550   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
9551   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
9552   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
9553   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
9554 };
9555
9556 /* All the information needed to handle a target attribute.
9557    NAME is the name of the attribute.
9558    ATTR_TYPE specifies the type of behavior of the attribute as described
9559    in the definition of enum aarch64_attr_opt_type.
9560    ALLOW_NEG is true if the attribute supports a "no-" form.
9561    HANDLER is the function that takes the attribute string as an argument
9562    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
9563    OPT_NUM is the enum specifying the option that the attribute modifies.
9564    This is needed for attributes that mirror the behavior of a command-line
9565    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
9566    aarch64_attr_enum.  */
9567
9568 struct aarch64_attribute_info
9569 {
9570   const char *name;
9571   enum aarch64_attr_opt_type attr_type;
9572   bool allow_neg;
9573   bool (*handler) (const char *);
9574   enum opt_code opt_num;
9575 };
9576
9577 /* Handle the ARCH_STR argument to the arch= target attribute.  */
9578
9579 static bool
9580 aarch64_handle_attr_arch (const char *str)
9581 {
9582   const struct processor *tmp_arch = NULL;
9583   enum aarch64_parse_opt_result parse_res
9584     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
9585
9586   if (parse_res == AARCH64_PARSE_OK)
9587     {
9588       gcc_assert (tmp_arch);
9589       selected_arch = tmp_arch;
9590       explicit_arch = selected_arch->arch;
9591       return true;
9592     }
9593
9594   switch (parse_res)
9595     {
9596       case AARCH64_PARSE_MISSING_ARG:
9597         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
9598         break;
9599       case AARCH64_PARSE_INVALID_ARG:
9600         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
9601         aarch64_print_hint_for_arch (str);
9602         break;
9603       case AARCH64_PARSE_INVALID_FEATURE:
9604         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
9605         break;
9606       default:
9607         gcc_unreachable ();
9608     }
9609
9610   return false;
9611 }
9612
9613 /* Handle the argument CPU_STR to the cpu= target attribute.  */
9614
9615 static bool
9616 aarch64_handle_attr_cpu (const char *str)
9617 {
9618   const struct processor *tmp_cpu = NULL;
9619   enum aarch64_parse_opt_result parse_res
9620     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
9621
9622   if (parse_res == AARCH64_PARSE_OK)
9623     {
9624       gcc_assert (tmp_cpu);
9625       selected_tune = tmp_cpu;
9626       explicit_tune_core = selected_tune->ident;
9627
9628       selected_arch = &all_architectures[tmp_cpu->arch];
9629       explicit_arch = selected_arch->arch;
9630       return true;
9631     }
9632
9633   switch (parse_res)
9634     {
9635       case AARCH64_PARSE_MISSING_ARG:
9636         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
9637         break;
9638       case AARCH64_PARSE_INVALID_ARG:
9639         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
9640         aarch64_print_hint_for_core (str);
9641         break;
9642       case AARCH64_PARSE_INVALID_FEATURE:
9643         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
9644         break;
9645       default:
9646         gcc_unreachable ();
9647     }
9648
9649   return false;
9650 }
9651
9652 /* Handle the argument STR to the tune= target attribute.  */
9653
9654 static bool
9655 aarch64_handle_attr_tune (const char *str)
9656 {
9657   const struct processor *tmp_tune = NULL;
9658   enum aarch64_parse_opt_result parse_res
9659     = aarch64_parse_tune (str, &tmp_tune);
9660
9661   if (parse_res == AARCH64_PARSE_OK)
9662     {
9663       gcc_assert (tmp_tune);
9664       selected_tune = tmp_tune;
9665       explicit_tune_core = selected_tune->ident;
9666       return true;
9667     }
9668
9669   switch (parse_res)
9670     {
9671       case AARCH64_PARSE_INVALID_ARG:
9672         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
9673         aarch64_print_hint_for_core (str);
9674         break;
9675       default:
9676         gcc_unreachable ();
9677     }
9678
9679   return false;
9680 }
9681
9682 /* Parse an architecture extensions target attribute string specified in STR.
9683    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
9684    if successful.  Update aarch64_isa_flags to reflect the ISA features
9685    modified.  */
9686
9687 static bool
9688 aarch64_handle_attr_isa_flags (char *str)
9689 {
9690   enum aarch64_parse_opt_result parse_res;
9691   unsigned long isa_flags = aarch64_isa_flags;
9692
9693   /* We allow "+nothing" in the beginning to clear out all architectural
9694      features if the user wants to handpick specific features.  */
9695   if (strncmp ("+nothing", str, 8) == 0)
9696     {
9697       isa_flags = 0;
9698       str += 8;
9699     }
9700
9701   parse_res = aarch64_parse_extension (str, &isa_flags);
9702
9703   if (parse_res == AARCH64_PARSE_OK)
9704     {
9705       aarch64_isa_flags = isa_flags;
9706       return true;
9707     }
9708
9709   switch (parse_res)
9710     {
9711       case AARCH64_PARSE_MISSING_ARG:
9712         error ("missing value in %<target()%> pragma or attribute");
9713         break;
9714
9715       case AARCH64_PARSE_INVALID_FEATURE:
9716         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
9717         break;
9718
9719       default:
9720         gcc_unreachable ();
9721     }
9722
9723  return false;
9724 }
9725
9726 /* The target attributes that we support.  On top of these we also support just
9727    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
9728    handled explicitly in aarch64_process_one_target_attr.  */
9729
9730 static const struct aarch64_attribute_info aarch64_attributes[] =
9731 {
9732   { "general-regs-only", aarch64_attr_mask, false, NULL,
9733      OPT_mgeneral_regs_only },
9734   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
9735      OPT_mfix_cortex_a53_835769 },
9736   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
9737      OPT_mfix_cortex_a53_843419 },
9738   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
9739   { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
9740   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
9741      OPT_momit_leaf_frame_pointer },
9742   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
9743   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
9744      OPT_march_ },
9745   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
9746   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
9747      OPT_mtune_ },
9748   { "sign-return-address", aarch64_attr_enum, false, NULL,
9749      OPT_msign_return_address_ },
9750   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
9751 };
9752
9753 /* Parse ARG_STR which contains the definition of one target attribute.
9754    Show appropriate errors if any or return true if the attribute is valid.  */
9755
9756 static bool
9757 aarch64_process_one_target_attr (char *arg_str)
9758 {
9759   bool invert = false;
9760
9761   size_t len = strlen (arg_str);
9762
9763   if (len == 0)
9764     {
9765       error ("malformed %<target()%> pragma or attribute");
9766       return false;
9767     }
9768
9769   char *str_to_check = (char *) alloca (len + 1);
9770   strcpy (str_to_check, arg_str);
9771
9772   /* Skip leading whitespace.  */
9773   while (*str_to_check == ' ' || *str_to_check == '\t')
9774     str_to_check++;
9775
9776   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
9777      It is easier to detect and handle it explicitly here rather than going
9778      through the machinery for the rest of the target attributes in this
9779      function.  */
9780   if (*str_to_check == '+')
9781     return aarch64_handle_attr_isa_flags (str_to_check);
9782
9783   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
9784     {
9785       invert = true;
9786       str_to_check += 3;
9787     }
9788   char *arg = strchr (str_to_check, '=');
9789
9790   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9791      and point ARG to "foo".  */
9792   if (arg)
9793     {
9794       *arg = '\0';
9795       arg++;
9796     }
9797   const struct aarch64_attribute_info *p_attr;
9798   bool found = false;
9799   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
9800     {
9801       /* If the names don't match up, or the user has given an argument
9802          to an attribute that doesn't accept one, or didn't give an argument
9803          to an attribute that expects one, fail to match.  */
9804       if (strcmp (str_to_check, p_attr->name) != 0)
9805         continue;
9806
9807       found = true;
9808       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
9809                               || p_attr->attr_type == aarch64_attr_enum;
9810
9811       if (attr_need_arg_p ^ (arg != NULL))
9812         {
9813           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
9814           return false;
9815         }
9816
9817       /* If the name matches but the attribute does not allow "no-" versions
9818          then we can't match.  */
9819       if (invert && !p_attr->allow_neg)
9820         {
9821           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
9822           return false;
9823         }
9824
9825       switch (p_attr->attr_type)
9826         {
9827         /* Has a custom handler registered.
9828            For example, cpu=, arch=, tune=.  */
9829           case aarch64_attr_custom:
9830             gcc_assert (p_attr->handler);
9831             if (!p_attr->handler (arg))
9832               return false;
9833             break;
9834
9835           /* Either set or unset a boolean option.  */
9836           case aarch64_attr_bool:
9837             {
9838               struct cl_decoded_option decoded;
9839
9840               generate_option (p_attr->opt_num, NULL, !invert,
9841                                CL_TARGET, &decoded);
9842               aarch64_handle_option (&global_options, &global_options_set,
9843                                       &decoded, input_location);
9844               break;
9845             }
9846           /* Set or unset a bit in the target_flags.  aarch64_handle_option
9847              should know what mask to apply given the option number.  */
9848           case aarch64_attr_mask:
9849             {
9850               struct cl_decoded_option decoded;
9851               /* We only need to specify the option number.
9852                  aarch64_handle_option will know which mask to apply.  */
9853               decoded.opt_index = p_attr->opt_num;
9854               decoded.value = !invert;
9855               aarch64_handle_option (&global_options, &global_options_set,
9856                                       &decoded, input_location);
9857               break;
9858             }
9859           /* Use the option setting machinery to set an option to an enum.  */
9860           case aarch64_attr_enum:
9861             {
9862               gcc_assert (arg);
9863               bool valid;
9864               int value;
9865               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
9866                                               &value, CL_TARGET);
9867               if (valid)
9868                 {
9869                   set_option (&global_options, NULL, p_attr->opt_num, value,
9870                               NULL, DK_UNSPECIFIED, input_location,
9871                               global_dc);
9872                 }
9873               else
9874                 {
9875                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
9876                 }
9877               break;
9878             }
9879           default:
9880             gcc_unreachable ();
9881         }
9882     }
9883
9884   /* If we reached here we either have found an attribute and validated
9885      it or didn't match any.  If we matched an attribute but its arguments
9886      were malformed we will have returned false already.  */
9887   return found;
9888 }
9889
9890 /* Count how many times the character C appears in
9891    NULL-terminated string STR.  */
9892
9893 static unsigned int
9894 num_occurences_in_str (char c, char *str)
9895 {
9896   unsigned int res = 0;
9897   while (*str != '\0')
9898     {
9899       if (*str == c)
9900         res++;
9901
9902       str++;
9903     }
9904
9905   return res;
9906 }
9907
9908 /* Parse the tree in ARGS that contains the target attribute information
9909    and update the global target options space.  */
9910
9911 bool
9912 aarch64_process_target_attr (tree args)
9913 {
9914   if (TREE_CODE (args) == TREE_LIST)
9915     {
9916       do
9917         {
9918           tree head = TREE_VALUE (args);
9919           if (head)
9920             {
9921               if (!aarch64_process_target_attr (head))
9922                 return false;
9923             }
9924           args = TREE_CHAIN (args);
9925         } while (args);
9926
9927       return true;
9928     }
9929
9930   if (TREE_CODE (args) != STRING_CST)
9931     {
9932       error ("attribute %<target%> argument not a string");
9933       return false;
9934     }
9935
9936   size_t len = strlen (TREE_STRING_POINTER (args));
9937   char *str_to_check = (char *) alloca (len + 1);
9938   strcpy (str_to_check, TREE_STRING_POINTER (args));
9939
9940   if (len == 0)
9941     {
9942       error ("malformed %<target()%> pragma or attribute");
9943       return false;
9944     }
9945
9946   /* Used to catch empty spaces between commas i.e.
9947      attribute ((target ("attr1,,attr2"))).  */
9948   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9949
9950   /* Handle multiple target attributes separated by ','.  */
9951   char *token = strtok (str_to_check, ",");
9952
9953   unsigned int num_attrs = 0;
9954   while (token)
9955     {
9956       num_attrs++;
9957       if (!aarch64_process_one_target_attr (token))
9958         {
9959           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
9960           return false;
9961         }
9962
9963       token = strtok (NULL, ",");
9964     }
9965
9966   if (num_attrs != num_commas + 1)
9967     {
9968       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
9969       return false;
9970     }
9971
9972   return true;
9973 }
9974
9975 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
9976    process attribute ((target ("..."))).  */
9977
9978 static bool
9979 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9980 {
9981   struct cl_target_option cur_target;
9982   bool ret;
9983   tree old_optimize;
9984   tree new_target, new_optimize;
9985   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9986
9987   /* If what we're processing is the current pragma string then the
9988      target option node is already stored in target_option_current_node
9989      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
9990      having to re-parse the string.  This is especially useful to keep
9991      arm_neon.h compile times down since that header contains a lot
9992      of intrinsics enclosed in pragmas.  */
9993   if (!existing_target && args == current_target_pragma)
9994     {
9995       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
9996       return true;
9997     }
9998   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9999
10000   old_optimize = build_optimization_node (&global_options);
10001   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
10002
10003   /* If the function changed the optimization levels as well as setting
10004      target options, start with the optimizations specified.  */
10005   if (func_optimize && func_optimize != old_optimize)
10006     cl_optimization_restore (&global_options,
10007                              TREE_OPTIMIZATION (func_optimize));
10008
10009   /* Save the current target options to restore at the end.  */
10010   cl_target_option_save (&cur_target, &global_options);
10011
10012   /* If fndecl already has some target attributes applied to it, unpack
10013      them so that we add this attribute on top of them, rather than
10014      overwriting them.  */
10015   if (existing_target)
10016     {
10017       struct cl_target_option *existing_options
10018         = TREE_TARGET_OPTION (existing_target);
10019
10020       if (existing_options)
10021         cl_target_option_restore (&global_options, existing_options);
10022     }
10023   else
10024     cl_target_option_restore (&global_options,
10025                         TREE_TARGET_OPTION (target_option_current_node));
10026
10027   ret = aarch64_process_target_attr (args);
10028
10029   /* Set up any additional state.  */
10030   if (ret)
10031     {
10032       aarch64_override_options_internal (&global_options);
10033       /* Initialize SIMD builtins if we haven't already.
10034          Set current_target_pragma to NULL for the duration so that
10035          the builtin initialization code doesn't try to tag the functions
10036          being built with the attributes specified by any current pragma, thus
10037          going into an infinite recursion.  */
10038       if (TARGET_SIMD)
10039         {
10040           tree saved_current_target_pragma = current_target_pragma;
10041           current_target_pragma = NULL;
10042           aarch64_init_simd_builtins ();
10043           current_target_pragma = saved_current_target_pragma;
10044         }
10045       new_target = build_target_option_node (&global_options);
10046     }
10047   else
10048     new_target = NULL;
10049
10050   new_optimize = build_optimization_node (&global_options);
10051
10052   if (fndecl && ret)
10053     {
10054       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
10055
10056       if (old_optimize != new_optimize)
10057         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
10058     }
10059
10060   cl_target_option_restore (&global_options, &cur_target);
10061
10062   if (old_optimize != new_optimize)
10063     cl_optimization_restore (&global_options,
10064                              TREE_OPTIMIZATION (old_optimize));
10065   return ret;
10066 }
10067
10068 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
10069    tri-bool options (yes, no, don't care) and the default value is
10070    DEF, determine whether to reject inlining.  */
10071
10072 static bool
10073 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
10074                                      int dont_care, int def)
10075 {
10076   /* If the callee doesn't care, always allow inlining.  */
10077   if (callee == dont_care)
10078     return true;
10079
10080   /* If the caller doesn't care, always allow inlining.  */
10081   if (caller == dont_care)
10082     return true;
10083
10084   /* Otherwise, allow inlining if either the callee and caller values
10085      agree, or if the callee is using the default value.  */
10086   return (callee == caller || callee == def);
10087 }
10088
10089 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
10090    to inline CALLEE into CALLER based on target-specific info.
10091    Make sure that the caller and callee have compatible architectural
10092    features.  Then go through the other possible target attributes
10093    and see if they can block inlining.  Try not to reject always_inline
10094    callees unless they are incompatible architecturally.  */
10095
10096 static bool
10097 aarch64_can_inline_p (tree caller, tree callee)
10098 {
10099   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
10100   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
10101
10102   /* If callee has no option attributes, then it is ok to inline.  */
10103   if (!callee_tree)
10104     return true;
10105
10106   struct cl_target_option *caller_opts
10107         = TREE_TARGET_OPTION (caller_tree ? caller_tree
10108                                            : target_option_default_node);
10109
10110   struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
10111
10112
10113   /* Callee's ISA flags should be a subset of the caller's.  */
10114   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
10115        != callee_opts->x_aarch64_isa_flags)
10116     return false;
10117
10118   /* Allow non-strict aligned functions inlining into strict
10119      aligned ones.  */
10120   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
10121        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
10122       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
10123            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
10124     return false;
10125
10126   bool always_inline = lookup_attribute ("always_inline",
10127                                           DECL_ATTRIBUTES (callee));
10128
10129   /* If the architectural features match up and the callee is always_inline
10130      then the other attributes don't matter.  */
10131   if (always_inline)
10132     return true;
10133
10134   if (caller_opts->x_aarch64_cmodel_var
10135       != callee_opts->x_aarch64_cmodel_var)
10136     return false;
10137
10138   if (caller_opts->x_aarch64_tls_dialect
10139       != callee_opts->x_aarch64_tls_dialect)
10140     return false;
10141
10142   /* Honour explicit requests to workaround errata.  */
10143   if (!aarch64_tribools_ok_for_inlining_p (
10144           caller_opts->x_aarch64_fix_a53_err835769,
10145           callee_opts->x_aarch64_fix_a53_err835769,
10146           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
10147     return false;
10148
10149   if (!aarch64_tribools_ok_for_inlining_p (
10150           caller_opts->x_aarch64_fix_a53_err843419,
10151           callee_opts->x_aarch64_fix_a53_err843419,
10152           2, TARGET_FIX_ERR_A53_843419))
10153     return false;
10154
10155   /* If the user explicitly specified -momit-leaf-frame-pointer for the
10156      caller and calle and they don't match up, reject inlining.  */
10157   if (!aarch64_tribools_ok_for_inlining_p (
10158           caller_opts->x_flag_omit_leaf_frame_pointer,
10159           callee_opts->x_flag_omit_leaf_frame_pointer,
10160           2, 1))
10161     return false;
10162
10163   /* If the callee has specific tuning overrides, respect them.  */
10164   if (callee_opts->x_aarch64_override_tune_string != NULL
10165       && caller_opts->x_aarch64_override_tune_string == NULL)
10166     return false;
10167
10168   /* If the user specified tuning override strings for the
10169      caller and callee and they don't match up, reject inlining.
10170      We just do a string compare here, we don't analyze the meaning
10171      of the string, as it would be too costly for little gain.  */
10172   if (callee_opts->x_aarch64_override_tune_string
10173       && caller_opts->x_aarch64_override_tune_string
10174       && (strcmp (callee_opts->x_aarch64_override_tune_string,
10175                   caller_opts->x_aarch64_override_tune_string) != 0))
10176     return false;
10177
10178   return true;
10179 }
10180
10181 /* Return true if SYMBOL_REF X binds locally.  */
10182
10183 static bool
10184 aarch64_symbol_binds_local_p (const_rtx x)
10185 {
10186   return (SYMBOL_REF_DECL (x)
10187           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
10188           : SYMBOL_REF_LOCAL_P (x));
10189 }
10190
10191 /* Return true if SYMBOL_REF X is thread local */
10192 static bool
10193 aarch64_tls_symbol_p (rtx x)
10194 {
10195   if (! TARGET_HAVE_TLS)
10196     return false;
10197
10198   if (GET_CODE (x) != SYMBOL_REF)
10199     return false;
10200
10201   return SYMBOL_REF_TLS_MODEL (x) != 0;
10202 }
10203
10204 /* Classify a TLS symbol into one of the TLS kinds.  */
10205 enum aarch64_symbol_type
10206 aarch64_classify_tls_symbol (rtx x)
10207 {
10208   enum tls_model tls_kind = tls_symbolic_operand_type (x);
10209
10210   switch (tls_kind)
10211     {
10212     case TLS_MODEL_GLOBAL_DYNAMIC:
10213     case TLS_MODEL_LOCAL_DYNAMIC:
10214       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
10215
10216     case TLS_MODEL_INITIAL_EXEC:
10217       switch (aarch64_cmodel)
10218         {
10219         case AARCH64_CMODEL_TINY:
10220         case AARCH64_CMODEL_TINY_PIC:
10221           return SYMBOL_TINY_TLSIE;
10222         default:
10223           return SYMBOL_SMALL_TLSIE;
10224         }
10225
10226     case TLS_MODEL_LOCAL_EXEC:
10227       if (aarch64_tls_size == 12)
10228         return SYMBOL_TLSLE12;
10229       else if (aarch64_tls_size == 24)
10230         return SYMBOL_TLSLE24;
10231       else if (aarch64_tls_size == 32)
10232         return SYMBOL_TLSLE32;
10233       else if (aarch64_tls_size == 48)
10234         return SYMBOL_TLSLE48;
10235       else
10236         gcc_unreachable ();
10237
10238     case TLS_MODEL_EMULATED:
10239     case TLS_MODEL_NONE:
10240       return SYMBOL_FORCE_TO_MEM;
10241
10242     default:
10243       gcc_unreachable ();
10244     }
10245 }
10246
10247 /* Return the method that should be used to access SYMBOL_REF or
10248    LABEL_REF X.  */
10249
10250 enum aarch64_symbol_type
10251 aarch64_classify_symbol (rtx x, rtx offset)
10252 {
10253   if (GET_CODE (x) == LABEL_REF)
10254     {
10255       switch (aarch64_cmodel)
10256         {
10257         case AARCH64_CMODEL_LARGE:
10258           return SYMBOL_FORCE_TO_MEM;
10259
10260         case AARCH64_CMODEL_TINY_PIC:
10261         case AARCH64_CMODEL_TINY:
10262           return SYMBOL_TINY_ABSOLUTE;
10263
10264         case AARCH64_CMODEL_SMALL_SPIC:
10265         case AARCH64_CMODEL_SMALL_PIC:
10266         case AARCH64_CMODEL_SMALL:
10267           return SYMBOL_SMALL_ABSOLUTE;
10268
10269         default:
10270           gcc_unreachable ();
10271         }
10272     }
10273
10274   if (GET_CODE (x) == SYMBOL_REF)
10275     {
10276       if (aarch64_tls_symbol_p (x))
10277         return aarch64_classify_tls_symbol (x);
10278
10279       switch (aarch64_cmodel)
10280         {
10281         case AARCH64_CMODEL_TINY:
10282           /* When we retrieve symbol + offset address, we have to make sure
10283              the offset does not cause overflow of the final address.  But
10284              we have no way of knowing the address of symbol at compile time
10285              so we can't accurately say if the distance between the PC and
10286              symbol + offset is outside the addressible range of +/-1M in the
10287              TINY code model.  So we rely on images not being greater than
10288              1M and cap the offset at 1M and anything beyond 1M will have to
10289              be loaded using an alternative mechanism.  Furthermore if the
10290              symbol is a weak reference to something that isn't known to
10291              resolve to a symbol in this module, then force to memory.  */
10292           if ((SYMBOL_REF_WEAK (x)
10293                && !aarch64_symbol_binds_local_p (x))
10294               || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
10295             return SYMBOL_FORCE_TO_MEM;
10296           return SYMBOL_TINY_ABSOLUTE;
10297
10298         case AARCH64_CMODEL_SMALL:
10299           /* Same reasoning as the tiny code model, but the offset cap here is
10300              4G.  */
10301           if ((SYMBOL_REF_WEAK (x)
10302                && !aarch64_symbol_binds_local_p (x))
10303               || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
10304                             HOST_WIDE_INT_C (4294967264)))
10305             return SYMBOL_FORCE_TO_MEM;
10306           return SYMBOL_SMALL_ABSOLUTE;
10307
10308         case AARCH64_CMODEL_TINY_PIC:
10309           if (!aarch64_symbol_binds_local_p (x))
10310             return SYMBOL_TINY_GOT;
10311           return SYMBOL_TINY_ABSOLUTE;
10312
10313         case AARCH64_CMODEL_SMALL_SPIC:
10314         case AARCH64_CMODEL_SMALL_PIC:
10315           if (!aarch64_symbol_binds_local_p (x))
10316             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
10317                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
10318           return SYMBOL_SMALL_ABSOLUTE;
10319
10320         case AARCH64_CMODEL_LARGE:
10321           /* This is alright even in PIC code as the constant
10322              pool reference is always PC relative and within
10323              the same translation unit.  */
10324           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
10325             return SYMBOL_SMALL_ABSOLUTE;
10326           else
10327             return SYMBOL_FORCE_TO_MEM;
10328
10329         default:
10330           gcc_unreachable ();
10331         }
10332     }
10333
10334   /* By default push everything into the constant pool.  */
10335   return SYMBOL_FORCE_TO_MEM;
10336 }
10337
10338 bool
10339 aarch64_constant_address_p (rtx x)
10340 {
10341   return (CONSTANT_P (x) && memory_address_p (DImode, x));
10342 }
10343
10344 bool
10345 aarch64_legitimate_pic_operand_p (rtx x)
10346 {
10347   if (GET_CODE (x) == SYMBOL_REF
10348       || (GET_CODE (x) == CONST
10349           && GET_CODE (XEXP (x, 0)) == PLUS
10350           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
10351      return false;
10352
10353   return true;
10354 }
10355
10356 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
10357    that should be rematerialized rather than spilled.  */
10358
10359 static bool
10360 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
10361 {
10362   /* Support CSE and rematerialization of common constants.  */
10363   if (CONST_INT_P (x) || CONST_DOUBLE_P (x) || GET_CODE (x) == CONST_VECTOR)
10364     return true;
10365
10366   /* Do not allow vector struct mode constants.  We could support
10367      0 and -1 easily, but they need support in aarch64-simd.md.  */
10368   if (aarch64_vect_struct_mode_p (mode))
10369     return false;
10370
10371   /* Do not allow wide int constants - this requires support in movti.  */
10372   if (CONST_WIDE_INT_P (x))
10373     return false;
10374
10375   /* Do not allow const (plus (anchor_symbol, const_int)).  */
10376   if (GET_CODE (x) == CONST)
10377     {
10378       rtx offset;
10379
10380       split_const (x, &x, &offset);
10381
10382       if (SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
10383         return false;
10384     }
10385
10386   if (GET_CODE (x) == HIGH)
10387     x = XEXP (x, 0);
10388
10389   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
10390      so spilling them is better than rematerialization.  */
10391   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
10392     return true;
10393
10394   /* Label references are always constant.  */
10395   if (GET_CODE (x) == LABEL_REF)
10396     return true;
10397
10398   return false;
10399 }
10400
10401 rtx
10402 aarch64_load_tp (rtx target)
10403 {
10404   if (!target
10405       || GET_MODE (target) != Pmode
10406       || !register_operand (target, Pmode))
10407     target = gen_reg_rtx (Pmode);
10408
10409   /* Can return in any reg.  */
10410   emit_insn (gen_aarch64_load_tp_hard (target));
10411   return target;
10412 }
10413
10414 /* On AAPCS systems, this is the "struct __va_list".  */
10415 static GTY(()) tree va_list_type;
10416
10417 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
10418    Return the type to use as __builtin_va_list.
10419
10420    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
10421
10422    struct __va_list
10423    {
10424      void *__stack;
10425      void *__gr_top;
10426      void *__vr_top;
10427      int   __gr_offs;
10428      int   __vr_offs;
10429    };  */
10430
10431 static tree
10432 aarch64_build_builtin_va_list (void)
10433 {
10434   tree va_list_name;
10435   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10436
10437   /* Create the type.  */
10438   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
10439   /* Give it the required name.  */
10440   va_list_name = build_decl (BUILTINS_LOCATION,
10441                              TYPE_DECL,
10442                              get_identifier ("__va_list"),
10443                              va_list_type);
10444   DECL_ARTIFICIAL (va_list_name) = 1;
10445   TYPE_NAME (va_list_type) = va_list_name;
10446   TYPE_STUB_DECL (va_list_type) = va_list_name;
10447
10448   /* Create the fields.  */
10449   f_stack = build_decl (BUILTINS_LOCATION,
10450                         FIELD_DECL, get_identifier ("__stack"),
10451                         ptr_type_node);
10452   f_grtop = build_decl (BUILTINS_LOCATION,
10453                         FIELD_DECL, get_identifier ("__gr_top"),
10454                         ptr_type_node);
10455   f_vrtop = build_decl (BUILTINS_LOCATION,
10456                         FIELD_DECL, get_identifier ("__vr_top"),
10457                         ptr_type_node);
10458   f_groff = build_decl (BUILTINS_LOCATION,
10459                         FIELD_DECL, get_identifier ("__gr_offs"),
10460                         integer_type_node);
10461   f_vroff = build_decl (BUILTINS_LOCATION,
10462                         FIELD_DECL, get_identifier ("__vr_offs"),
10463                         integer_type_node);
10464
10465   /* Tell tree-stdarg pass about our internal offset fields.
10466      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
10467      purpose to identify whether the code is updating va_list internal
10468      offset fields through irregular way.  */
10469   va_list_gpr_counter_field = f_groff;
10470   va_list_fpr_counter_field = f_vroff;
10471
10472   DECL_ARTIFICIAL (f_stack) = 1;
10473   DECL_ARTIFICIAL (f_grtop) = 1;
10474   DECL_ARTIFICIAL (f_vrtop) = 1;
10475   DECL_ARTIFICIAL (f_groff) = 1;
10476   DECL_ARTIFICIAL (f_vroff) = 1;
10477
10478   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
10479   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
10480   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
10481   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
10482   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
10483
10484   TYPE_FIELDS (va_list_type) = f_stack;
10485   DECL_CHAIN (f_stack) = f_grtop;
10486   DECL_CHAIN (f_grtop) = f_vrtop;
10487   DECL_CHAIN (f_vrtop) = f_groff;
10488   DECL_CHAIN (f_groff) = f_vroff;
10489
10490   /* Compute its layout.  */
10491   layout_type (va_list_type);
10492
10493   return va_list_type;
10494 }
10495
10496 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
10497 static void
10498 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
10499 {
10500   const CUMULATIVE_ARGS *cum;
10501   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10502   tree stack, grtop, vrtop, groff, vroff;
10503   tree t;
10504   int gr_save_area_size = cfun->va_list_gpr_size;
10505   int vr_save_area_size = cfun->va_list_fpr_size;
10506   int vr_offset;
10507
10508   cum = &crtl->args.info;
10509   if (cfun->va_list_gpr_size)
10510     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
10511                              cfun->va_list_gpr_size);
10512   if (cfun->va_list_fpr_size)
10513     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
10514                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
10515
10516   if (!TARGET_FLOAT)
10517     {
10518       gcc_assert (cum->aapcs_nvrn == 0);
10519       vr_save_area_size = 0;
10520     }
10521
10522   f_stack = TYPE_FIELDS (va_list_type_node);
10523   f_grtop = DECL_CHAIN (f_stack);
10524   f_vrtop = DECL_CHAIN (f_grtop);
10525   f_groff = DECL_CHAIN (f_vrtop);
10526   f_vroff = DECL_CHAIN (f_groff);
10527
10528   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
10529                   NULL_TREE);
10530   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
10531                   NULL_TREE);
10532   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
10533                   NULL_TREE);
10534   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
10535                   NULL_TREE);
10536   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
10537                   NULL_TREE);
10538
10539   /* Emit code to initialize STACK, which points to the next varargs stack
10540      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
10541      by named arguments.  STACK is 8-byte aligned.  */
10542   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
10543   if (cum->aapcs_stack_size > 0)
10544     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
10545   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
10546   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10547
10548   /* Emit code to initialize GRTOP, the top of the GR save area.
10549      virtual_incoming_args_rtx should have been 16 byte aligned.  */
10550   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
10551   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
10552   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10553
10554   /* Emit code to initialize VRTOP, the top of the VR save area.
10555      This address is gr_save_area_bytes below GRTOP, rounded
10556      down to the next 16-byte boundary.  */
10557   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
10558   vr_offset = ROUND_UP (gr_save_area_size,
10559                         STACK_BOUNDARY / BITS_PER_UNIT);
10560
10561   if (vr_offset)
10562     t = fold_build_pointer_plus_hwi (t, -vr_offset);
10563   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
10564   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10565
10566   /* Emit code to initialize GROFF, the offset from GRTOP of the
10567      next GPR argument.  */
10568   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
10569               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
10570   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10571
10572   /* Likewise emit code to initialize VROFF, the offset from FTOP
10573      of the next VR argument.  */
10574   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
10575               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
10576   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10577 }
10578
10579 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
10580
10581 static tree
10582 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
10583                               gimple_seq *post_p ATTRIBUTE_UNUSED)
10584 {
10585   tree addr;
10586   bool indirect_p;
10587   bool is_ha;           /* is HFA or HVA.  */
10588   bool dw_align;        /* double-word align.  */
10589   machine_mode ag_mode = VOIDmode;
10590   int nregs;
10591   machine_mode mode;
10592
10593   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10594   tree stack, f_top, f_off, off, arg, roundup, on_stack;
10595   HOST_WIDE_INT size, rsize, adjust, align;
10596   tree t, u, cond1, cond2;
10597
10598   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
10599   if (indirect_p)
10600     type = build_pointer_type (type);
10601
10602   mode = TYPE_MODE (type);
10603
10604   f_stack = TYPE_FIELDS (va_list_type_node);
10605   f_grtop = DECL_CHAIN (f_stack);
10606   f_vrtop = DECL_CHAIN (f_grtop);
10607   f_groff = DECL_CHAIN (f_vrtop);
10608   f_vroff = DECL_CHAIN (f_groff);
10609
10610   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
10611                   f_stack, NULL_TREE);
10612   size = int_size_in_bytes (type);
10613   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
10614
10615   dw_align = false;
10616   adjust = 0;
10617   if (aarch64_vfp_is_call_or_return_candidate (mode,
10618                                                type,
10619                                                &ag_mode,
10620                                                &nregs,
10621                                                &is_ha))
10622     {
10623       /* TYPE passed in fp/simd registers.  */
10624       if (!TARGET_FLOAT)
10625         aarch64_err_no_fpadvsimd (mode, "varargs");
10626
10627       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
10628                       unshare_expr (valist), f_vrtop, NULL_TREE);
10629       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
10630                       unshare_expr (valist), f_vroff, NULL_TREE);
10631
10632       rsize = nregs * UNITS_PER_VREG;
10633
10634       if (is_ha)
10635         {
10636           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
10637             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
10638         }
10639       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
10640                && size < UNITS_PER_VREG)
10641         {
10642           adjust = UNITS_PER_VREG - size;
10643         }
10644     }
10645   else
10646     {
10647       /* TYPE passed in general registers.  */
10648       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
10649                       unshare_expr (valist), f_grtop, NULL_TREE);
10650       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
10651                       unshare_expr (valist), f_groff, NULL_TREE);
10652       rsize = ROUND_UP (size, UNITS_PER_WORD);
10653       nregs = rsize / UNITS_PER_WORD;
10654
10655       if (align > 8)
10656         dw_align = true;
10657
10658       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
10659           && size < UNITS_PER_WORD)
10660         {
10661           adjust = UNITS_PER_WORD  - size;
10662         }
10663     }
10664
10665   /* Get a local temporary for the field value.  */
10666   off = get_initialized_tmp_var (f_off, pre_p, NULL);
10667
10668   /* Emit code to branch if off >= 0.  */
10669   t = build2 (GE_EXPR, boolean_type_node, off,
10670               build_int_cst (TREE_TYPE (off), 0));
10671   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
10672
10673   if (dw_align)
10674     {
10675       /* Emit: offs = (offs + 15) & -16.  */
10676       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10677                   build_int_cst (TREE_TYPE (off), 15));
10678       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
10679                   build_int_cst (TREE_TYPE (off), -16));
10680       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
10681     }
10682   else
10683     roundup = NULL;
10684
10685   /* Update ap.__[g|v]r_offs  */
10686   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10687               build_int_cst (TREE_TYPE (off), rsize));
10688   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
10689
10690   /* String up.  */
10691   if (roundup)
10692     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10693
10694   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
10695   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
10696               build_int_cst (TREE_TYPE (f_off), 0));
10697   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
10698
10699   /* String up: make sure the assignment happens before the use.  */
10700   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
10701   COND_EXPR_ELSE (cond1) = t;
10702
10703   /* Prepare the trees handling the argument that is passed on the stack;
10704      the top level node will store in ON_STACK.  */
10705   arg = get_initialized_tmp_var (stack, pre_p, NULL);
10706   if (align > 8)
10707     {
10708       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
10709       t = fold_convert (intDI_type_node, arg);
10710       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10711                   build_int_cst (TREE_TYPE (t), 15));
10712       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10713                   build_int_cst (TREE_TYPE (t), -16));
10714       t = fold_convert (TREE_TYPE (arg), t);
10715       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
10716     }
10717   else
10718     roundup = NULL;
10719   /* Advance ap.__stack  */
10720   t = fold_convert (intDI_type_node, arg);
10721   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10722               build_int_cst (TREE_TYPE (t), size + 7));
10723   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10724               build_int_cst (TREE_TYPE (t), -8));
10725   t = fold_convert (TREE_TYPE (arg), t);
10726   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
10727   /* String up roundup and advance.  */
10728   if (roundup)
10729     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10730   /* String up with arg */
10731   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
10732   /* Big-endianness related address adjustment.  */
10733   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
10734       && size < UNITS_PER_WORD)
10735   {
10736     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
10737                 size_int (UNITS_PER_WORD - size));
10738     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
10739   }
10740
10741   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
10742   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
10743
10744   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
10745   t = off;
10746   if (adjust)
10747     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
10748                 build_int_cst (TREE_TYPE (off), adjust));
10749
10750   t = fold_convert (sizetype, t);
10751   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
10752
10753   if (is_ha)
10754     {
10755       /* type ha; // treat as "struct {ftype field[n];}"
10756          ... [computing offs]
10757          for (i = 0; i <nregs; ++i, offs += 16)
10758            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
10759          return ha;  */
10760       int i;
10761       tree tmp_ha, field_t, field_ptr_t;
10762
10763       /* Declare a local variable.  */
10764       tmp_ha = create_tmp_var_raw (type, "ha");
10765       gimple_add_tmp_var (tmp_ha);
10766
10767       /* Establish the base type.  */
10768       switch (ag_mode)
10769         {
10770         case E_SFmode:
10771           field_t = float_type_node;
10772           field_ptr_t = float_ptr_type_node;
10773           break;
10774         case E_DFmode:
10775           field_t = double_type_node;
10776           field_ptr_t = double_ptr_type_node;
10777           break;
10778         case E_TFmode:
10779           field_t = long_double_type_node;
10780           field_ptr_t = long_double_ptr_type_node;
10781           break;
10782         case E_HFmode:
10783           field_t = aarch64_fp16_type_node;
10784           field_ptr_t = aarch64_fp16_ptr_type_node;
10785           break;
10786         case E_V2SImode:
10787         case E_V4SImode:
10788             {
10789               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
10790               field_t = build_vector_type_for_mode (innertype, ag_mode);
10791               field_ptr_t = build_pointer_type (field_t);
10792             }
10793           break;
10794         default:
10795           gcc_assert (0);
10796         }
10797
10798       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
10799       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
10800       addr = t;
10801       t = fold_convert (field_ptr_t, addr);
10802       t = build2 (MODIFY_EXPR, field_t,
10803                   build1 (INDIRECT_REF, field_t, tmp_ha),
10804                   build1 (INDIRECT_REF, field_t, t));
10805
10806       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
10807       for (i = 1; i < nregs; ++i)
10808         {
10809           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
10810           u = fold_convert (field_ptr_t, addr);
10811           u = build2 (MODIFY_EXPR, field_t,
10812                       build2 (MEM_REF, field_t, tmp_ha,
10813                               build_int_cst (field_ptr_t,
10814                                              (i *
10815                                               int_size_in_bytes (field_t)))),
10816                       build1 (INDIRECT_REF, field_t, u));
10817           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
10818         }
10819
10820       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
10821       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
10822     }
10823
10824   COND_EXPR_ELSE (cond2) = t;
10825   addr = fold_convert (build_pointer_type (type), cond1);
10826   addr = build_va_arg_indirect_ref (addr);
10827
10828   if (indirect_p)
10829     addr = build_va_arg_indirect_ref (addr);
10830
10831   return addr;
10832 }
10833
10834 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
10835
10836 static void
10837 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10838                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
10839                                 int no_rtl)
10840 {
10841   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10842   CUMULATIVE_ARGS local_cum;
10843   int gr_saved = cfun->va_list_gpr_size;
10844   int vr_saved = cfun->va_list_fpr_size;
10845
10846   /* The caller has advanced CUM up to, but not beyond, the last named
10847      argument.  Advance a local copy of CUM past the last "real" named
10848      argument, to find out how many registers are left over.  */
10849   local_cum = *cum;
10850   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
10851
10852   /* Found out how many registers we need to save.
10853      Honor tree-stdvar analysis results.  */
10854   if (cfun->va_list_gpr_size)
10855     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
10856                     cfun->va_list_gpr_size / UNITS_PER_WORD);
10857   if (cfun->va_list_fpr_size)
10858     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
10859                     cfun->va_list_fpr_size / UNITS_PER_VREG);
10860
10861   if (!TARGET_FLOAT)
10862     {
10863       gcc_assert (local_cum.aapcs_nvrn == 0);
10864       vr_saved = 0;
10865     }
10866
10867   if (!no_rtl)
10868     {
10869       if (gr_saved > 0)
10870         {
10871           rtx ptr, mem;
10872
10873           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
10874           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
10875                                - gr_saved * UNITS_PER_WORD);
10876           mem = gen_frame_mem (BLKmode, ptr);
10877           set_mem_alias_set (mem, get_varargs_alias_set ());
10878
10879           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
10880                                mem, gr_saved);
10881         }
10882       if (vr_saved > 0)
10883         {
10884           /* We can't use move_block_from_reg, because it will use
10885              the wrong mode, storing D regs only.  */
10886           machine_mode mode = TImode;
10887           int off, i, vr_start;
10888
10889           /* Set OFF to the offset from virtual_incoming_args_rtx of
10890              the first vector register.  The VR save area lies below
10891              the GR one, and is aligned to 16 bytes.  */
10892           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10893                            STACK_BOUNDARY / BITS_PER_UNIT);
10894           off -= vr_saved * UNITS_PER_VREG;
10895
10896           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10897           for (i = 0; i < vr_saved; ++i)
10898             {
10899               rtx ptr, mem;
10900
10901               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10902               mem = gen_frame_mem (mode, ptr);
10903               set_mem_alias_set (mem, get_varargs_alias_set ());
10904               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10905               off += UNITS_PER_VREG;
10906             }
10907         }
10908     }
10909
10910   /* We don't save the size into *PRETEND_SIZE because we want to avoid
10911      any complication of having crtl->args.pretend_args_size changed.  */
10912   cfun->machine->frame.saved_varargs_size
10913     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10914                  STACK_BOUNDARY / BITS_PER_UNIT)
10915        + vr_saved * UNITS_PER_VREG);
10916 }
10917
10918 static void
10919 aarch64_conditional_register_usage (void)
10920 {
10921   int i;
10922   if (!TARGET_FLOAT)
10923     {
10924       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10925         {
10926           fixed_regs[i] = 1;
10927           call_used_regs[i] = 1;
10928         }
10929     }
10930 }
10931
10932 /* Walk down the type tree of TYPE counting consecutive base elements.
10933    If *MODEP is VOIDmode, then set it to the first valid floating point
10934    type.  If a non-floating point type is found, or if a floating point
10935    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10936    otherwise return the count in the sub-tree.  */
10937 static int
10938 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10939 {
10940   machine_mode mode;
10941   HOST_WIDE_INT size;
10942
10943   switch (TREE_CODE (type))
10944     {
10945     case REAL_TYPE:
10946       mode = TYPE_MODE (type);
10947       if (mode != DFmode && mode != SFmode
10948           && mode != TFmode && mode != HFmode)
10949         return -1;
10950
10951       if (*modep == VOIDmode)
10952         *modep = mode;
10953
10954       if (*modep == mode)
10955         return 1;
10956
10957       break;
10958
10959     case COMPLEX_TYPE:
10960       mode = TYPE_MODE (TREE_TYPE (type));
10961       if (mode != DFmode && mode != SFmode
10962           && mode != TFmode && mode != HFmode)
10963         return -1;
10964
10965       if (*modep == VOIDmode)
10966         *modep = mode;
10967
10968       if (*modep == mode)
10969         return 2;
10970
10971       break;
10972
10973     case VECTOR_TYPE:
10974       /* Use V2SImode and V4SImode as representatives of all 64-bit
10975          and 128-bit vector types.  */
10976       size = int_size_in_bytes (type);
10977       switch (size)
10978         {
10979         case 8:
10980           mode = V2SImode;
10981           break;
10982         case 16:
10983           mode = V4SImode;
10984           break;
10985         default:
10986           return -1;
10987         }
10988
10989       if (*modep == VOIDmode)
10990         *modep = mode;
10991
10992       /* Vector modes are considered to be opaque: two vectors are
10993          equivalent for the purposes of being homogeneous aggregates
10994          if they are the same size.  */
10995       if (*modep == mode)
10996         return 1;
10997
10998       break;
10999
11000     case ARRAY_TYPE:
11001       {
11002         int count;
11003         tree index = TYPE_DOMAIN (type);
11004
11005         /* Can't handle incomplete types nor sizes that are not
11006            fixed.  */
11007         if (!COMPLETE_TYPE_P (type)
11008             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11009           return -1;
11010
11011         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
11012         if (count == -1
11013             || !index
11014             || !TYPE_MAX_VALUE (index)
11015             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
11016             || !TYPE_MIN_VALUE (index)
11017             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
11018             || count < 0)
11019           return -1;
11020
11021         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
11022                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
11023
11024         /* There must be no padding.  */
11025         if (wi::to_wide (TYPE_SIZE (type))
11026             != count * GET_MODE_BITSIZE (*modep))
11027           return -1;
11028
11029         return count;
11030       }
11031
11032     case RECORD_TYPE:
11033       {
11034         int count = 0;
11035         int sub_count;
11036         tree field;
11037
11038         /* Can't handle incomplete types nor sizes that are not
11039            fixed.  */
11040         if (!COMPLETE_TYPE_P (type)
11041             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11042           return -1;
11043
11044         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
11045           {
11046             if (TREE_CODE (field) != FIELD_DECL)
11047               continue;
11048
11049             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
11050             if (sub_count < 0)
11051               return -1;
11052             count += sub_count;
11053           }
11054
11055         /* There must be no padding.  */
11056         if (wi::to_wide (TYPE_SIZE (type))
11057             != count * GET_MODE_BITSIZE (*modep))
11058           return -1;
11059
11060         return count;
11061       }
11062
11063     case UNION_TYPE:
11064     case QUAL_UNION_TYPE:
11065       {
11066         /* These aren't very interesting except in a degenerate case.  */
11067         int count = 0;
11068         int sub_count;
11069         tree field;
11070
11071         /* Can't handle incomplete types nor sizes that are not
11072            fixed.  */
11073         if (!COMPLETE_TYPE_P (type)
11074             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11075           return -1;
11076
11077         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
11078           {
11079             if (TREE_CODE (field) != FIELD_DECL)
11080               continue;
11081
11082             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
11083             if (sub_count < 0)
11084               return -1;
11085             count = count > sub_count ? count : sub_count;
11086           }
11087
11088         /* There must be no padding.  */
11089         if (wi::to_wide (TYPE_SIZE (type))
11090             != count * GET_MODE_BITSIZE (*modep))
11091           return -1;
11092
11093         return count;
11094       }
11095
11096     default:
11097       break;
11098     }
11099
11100   return -1;
11101 }
11102
11103 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
11104    type as described in AAPCS64 \S 4.1.2.
11105
11106    See the comment above aarch64_composite_type_p for the notes on MODE.  */
11107
11108 static bool
11109 aarch64_short_vector_p (const_tree type,
11110                         machine_mode mode)
11111 {
11112   HOST_WIDE_INT size = -1;
11113
11114   if (type && TREE_CODE (type) == VECTOR_TYPE)
11115     size = int_size_in_bytes (type);
11116   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
11117             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11118     size = GET_MODE_SIZE (mode);
11119
11120   return (size == 8 || size == 16);
11121 }
11122
11123 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
11124    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
11125    array types.  The C99 floating-point complex types are also considered
11126    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
11127    types, which are GCC extensions and out of the scope of AAPCS64, are
11128    treated as composite types here as well.
11129
11130    Note that MODE itself is not sufficient in determining whether a type
11131    is such a composite type or not.  This is because
11132    stor-layout.c:compute_record_mode may have already changed the MODE
11133    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
11134    structure with only one field may have its MODE set to the mode of the
11135    field.  Also an integer mode whose size matches the size of the
11136    RECORD_TYPE type may be used to substitute the original mode
11137    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
11138    solely relied on.  */
11139
11140 static bool
11141 aarch64_composite_type_p (const_tree type,
11142                           machine_mode mode)
11143 {
11144   if (aarch64_short_vector_p (type, mode))
11145     return false;
11146
11147   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
11148     return true;
11149
11150   if (mode == BLKmode
11151       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
11152       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
11153     return true;
11154
11155   return false;
11156 }
11157
11158 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
11159    shall be passed or returned in simd/fp register(s) (providing these
11160    parameter passing registers are available).
11161
11162    Upon successful return, *COUNT returns the number of needed registers,
11163    *BASE_MODE returns the mode of the individual register and when IS_HAF
11164    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
11165    floating-point aggregate or a homogeneous short-vector aggregate.  */
11166
11167 static bool
11168 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
11169                                          const_tree type,
11170                                          machine_mode *base_mode,
11171                                          int *count,
11172                                          bool *is_ha)
11173 {
11174   machine_mode new_mode = VOIDmode;
11175   bool composite_p = aarch64_composite_type_p (type, mode);
11176
11177   if (is_ha != NULL) *is_ha = false;
11178
11179   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
11180       || aarch64_short_vector_p (type, mode))
11181     {
11182       *count = 1;
11183       new_mode = mode;
11184     }
11185   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
11186     {
11187       if (is_ha != NULL) *is_ha = true;
11188       *count = 2;
11189       new_mode = GET_MODE_INNER (mode);
11190     }
11191   else if (type && composite_p)
11192     {
11193       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
11194
11195       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
11196         {
11197           if (is_ha != NULL) *is_ha = true;
11198           *count = ag_count;
11199         }
11200       else
11201         return false;
11202     }
11203   else
11204     return false;
11205
11206   *base_mode = new_mode;
11207   return true;
11208 }
11209
11210 /* Implement TARGET_STRUCT_VALUE_RTX.  */
11211
11212 static rtx
11213 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
11214                           int incoming ATTRIBUTE_UNUSED)
11215 {
11216   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
11217 }
11218
11219 /* Implements target hook vector_mode_supported_p.  */
11220 static bool
11221 aarch64_vector_mode_supported_p (machine_mode mode)
11222 {
11223   if (TARGET_SIMD
11224       && (mode == V4SImode  || mode == V8HImode
11225           || mode == V16QImode || mode == V2DImode
11226           || mode == V2SImode  || mode == V4HImode
11227           || mode == V8QImode || mode == V2SFmode
11228           || mode == V4SFmode || mode == V2DFmode
11229           || mode == V4HFmode || mode == V8HFmode
11230           || mode == V1DFmode))
11231     return true;
11232
11233   return false;
11234 }
11235
11236 /* Return appropriate SIMD container
11237    for MODE within a vector of WIDTH bits.  */
11238 static machine_mode
11239 aarch64_simd_container_mode (scalar_mode mode, unsigned width)
11240 {
11241   gcc_assert (width == 64 || width == 128);
11242   if (TARGET_SIMD)
11243     {
11244       if (width == 128)
11245         switch (mode)
11246           {
11247           case E_DFmode:
11248             return V2DFmode;
11249           case E_SFmode:
11250             return V4SFmode;
11251           case E_HFmode:
11252             return V8HFmode;
11253           case E_SImode:
11254             return V4SImode;
11255           case E_HImode:
11256             return V8HImode;
11257           case E_QImode:
11258             return V16QImode;
11259           case E_DImode:
11260             return V2DImode;
11261           default:
11262             break;
11263           }
11264       else
11265         switch (mode)
11266           {
11267           case E_SFmode:
11268             return V2SFmode;
11269           case E_HFmode:
11270             return V4HFmode;
11271           case E_SImode:
11272             return V2SImode;
11273           case E_HImode:
11274             return V4HImode;
11275           case E_QImode:
11276             return V8QImode;
11277           default:
11278             break;
11279           }
11280     }
11281   return word_mode;
11282 }
11283
11284 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
11285 static machine_mode
11286 aarch64_preferred_simd_mode (scalar_mode mode)
11287 {
11288   return aarch64_simd_container_mode (mode, 128);
11289 }
11290
11291 /* Return the bitmask of possible vector sizes for the vectorizer
11292    to iterate over.  */
11293 static unsigned int
11294 aarch64_autovectorize_vector_sizes (void)
11295 {
11296   return (16 | 8);
11297 }
11298
11299 /* Implement TARGET_MANGLE_TYPE.  */
11300
11301 static const char *
11302 aarch64_mangle_type (const_tree type)
11303 {
11304   /* The AArch64 ABI documents say that "__va_list" has to be
11305      managled as if it is in the "std" namespace.  */
11306   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
11307     return "St9__va_list";
11308
11309   /* Half-precision float.  */
11310   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
11311     return "Dh";
11312
11313   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
11314      builtin types.  */
11315   if (TYPE_NAME (type) != NULL)
11316     return aarch64_mangle_builtin_type (type);
11317
11318   /* Use the default mangling.  */
11319   return NULL;
11320 }
11321
11322 /* Find the first rtx_insn before insn that will generate an assembly
11323    instruction.  */
11324
11325 static rtx_insn *
11326 aarch64_prev_real_insn (rtx_insn *insn)
11327 {
11328   if (!insn)
11329     return NULL;
11330
11331   do
11332     {
11333       insn = prev_real_insn (insn);
11334     }
11335   while (insn && recog_memoized (insn) < 0);
11336
11337   return insn;
11338 }
11339
11340 static bool
11341 is_madd_op (enum attr_type t1)
11342 {
11343   unsigned int i;
11344   /* A number of these may be AArch32 only.  */
11345   enum attr_type mlatypes[] = {
11346     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
11347     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
11348     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
11349   };
11350
11351   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
11352     {
11353       if (t1 == mlatypes[i])
11354         return true;
11355     }
11356
11357   return false;
11358 }
11359
11360 /* Check if there is a register dependency between a load and the insn
11361    for which we hold recog_data.  */
11362
11363 static bool
11364 dep_between_memop_and_curr (rtx memop)
11365 {
11366   rtx load_reg;
11367   int opno;
11368
11369   gcc_assert (GET_CODE (memop) == SET);
11370
11371   if (!REG_P (SET_DEST (memop)))
11372     return false;
11373
11374   load_reg = SET_DEST (memop);
11375   for (opno = 1; opno < recog_data.n_operands; opno++)
11376     {
11377       rtx operand = recog_data.operand[opno];
11378       if (REG_P (operand)
11379           && reg_overlap_mentioned_p (load_reg, operand))
11380         return true;
11381
11382     }
11383   return false;
11384 }
11385
11386
11387 /* When working around the Cortex-A53 erratum 835769,
11388    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
11389    instruction and has a preceding memory instruction such that a NOP
11390    should be inserted between them.  */
11391
11392 bool
11393 aarch64_madd_needs_nop (rtx_insn* insn)
11394 {
11395   enum attr_type attr_type;
11396   rtx_insn *prev;
11397   rtx body;
11398
11399   if (!TARGET_FIX_ERR_A53_835769)
11400     return false;
11401
11402   if (!INSN_P (insn) || recog_memoized (insn) < 0)
11403     return false;
11404
11405   attr_type = get_attr_type (insn);
11406   if (!is_madd_op (attr_type))
11407     return false;
11408
11409   prev = aarch64_prev_real_insn (insn);
11410   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
11411      Restore recog state to INSN to avoid state corruption.  */
11412   extract_constrain_insn_cached (insn);
11413
11414   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
11415     return false;
11416
11417   body = single_set (prev);
11418
11419   /* If the previous insn is a memory op and there is no dependency between
11420      it and the DImode madd, emit a NOP between them.  If body is NULL then we
11421      have a complex memory operation, probably a load/store pair.
11422      Be conservative for now and emit a NOP.  */
11423   if (GET_MODE (recog_data.operand[0]) == DImode
11424       && (!body || !dep_between_memop_and_curr (body)))
11425     return true;
11426
11427   return false;
11428
11429 }
11430
11431
11432 /* Implement FINAL_PRESCAN_INSN.  */
11433
11434 void
11435 aarch64_final_prescan_insn (rtx_insn *insn)
11436 {
11437   if (aarch64_madd_needs_nop (insn))
11438     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
11439 }
11440
11441
11442 /* Return the equivalent letter for size.  */
11443 static char
11444 sizetochar (int size)
11445 {
11446   switch (size)
11447     {
11448     case 64: return 'd';
11449     case 32: return 's';
11450     case 16: return 'h';
11451     case 8 : return 'b';
11452     default: gcc_unreachable ();
11453     }
11454 }
11455
11456 /* Return true iff x is a uniform vector of floating-point
11457    constants, and the constant can be represented in
11458    quarter-precision form.  Note, as aarch64_float_const_representable
11459    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
11460 static bool
11461 aarch64_vect_float_const_representable_p (rtx x)
11462 {
11463   rtx elt;
11464   return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
11465           && const_vec_duplicate_p (x, &elt)
11466           && aarch64_float_const_representable_p (elt));
11467 }
11468
11469 /* Return true for valid and false for invalid.  */
11470 bool
11471 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
11472                               struct simd_immediate_info *info,
11473                               enum simd_immediate_check which)
11474 {
11475 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
11476   matches = 1;                                          \
11477   for (i = 0; i < idx; i += (STRIDE))                   \
11478     if (!(TEST))                                        \
11479       matches = 0;                                      \
11480   if (matches)                                          \
11481     {                                                   \
11482       immtype = (CLASS);                                \
11483       elsize = (ELSIZE);                                \
11484       eshift = (SHIFT);                                 \
11485       emvn = (NEG);                                     \
11486       break;                                            \
11487     }
11488
11489   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
11490   unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
11491   unsigned char bytes[16];
11492   int immtype = -1, matches;
11493   unsigned int invmask = inverse ? 0xff : 0;
11494   int eshift, emvn;
11495
11496   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11497     {
11498       if (! (aarch64_simd_imm_zero_p (op, mode)
11499              || aarch64_vect_float_const_representable_p (op)))
11500         return false;
11501
11502       if (info)
11503         {
11504           rtx elt = CONST_VECTOR_ELT (op, 0);
11505           scalar_float_mode elt_mode
11506             = as_a <scalar_float_mode> (GET_MODE (elt));
11507
11508           info->value = elt;
11509           info->element_width = GET_MODE_BITSIZE (elt_mode);
11510           info->mvn = false;
11511           info->shift = 0;
11512         }
11513
11514       return true;
11515     }
11516
11517   /* Splat vector constant out into a byte vector.  */
11518   for (i = 0; i < n_elts; i++)
11519     {
11520       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
11521          it must be laid out in the vector register in reverse order.  */
11522       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
11523       unsigned HOST_WIDE_INT elpart;
11524
11525       gcc_assert (CONST_INT_P (el));
11526       elpart = INTVAL (el);
11527
11528       for (unsigned int byte = 0; byte < innersize; byte++)
11529         {
11530           bytes[idx++] = (elpart & 0xff) ^ invmask;
11531           elpart >>= BITS_PER_UNIT;
11532         }
11533
11534     }
11535
11536   /* Sanity check.  */
11537   gcc_assert (idx == GET_MODE_SIZE (mode));
11538
11539   do
11540     {
11541       if (which & AARCH64_CHECK_ORR)
11542         {
11543           CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
11544                  && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
11545
11546           CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11547                  && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11548
11549           CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
11550                  && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11551
11552           CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
11553                  && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
11554
11555           CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
11556
11557           CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
11558         }
11559
11560       if (which & AARCH64_CHECK_BIC)
11561         {
11562           CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
11563                  && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
11564
11565           CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11566                  && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11567
11568           CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
11569                  && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11570
11571           CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
11572                  && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
11573
11574           CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
11575
11576           CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
11577         }
11578
11579       /* Shifting ones / 8-bit / 64-bit variants only checked
11580          for 'ALL' (MOVI/MVNI).  */
11581       if (which == AARCH64_CHECK_MOV)
11582         {
11583           CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11584                  && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11585
11586           CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11587                  && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11588
11589           CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
11590                  && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11591
11592           CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
11593                  && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11594
11595           CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
11596
11597           CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
11598                  && bytes[i] == bytes[(i + 8) % idx], 0, 0);
11599         }
11600     }
11601   while (0);
11602
11603   if (immtype == -1)
11604     return false;
11605
11606   if (info)
11607     {
11608       info->element_width = elsize;
11609       info->mvn = emvn != 0;
11610       info->shift = eshift;
11611
11612       unsigned HOST_WIDE_INT imm = 0;
11613
11614       if (immtype >= 12 && immtype <= 15)
11615         info->msl = true;
11616
11617       /* Un-invert bytes of recognized vector, if necessary.  */
11618       if (invmask != 0)
11619         for (i = 0; i < idx; i++)
11620           bytes[i] ^= invmask;
11621
11622       if (immtype == 17)
11623         {
11624           /* FIXME: Broken on 32-bit H_W_I hosts.  */
11625           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
11626
11627           for (i = 0; i < 8; i++)
11628             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
11629               << (i * BITS_PER_UNIT);
11630
11631
11632           info->value = GEN_INT (imm);
11633         }
11634       else
11635         {
11636           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
11637             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
11638
11639           /* Construct 'abcdefgh' because the assembler cannot handle
11640              generic constants.  */
11641           if (info->mvn)
11642             imm = ~imm;
11643           imm = (imm >> info->shift) & 0xff;
11644           info->value = GEN_INT (imm);
11645         }
11646     }
11647
11648   return true;
11649 #undef CHECK
11650 }
11651
11652 /* Check of immediate shift constants are within range.  */
11653 bool
11654 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
11655 {
11656   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
11657   if (left)
11658     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
11659   else
11660     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
11661 }
11662
11663 /* Return true if X is a uniform vector where all elements
11664    are either the floating-point constant 0.0 or the
11665    integer constant 0.  */
11666 bool
11667 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
11668 {
11669   return x == CONST0_RTX (mode);
11670 }
11671
11672
11673 /* Return the bitmask CONST_INT to select the bits required by a zero extract
11674    operation of width WIDTH at bit position POS.  */
11675
11676 rtx
11677 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
11678 {
11679   gcc_assert (CONST_INT_P (width));
11680   gcc_assert (CONST_INT_P (pos));
11681
11682   unsigned HOST_WIDE_INT mask
11683     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
11684   return GEN_INT (mask << UINTVAL (pos));
11685 }
11686
11687 bool
11688 aarch64_mov_operand_p (rtx x, machine_mode mode)
11689 {
11690   if (GET_CODE (x) == HIGH
11691       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
11692     return true;
11693
11694   if (CONST_INT_P (x))
11695     return true;
11696
11697   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
11698     return true;
11699
11700   return aarch64_classify_symbolic_expression (x)
11701     == SYMBOL_TINY_ABSOLUTE;
11702 }
11703
11704 /* Return a const_int vector of VAL.  */
11705 rtx
11706 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
11707 {
11708   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
11709   return gen_const_vec_duplicate (mode, c);
11710 }
11711
11712 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
11713
11714 bool
11715 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
11716 {
11717   machine_mode vmode;
11718
11719   vmode = aarch64_preferred_simd_mode (mode);
11720   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
11721   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
11722 }
11723
11724 /* Construct and return a PARALLEL RTX vector with elements numbering the
11725    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
11726    the vector - from the perspective of the architecture.  This does not
11727    line up with GCC's perspective on lane numbers, so we end up with
11728    different masks depending on our target endian-ness.  The diagram
11729    below may help.  We must draw the distinction when building masks
11730    which select one half of the vector.  An instruction selecting
11731    architectural low-lanes for a big-endian target, must be described using
11732    a mask selecting GCC high-lanes.
11733
11734                  Big-Endian             Little-Endian
11735
11736 GCC             0   1   2   3           3   2   1   0
11737               | x | x | x | x |       | x | x | x | x |
11738 Architecture    3   2   1   0           3   2   1   0
11739
11740 Low Mask:         { 2, 3 }                { 0, 1 }
11741 High Mask:        { 0, 1 }                { 2, 3 }
11742
11743    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
11744
11745 rtx
11746 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
11747 {
11748   rtvec v = rtvec_alloc (nunits / 2);
11749   int high_base = nunits / 2;
11750   int low_base = 0;
11751   int base;
11752   rtx t1;
11753   int i;
11754
11755   if (BYTES_BIG_ENDIAN)
11756     base = high ? low_base : high_base;
11757   else
11758     base = high ? high_base : low_base;
11759
11760   for (i = 0; i < nunits / 2; i++)
11761     RTVEC_ELT (v, i) = GEN_INT (base + i);
11762
11763   t1 = gen_rtx_PARALLEL (mode, v);
11764   return t1;
11765 }
11766
11767 /* Check OP for validity as a PARALLEL RTX vector with elements
11768    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
11769    from the perspective of the architecture.  See the diagram above
11770    aarch64_simd_vect_par_cnst_half for more details.  */
11771
11772 bool
11773 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
11774                                        bool high)
11775 {
11776   if (!VECTOR_MODE_P (mode))
11777     return false;
11778
11779   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, GET_MODE_NUNITS (mode),
11780                                                high);
11781   HOST_WIDE_INT count_op = XVECLEN (op, 0);
11782   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
11783   int i = 0;
11784
11785   if (count_op != count_ideal)
11786     return false;
11787
11788   for (i = 0; i < count_ideal; i++)
11789     {
11790       rtx elt_op = XVECEXP (op, 0, i);
11791       rtx elt_ideal = XVECEXP (ideal, 0, i);
11792
11793       if (!CONST_INT_P (elt_op)
11794           || INTVAL (elt_ideal) != INTVAL (elt_op))
11795         return false;
11796     }
11797   return true;
11798 }
11799
11800 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
11801    HIGH (exclusive).  */
11802 void
11803 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
11804                           const_tree exp)
11805 {
11806   HOST_WIDE_INT lane;
11807   gcc_assert (CONST_INT_P (operand));
11808   lane = INTVAL (operand);
11809
11810   if (lane < low || lane >= high)
11811   {
11812     if (exp)
11813       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
11814     else
11815       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
11816   }
11817 }
11818
11819 /* Peform endian correction on lane number N, which indexes a vector
11820    of mode MODE, and return the result as an SImode rtx.  */
11821
11822 rtx
11823 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
11824 {
11825   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
11826 }
11827
11828 /* Return TRUE if OP is a valid vector addressing mode.  */
11829 bool
11830 aarch64_simd_mem_operand_p (rtx op)
11831 {
11832   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
11833                         || REG_P (XEXP (op, 0)));
11834 }
11835
11836 /* Emit a register copy from operand to operand, taking care not to
11837    early-clobber source registers in the process.
11838
11839    COUNT is the number of components into which the copy needs to be
11840    decomposed.  */
11841 void
11842 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
11843                                 unsigned int count)
11844 {
11845   unsigned int i;
11846   int rdest = REGNO (operands[0]);
11847   int rsrc = REGNO (operands[1]);
11848
11849   if (!reg_overlap_mentioned_p (operands[0], operands[1])
11850       || rdest < rsrc)
11851     for (i = 0; i < count; i++)
11852       emit_move_insn (gen_rtx_REG (mode, rdest + i),
11853                       gen_rtx_REG (mode, rsrc + i));
11854   else
11855     for (i = 0; i < count; i++)
11856       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
11857                       gen_rtx_REG (mode, rsrc + count - i - 1));
11858 }
11859
11860 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11861    one of VSTRUCT modes: OI, CI, or XI.  */
11862 int
11863 aarch64_simd_attr_length_rglist (machine_mode mode)
11864 {
11865   return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
11866 }
11867
11868 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
11869    alignment of a vector to 128 bits.  */
11870 static HOST_WIDE_INT
11871 aarch64_simd_vector_alignment (const_tree type)
11872 {
11873   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
11874   return MIN (align, 128);
11875 }
11876
11877 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
11878 static bool
11879 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
11880 {
11881   if (is_packed)
11882     return false;
11883
11884   /* We guarantee alignment for vectors up to 128-bits.  */
11885   if (tree_int_cst_compare (TYPE_SIZE (type),
11886                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11887     return false;
11888
11889   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
11890   return true;
11891 }
11892
11893 /* Return true if the vector misalignment factor is supported by the
11894    target.  */
11895 static bool
11896 aarch64_builtin_support_vector_misalignment (machine_mode mode,
11897                                              const_tree type, int misalignment,
11898                                              bool is_packed)
11899 {
11900   if (TARGET_SIMD && STRICT_ALIGNMENT)
11901     {
11902       /* Return if movmisalign pattern is not supported for this mode.  */
11903       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
11904         return false;
11905
11906       /* Misalignment factor is unknown at compile time.  */
11907       if (misalignment == -1)
11908         return false;
11909     }
11910   return default_builtin_support_vector_misalignment (mode, type, misalignment,
11911                                                       is_packed);
11912 }
11913
11914 /* If VALS is a vector constant that can be loaded into a register
11915    using DUP, generate instructions to do so and return an RTX to
11916    assign to the register.  Otherwise return NULL_RTX.  */
11917 static rtx
11918 aarch64_simd_dup_constant (rtx vals)
11919 {
11920   machine_mode mode = GET_MODE (vals);
11921   machine_mode inner_mode = GET_MODE_INNER (mode);
11922   rtx x;
11923
11924   if (!const_vec_duplicate_p (vals, &x))
11925     return NULL_RTX;
11926
11927   /* We can load this constant by using DUP and a constant in a
11928      single ARM register.  This will be cheaper than a vector
11929      load.  */
11930   x = copy_to_mode_reg (inner_mode, x);
11931   return gen_vec_duplicate (mode, x);
11932 }
11933
11934
11935 /* Generate code to load VALS, which is a PARALLEL containing only
11936    constants (for vec_init) or CONST_VECTOR, efficiently into a
11937    register.  Returns an RTX to copy into the register, or NULL_RTX
11938    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
11939 static rtx
11940 aarch64_simd_make_constant (rtx vals)
11941 {
11942   machine_mode mode = GET_MODE (vals);
11943   rtx const_dup;
11944   rtx const_vec = NULL_RTX;
11945   int n_elts = GET_MODE_NUNITS (mode);
11946   int n_const = 0;
11947   int i;
11948
11949   if (GET_CODE (vals) == CONST_VECTOR)
11950     const_vec = vals;
11951   else if (GET_CODE (vals) == PARALLEL)
11952     {
11953       /* A CONST_VECTOR must contain only CONST_INTs and
11954          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11955          Only store valid constants in a CONST_VECTOR.  */
11956       for (i = 0; i < n_elts; ++i)
11957         {
11958           rtx x = XVECEXP (vals, 0, i);
11959           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11960             n_const++;
11961         }
11962       if (n_const == n_elts)
11963         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11964     }
11965   else
11966     gcc_unreachable ();
11967
11968   if (const_vec != NULL_RTX
11969       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11970     /* Load using MOVI/MVNI.  */
11971     return const_vec;
11972   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11973     /* Loaded using DUP.  */
11974     return const_dup;
11975   else if (const_vec != NULL_RTX)
11976     /* Load from constant pool. We can not take advantage of single-cycle
11977        LD1 because we need a PC-relative addressing mode.  */
11978     return const_vec;
11979   else
11980     /* A PARALLEL containing something not valid inside CONST_VECTOR.
11981        We can not construct an initializer.  */
11982     return NULL_RTX;
11983 }
11984
11985 /* Expand a vector initialisation sequence, such that TARGET is
11986    initialised to contain VALS.  */
11987
11988 void
11989 aarch64_expand_vector_init (rtx target, rtx vals)
11990 {
11991   machine_mode mode = GET_MODE (target);
11992   scalar_mode inner_mode = GET_MODE_INNER (mode);
11993   /* The number of vector elements.  */
11994   int n_elts = GET_MODE_NUNITS (mode);
11995   /* The number of vector elements which are not constant.  */
11996   int n_var = 0;
11997   rtx any_const = NULL_RTX;
11998   /* The first element of vals.  */
11999   rtx v0 = XVECEXP (vals, 0, 0);
12000   bool all_same = true;
12001
12002   /* Count the number of variable elements to initialise.  */
12003   for (int i = 0; i < n_elts; ++i)
12004     {
12005       rtx x = XVECEXP (vals, 0, i);
12006       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
12007         ++n_var;
12008       else
12009         any_const = x;
12010
12011       all_same &= rtx_equal_p (x, v0);
12012     }
12013
12014   /* No variable elements, hand off to aarch64_simd_make_constant which knows
12015      how best to handle this.  */
12016   if (n_var == 0)
12017     {
12018       rtx constant = aarch64_simd_make_constant (vals);
12019       if (constant != NULL_RTX)
12020         {
12021           emit_move_insn (target, constant);
12022           return;
12023         }
12024     }
12025
12026   /* Splat a single non-constant element if we can.  */
12027   if (all_same)
12028     {
12029       rtx x = copy_to_mode_reg (inner_mode, v0);
12030       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
12031       return;
12032     }
12033
12034   enum insn_code icode = optab_handler (vec_set_optab, mode);
12035   gcc_assert (icode != CODE_FOR_nothing);
12036
12037   /* If there are only variable elements, try to optimize
12038      the insertion using dup for the most common element
12039      followed by insertions.  */
12040
12041   /* The algorithm will fill matches[*][0] with the earliest matching element,
12042      and matches[X][1] with the count of duplicate elements (if X is the
12043      earliest element which has duplicates).  */
12044
12045   if (n_var == n_elts && n_elts <= 16)
12046     {
12047       int matches[16][2] = {0};
12048       for (int i = 0; i < n_elts; i++)
12049         {
12050           for (int j = 0; j <= i; j++)
12051             {
12052               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
12053                 {
12054                   matches[i][0] = j;
12055                   matches[j][1]++;
12056                   break;
12057                 }
12058             }
12059         }
12060       int maxelement = 0;
12061       int maxv = 0;
12062       for (int i = 0; i < n_elts; i++)
12063         if (matches[i][1] > maxv)
12064           {
12065             maxelement = i;
12066             maxv = matches[i][1];
12067           }
12068
12069       /* Create a duplicate of the most common element.  */
12070       rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
12071       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
12072
12073       /* Insert the rest.  */
12074       for (int i = 0; i < n_elts; i++)
12075         {
12076           rtx x = XVECEXP (vals, 0, i);
12077           if (matches[i][0] == maxelement)
12078             continue;
12079           x = copy_to_mode_reg (inner_mode, x);
12080           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
12081         }
12082       return;
12083     }
12084
12085   /* Initialise a vector which is part-variable.  We want to first try
12086      to build those lanes which are constant in the most efficient way we
12087      can.  */
12088   if (n_var != n_elts)
12089     {
12090       rtx copy = copy_rtx (vals);
12091
12092       /* Load constant part of vector.  We really don't care what goes into the
12093          parts we will overwrite, but we're more likely to be able to load the
12094          constant efficiently if it has fewer, larger, repeating parts
12095          (see aarch64_simd_valid_immediate).  */
12096       for (int i = 0; i < n_elts; i++)
12097         {
12098           rtx x = XVECEXP (vals, 0, i);
12099           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
12100             continue;
12101           rtx subst = any_const;
12102           for (int bit = n_elts / 2; bit > 0; bit /= 2)
12103             {
12104               /* Look in the copied vector, as more elements are const.  */
12105               rtx test = XVECEXP (copy, 0, i ^ bit);
12106               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
12107                 {
12108                   subst = test;
12109                   break;
12110                 }
12111             }
12112           XVECEXP (copy, 0, i) = subst;
12113         }
12114       aarch64_expand_vector_init (target, copy);
12115     }
12116
12117   /* Insert the variable lanes directly.  */
12118   for (int i = 0; i < n_elts; i++)
12119     {
12120       rtx x = XVECEXP (vals, 0, i);
12121       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
12122         continue;
12123       x = copy_to_mode_reg (inner_mode, x);
12124       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
12125     }
12126 }
12127
12128 static unsigned HOST_WIDE_INT
12129 aarch64_shift_truncation_mask (machine_mode mode)
12130 {
12131   return
12132     (!SHIFT_COUNT_TRUNCATED
12133      || aarch64_vector_mode_supported_p (mode)
12134      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
12135 }
12136
12137 /* Select a format to encode pointers in exception handling data.  */
12138 int
12139 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
12140 {
12141    int type;
12142    switch (aarch64_cmodel)
12143      {
12144      case AARCH64_CMODEL_TINY:
12145      case AARCH64_CMODEL_TINY_PIC:
12146      case AARCH64_CMODEL_SMALL:
12147      case AARCH64_CMODEL_SMALL_PIC:
12148      case AARCH64_CMODEL_SMALL_SPIC:
12149        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
12150           for everything.  */
12151        type = DW_EH_PE_sdata4;
12152        break;
12153      default:
12154        /* No assumptions here.  8-byte relocs required.  */
12155        type = DW_EH_PE_sdata8;
12156        break;
12157      }
12158    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
12159 }
12160
12161 /* The last .arch and .tune assembly strings that we printed.  */
12162 static std::string aarch64_last_printed_arch_string;
12163 static std::string aarch64_last_printed_tune_string;
12164
12165 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
12166    by the function fndecl.  */
12167
12168 void
12169 aarch64_declare_function_name (FILE *stream, const char* name,
12170                                 tree fndecl)
12171 {
12172   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12173
12174   struct cl_target_option *targ_options;
12175   if (target_parts)
12176     targ_options = TREE_TARGET_OPTION (target_parts);
12177   else
12178     targ_options = TREE_TARGET_OPTION (target_option_current_node);
12179   gcc_assert (targ_options);
12180
12181   const struct processor *this_arch
12182     = aarch64_get_arch (targ_options->x_explicit_arch);
12183
12184   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
12185   std::string extension
12186     = aarch64_get_extension_string_for_isa_flags (isa_flags,
12187                                                   this_arch->flags);
12188   /* Only update the assembler .arch string if it is distinct from the last
12189      such string we printed.  */
12190   std::string to_print = this_arch->name + extension;
12191   if (to_print != aarch64_last_printed_arch_string)
12192     {
12193       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
12194       aarch64_last_printed_arch_string = to_print;
12195     }
12196
12197   /* Print the cpu name we're tuning for in the comments, might be
12198      useful to readers of the generated asm.  Do it only when it changes
12199      from function to function and verbose assembly is requested.  */
12200   const struct processor *this_tune
12201     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
12202
12203   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
12204     {
12205       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
12206                    this_tune->name);
12207       aarch64_last_printed_tune_string = this_tune->name;
12208     }
12209
12210   /* Don't forget the type directive for ELF.  */
12211   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
12212   ASM_OUTPUT_LABEL (stream, name);
12213 }
12214
12215 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
12216
12217 static void
12218 aarch64_start_file (void)
12219 {
12220   struct cl_target_option *default_options
12221     = TREE_TARGET_OPTION (target_option_default_node);
12222
12223   const struct processor *default_arch
12224     = aarch64_get_arch (default_options->x_explicit_arch);
12225   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
12226   std::string extension
12227     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
12228                                                   default_arch->flags);
12229
12230    aarch64_last_printed_arch_string = default_arch->name + extension;
12231    aarch64_last_printed_tune_string = "";
12232    asm_fprintf (asm_out_file, "\t.arch %s\n",
12233                 aarch64_last_printed_arch_string.c_str ());
12234
12235    default_file_start ();
12236 }
12237
12238 /* Emit load exclusive.  */
12239
12240 static void
12241 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
12242                              rtx mem, rtx model_rtx)
12243 {
12244   rtx (*gen) (rtx, rtx, rtx);
12245
12246   switch (mode)
12247     {
12248     case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
12249     case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
12250     case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
12251     case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
12252     default:
12253       gcc_unreachable ();
12254     }
12255
12256   emit_insn (gen (rval, mem, model_rtx));
12257 }
12258
12259 /* Emit store exclusive.  */
12260
12261 static void
12262 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
12263                               rtx rval, rtx mem, rtx model_rtx)
12264 {
12265   rtx (*gen) (rtx, rtx, rtx, rtx);
12266
12267   switch (mode)
12268     {
12269     case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
12270     case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
12271     case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
12272     case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
12273     default:
12274       gcc_unreachable ();
12275     }
12276
12277   emit_insn (gen (bval, rval, mem, model_rtx));
12278 }
12279
12280 /* Mark the previous jump instruction as unlikely.  */
12281
12282 static void
12283 aarch64_emit_unlikely_jump (rtx insn)
12284 {
12285   rtx_insn *jump = emit_jump_insn (insn);
12286   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
12287 }
12288
12289 /* Expand a compare and swap pattern.  */
12290
12291 void
12292 aarch64_expand_compare_and_swap (rtx operands[])
12293 {
12294   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
12295   machine_mode mode, cmp_mode;
12296   typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
12297   int idx;
12298   gen_cas_fn gen;
12299   const gen_cas_fn split_cas[] =
12300   {
12301     gen_aarch64_compare_and_swapqi,
12302     gen_aarch64_compare_and_swaphi,
12303     gen_aarch64_compare_and_swapsi,
12304     gen_aarch64_compare_and_swapdi
12305   };
12306   const gen_cas_fn atomic_cas[] =
12307   {
12308     gen_aarch64_compare_and_swapqi_lse,
12309     gen_aarch64_compare_and_swaphi_lse,
12310     gen_aarch64_compare_and_swapsi_lse,
12311     gen_aarch64_compare_and_swapdi_lse
12312   };
12313
12314   bval = operands[0];
12315   rval = operands[1];
12316   mem = operands[2];
12317   oldval = operands[3];
12318   newval = operands[4];
12319   is_weak = operands[5];
12320   mod_s = operands[6];
12321   mod_f = operands[7];
12322   mode = GET_MODE (mem);
12323   cmp_mode = mode;
12324
12325   /* Normally the succ memory model must be stronger than fail, but in the
12326      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
12327      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
12328
12329   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
12330       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
12331     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
12332
12333   switch (mode)
12334     {
12335     case E_QImode:
12336     case E_HImode:
12337       /* For short modes, we're going to perform the comparison in SImode,
12338          so do the zero-extension now.  */
12339       cmp_mode = SImode;
12340       rval = gen_reg_rtx (SImode);
12341       oldval = convert_modes (SImode, mode, oldval, true);
12342       /* Fall through.  */
12343
12344     case E_SImode:
12345     case E_DImode:
12346       /* Force the value into a register if needed.  */
12347       if (!aarch64_plus_operand (oldval, mode))
12348         oldval = force_reg (cmp_mode, oldval);
12349       break;
12350
12351     default:
12352       gcc_unreachable ();
12353     }
12354
12355   switch (mode)
12356     {
12357     case E_QImode: idx = 0; break;
12358     case E_HImode: idx = 1; break;
12359     case E_SImode: idx = 2; break;
12360     case E_DImode: idx = 3; break;
12361     default:
12362       gcc_unreachable ();
12363     }
12364   if (TARGET_LSE)
12365     gen = atomic_cas[idx];
12366   else
12367     gen = split_cas[idx];
12368
12369   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
12370
12371   if (mode == QImode || mode == HImode)
12372     emit_move_insn (operands[1], gen_lowpart (mode, rval));
12373
12374   x = gen_rtx_REG (CCmode, CC_REGNUM);
12375   x = gen_rtx_EQ (SImode, x, const0_rtx);
12376   emit_insn (gen_rtx_SET (bval, x));
12377 }
12378
12379 /* Test whether the target supports using a atomic load-operate instruction.
12380    CODE is the operation and AFTER is TRUE if the data in memory after the
12381    operation should be returned and FALSE if the data before the operation
12382    should be returned.  Returns FALSE if the operation isn't supported by the
12383    architecture.  */
12384
12385 bool
12386 aarch64_atomic_ldop_supported_p (enum rtx_code code)
12387 {
12388   if (!TARGET_LSE)
12389     return false;
12390
12391   switch (code)
12392     {
12393     case SET:
12394     case AND:
12395     case IOR:
12396     case XOR:
12397     case MINUS:
12398     case PLUS:
12399       return true;
12400     default:
12401       return false;
12402     }
12403 }
12404
12405 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
12406    sequence implementing an atomic operation.  */
12407
12408 static void
12409 aarch64_emit_post_barrier (enum memmodel model)
12410 {
12411   const enum memmodel base_model = memmodel_base (model);
12412
12413   if (is_mm_sync (model)
12414       && (base_model == MEMMODEL_ACQUIRE
12415           || base_model == MEMMODEL_ACQ_REL
12416           || base_model == MEMMODEL_SEQ_CST))
12417     {
12418       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
12419     }
12420 }
12421
12422 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
12423    for the data in memory.  EXPECTED is the value expected to be in memory.
12424    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
12425    is the memory ordering to use.  */
12426
12427 void
12428 aarch64_gen_atomic_cas (rtx rval, rtx mem,
12429                         rtx expected, rtx desired,
12430                         rtx model)
12431 {
12432   rtx (*gen) (rtx, rtx, rtx, rtx);
12433   machine_mode mode;
12434
12435   mode = GET_MODE (mem);
12436
12437   switch (mode)
12438     {
12439     case E_QImode: gen = gen_aarch64_atomic_casqi; break;
12440     case E_HImode: gen = gen_aarch64_atomic_cashi; break;
12441     case E_SImode: gen = gen_aarch64_atomic_cassi; break;
12442     case E_DImode: gen = gen_aarch64_atomic_casdi; break;
12443     default:
12444       gcc_unreachable ();
12445     }
12446
12447   /* Move the expected value into the CAS destination register.  */
12448   emit_insn (gen_rtx_SET (rval, expected));
12449
12450   /* Emit the CAS.  */
12451   emit_insn (gen (rval, mem, desired, model));
12452
12453   /* Compare the expected value with the value loaded by the CAS, to establish
12454      whether the swap was made.  */
12455   aarch64_gen_compare_reg (EQ, rval, expected);
12456 }
12457
12458 /* Split a compare and swap pattern.  */
12459
12460 void
12461 aarch64_split_compare_and_swap (rtx operands[])
12462 {
12463   rtx rval, mem, oldval, newval, scratch;
12464   machine_mode mode;
12465   bool is_weak;
12466   rtx_code_label *label1, *label2;
12467   rtx x, cond;
12468   enum memmodel model;
12469   rtx model_rtx;
12470
12471   rval = operands[0];
12472   mem = operands[1];
12473   oldval = operands[2];
12474   newval = operands[3];
12475   is_weak = (operands[4] != const0_rtx);
12476   model_rtx = operands[5];
12477   scratch = operands[7];
12478   mode = GET_MODE (mem);
12479   model = memmodel_from_int (INTVAL (model_rtx));
12480
12481   /* When OLDVAL is zero and we want the strong version we can emit a tighter
12482     loop:
12483     .label1:
12484         LD[A]XR rval, [mem]
12485         CBNZ    rval, .label2
12486         ST[L]XR scratch, newval, [mem]
12487         CBNZ    scratch, .label1
12488     .label2:
12489         CMP     rval, 0.  */
12490   bool strong_zero_p = !is_weak && oldval == const0_rtx;
12491
12492   label1 = NULL;
12493   if (!is_weak)
12494     {
12495       label1 = gen_label_rtx ();
12496       emit_label (label1);
12497     }
12498   label2 = gen_label_rtx ();
12499
12500   /* The initial load can be relaxed for a __sync operation since a final
12501      barrier will be emitted to stop code hoisting.  */
12502   if (is_mm_sync (model))
12503     aarch64_emit_load_exclusive (mode, rval, mem,
12504                                  GEN_INT (MEMMODEL_RELAXED));
12505   else
12506     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
12507
12508   if (strong_zero_p)
12509     {
12510       x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
12511       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12512                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12513       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12514     }
12515   else
12516     {
12517       cond = aarch64_gen_compare_reg (NE, rval, oldval);
12518       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12519       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12520                                  gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12521       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12522     }
12523
12524   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
12525
12526   if (!is_weak)
12527     {
12528       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
12529       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12530                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
12531       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12532     }
12533   else
12534     {
12535       cond = gen_rtx_REG (CCmode, CC_REGNUM);
12536       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
12537       emit_insn (gen_rtx_SET (cond, x));
12538     }
12539
12540   emit_label (label2);
12541   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
12542      to set the condition flags.  If this is not used it will be removed by
12543      later passes.  */
12544   if (strong_zero_p)
12545     {
12546       cond = gen_rtx_REG (CCmode, CC_REGNUM);
12547       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
12548       emit_insn (gen_rtx_SET (cond, x));
12549     }
12550   /* Emit any final barrier needed for a __sync operation.  */
12551   if (is_mm_sync (model))
12552     aarch64_emit_post_barrier (model);
12553 }
12554
12555 /* Emit a BIC instruction.  */
12556
12557 static void
12558 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
12559 {
12560   rtx shift_rtx = GEN_INT (shift);
12561   rtx (*gen) (rtx, rtx, rtx, rtx);
12562
12563   switch (mode)
12564     {
12565     case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
12566     case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
12567     default:
12568       gcc_unreachable ();
12569     }
12570
12571   emit_insn (gen (dst, s2, shift_rtx, s1));
12572 }
12573
12574 /* Emit an atomic swap.  */
12575
12576 static void
12577 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
12578                           rtx mem, rtx model)
12579 {
12580   rtx (*gen) (rtx, rtx, rtx, rtx);
12581
12582   switch (mode)
12583     {
12584     case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
12585     case E_HImode: gen = gen_aarch64_atomic_swphi; break;
12586     case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
12587     case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
12588     default:
12589       gcc_unreachable ();
12590     }
12591
12592   emit_insn (gen (dst, mem, value, model));
12593 }
12594
12595 /* Operations supported by aarch64_emit_atomic_load_op.  */
12596
12597 enum aarch64_atomic_load_op_code
12598 {
12599   AARCH64_LDOP_PLUS,    /* A + B  */
12600   AARCH64_LDOP_XOR,     /* A ^ B  */
12601   AARCH64_LDOP_OR,      /* A | B  */
12602   AARCH64_LDOP_BIC      /* A & ~B  */
12603 };
12604
12605 /* Emit an atomic load-operate.  */
12606
12607 static void
12608 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
12609                              machine_mode mode, rtx dst, rtx src,
12610                              rtx mem, rtx model)
12611 {
12612   typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
12613   const aarch64_atomic_load_op_fn plus[] =
12614   {
12615     gen_aarch64_atomic_loadaddqi,
12616     gen_aarch64_atomic_loadaddhi,
12617     gen_aarch64_atomic_loadaddsi,
12618     gen_aarch64_atomic_loadadddi
12619   };
12620   const aarch64_atomic_load_op_fn eor[] =
12621   {
12622     gen_aarch64_atomic_loadeorqi,
12623     gen_aarch64_atomic_loadeorhi,
12624     gen_aarch64_atomic_loadeorsi,
12625     gen_aarch64_atomic_loadeordi
12626   };
12627   const aarch64_atomic_load_op_fn ior[] =
12628   {
12629     gen_aarch64_atomic_loadsetqi,
12630     gen_aarch64_atomic_loadsethi,
12631     gen_aarch64_atomic_loadsetsi,
12632     gen_aarch64_atomic_loadsetdi
12633   };
12634   const aarch64_atomic_load_op_fn bic[] =
12635   {
12636     gen_aarch64_atomic_loadclrqi,
12637     gen_aarch64_atomic_loadclrhi,
12638     gen_aarch64_atomic_loadclrsi,
12639     gen_aarch64_atomic_loadclrdi
12640   };
12641   aarch64_atomic_load_op_fn gen;
12642   int idx = 0;
12643
12644   switch (mode)
12645     {
12646     case E_QImode: idx = 0; break;
12647     case E_HImode: idx = 1; break;
12648     case E_SImode: idx = 2; break;
12649     case E_DImode: idx = 3; break;
12650     default:
12651       gcc_unreachable ();
12652     }
12653
12654   switch (code)
12655     {
12656     case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
12657     case AARCH64_LDOP_XOR: gen = eor[idx]; break;
12658     case AARCH64_LDOP_OR: gen = ior[idx]; break;
12659     case AARCH64_LDOP_BIC: gen = bic[idx]; break;
12660     default:
12661       gcc_unreachable ();
12662     }
12663
12664   emit_insn (gen (dst, mem, src, model));
12665 }
12666
12667 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
12668    location to store the data read from memory.  OUT_RESULT is the location to
12669    store the result of the operation.  MEM is the memory location to read and
12670    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
12671    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
12672    be NULL.  */
12673
12674 void
12675 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
12676                          rtx mem, rtx value, rtx model_rtx)
12677 {
12678   machine_mode mode = GET_MODE (mem);
12679   machine_mode wmode = (mode == DImode ? DImode : SImode);
12680   const bool short_mode = (mode < SImode);
12681   aarch64_atomic_load_op_code ldop_code;
12682   rtx src;
12683   rtx x;
12684
12685   if (out_data)
12686     out_data = gen_lowpart (mode, out_data);
12687
12688   if (out_result)
12689     out_result = gen_lowpart (mode, out_result);
12690
12691   /* Make sure the value is in a register, putting it into a destination
12692      register if it needs to be manipulated.  */
12693   if (!register_operand (value, mode)
12694       || code == AND || code == MINUS)
12695     {
12696       src = out_result ? out_result : out_data;
12697       emit_move_insn (src, gen_lowpart (mode, value));
12698     }
12699   else
12700     src = value;
12701   gcc_assert (register_operand (src, mode));
12702
12703   /* Preprocess the data for the operation as necessary.  If the operation is
12704      a SET then emit a swap instruction and finish.  */
12705   switch (code)
12706     {
12707     case SET:
12708       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
12709       return;
12710
12711     case MINUS:
12712       /* Negate the value and treat it as a PLUS.  */
12713       {
12714         rtx neg_src;
12715
12716         /* Resize the value if necessary.  */
12717         if (short_mode)
12718           src = gen_lowpart (wmode, src);
12719
12720         neg_src = gen_rtx_NEG (wmode, src);
12721         emit_insn (gen_rtx_SET (src, neg_src));
12722
12723         if (short_mode)
12724           src = gen_lowpart (mode, src);
12725       }
12726       /* Fall-through.  */
12727     case PLUS:
12728       ldop_code = AARCH64_LDOP_PLUS;
12729       break;
12730
12731     case IOR:
12732       ldop_code = AARCH64_LDOP_OR;
12733       break;
12734
12735     case XOR:
12736       ldop_code = AARCH64_LDOP_XOR;
12737       break;
12738
12739     case AND:
12740       {
12741         rtx not_src;
12742
12743         /* Resize the value if necessary.  */
12744         if (short_mode)
12745           src = gen_lowpart (wmode, src);
12746
12747         not_src = gen_rtx_NOT (wmode, src);
12748         emit_insn (gen_rtx_SET (src, not_src));
12749
12750         if (short_mode)
12751           src = gen_lowpart (mode, src);
12752       }
12753       ldop_code = AARCH64_LDOP_BIC;
12754       break;
12755
12756     default:
12757       /* The operation can't be done with atomic instructions.  */
12758       gcc_unreachable ();
12759     }
12760
12761   aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
12762
12763   /* If necessary, calculate the data in memory after the update by redoing the
12764      operation from values in registers.  */
12765   if (!out_result)
12766     return;
12767
12768   if (short_mode)
12769     {
12770       src = gen_lowpart (wmode, src);
12771       out_data = gen_lowpart (wmode, out_data);
12772       out_result = gen_lowpart (wmode, out_result);
12773     }
12774
12775   x = NULL_RTX;
12776
12777   switch (code)
12778     {
12779     case MINUS:
12780     case PLUS:
12781       x = gen_rtx_PLUS (wmode, out_data, src);
12782       break;
12783     case IOR:
12784       x = gen_rtx_IOR (wmode, out_data, src);
12785       break;
12786     case XOR:
12787       x = gen_rtx_XOR (wmode, out_data, src);
12788       break;
12789     case AND:
12790       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
12791       return;
12792     default:
12793       gcc_unreachable ();
12794     }
12795
12796   emit_set_insn (out_result, x);
12797
12798   return;
12799 }
12800
12801 /* Split an atomic operation.  */
12802
12803 void
12804 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
12805                          rtx value, rtx model_rtx, rtx cond)
12806 {
12807   machine_mode mode = GET_MODE (mem);
12808   machine_mode wmode = (mode == DImode ? DImode : SImode);
12809   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
12810   const bool is_sync = is_mm_sync (model);
12811   rtx_code_label *label;
12812   rtx x;
12813
12814   /* Split the atomic operation into a sequence.  */
12815   label = gen_label_rtx ();
12816   emit_label (label);
12817
12818   if (new_out)
12819     new_out = gen_lowpart (wmode, new_out);
12820   if (old_out)
12821     old_out = gen_lowpart (wmode, old_out);
12822   else
12823     old_out = new_out;
12824   value = simplify_gen_subreg (wmode, value, mode, 0);
12825
12826   /* The initial load can be relaxed for a __sync operation since a final
12827      barrier will be emitted to stop code hoisting.  */
12828  if (is_sync)
12829     aarch64_emit_load_exclusive (mode, old_out, mem,
12830                                  GEN_INT (MEMMODEL_RELAXED));
12831   else
12832     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
12833
12834   switch (code)
12835     {
12836     case SET:
12837       new_out = value;
12838       break;
12839
12840     case NOT:
12841       x = gen_rtx_AND (wmode, old_out, value);
12842       emit_insn (gen_rtx_SET (new_out, x));
12843       x = gen_rtx_NOT (wmode, new_out);
12844       emit_insn (gen_rtx_SET (new_out, x));
12845       break;
12846
12847     case MINUS:
12848       if (CONST_INT_P (value))
12849         {
12850           value = GEN_INT (-INTVAL (value));
12851           code = PLUS;
12852         }
12853       /* Fall through.  */
12854
12855     default:
12856       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
12857       emit_insn (gen_rtx_SET (new_out, x));
12858       break;
12859     }
12860
12861   aarch64_emit_store_exclusive (mode, cond, mem,
12862                                 gen_lowpart (mode, new_out), model_rtx);
12863
12864   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12865   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12866                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
12867   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12868
12869   /* Emit any final barrier needed for a __sync operation.  */
12870   if (is_sync)
12871     aarch64_emit_post_barrier (model);
12872 }
12873
12874 static void
12875 aarch64_init_libfuncs (void)
12876 {
12877    /* Half-precision float operations.  The compiler handles all operations
12878      with NULL libfuncs by converting to SFmode.  */
12879
12880   /* Conversions.  */
12881   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
12882   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
12883
12884   /* Arithmetic.  */
12885   set_optab_libfunc (add_optab, HFmode, NULL);
12886   set_optab_libfunc (sdiv_optab, HFmode, NULL);
12887   set_optab_libfunc (smul_optab, HFmode, NULL);
12888   set_optab_libfunc (neg_optab, HFmode, NULL);
12889   set_optab_libfunc (sub_optab, HFmode, NULL);
12890
12891   /* Comparisons.  */
12892   set_optab_libfunc (eq_optab, HFmode, NULL);
12893   set_optab_libfunc (ne_optab, HFmode, NULL);
12894   set_optab_libfunc (lt_optab, HFmode, NULL);
12895   set_optab_libfunc (le_optab, HFmode, NULL);
12896   set_optab_libfunc (ge_optab, HFmode, NULL);
12897   set_optab_libfunc (gt_optab, HFmode, NULL);
12898   set_optab_libfunc (unord_optab, HFmode, NULL);
12899 }
12900
12901 /* Target hook for c_mode_for_suffix.  */
12902 static machine_mode
12903 aarch64_c_mode_for_suffix (char suffix)
12904 {
12905   if (suffix == 'q')
12906     return TFmode;
12907
12908   return VOIDmode;
12909 }
12910
12911 /* We can only represent floating point constants which will fit in
12912    "quarter-precision" values.  These values are characterised by
12913    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
12914    by:
12915
12916    (-1)^s * (n/16) * 2^r
12917
12918    Where:
12919      's' is the sign bit.
12920      'n' is an integer in the range 16 <= n <= 31.
12921      'r' is an integer in the range -3 <= r <= 4.  */
12922
12923 /* Return true iff X can be represented by a quarter-precision
12924    floating point immediate operand X.  Note, we cannot represent 0.0.  */
12925 bool
12926 aarch64_float_const_representable_p (rtx x)
12927 {
12928   /* This represents our current view of how many bits
12929      make up the mantissa.  */
12930   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
12931   int exponent;
12932   unsigned HOST_WIDE_INT mantissa, mask;
12933   REAL_VALUE_TYPE r, m;
12934   bool fail;
12935
12936   if (!CONST_DOUBLE_P (x))
12937     return false;
12938
12939   /* We don't support HFmode constants yet.  */
12940   if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
12941     return false;
12942
12943   r = *CONST_DOUBLE_REAL_VALUE (x);
12944
12945   /* We cannot represent infinities, NaNs or +/-zero.  We won't
12946      know if we have +zero until we analyse the mantissa, but we
12947      can reject the other invalid values.  */
12948   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
12949       || REAL_VALUE_MINUS_ZERO (r))
12950     return false;
12951
12952   /* Extract exponent.  */
12953   r = real_value_abs (&r);
12954   exponent = REAL_EXP (&r);
12955
12956   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12957      highest (sign) bit, with a fixed binary point at bit point_pos.
12958      m1 holds the low part of the mantissa, m2 the high part.
12959      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12960      bits for the mantissa, this can fail (low bits will be lost).  */
12961   real_ldexp (&m, &r, point_pos - exponent);
12962   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
12963
12964   /* If the low part of the mantissa has bits set we cannot represent
12965      the value.  */
12966   if (w.ulow () != 0)
12967     return false;
12968   /* We have rejected the lower HOST_WIDE_INT, so update our
12969      understanding of how many bits lie in the mantissa and
12970      look only at the high HOST_WIDE_INT.  */
12971   mantissa = w.elt (1);
12972   point_pos -= HOST_BITS_PER_WIDE_INT;
12973
12974   /* We can only represent values with a mantissa of the form 1.xxxx.  */
12975   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
12976   if ((mantissa & mask) != 0)
12977     return false;
12978
12979   /* Having filtered unrepresentable values, we may now remove all
12980      but the highest 5 bits.  */
12981   mantissa >>= point_pos - 5;
12982
12983   /* We cannot represent the value 0.0, so reject it.  This is handled
12984      elsewhere.  */
12985   if (mantissa == 0)
12986     return false;
12987
12988   /* Then, as bit 4 is always set, we can mask it off, leaving
12989      the mantissa in the range [0, 15].  */
12990   mantissa &= ~(1 << 4);
12991   gcc_assert (mantissa <= 15);
12992
12993   /* GCC internally does not use IEEE754-like encoding (where normalized
12994      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
12995      Our mantissa values are shifted 4 places to the left relative to
12996      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
12997      by 5 places to correct for GCC's representation.  */
12998   exponent = 5 - exponent;
12999
13000   return (exponent >= 0 && exponent <= 7);
13001 }
13002
13003 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
13004    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
13005    output MOVI/MVNI, ORR or BIC immediate.  */
13006 char*
13007 aarch64_output_simd_mov_immediate (rtx const_vector,
13008                                    machine_mode mode,
13009                                    unsigned width,
13010                                    enum simd_immediate_check which)
13011 {
13012   bool is_valid;
13013   static char templ[40];
13014   const char *mnemonic;
13015   const char *shift_op;
13016   unsigned int lane_count = 0;
13017   char element_char;
13018
13019   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
13020
13021   /* This will return true to show const_vector is legal for use as either
13022      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
13023      It will also update INFO to show how the immediate should be generated.
13024      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
13025   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false,
13026                                            &info, which);
13027   gcc_assert (is_valid);
13028
13029   element_char = sizetochar (info.element_width);
13030   lane_count = width / info.element_width;
13031
13032   mode = GET_MODE_INNER (mode);
13033   if (GET_MODE_CLASS (mode) == MODE_FLOAT)
13034     {
13035       gcc_assert (info.shift == 0 && ! info.mvn);
13036       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
13037          move immediate path.  */
13038       if (aarch64_float_const_zero_rtx_p (info.value))
13039         info.value = GEN_INT (0);
13040       else
13041         {
13042           const unsigned int buf_size = 20;
13043           char float_buf[buf_size] = {'\0'};
13044           real_to_decimal_for_mode (float_buf,
13045                                     CONST_DOUBLE_REAL_VALUE (info.value),
13046                                     buf_size, buf_size, 1, mode);
13047
13048           if (lane_count == 1)
13049             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
13050           else
13051             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
13052                       lane_count, element_char, float_buf);
13053           return templ;
13054         }
13055     }
13056
13057   gcc_assert (CONST_INT_P (info.value));
13058
13059   if (which == AARCH64_CHECK_MOV)
13060     {
13061       mnemonic = info.mvn ? "mvni" : "movi";
13062       shift_op = info.msl ? "msl" : "lsl";
13063       if (lane_count == 1)
13064         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
13065                   mnemonic, UINTVAL (info.value));
13066       else if (info.shift)
13067         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
13068                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
13069                   element_char, UINTVAL (info.value), shift_op, info.shift);
13070       else
13071         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
13072                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
13073                   element_char, UINTVAL (info.value));
13074     }
13075   else
13076     {
13077       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
13078       mnemonic = info.mvn ? "bic" : "orr";
13079       if (info.shift)
13080         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
13081                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
13082                   element_char, UINTVAL (info.value), "lsl", info.shift);
13083       else
13084         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
13085                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
13086                   element_char, UINTVAL (info.value));
13087     }
13088   return templ;
13089 }
13090
13091 char*
13092 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
13093 {
13094
13095   /* If a floating point number was passed and we desire to use it in an
13096      integer mode do the conversion to integer.  */
13097   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
13098     {
13099       unsigned HOST_WIDE_INT ival;
13100       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
13101           gcc_unreachable ();
13102       immediate = gen_int_mode (ival, mode);
13103     }
13104
13105   machine_mode vmode;
13106   /* use a 64 bit mode for everything except for DI/DF mode, where we use
13107      a 128 bit vector mode.  */
13108   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
13109
13110   vmode = aarch64_simd_container_mode (mode, width);
13111   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
13112   return aarch64_output_simd_mov_immediate (v_op, vmode, width);
13113 }
13114
13115 /* Split operands into moves from op[1] + op[2] into op[0].  */
13116
13117 void
13118 aarch64_split_combinev16qi (rtx operands[3])
13119 {
13120   unsigned int dest = REGNO (operands[0]);
13121   unsigned int src1 = REGNO (operands[1]);
13122   unsigned int src2 = REGNO (operands[2]);
13123   machine_mode halfmode = GET_MODE (operands[1]);
13124   unsigned int halfregs = REG_NREGS (operands[1]);
13125   rtx destlo, desthi;
13126
13127   gcc_assert (halfmode == V16QImode);
13128
13129   if (src1 == dest && src2 == dest + halfregs)
13130     {
13131       /* No-op move.  Can't split to nothing; emit something.  */
13132       emit_note (NOTE_INSN_DELETED);
13133       return;
13134     }
13135
13136   /* Preserve register attributes for variable tracking.  */
13137   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
13138   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
13139                                GET_MODE_SIZE (halfmode));
13140
13141   /* Special case of reversed high/low parts.  */
13142   if (reg_overlap_mentioned_p (operands[2], destlo)
13143       && reg_overlap_mentioned_p (operands[1], desthi))
13144     {
13145       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
13146       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
13147       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
13148     }
13149   else if (!reg_overlap_mentioned_p (operands[2], destlo))
13150     {
13151       /* Try to avoid unnecessary moves if part of the result
13152          is in the right place already.  */
13153       if (src1 != dest)
13154         emit_move_insn (destlo, operands[1]);
13155       if (src2 != dest + halfregs)
13156         emit_move_insn (desthi, operands[2]);
13157     }
13158   else
13159     {
13160       if (src2 != dest + halfregs)
13161         emit_move_insn (desthi, operands[2]);
13162       if (src1 != dest)
13163         emit_move_insn (destlo, operands[1]);
13164     }
13165 }
13166
13167 /* vec_perm support.  */
13168
13169 #define MAX_VECT_LEN 16
13170
13171 struct expand_vec_perm_d
13172 {
13173   rtx target, op0, op1;
13174   auto_vec_perm_indices perm;
13175   machine_mode vmode;
13176   bool one_vector_p;
13177   bool testing_p;
13178 };
13179
13180 /* Generate a variable permutation.  */
13181
13182 static void
13183 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
13184 {
13185   machine_mode vmode = GET_MODE (target);
13186   bool one_vector_p = rtx_equal_p (op0, op1);
13187
13188   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
13189   gcc_checking_assert (GET_MODE (op0) == vmode);
13190   gcc_checking_assert (GET_MODE (op1) == vmode);
13191   gcc_checking_assert (GET_MODE (sel) == vmode);
13192   gcc_checking_assert (TARGET_SIMD);
13193
13194   if (one_vector_p)
13195     {
13196       if (vmode == V8QImode)
13197         {
13198           /* Expand the argument to a V16QI mode by duplicating it.  */
13199           rtx pair = gen_reg_rtx (V16QImode);
13200           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
13201           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13202         }
13203       else
13204         {
13205           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
13206         }
13207     }
13208   else
13209     {
13210       rtx pair;
13211
13212       if (vmode == V8QImode)
13213         {
13214           pair = gen_reg_rtx (V16QImode);
13215           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
13216           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13217         }
13218       else
13219         {
13220           pair = gen_reg_rtx (OImode);
13221           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
13222           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
13223         }
13224     }
13225 }
13226
13227 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
13228    NELT is the number of elements in the vector.  */
13229
13230 void
13231 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
13232                          unsigned int nelt)
13233 {
13234   machine_mode vmode = GET_MODE (target);
13235   bool one_vector_p = rtx_equal_p (op0, op1);
13236   rtx mask;
13237
13238   /* The TBL instruction does not use a modulo index, so we must take care
13239      of that ourselves.  */
13240   mask = aarch64_simd_gen_const_vector_dup (vmode,
13241       one_vector_p ? nelt - 1 : 2 * nelt - 1);
13242   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
13243
13244   /* For big-endian, we also need to reverse the index within the vector
13245      (but not which vector).  */
13246   if (BYTES_BIG_ENDIAN)
13247     {
13248       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
13249       if (!one_vector_p)
13250         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
13251       sel = expand_simple_binop (vmode, XOR, sel, mask,
13252                                  NULL, 0, OPTAB_LIB_WIDEN);
13253     }
13254   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
13255 }
13256
13257 /* Recognize patterns suitable for the TRN instructions.  */
13258 static bool
13259 aarch64_evpc_trn (struct expand_vec_perm_d *d)
13260 {
13261   unsigned int i, odd, mask, nelt = d->perm.length ();
13262   rtx out, in0, in1, x;
13263   machine_mode vmode = d->vmode;
13264
13265   if (GET_MODE_UNIT_SIZE (vmode) > 8)
13266     return false;
13267
13268   /* Note that these are little-endian tests.
13269      We correct for big-endian later.  */
13270   if (d->perm[0] == 0)
13271     odd = 0;
13272   else if (d->perm[0] == 1)
13273     odd = 1;
13274   else
13275     return false;
13276   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13277
13278   for (i = 0; i < nelt; i += 2)
13279     {
13280       if (d->perm[i] != i + odd)
13281         return false;
13282       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
13283         return false;
13284     }
13285
13286   /* Success!  */
13287   if (d->testing_p)
13288     return true;
13289
13290   in0 = d->op0;
13291   in1 = d->op1;
13292   if (BYTES_BIG_ENDIAN)
13293     {
13294       x = in0, in0 = in1, in1 = x;
13295       odd = !odd;
13296     }
13297   out = d->target;
13298
13299   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
13300                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
13301   return true;
13302 }
13303
13304 /* Recognize patterns suitable for the UZP instructions.  */
13305 static bool
13306 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
13307 {
13308   unsigned int i, odd, mask, nelt = d->perm.length ();
13309   rtx out, in0, in1, x;
13310   machine_mode vmode = d->vmode;
13311
13312   if (GET_MODE_UNIT_SIZE (vmode) > 8)
13313     return false;
13314
13315   /* Note that these are little-endian tests.
13316      We correct for big-endian later.  */
13317   if (d->perm[0] == 0)
13318     odd = 0;
13319   else if (d->perm[0] == 1)
13320     odd = 1;
13321   else
13322     return false;
13323   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13324
13325   for (i = 0; i < nelt; i++)
13326     {
13327       unsigned elt = (i * 2 + odd) & mask;
13328       if (d->perm[i] != elt)
13329         return false;
13330     }
13331
13332   /* Success!  */
13333   if (d->testing_p)
13334     return true;
13335
13336   in0 = d->op0;
13337   in1 = d->op1;
13338   if (BYTES_BIG_ENDIAN)
13339     {
13340       x = in0, in0 = in1, in1 = x;
13341       odd = !odd;
13342     }
13343   out = d->target;
13344
13345   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
13346                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
13347   return true;
13348 }
13349
13350 /* Recognize patterns suitable for the ZIP instructions.  */
13351 static bool
13352 aarch64_evpc_zip (struct expand_vec_perm_d *d)
13353 {
13354   unsigned int i, high, mask, nelt = d->perm.length ();
13355   rtx out, in0, in1, x;
13356   machine_mode vmode = d->vmode;
13357
13358   if (GET_MODE_UNIT_SIZE (vmode) > 8)
13359     return false;
13360
13361   /* Note that these are little-endian tests.
13362      We correct for big-endian later.  */
13363   high = nelt / 2;
13364   if (d->perm[0] == high)
13365     /* Do Nothing.  */
13366     ;
13367   else if (d->perm[0] == 0)
13368     high = 0;
13369   else
13370     return false;
13371   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13372
13373   for (i = 0; i < nelt / 2; i++)
13374     {
13375       unsigned elt = (i + high) & mask;
13376       if (d->perm[i * 2] != elt)
13377         return false;
13378       elt = (elt + nelt) & mask;
13379       if (d->perm[i * 2 + 1] != elt)
13380         return false;
13381     }
13382
13383   /* Success!  */
13384   if (d->testing_p)
13385     return true;
13386
13387   in0 = d->op0;
13388   in1 = d->op1;
13389   if (BYTES_BIG_ENDIAN)
13390     {
13391       x = in0, in0 = in1, in1 = x;
13392       high = !high;
13393     }
13394   out = d->target;
13395
13396   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
13397                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
13398   return true;
13399 }
13400
13401 /* Recognize patterns for the EXT insn.  */
13402
13403 static bool
13404 aarch64_evpc_ext (struct expand_vec_perm_d *d)
13405 {
13406   unsigned int i, nelt = d->perm.length ();
13407   rtx offset;
13408
13409   unsigned int location = d->perm[0]; /* Always < nelt.  */
13410
13411   /* Check if the extracted indices are increasing by one.  */
13412   for (i = 1; i < nelt; i++)
13413     {
13414       unsigned int required = location + i;
13415       if (d->one_vector_p)
13416         {
13417           /* We'll pass the same vector in twice, so allow indices to wrap.  */
13418           required &= (nelt - 1);
13419         }
13420       if (d->perm[i] != required)
13421         return false;
13422     }
13423
13424   /* Success! */
13425   if (d->testing_p)
13426     return true;
13427
13428   /* The case where (location == 0) is a no-op for both big- and little-endian,
13429      and is removed by the mid-end at optimization levels -O1 and higher.  */
13430
13431   if (BYTES_BIG_ENDIAN && (location != 0))
13432     {
13433       /* After setup, we want the high elements of the first vector (stored
13434          at the LSB end of the register), and the low elements of the second
13435          vector (stored at the MSB end of the register). So swap.  */
13436       std::swap (d->op0, d->op1);
13437       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
13438       location = nelt - location;
13439     }
13440
13441   offset = GEN_INT (location);
13442   emit_set_insn (d->target,
13443                  gen_rtx_UNSPEC (d->vmode,
13444                                  gen_rtvec (3, d->op0, d->op1, offset),
13445                                  UNSPEC_EXT));
13446   return true;
13447 }
13448
13449 /* Recognize patterns for the REV insns.  */
13450
13451 static bool
13452 aarch64_evpc_rev (struct expand_vec_perm_d *d)
13453 {
13454   unsigned int i, j, diff, size, unspec, nelt = d->perm.length ();
13455
13456   if (!d->one_vector_p)
13457     return false;
13458
13459   diff = d->perm[0];
13460   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
13461   if (size == 8)
13462     unspec = UNSPEC_REV64;
13463   else if (size == 4)
13464     unspec = UNSPEC_REV32;
13465   else if (size == 2)
13466     unspec = UNSPEC_REV16;
13467   else
13468     return false;
13469
13470   for (i = 0; i < nelt ; i += diff + 1)
13471     for (j = 0; j <= diff; j += 1)
13472       {
13473         /* This is guaranteed to be true as the value of diff
13474            is 7, 3, 1 and we should have enough elements in the
13475            queue to generate this.  Getting a vector mask with a
13476            value of diff other than these values implies that
13477            something is wrong by the time we get here.  */
13478         gcc_assert (i + j < nelt);
13479         if (d->perm[i + j] != i + diff - j)
13480           return false;
13481       }
13482
13483   /* Success! */
13484   if (d->testing_p)
13485     return true;
13486
13487   emit_set_insn (d->target, gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0),
13488                                             unspec));
13489   return true;
13490 }
13491
13492 static bool
13493 aarch64_evpc_dup (struct expand_vec_perm_d *d)
13494 {
13495   rtx out = d->target;
13496   rtx in0;
13497   machine_mode vmode = d->vmode;
13498   unsigned int i, elt, nelt = d->perm.length ();
13499   rtx lane;
13500
13501   elt = d->perm[0];
13502   for (i = 1; i < nelt; i++)
13503     {
13504       if (elt != d->perm[i])
13505         return false;
13506     }
13507
13508   /* The generic preparation in aarch64_expand_vec_perm_const_1
13509      swaps the operand order and the permute indices if it finds
13510      d->perm[0] to be in the second operand.  Thus, we can always
13511      use d->op0 and need not do any extra arithmetic to get the
13512      correct lane number.  */
13513   in0 = d->op0;
13514   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
13515
13516   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
13517   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
13518   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
13519   return true;
13520 }
13521
13522 static bool
13523 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
13524 {
13525   rtx rperm[MAX_VECT_LEN], sel;
13526   machine_mode vmode = d->vmode;
13527   unsigned int i, nelt = d->perm.length ();
13528
13529   if (d->testing_p)
13530     return true;
13531
13532   /* Generic code will try constant permutation twice.  Once with the
13533      original mode and again with the elements lowered to QImode.
13534      So wait and don't do the selector expansion ourselves.  */
13535   if (vmode != V8QImode && vmode != V16QImode)
13536     return false;
13537
13538   for (i = 0; i < nelt; ++i)
13539     {
13540       int nunits = GET_MODE_NUNITS (vmode);
13541
13542       /* If big-endian and two vectors we end up with a weird mixed-endian
13543          mode on NEON.  Reverse the index within each word but not the word
13544          itself.  */
13545       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
13546                                            : d->perm[i]);
13547     }
13548   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
13549   sel = force_reg (vmode, sel);
13550
13551   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
13552   return true;
13553 }
13554
13555 static bool
13556 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
13557 {
13558   /* The pattern matching functions above are written to look for a small
13559      number to begin the sequence (0, 1, N/2).  If we begin with an index
13560      from the second operand, we can swap the operands.  */
13561   unsigned int nelt = d->perm.length ();
13562   if (d->perm[0] >= nelt)
13563     {
13564       gcc_assert (nelt == (nelt & -nelt));
13565       for (unsigned int i = 0; i < nelt; ++i)
13566         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
13567
13568       std::swap (d->op0, d->op1);
13569     }
13570
13571   if (TARGET_SIMD && nelt > 1)
13572     {
13573       if (aarch64_evpc_rev (d))
13574         return true;
13575       else if (aarch64_evpc_ext (d))
13576         return true;
13577       else if (aarch64_evpc_dup (d))
13578         return true;
13579       else if (aarch64_evpc_zip (d))
13580         return true;
13581       else if (aarch64_evpc_uzp (d))
13582         return true;
13583       else if (aarch64_evpc_trn (d))
13584         return true;
13585       return aarch64_evpc_tbl (d);
13586     }
13587   return false;
13588 }
13589
13590 /* Expand a vec_perm_const pattern with the operands given by TARGET,
13591    OP0, OP1 and SEL.  NELT is the number of elements in the vector.  */
13592
13593 bool
13594 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel,
13595                                unsigned int nelt)
13596 {
13597   struct expand_vec_perm_d d;
13598   unsigned int i, which;
13599
13600   d.target = target;
13601   d.op0 = op0;
13602   d.op1 = op1;
13603
13604   d.vmode = GET_MODE (target);
13605   gcc_assert (VECTOR_MODE_P (d.vmode));
13606   d.testing_p = false;
13607
13608   d.perm.reserve (nelt);
13609   for (i = which = 0; i < nelt; ++i)
13610     {
13611       rtx e = XVECEXP (sel, 0, i);
13612       unsigned int ei = INTVAL (e) & (2 * nelt - 1);
13613       which |= (ei < nelt ? 1 : 2);
13614       d.perm.quick_push (ei);
13615     }
13616
13617   switch (which)
13618     {
13619     default:
13620       gcc_unreachable ();
13621
13622     case 3:
13623       d.one_vector_p = false;
13624       if (!rtx_equal_p (op0, op1))
13625         break;
13626
13627       /* The elements of PERM do not suggest that only the first operand
13628          is used, but both operands are identical.  Allow easier matching
13629          of the permutation by folding the permutation into the single
13630          input vector.  */
13631       /* Fall Through.  */
13632     case 2:
13633       for (i = 0; i < nelt; ++i)
13634         d.perm[i] &= nelt - 1;
13635       d.op0 = op1;
13636       d.one_vector_p = true;
13637       break;
13638
13639     case 1:
13640       d.op1 = op0;
13641       d.one_vector_p = true;
13642       break;
13643     }
13644
13645   return aarch64_expand_vec_perm_const_1 (&d);
13646 }
13647
13648 static bool
13649 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode, vec_perm_indices sel)
13650 {
13651   struct expand_vec_perm_d d;
13652   unsigned int i, nelt, which;
13653   bool ret;
13654
13655   d.vmode = vmode;
13656   d.testing_p = true;
13657   d.perm.safe_splice (sel);
13658
13659   /* Calculate whether all elements are in one vector.  */
13660   nelt = sel.length ();
13661   for (i = which = 0; i < nelt; ++i)
13662     {
13663       unsigned int e = d.perm[i];
13664       gcc_assert (e < 2 * nelt);
13665       which |= (e < nelt ? 1 : 2);
13666     }
13667
13668   /* If all elements are from the second vector, reindex as if from the
13669      first vector.  */
13670   if (which == 2)
13671     for (i = 0; i < nelt; ++i)
13672       d.perm[i] -= nelt;
13673
13674   /* Check whether the mask can be applied to a single vector.  */
13675   d.one_vector_p = (which != 3);
13676
13677   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
13678   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
13679   if (!d.one_vector_p)
13680     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
13681
13682   start_sequence ();
13683   ret = aarch64_expand_vec_perm_const_1 (&d);
13684   end_sequence ();
13685
13686   return ret;
13687 }
13688
13689 /* Generate a byte permute mask for a register of mode MODE,
13690    which has NUNITS units.  */
13691
13692 rtx
13693 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
13694 {
13695   /* We have to reverse each vector because we dont have
13696      a permuted load that can reverse-load according to ABI rules.  */
13697   rtx mask;
13698   rtvec v = rtvec_alloc (16);
13699   unsigned int i, j;
13700   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
13701
13702   gcc_assert (BYTES_BIG_ENDIAN);
13703   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
13704
13705   for (i = 0; i < nunits; i++)
13706     for (j = 0; j < usize; j++)
13707       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
13708   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
13709   return force_reg (V16QImode, mask);
13710 }
13711
13712 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
13713    true.  However due to issues with register allocation it is preferable
13714    to avoid tieing integer scalar and FP scalar modes.  Executing integer
13715    operations in general registers is better than treating them as scalar
13716    vector operations.  This reduces latency and avoids redundant int<->FP
13717    moves.  So tie modes if they are either the same class, or vector modes
13718    with other vector modes, vector structs or any scalar mode.  */
13719
13720 static bool
13721 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
13722 {
13723   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
13724     return true;
13725
13726   /* We specifically want to allow elements of "structure" modes to
13727      be tieable to the structure.  This more general condition allows
13728      other rarer situations too.  */
13729   if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
13730     return true;
13731
13732   /* Also allow any scalar modes with vectors.  */
13733   if (aarch64_vector_mode_supported_p (mode1)
13734       || aarch64_vector_mode_supported_p (mode2))
13735     return true;
13736
13737   return false;
13738 }
13739
13740 /* Return a new RTX holding the result of moving POINTER forward by
13741    AMOUNT bytes.  */
13742
13743 static rtx
13744 aarch64_move_pointer (rtx pointer, int amount)
13745 {
13746   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
13747
13748   return adjust_automodify_address (pointer, GET_MODE (pointer),
13749                                     next, amount);
13750 }
13751
13752 /* Return a new RTX holding the result of moving POINTER forward by the
13753    size of the mode it points to.  */
13754
13755 static rtx
13756 aarch64_progress_pointer (rtx pointer)
13757 {
13758   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
13759
13760   return aarch64_move_pointer (pointer, amount);
13761 }
13762
13763 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13764    MODE bytes.  */
13765
13766 static void
13767 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
13768                                               machine_mode mode)
13769 {
13770   rtx reg = gen_reg_rtx (mode);
13771
13772   /* "Cast" the pointers to the correct mode.  */
13773   *src = adjust_address (*src, mode, 0);
13774   *dst = adjust_address (*dst, mode, 0);
13775   /* Emit the memcpy.  */
13776   emit_move_insn (reg, *src);
13777   emit_move_insn (*dst, reg);
13778   /* Move the pointers forward.  */
13779   *src = aarch64_progress_pointer (*src);
13780   *dst = aarch64_progress_pointer (*dst);
13781 }
13782
13783 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
13784    we succeed, otherwise return false.  */
13785
13786 bool
13787 aarch64_expand_movmem (rtx *operands)
13788 {
13789   unsigned int n;
13790   rtx dst = operands[0];
13791   rtx src = operands[1];
13792   rtx base;
13793   bool speed_p = !optimize_function_for_size_p (cfun);
13794
13795   /* When optimizing for size, give a better estimate of the length of a
13796      memcpy call, but use the default otherwise.  */
13797   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
13798
13799   /* We can't do anything smart if the amount to copy is not constant.  */
13800   if (!CONST_INT_P (operands[2]))
13801     return false;
13802
13803   n = UINTVAL (operands[2]);
13804
13805   /* Try to keep the number of instructions low.  For cases below 16 bytes we
13806      need to make at most two moves.  For cases above 16 bytes it will be one
13807      move for each 16 byte chunk, then at most two additional moves.  */
13808   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
13809     return false;
13810
13811   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
13812   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
13813
13814   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
13815   src = adjust_automodify_address (src, VOIDmode, base, 0);
13816
13817   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
13818      1-byte chunk.  */
13819   if (n < 4)
13820     {
13821       if (n >= 2)
13822         {
13823           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13824           n -= 2;
13825         }
13826
13827       if (n == 1)
13828         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13829
13830       return true;
13831     }
13832
13833   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
13834      4-byte chunk, partially overlapping with the previously copied chunk.  */
13835   if (n < 8)
13836     {
13837       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13838       n -= 4;
13839       if (n > 0)
13840         {
13841           int move = n - 4;
13842
13843           src = aarch64_move_pointer (src, move);
13844           dst = aarch64_move_pointer (dst, move);
13845           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13846         }
13847       return true;
13848     }
13849
13850   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
13851      them, then (if applicable) an 8-byte chunk.  */
13852   while (n >= 8)
13853     {
13854       if (n / 16)
13855         {
13856           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
13857           n -= 16;
13858         }
13859       else
13860         {
13861           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13862           n -= 8;
13863         }
13864     }
13865
13866   /* Finish the final bytes of the copy.  We can always do this in one
13867      instruction.  We either copy the exact amount we need, or partially
13868      overlap with the previous chunk we copied and copy 8-bytes.  */
13869   if (n == 0)
13870     return true;
13871   else if (n == 1)
13872     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13873   else if (n == 2)
13874     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13875   else if (n == 4)
13876     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13877   else
13878     {
13879       if (n == 3)
13880         {
13881           src = aarch64_move_pointer (src, -1);
13882           dst = aarch64_move_pointer (dst, -1);
13883           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13884         }
13885       else
13886         {
13887           int move = n - 8;
13888
13889           src = aarch64_move_pointer (src, move);
13890           dst = aarch64_move_pointer (dst, move);
13891           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13892         }
13893     }
13894
13895   return true;
13896 }
13897
13898 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
13899    SImode stores.  Handle the case when the constant has identical
13900    bottom and top halves.  This is beneficial when the two stores can be
13901    merged into an STP and we avoid synthesising potentially expensive
13902    immediates twice.  Return true if such a split is possible.  */
13903
13904 bool
13905 aarch64_split_dimode_const_store (rtx dst, rtx src)
13906 {
13907   rtx lo = gen_lowpart (SImode, src);
13908   rtx hi = gen_highpart_mode (SImode, DImode, src);
13909
13910   bool size_p = optimize_function_for_size_p (cfun);
13911
13912   if (!rtx_equal_p (lo, hi))
13913     return false;
13914
13915   unsigned int orig_cost
13916     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
13917   unsigned int lo_cost
13918     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
13919
13920   /* We want to transform:
13921      MOV        x1, 49370
13922      MOVK       x1, 0x140, lsl 16
13923      MOVK       x1, 0xc0da, lsl 32
13924      MOVK       x1, 0x140, lsl 48
13925      STR        x1, [x0]
13926    into:
13927      MOV        w1, 49370
13928      MOVK       w1, 0x140, lsl 16
13929      STP        w1, w1, [x0]
13930    So we want to perform this only when we save two instructions
13931    or more.  When optimizing for size, however, accept any code size
13932    savings we can.  */
13933   if (size_p && orig_cost <= lo_cost)
13934     return false;
13935
13936   if (!size_p
13937       && (orig_cost <= lo_cost + 1))
13938     return false;
13939
13940   rtx mem_lo = adjust_address (dst, SImode, 0);
13941   if (!aarch64_mem_pair_operand (mem_lo, SImode))
13942     return false;
13943
13944   rtx tmp_reg = gen_reg_rtx (SImode);
13945   aarch64_expand_mov_immediate (tmp_reg, lo);
13946   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
13947   /* Don't emit an explicit store pair as this may not be always profitable.
13948      Let the sched-fusion logic decide whether to merge them.  */
13949   emit_move_insn (mem_lo, tmp_reg);
13950   emit_move_insn (mem_hi, tmp_reg);
13951
13952   return true;
13953 }
13954
13955 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
13956
13957 static unsigned HOST_WIDE_INT
13958 aarch64_asan_shadow_offset (void)
13959 {
13960   return (HOST_WIDE_INT_1 << 36);
13961 }
13962
13963 static bool
13964 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
13965                                         unsigned int align,
13966                                         enum by_pieces_operation op,
13967                                         bool speed_p)
13968 {
13969   /* STORE_BY_PIECES can be used when copying a constant string, but
13970      in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
13971      For now we always fail this and let the move_by_pieces code copy
13972      the string from read-only memory.  */
13973   if (op == STORE_BY_PIECES)
13974     return false;
13975
13976   return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
13977 }
13978
13979 static rtx
13980 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
13981                         int code, tree treeop0, tree treeop1)
13982 {
13983   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13984   rtx op0, op1;
13985   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13986   insn_code icode;
13987   struct expand_operand ops[4];
13988
13989   start_sequence ();
13990   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13991
13992   op_mode = GET_MODE (op0);
13993   if (op_mode == VOIDmode)
13994     op_mode = GET_MODE (op1);
13995
13996   switch (op_mode)
13997     {
13998     case E_QImode:
13999     case E_HImode:
14000     case E_SImode:
14001       cmp_mode = SImode;
14002       icode = CODE_FOR_cmpsi;
14003       break;
14004
14005     case E_DImode:
14006       cmp_mode = DImode;
14007       icode = CODE_FOR_cmpdi;
14008       break;
14009
14010     case E_SFmode:
14011       cmp_mode = SFmode;
14012       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14013       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
14014       break;
14015
14016     case E_DFmode:
14017       cmp_mode = DFmode;
14018       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14019       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
14020       break;
14021
14022     default:
14023       end_sequence ();
14024       return NULL_RTX;
14025     }
14026
14027   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
14028   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
14029   if (!op0 || !op1)
14030     {
14031       end_sequence ();
14032       return NULL_RTX;
14033     }
14034   *prep_seq = get_insns ();
14035   end_sequence ();
14036
14037   create_fixed_operand (&ops[0], op0);
14038   create_fixed_operand (&ops[1], op1);
14039
14040   start_sequence ();
14041   if (!maybe_expand_insn (icode, 2, ops))
14042     {
14043       end_sequence ();
14044       return NULL_RTX;
14045     }
14046   *gen_seq = get_insns ();
14047   end_sequence ();
14048
14049   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
14050                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
14051 }
14052
14053 static rtx
14054 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
14055                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
14056 {
14057   rtx op0, op1, target;
14058   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14059   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14060   insn_code icode;
14061   struct expand_operand ops[6];
14062   int aarch64_cond;
14063
14064   push_to_sequence (*prep_seq);
14065   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14066
14067   op_mode = GET_MODE (op0);
14068   if (op_mode == VOIDmode)
14069     op_mode = GET_MODE (op1);
14070
14071   switch (op_mode)
14072     {
14073     case E_QImode:
14074     case E_HImode:
14075     case E_SImode:
14076       cmp_mode = SImode;
14077       icode = CODE_FOR_ccmpsi;
14078       break;
14079
14080     case E_DImode:
14081       cmp_mode = DImode;
14082       icode = CODE_FOR_ccmpdi;
14083       break;
14084
14085     case E_SFmode:
14086       cmp_mode = SFmode;
14087       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14088       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
14089       break;
14090
14091     case E_DFmode:
14092       cmp_mode = DFmode;
14093       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14094       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
14095       break;
14096
14097     default:
14098       end_sequence ();
14099       return NULL_RTX;
14100     }
14101
14102   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
14103   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
14104   if (!op0 || !op1)
14105     {
14106       end_sequence ();
14107       return NULL_RTX;
14108     }
14109   *prep_seq = get_insns ();
14110   end_sequence ();
14111
14112   target = gen_rtx_REG (cc_mode, CC_REGNUM);
14113   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
14114
14115   if (bit_code != AND)
14116     {
14117       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
14118                                                 GET_MODE (XEXP (prev, 0))),
14119                              VOIDmode, XEXP (prev, 0), const0_rtx);
14120       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
14121     }
14122
14123   create_fixed_operand (&ops[0], XEXP (prev, 0));
14124   create_fixed_operand (&ops[1], target);
14125   create_fixed_operand (&ops[2], op0);
14126   create_fixed_operand (&ops[3], op1);
14127   create_fixed_operand (&ops[4], prev);
14128   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
14129
14130   push_to_sequence (*gen_seq);
14131   if (!maybe_expand_insn (icode, 6, ops))
14132     {
14133       end_sequence ();
14134       return NULL_RTX;
14135     }
14136
14137   *gen_seq = get_insns ();
14138   end_sequence ();
14139
14140   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
14141 }
14142
14143 #undef TARGET_GEN_CCMP_FIRST
14144 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
14145
14146 #undef TARGET_GEN_CCMP_NEXT
14147 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
14148
14149 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
14150    instruction fusion of some sort.  */
14151
14152 static bool
14153 aarch64_macro_fusion_p (void)
14154 {
14155   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
14156 }
14157
14158
14159 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
14160    should be kept together during scheduling.  */
14161
14162 static bool
14163 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
14164 {
14165   rtx set_dest;
14166   rtx prev_set = single_set (prev);
14167   rtx curr_set = single_set (curr);
14168   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
14169   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
14170
14171   if (!aarch64_macro_fusion_p ())
14172     return false;
14173
14174   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
14175     {
14176       /* We are trying to match:
14177          prev (mov)  == (set (reg r0) (const_int imm16))
14178          curr (movk) == (set (zero_extract (reg r0)
14179                                            (const_int 16)
14180                                            (const_int 16))
14181                              (const_int imm16_1))  */
14182
14183       set_dest = SET_DEST (curr_set);
14184
14185       if (GET_CODE (set_dest) == ZERO_EXTRACT
14186           && CONST_INT_P (SET_SRC (curr_set))
14187           && CONST_INT_P (SET_SRC (prev_set))
14188           && CONST_INT_P (XEXP (set_dest, 2))
14189           && INTVAL (XEXP (set_dest, 2)) == 16
14190           && REG_P (XEXP (set_dest, 0))
14191           && REG_P (SET_DEST (prev_set))
14192           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
14193         {
14194           return true;
14195         }
14196     }
14197
14198   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
14199     {
14200
14201       /*  We're trying to match:
14202           prev (adrp) == (set (reg r1)
14203                               (high (symbol_ref ("SYM"))))
14204           curr (add) == (set (reg r0)
14205                              (lo_sum (reg r1)
14206                                      (symbol_ref ("SYM"))))
14207           Note that r0 need not necessarily be the same as r1, especially
14208           during pre-regalloc scheduling.  */
14209
14210       if (satisfies_constraint_Ush (SET_SRC (prev_set))
14211           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14212         {
14213           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
14214               && REG_P (XEXP (SET_SRC (curr_set), 0))
14215               && REGNO (XEXP (SET_SRC (curr_set), 0))
14216                  == REGNO (SET_DEST (prev_set))
14217               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
14218                               XEXP (SET_SRC (curr_set), 1)))
14219             return true;
14220         }
14221     }
14222
14223   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
14224     {
14225
14226       /* We're trying to match:
14227          prev (movk) == (set (zero_extract (reg r0)
14228                                            (const_int 16)
14229                                            (const_int 32))
14230                              (const_int imm16_1))
14231          curr (movk) == (set (zero_extract (reg r0)
14232                                            (const_int 16)
14233                                            (const_int 48))
14234                              (const_int imm16_2))  */
14235
14236       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
14237           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
14238           && REG_P (XEXP (SET_DEST (prev_set), 0))
14239           && REG_P (XEXP (SET_DEST (curr_set), 0))
14240           && REGNO (XEXP (SET_DEST (prev_set), 0))
14241              == REGNO (XEXP (SET_DEST (curr_set), 0))
14242           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
14243           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
14244           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
14245           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
14246           && CONST_INT_P (SET_SRC (prev_set))
14247           && CONST_INT_P (SET_SRC (curr_set)))
14248         return true;
14249
14250     }
14251   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
14252     {
14253       /* We're trying to match:
14254           prev (adrp) == (set (reg r0)
14255                               (high (symbol_ref ("SYM"))))
14256           curr (ldr) == (set (reg r1)
14257                              (mem (lo_sum (reg r0)
14258                                              (symbol_ref ("SYM")))))
14259                  or
14260           curr (ldr) == (set (reg r1)
14261                              (zero_extend (mem
14262                                            (lo_sum (reg r0)
14263                                                    (symbol_ref ("SYM"))))))  */
14264       if (satisfies_constraint_Ush (SET_SRC (prev_set))
14265           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14266         {
14267           rtx curr_src = SET_SRC (curr_set);
14268
14269           if (GET_CODE (curr_src) == ZERO_EXTEND)
14270             curr_src = XEXP (curr_src, 0);
14271
14272           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
14273               && REG_P (XEXP (XEXP (curr_src, 0), 0))
14274               && REGNO (XEXP (XEXP (curr_src, 0), 0))
14275                  == REGNO (SET_DEST (prev_set))
14276               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
14277                               XEXP (SET_SRC (prev_set), 0)))
14278               return true;
14279         }
14280     }
14281
14282   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
14283        && aarch_crypto_can_dual_issue (prev, curr))
14284     return true;
14285
14286   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
14287       && any_condjump_p (curr))
14288     {
14289       enum attr_type prev_type = get_attr_type (prev);
14290
14291       unsigned int condreg1, condreg2;
14292       rtx cc_reg_1;
14293       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
14294       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
14295
14296       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
14297           && prev
14298           && modified_in_p (cc_reg_1, prev))
14299         {
14300           /* FIXME: this misses some which is considered simple arthematic
14301              instructions for ThunderX.  Simple shifts are missed here.  */
14302           if (prev_type == TYPE_ALUS_SREG
14303               || prev_type == TYPE_ALUS_IMM
14304               || prev_type == TYPE_LOGICS_REG
14305               || prev_type == TYPE_LOGICS_IMM)
14306             return true;
14307         }
14308     }
14309
14310   if (prev_set
14311       && curr_set
14312       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
14313       && any_condjump_p (curr))
14314     {
14315       /* We're trying to match:
14316           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
14317           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
14318                                                          (const_int 0))
14319                                                  (label_ref ("SYM"))
14320                                                  (pc))  */
14321       if (SET_DEST (curr_set) == (pc_rtx)
14322           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
14323           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
14324           && REG_P (SET_DEST (prev_set))
14325           && REGNO (SET_DEST (prev_set))
14326              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
14327         {
14328           /* Fuse ALU operations followed by conditional branch instruction.  */
14329           switch (get_attr_type (prev))
14330             {
14331             case TYPE_ALU_IMM:
14332             case TYPE_ALU_SREG:
14333             case TYPE_ADC_REG:
14334             case TYPE_ADC_IMM:
14335             case TYPE_ADCS_REG:
14336             case TYPE_ADCS_IMM:
14337             case TYPE_LOGIC_REG:
14338             case TYPE_LOGIC_IMM:
14339             case TYPE_CSEL:
14340             case TYPE_ADR:
14341             case TYPE_MOV_IMM:
14342             case TYPE_SHIFT_REG:
14343             case TYPE_SHIFT_IMM:
14344             case TYPE_BFM:
14345             case TYPE_RBIT:
14346             case TYPE_REV:
14347             case TYPE_EXTEND:
14348               return true;
14349
14350             default:;
14351             }
14352         }
14353     }
14354
14355   return false;
14356 }
14357
14358 /* Return true iff the instruction fusion described by OP is enabled.  */
14359
14360 bool
14361 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
14362 {
14363   return (aarch64_tune_params.fusible_ops & op) != 0;
14364 }
14365
14366 /* If MEM is in the form of [base+offset], extract the two parts
14367    of address and set to BASE and OFFSET, otherwise return false
14368    after clearing BASE and OFFSET.  */
14369
14370 bool
14371 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
14372 {
14373   rtx addr;
14374
14375   gcc_assert (MEM_P (mem));
14376
14377   addr = XEXP (mem, 0);
14378
14379   if (REG_P (addr))
14380     {
14381       *base = addr;
14382       *offset = const0_rtx;
14383       return true;
14384     }
14385
14386   if (GET_CODE (addr) == PLUS
14387       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
14388     {
14389       *base = XEXP (addr, 0);
14390       *offset = XEXP (addr, 1);
14391       return true;
14392     }
14393
14394   *base = NULL_RTX;
14395   *offset = NULL_RTX;
14396
14397   return false;
14398 }
14399
14400 /* Types for scheduling fusion.  */
14401 enum sched_fusion_type
14402 {
14403   SCHED_FUSION_NONE = 0,
14404   SCHED_FUSION_LD_SIGN_EXTEND,
14405   SCHED_FUSION_LD_ZERO_EXTEND,
14406   SCHED_FUSION_LD,
14407   SCHED_FUSION_ST,
14408   SCHED_FUSION_NUM
14409 };
14410
14411 /* If INSN is a load or store of address in the form of [base+offset],
14412    extract the two parts and set to BASE and OFFSET.  Return scheduling
14413    fusion type this INSN is.  */
14414
14415 static enum sched_fusion_type
14416 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
14417 {
14418   rtx x, dest, src;
14419   enum sched_fusion_type fusion = SCHED_FUSION_LD;
14420
14421   gcc_assert (INSN_P (insn));
14422   x = PATTERN (insn);
14423   if (GET_CODE (x) != SET)
14424     return SCHED_FUSION_NONE;
14425
14426   src = SET_SRC (x);
14427   dest = SET_DEST (x);
14428
14429   machine_mode dest_mode = GET_MODE (dest);
14430
14431   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
14432     return SCHED_FUSION_NONE;
14433
14434   if (GET_CODE (src) == SIGN_EXTEND)
14435     {
14436       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
14437       src = XEXP (src, 0);
14438       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14439         return SCHED_FUSION_NONE;
14440     }
14441   else if (GET_CODE (src) == ZERO_EXTEND)
14442     {
14443       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
14444       src = XEXP (src, 0);
14445       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14446         return SCHED_FUSION_NONE;
14447     }
14448
14449   if (GET_CODE (src) == MEM && REG_P (dest))
14450     extract_base_offset_in_addr (src, base, offset);
14451   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
14452     {
14453       fusion = SCHED_FUSION_ST;
14454       extract_base_offset_in_addr (dest, base, offset);
14455     }
14456   else
14457     return SCHED_FUSION_NONE;
14458
14459   if (*base == NULL_RTX || *offset == NULL_RTX)
14460     fusion = SCHED_FUSION_NONE;
14461
14462   return fusion;
14463 }
14464
14465 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
14466
14467    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
14468    and PRI are only calculated for these instructions.  For other instruction,
14469    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
14470    type instruction fusion can be added by returning different priorities.
14471
14472    It's important that irrelevant instructions get the largest FUSION_PRI.  */
14473
14474 static void
14475 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
14476                                int *fusion_pri, int *pri)
14477 {
14478   int tmp, off_val;
14479   rtx base, offset;
14480   enum sched_fusion_type fusion;
14481
14482   gcc_assert (INSN_P (insn));
14483
14484   tmp = max_pri - 1;
14485   fusion = fusion_load_store (insn, &base, &offset);
14486   if (fusion == SCHED_FUSION_NONE)
14487     {
14488       *pri = tmp;
14489       *fusion_pri = tmp;
14490       return;
14491     }
14492
14493   /* Set FUSION_PRI according to fusion type and base register.  */
14494   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
14495
14496   /* Calculate PRI.  */
14497   tmp /= 2;
14498
14499   /* INSN with smaller offset goes first.  */
14500   off_val = (int)(INTVAL (offset));
14501   if (off_val >= 0)
14502     tmp -= (off_val & 0xfffff);
14503   else
14504     tmp += ((- off_val) & 0xfffff);
14505
14506   *pri = tmp;
14507   return;
14508 }
14509
14510 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
14511    Adjust priority of sha1h instructions so they are scheduled before
14512    other SHA1 instructions.  */
14513
14514 static int
14515 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
14516 {
14517   rtx x = PATTERN (insn);
14518
14519   if (GET_CODE (x) == SET)
14520     {
14521       x = SET_SRC (x);
14522
14523       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
14524         return priority + 10;
14525     }
14526
14527   return priority;
14528 }
14529
14530 /* Given OPERANDS of consecutive load/store, check if we can merge
14531    them into ldp/stp.  LOAD is true if they are load instructions.
14532    MODE is the mode of memory operands.  */
14533
14534 bool
14535 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
14536                                 machine_mode mode)
14537 {
14538   HOST_WIDE_INT offval_1, offval_2, msize;
14539   enum reg_class rclass_1, rclass_2;
14540   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
14541
14542   if (load)
14543     {
14544       mem_1 = operands[1];
14545       mem_2 = operands[3];
14546       reg_1 = operands[0];
14547       reg_2 = operands[2];
14548       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
14549       if (REGNO (reg_1) == REGNO (reg_2))
14550         return false;
14551     }
14552   else
14553     {
14554       mem_1 = operands[0];
14555       mem_2 = operands[2];
14556       reg_1 = operands[1];
14557       reg_2 = operands[3];
14558     }
14559
14560   /* The mems cannot be volatile.  */
14561   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
14562     return false;
14563
14564   /* If we have SImode and slow unaligned ldp,
14565      check the alignment to be at least 8 byte. */
14566   if (mode == SImode
14567       && (aarch64_tune_params.extra_tuning_flags
14568           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14569       && !optimize_size
14570       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14571     return false;
14572
14573   /* Check if the addresses are in the form of [base+offset].  */
14574   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14575   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14576     return false;
14577   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14578   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14579     return false;
14580
14581   /* Check if the bases are same.  */
14582   if (!rtx_equal_p (base_1, base_2))
14583     return false;
14584
14585   offval_1 = INTVAL (offset_1);
14586   offval_2 = INTVAL (offset_2);
14587   msize = GET_MODE_SIZE (mode);
14588   /* Check if the offsets are consecutive.  */
14589   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
14590     return false;
14591
14592   /* Check if the addresses are clobbered by load.  */
14593   if (load)
14594     {
14595       if (reg_mentioned_p (reg_1, mem_1))
14596         return false;
14597
14598       /* In increasing order, the last load can clobber the address.  */
14599       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
14600       return false;
14601     }
14602
14603   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14604     rclass_1 = FP_REGS;
14605   else
14606     rclass_1 = GENERAL_REGS;
14607
14608   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14609     rclass_2 = FP_REGS;
14610   else
14611     rclass_2 = GENERAL_REGS;
14612
14613   /* Check if the registers are of same class.  */
14614   if (rclass_1 != rclass_2)
14615     return false;
14616
14617   return true;
14618 }
14619
14620 /* Given OPERANDS of consecutive load/store, check if we can merge
14621    them into ldp/stp by adjusting the offset.  LOAD is true if they
14622    are load instructions.  MODE is the mode of memory operands.
14623
14624    Given below consecutive stores:
14625
14626      str  w1, [xb, 0x100]
14627      str  w1, [xb, 0x104]
14628      str  w1, [xb, 0x108]
14629      str  w1, [xb, 0x10c]
14630
14631    Though the offsets are out of the range supported by stp, we can
14632    still pair them after adjusting the offset, like:
14633
14634      add  scratch, xb, 0x100
14635      stp  w1, w1, [scratch]
14636      stp  w1, w1, [scratch, 0x8]
14637
14638    The peephole patterns detecting this opportunity should guarantee
14639    the scratch register is avaliable.  */
14640
14641 bool
14642 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
14643                                        scalar_mode mode)
14644 {
14645   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
14646   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
14647   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
14648   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
14649
14650   if (load)
14651     {
14652       reg_1 = operands[0];
14653       mem_1 = operands[1];
14654       reg_2 = operands[2];
14655       mem_2 = operands[3];
14656       reg_3 = operands[4];
14657       mem_3 = operands[5];
14658       reg_4 = operands[6];
14659       mem_4 = operands[7];
14660       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
14661                   && REG_P (reg_3) && REG_P (reg_4));
14662       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
14663         return false;
14664     }
14665   else
14666     {
14667       mem_1 = operands[0];
14668       reg_1 = operands[1];
14669       mem_2 = operands[2];
14670       reg_2 = operands[3];
14671       mem_3 = operands[4];
14672       reg_3 = operands[5];
14673       mem_4 = operands[6];
14674       reg_4 = operands[7];
14675     }
14676   /* Skip if memory operand is by itslef valid for ldp/stp.  */
14677   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
14678     return false;
14679
14680   /* The mems cannot be volatile.  */
14681   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
14682       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
14683     return false;
14684
14685   /* Check if the addresses are in the form of [base+offset].  */
14686   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14687   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14688     return false;
14689   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14690   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14691     return false;
14692   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
14693   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
14694     return false;
14695   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
14696   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
14697     return false;
14698
14699   /* Check if the bases are same.  */
14700   if (!rtx_equal_p (base_1, base_2)
14701       || !rtx_equal_p (base_2, base_3)
14702       || !rtx_equal_p (base_3, base_4))
14703     return false;
14704
14705   offval_1 = INTVAL (offset_1);
14706   offval_2 = INTVAL (offset_2);
14707   offval_3 = INTVAL (offset_3);
14708   offval_4 = INTVAL (offset_4);
14709   msize = GET_MODE_SIZE (mode);
14710   /* Check if the offsets are consecutive.  */
14711   if ((offval_1 != (offval_2 + msize)
14712        || offval_1 != (offval_3 + msize * 2)
14713        || offval_1 != (offval_4 + msize * 3))
14714       && (offval_4 != (offval_3 + msize)
14715           || offval_4 != (offval_2 + msize * 2)
14716           || offval_4 != (offval_1 + msize * 3)))
14717     return false;
14718
14719   /* Check if the addresses are clobbered by load.  */
14720   if (load)
14721     {
14722       if (reg_mentioned_p (reg_1, mem_1)
14723           || reg_mentioned_p (reg_2, mem_2)
14724           || reg_mentioned_p (reg_3, mem_3))
14725         return false;
14726
14727       /* In increasing order, the last load can clobber the address.  */
14728       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
14729         return false;
14730     }
14731
14732   /* If we have SImode and slow unaligned ldp,
14733      check the alignment to be at least 8 byte. */
14734   if (mode == SImode
14735       && (aarch64_tune_params.extra_tuning_flags
14736           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14737       && !optimize_size
14738       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14739     return false;
14740
14741   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14742     rclass_1 = FP_REGS;
14743   else
14744     rclass_1 = GENERAL_REGS;
14745
14746   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14747     rclass_2 = FP_REGS;
14748   else
14749     rclass_2 = GENERAL_REGS;
14750
14751   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
14752     rclass_3 = FP_REGS;
14753   else
14754     rclass_3 = GENERAL_REGS;
14755
14756   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
14757     rclass_4 = FP_REGS;
14758   else
14759     rclass_4 = GENERAL_REGS;
14760
14761   /* Check if the registers are of same class.  */
14762   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
14763     return false;
14764
14765   return true;
14766 }
14767
14768 /* Given OPERANDS of consecutive load/store, this function pairs them
14769    into ldp/stp after adjusting the offset.  It depends on the fact
14770    that addresses of load/store instructions are in increasing order.
14771    MODE is the mode of memory operands.  CODE is the rtl operator
14772    which should be applied to all memory operands, it's SIGN_EXTEND,
14773    ZERO_EXTEND or UNKNOWN.  */
14774
14775 bool
14776 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
14777                              scalar_mode mode, RTX_CODE code)
14778 {
14779   rtx base, offset, t1, t2;
14780   rtx mem_1, mem_2, mem_3, mem_4;
14781   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
14782
14783   if (load)
14784     {
14785       mem_1 = operands[1];
14786       mem_2 = operands[3];
14787       mem_3 = operands[5];
14788       mem_4 = operands[7];
14789     }
14790   else
14791     {
14792       mem_1 = operands[0];
14793       mem_2 = operands[2];
14794       mem_3 = operands[4];
14795       mem_4 = operands[6];
14796       gcc_assert (code == UNKNOWN);
14797     }
14798
14799   extract_base_offset_in_addr (mem_1, &base, &offset);
14800   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
14801
14802   /* Adjust offset thus it can fit in ldp/stp instruction.  */
14803   msize = GET_MODE_SIZE (mode);
14804   stp_off_limit = msize * 0x40;
14805   off_val = INTVAL (offset);
14806   abs_off = (off_val < 0) ? -off_val : off_val;
14807   new_off = abs_off % stp_off_limit;
14808   adj_off = abs_off - new_off;
14809
14810   /* Further adjust to make sure all offsets are OK.  */
14811   if ((new_off + msize * 2) >= stp_off_limit)
14812     {
14813       adj_off += stp_off_limit;
14814       new_off -= stp_off_limit;
14815     }
14816
14817   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
14818   if (adj_off >= 0x1000)
14819     return false;
14820
14821   if (off_val < 0)
14822     {
14823       adj_off = -adj_off;
14824       new_off = -new_off;
14825     }
14826
14827   /* Create new memory references.  */
14828   mem_1 = change_address (mem_1, VOIDmode,
14829                           plus_constant (DImode, operands[8], new_off));
14830
14831   /* Check if the adjusted address is OK for ldp/stp.  */
14832   if (!aarch64_mem_pair_operand (mem_1, mode))
14833     return false;
14834
14835   msize = GET_MODE_SIZE (mode);
14836   mem_2 = change_address (mem_2, VOIDmode,
14837                           plus_constant (DImode,
14838                                          operands[8],
14839                                          new_off + msize));
14840   mem_3 = change_address (mem_3, VOIDmode,
14841                           plus_constant (DImode,
14842                                          operands[8],
14843                                          new_off + msize * 2));
14844   mem_4 = change_address (mem_4, VOIDmode,
14845                           plus_constant (DImode,
14846                                          operands[8],
14847                                          new_off + msize * 3));
14848
14849   if (code == ZERO_EXTEND)
14850     {
14851       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
14852       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
14853       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
14854       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
14855     }
14856   else if (code == SIGN_EXTEND)
14857     {
14858       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
14859       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
14860       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
14861       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
14862     }
14863
14864   if (load)
14865     {
14866       operands[1] = mem_1;
14867       operands[3] = mem_2;
14868       operands[5] = mem_3;
14869       operands[7] = mem_4;
14870     }
14871   else
14872     {
14873       operands[0] = mem_1;
14874       operands[2] = mem_2;
14875       operands[4] = mem_3;
14876       operands[6] = mem_4;
14877     }
14878
14879   /* Emit adjusting instruction.  */
14880   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
14881   /* Emit ldp/stp instructions.  */
14882   t1 = gen_rtx_SET (operands[0], operands[1]);
14883   t2 = gen_rtx_SET (operands[2], operands[3]);
14884   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
14885   t1 = gen_rtx_SET (operands[4], operands[5]);
14886   t2 = gen_rtx_SET (operands[6], operands[7]);
14887   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
14888   return true;
14889 }
14890
14891 /* Return 1 if pseudo register should be created and used to hold
14892    GOT address for PIC code.  */
14893
14894 bool
14895 aarch64_use_pseudo_pic_reg (void)
14896 {
14897   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
14898 }
14899
14900 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
14901
14902 static int
14903 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
14904 {
14905   switch (XINT (x, 1))
14906     {
14907     case UNSPEC_GOTSMALLPIC:
14908     case UNSPEC_GOTSMALLPIC28K:
14909     case UNSPEC_GOTTINYPIC:
14910       return 0;
14911     default:
14912       break;
14913     }
14914
14915   return default_unspec_may_trap_p (x, flags);
14916 }
14917
14918
14919 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
14920    return the log2 of that value.  Otherwise return -1.  */
14921
14922 int
14923 aarch64_fpconst_pow_of_2 (rtx x)
14924 {
14925   const REAL_VALUE_TYPE *r;
14926
14927   if (!CONST_DOUBLE_P (x))
14928     return -1;
14929
14930   r = CONST_DOUBLE_REAL_VALUE (x);
14931
14932   if (REAL_VALUE_NEGATIVE (*r)
14933       || REAL_VALUE_ISNAN (*r)
14934       || REAL_VALUE_ISINF (*r)
14935       || !real_isinteger (r, DFmode))
14936     return -1;
14937
14938   return exact_log2 (real_to_integer (r));
14939 }
14940
14941 /* If X is a vector of equal CONST_DOUBLE values and that value is
14942    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
14943
14944 int
14945 aarch64_vec_fpconst_pow_of_2 (rtx x)
14946 {
14947   if (GET_CODE (x) != CONST_VECTOR)
14948     return -1;
14949
14950   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
14951     return -1;
14952
14953   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
14954   if (firstval <= 0)
14955     return -1;
14956
14957   for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
14958     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
14959       return -1;
14960
14961   return firstval;
14962 }
14963
14964 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
14965    to float.
14966
14967    __fp16 always promotes through this hook.
14968    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
14969    through the generic excess precision logic rather than here.  */
14970
14971 static tree
14972 aarch64_promoted_type (const_tree t)
14973 {
14974   if (SCALAR_FLOAT_TYPE_P (t)
14975       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
14976     return float_type_node;
14977
14978   return NULL_TREE;
14979 }
14980
14981 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
14982
14983 static bool
14984 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
14985                            optimization_type opt_type)
14986 {
14987   switch (op)
14988     {
14989     case rsqrt_optab:
14990       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
14991
14992     default:
14993       return true;
14994     }
14995 }
14996
14997 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
14998    if MODE is HFmode, and punt to the generic implementation otherwise.  */
14999
15000 static bool
15001 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
15002 {
15003   return (mode == HFmode
15004           ? true
15005           : default_libgcc_floating_mode_supported_p (mode));
15006 }
15007
15008 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
15009    if MODE is HFmode, and punt to the generic implementation otherwise.  */
15010
15011 static bool
15012 aarch64_scalar_mode_supported_p (scalar_mode mode)
15013 {
15014   return (mode == HFmode
15015           ? true
15016           : default_scalar_mode_supported_p (mode));
15017 }
15018
15019 /* Set the value of FLT_EVAL_METHOD.
15020    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
15021
15022     0: evaluate all operations and constants, whose semantic type has at
15023        most the range and precision of type float, to the range and
15024        precision of float; evaluate all other operations and constants to
15025        the range and precision of the semantic type;
15026
15027     N, where _FloatN is a supported interchange floating type
15028        evaluate all operations and constants, whose semantic type has at
15029        most the range and precision of _FloatN type, to the range and
15030        precision of the _FloatN type; evaluate all other operations and
15031        constants to the range and precision of the semantic type;
15032
15033    If we have the ARMv8.2-A extensions then we support _Float16 in native
15034    precision, so we should set this to 16.  Otherwise, we support the type,
15035    but want to evaluate expressions in float precision, so set this to
15036    0.  */
15037
15038 static enum flt_eval_method
15039 aarch64_excess_precision (enum excess_precision_type type)
15040 {
15041   switch (type)
15042     {
15043       case EXCESS_PRECISION_TYPE_FAST:
15044       case EXCESS_PRECISION_TYPE_STANDARD:
15045         /* We can calculate either in 16-bit range and precision or
15046            32-bit range and precision.  Make that decision based on whether
15047            we have native support for the ARMv8.2-A 16-bit floating-point
15048            instructions or not.  */
15049         return (TARGET_FP_F16INST
15050                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
15051                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
15052       case EXCESS_PRECISION_TYPE_IMPLICIT:
15053         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
15054       default:
15055         gcc_unreachable ();
15056     }
15057   return FLT_EVAL_METHOD_UNPREDICTABLE;
15058 }
15059
15060 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
15061    scheduled for speculative execution.  Reject the long-running division
15062    and square-root instructions.  */
15063
15064 static bool
15065 aarch64_sched_can_speculate_insn (rtx_insn *insn)
15066 {
15067   switch (get_attr_type (insn))
15068     {
15069       case TYPE_SDIV:
15070       case TYPE_UDIV:
15071       case TYPE_FDIVS:
15072       case TYPE_FDIVD:
15073       case TYPE_FSQRTS:
15074       case TYPE_FSQRTD:
15075       case TYPE_NEON_FP_SQRT_S:
15076       case TYPE_NEON_FP_SQRT_D:
15077       case TYPE_NEON_FP_SQRT_S_Q:
15078       case TYPE_NEON_FP_SQRT_D_Q:
15079       case TYPE_NEON_FP_DIV_S:
15080       case TYPE_NEON_FP_DIV_D:
15081       case TYPE_NEON_FP_DIV_S_Q:
15082       case TYPE_NEON_FP_DIV_D_Q:
15083         return false;
15084       default:
15085         return true;
15086     }
15087 }
15088
15089 /* Target-specific selftests.  */
15090
15091 #if CHECKING_P
15092
15093 namespace selftest {
15094
15095 /* Selftest for the RTL loader.
15096    Verify that the RTL loader copes with a dump from
15097    print_rtx_function.  This is essentially just a test that class
15098    function_reader can handle a real dump, but it also verifies
15099    that lookup_reg_by_dump_name correctly handles hard regs.
15100    The presence of hard reg names in the dump means that the test is
15101    target-specific, hence it is in this file.  */
15102
15103 static void
15104 aarch64_test_loading_full_dump ()
15105 {
15106   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
15107
15108   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
15109
15110   rtx_insn *insn_1 = get_insn_by_uid (1);
15111   ASSERT_EQ (NOTE, GET_CODE (insn_1));
15112
15113   rtx_insn *insn_15 = get_insn_by_uid (15);
15114   ASSERT_EQ (INSN, GET_CODE (insn_15));
15115   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
15116
15117   /* Verify crtl->return_rtx.  */
15118   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
15119   ASSERT_EQ (0, REGNO (crtl->return_rtx));
15120   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
15121 }
15122
15123 /* Run all target-specific selftests.  */
15124
15125 static void
15126 aarch64_run_selftests (void)
15127 {
15128   aarch64_test_loading_full_dump ();
15129 }
15130
15131 } // namespace selftest
15132
15133 #endif /* #if CHECKING_P */
15134
15135 #undef TARGET_ADDRESS_COST
15136 #define TARGET_ADDRESS_COST aarch64_address_cost
15137
15138 /* This hook will determines whether unnamed bitfields affect the alignment
15139    of the containing structure.  The hook returns true if the structure
15140    should inherit the alignment requirements of an unnamed bitfield's
15141    type.  */
15142 #undef TARGET_ALIGN_ANON_BITFIELD
15143 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
15144
15145 #undef TARGET_ASM_ALIGNED_DI_OP
15146 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
15147
15148 #undef TARGET_ASM_ALIGNED_HI_OP
15149 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
15150
15151 #undef TARGET_ASM_ALIGNED_SI_OP
15152 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
15153
15154 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
15155 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
15156   hook_bool_const_tree_hwi_hwi_const_tree_true
15157
15158 #undef TARGET_ASM_FILE_START
15159 #define TARGET_ASM_FILE_START aarch64_start_file
15160
15161 #undef TARGET_ASM_OUTPUT_MI_THUNK
15162 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
15163
15164 #undef TARGET_ASM_SELECT_RTX_SECTION
15165 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
15166
15167 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
15168 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
15169
15170 #undef TARGET_BUILD_BUILTIN_VA_LIST
15171 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
15172
15173 #undef TARGET_CALLEE_COPIES
15174 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
15175
15176 #undef TARGET_CAN_ELIMINATE
15177 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
15178
15179 #undef TARGET_CAN_INLINE_P
15180 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
15181
15182 #undef TARGET_CANNOT_FORCE_CONST_MEM
15183 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
15184
15185 #undef TARGET_CASE_VALUES_THRESHOLD
15186 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
15187
15188 #undef TARGET_CONDITIONAL_REGISTER_USAGE
15189 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
15190
15191 /* Only the least significant bit is used for initialization guard
15192    variables.  */
15193 #undef TARGET_CXX_GUARD_MASK_BIT
15194 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
15195
15196 #undef TARGET_C_MODE_FOR_SUFFIX
15197 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
15198
15199 #ifdef TARGET_BIG_ENDIAN_DEFAULT
15200 #undef  TARGET_DEFAULT_TARGET_FLAGS
15201 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
15202 #endif
15203
15204 #undef TARGET_CLASS_MAX_NREGS
15205 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
15206
15207 #undef TARGET_BUILTIN_DECL
15208 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
15209
15210 #undef TARGET_BUILTIN_RECIPROCAL
15211 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
15212
15213 #undef TARGET_C_EXCESS_PRECISION
15214 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
15215
15216 #undef  TARGET_EXPAND_BUILTIN
15217 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
15218
15219 #undef TARGET_EXPAND_BUILTIN_VA_START
15220 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
15221
15222 #undef TARGET_FOLD_BUILTIN
15223 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
15224
15225 #undef TARGET_FUNCTION_ARG
15226 #define TARGET_FUNCTION_ARG aarch64_function_arg
15227
15228 #undef TARGET_FUNCTION_ARG_ADVANCE
15229 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
15230
15231 #undef TARGET_FUNCTION_ARG_BOUNDARY
15232 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
15233
15234 #undef TARGET_FUNCTION_ARG_PADDING
15235 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
15236
15237 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
15238 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
15239
15240 #undef TARGET_FUNCTION_VALUE
15241 #define TARGET_FUNCTION_VALUE aarch64_function_value
15242
15243 #undef TARGET_FUNCTION_VALUE_REGNO_P
15244 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
15245
15246 #undef TARGET_GIMPLE_FOLD_BUILTIN
15247 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
15248
15249 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
15250 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
15251
15252 #undef  TARGET_INIT_BUILTINS
15253 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
15254
15255 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
15256 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
15257   aarch64_ira_change_pseudo_allocno_class
15258
15259 #undef TARGET_LEGITIMATE_ADDRESS_P
15260 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
15261
15262 #undef TARGET_LEGITIMATE_CONSTANT_P
15263 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
15264
15265 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
15266 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
15267   aarch64_legitimize_address_displacement
15268
15269 #undef TARGET_LIBGCC_CMP_RETURN_MODE
15270 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
15271
15272 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
15273 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
15274 aarch64_libgcc_floating_mode_supported_p
15275
15276 #undef TARGET_MANGLE_TYPE
15277 #define TARGET_MANGLE_TYPE aarch64_mangle_type
15278
15279 #undef TARGET_MEMORY_MOVE_COST
15280 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
15281
15282 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
15283 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
15284
15285 #undef TARGET_MUST_PASS_IN_STACK
15286 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
15287
15288 /* This target hook should return true if accesses to volatile bitfields
15289    should use the narrowest mode possible.  It should return false if these
15290    accesses should use the bitfield container type.  */
15291 #undef TARGET_NARROW_VOLATILE_BITFIELD
15292 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
15293
15294 #undef  TARGET_OPTION_OVERRIDE
15295 #define TARGET_OPTION_OVERRIDE aarch64_override_options
15296
15297 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
15298 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
15299   aarch64_override_options_after_change
15300
15301 #undef TARGET_OPTION_SAVE
15302 #define TARGET_OPTION_SAVE aarch64_option_save
15303
15304 #undef TARGET_OPTION_RESTORE
15305 #define TARGET_OPTION_RESTORE aarch64_option_restore
15306
15307 #undef TARGET_OPTION_PRINT
15308 #define TARGET_OPTION_PRINT aarch64_option_print
15309
15310 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
15311 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
15312
15313 #undef TARGET_SET_CURRENT_FUNCTION
15314 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
15315
15316 #undef TARGET_PASS_BY_REFERENCE
15317 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
15318
15319 #undef TARGET_PREFERRED_RELOAD_CLASS
15320 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
15321
15322 #undef TARGET_SCHED_REASSOCIATION_WIDTH
15323 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
15324
15325 #undef TARGET_PROMOTED_TYPE
15326 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
15327
15328 #undef TARGET_SECONDARY_RELOAD
15329 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
15330
15331 #undef TARGET_SHIFT_TRUNCATION_MASK
15332 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
15333
15334 #undef TARGET_SETUP_INCOMING_VARARGS
15335 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
15336
15337 #undef TARGET_STRUCT_VALUE_RTX
15338 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
15339
15340 #undef TARGET_REGISTER_MOVE_COST
15341 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
15342
15343 #undef TARGET_RETURN_IN_MEMORY
15344 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
15345
15346 #undef TARGET_RETURN_IN_MSB
15347 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
15348
15349 #undef TARGET_RTX_COSTS
15350 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
15351
15352 #undef TARGET_SCALAR_MODE_SUPPORTED_P
15353 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
15354
15355 #undef TARGET_SCHED_ISSUE_RATE
15356 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
15357
15358 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
15359 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
15360   aarch64_sched_first_cycle_multipass_dfa_lookahead
15361
15362 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
15363 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
15364   aarch64_first_cycle_multipass_dfa_lookahead_guard
15365
15366 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
15367 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
15368   aarch64_get_separate_components
15369
15370 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
15371 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
15372   aarch64_components_for_bb
15373
15374 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
15375 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
15376   aarch64_disqualify_components
15377
15378 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
15379 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
15380   aarch64_emit_prologue_components
15381
15382 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
15383 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
15384   aarch64_emit_epilogue_components
15385
15386 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
15387 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
15388   aarch64_set_handled_components
15389
15390 #undef TARGET_TRAMPOLINE_INIT
15391 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
15392
15393 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
15394 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
15395
15396 #undef TARGET_VECTOR_MODE_SUPPORTED_P
15397 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
15398
15399 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
15400 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
15401   aarch64_builtin_support_vector_misalignment
15402
15403 #undef TARGET_ARRAY_MODE_SUPPORTED_P
15404 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
15405
15406 #undef TARGET_VECTORIZE_ADD_STMT_COST
15407 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
15408
15409 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
15410 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
15411   aarch64_builtin_vectorization_cost
15412
15413 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
15414 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
15415
15416 #undef TARGET_VECTORIZE_BUILTINS
15417 #define TARGET_VECTORIZE_BUILTINS
15418
15419 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
15420 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
15421   aarch64_builtin_vectorized_function
15422
15423 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
15424 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
15425   aarch64_autovectorize_vector_sizes
15426
15427 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
15428 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
15429   aarch64_atomic_assign_expand_fenv
15430
15431 /* Section anchor support.  */
15432
15433 #undef TARGET_MIN_ANCHOR_OFFSET
15434 #define TARGET_MIN_ANCHOR_OFFSET -256
15435
15436 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
15437    byte offset; we can do much more for larger data types, but have no way
15438    to determine the size of the access.  We assume accesses are aligned.  */
15439 #undef TARGET_MAX_ANCHOR_OFFSET
15440 #define TARGET_MAX_ANCHOR_OFFSET 4095
15441
15442 #undef TARGET_VECTOR_ALIGNMENT
15443 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
15444
15445 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
15446 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
15447   aarch64_simd_vector_alignment_reachable
15448
15449 /* vec_perm support.  */
15450
15451 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
15452 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
15453   aarch64_vectorize_vec_perm_const_ok
15454
15455 #undef TARGET_INIT_LIBFUNCS
15456 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
15457
15458 #undef TARGET_FIXED_CONDITION_CODE_REGS
15459 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
15460
15461 #undef TARGET_FLAGS_REGNUM
15462 #define TARGET_FLAGS_REGNUM CC_REGNUM
15463
15464 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
15465 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
15466
15467 #undef TARGET_ASAN_SHADOW_OFFSET
15468 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
15469
15470 #undef TARGET_LEGITIMIZE_ADDRESS
15471 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
15472
15473 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
15474 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
15475   aarch64_use_by_pieces_infrastructure_p
15476
15477 #undef TARGET_SCHED_CAN_SPECULATE_INSN
15478 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
15479
15480 #undef TARGET_CAN_USE_DOLOOP_P
15481 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
15482
15483 #undef TARGET_SCHED_ADJUST_PRIORITY
15484 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
15485
15486 #undef TARGET_SCHED_MACRO_FUSION_P
15487 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
15488
15489 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
15490 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
15491
15492 #undef TARGET_SCHED_FUSION_PRIORITY
15493 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
15494
15495 #undef TARGET_UNSPEC_MAY_TRAP_P
15496 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
15497
15498 #undef TARGET_USE_PSEUDO_PIC_REG
15499 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
15500
15501 #undef TARGET_PRINT_OPERAND
15502 #define TARGET_PRINT_OPERAND aarch64_print_operand
15503
15504 #undef TARGET_PRINT_OPERAND_ADDRESS
15505 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
15506
15507 #undef TARGET_OPTAB_SUPPORTED_P
15508 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
15509
15510 #undef TARGET_OMIT_STRUCT_RETURN_REG
15511 #define TARGET_OMIT_STRUCT_RETURN_REG true
15512
15513 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
15514 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
15515 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
15516
15517 #undef TARGET_HARD_REGNO_NREGS
15518 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
15519 #undef TARGET_HARD_REGNO_MODE_OK
15520 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
15521
15522 #undef TARGET_MODES_TIEABLE_P
15523 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
15524
15525 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
15526 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
15527   aarch64_hard_regno_call_part_clobbered
15528
15529 #undef TARGET_CONSTANT_ALIGNMENT
15530 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
15531
15532 #if CHECKING_P
15533 #undef TARGET_RUN_TARGET_SELFTESTS
15534 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
15535 #endif /* #if CHECKING_P */
15536
15537 struct gcc_target targetm = TARGET_INITIALIZER;
15538
15539 #include "gt-aarch64.h"