gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2017 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #define INCLUDE_STRING
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "memmodel.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "cfgloop.h"
  33 #include "df.h"
  34 #include "tm_p.h"
  35 #include "stringpool.h"
  36 #include "optabs.h"
  37 #include "regs.h"
  38 #include "emit-rtl.h"
  39 #include "recog.h"
  40 #include "diagnostic.h"
  41 #include "insn-attr.h"
  42 #include "alias.h"
  43 #include "fold-const.h"
  44 #include "stor-layout.h"
  45 #include "calls.h"
  46 #include "varasm.h"
  47 #include "output.h"
  48 #include "flags.h"
  49 #include "explow.h"
  50 #include "expr.h"
  51 #include "reload.h"
  52 #include "langhooks.h"
  53 #include "opts.h"
  54 #include "params.h"
  55 #include "gimplify.h"
  56 #include "dwarf2.h"
  57 #include "gimple-iterator.h"
  58 #include "tree-vectorizer.h"
  59 #include "aarch64-cost-tables.h"
  60 #include "dumpfile.h"
  61 #include "builtins.h"
  62 #include "rtl-iter.h"
  63 #include "tm-constrs.h"
  64 #include "sched-int.h"
  65 #include "target-globals.h"
  66 #include "common/common-target.h"
  67 #include "selftest.h"
  68 #include "selftest-rtl.h"
  69
  70 /* This file should be included last.  */
  71 #include "target-def.h"
  72
  73 /* Defined for convenience.  */
  74 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  75
  76 /* Classifies an address.
  77
  78    ADDRESS_REG_IMM
  79        A simple base register plus immediate offset.
  80
  81    ADDRESS_REG_WB
  82        A base register indexed by immediate offset with writeback.
  83
  84    ADDRESS_REG_REG
  85        A base register indexed by (optionally scaled) register.
  86
  87    ADDRESS_REG_UXTW
  88        A base register indexed by (optionally scaled) zero-extended register.
  89
  90    ADDRESS_REG_SXTW
  91        A base register indexed by (optionally scaled) sign-extended register.
  92
  93    ADDRESS_LO_SUM
  94        A LO_SUM rtx with a base register and "LO12" symbol relocation.
  95
  96    ADDRESS_SYMBOLIC:
  97        A constant symbolic address, in pc-relative literal pool.  */
  98
  99 enum aarch64_address_type {
 100   ADDRESS_REG_IMM,
 101   ADDRESS_REG_WB,
 102   ADDRESS_REG_REG,
 103   ADDRESS_REG_UXTW,
 104   ADDRESS_REG_SXTW,
 105   ADDRESS_LO_SUM,
 106   ADDRESS_SYMBOLIC
 107 };
 108
 109 struct aarch64_address_info {
 110   enum aarch64_address_type type;
 111   rtx base;
 112   rtx offset;
 113   int shift;
 114   enum aarch64_symbol_type symbol_type;
 115 };
 116
 117 struct simd_immediate_info
 118 {
 119   rtx value;
 120   int shift;
 121   int element_width;
 122   bool mvn;
 123   bool msl;
 124 };
 125
 126 /* The current code model.  */
 127 enum aarch64_code_model aarch64_cmodel;
 128
 129 #ifdef HAVE_AS_TLS
 130 #undef TARGET_HAVE_TLS
 131 #define TARGET_HAVE_TLS 1
 132 #endif
 133
 134 static bool aarch64_composite_type_p (const_tree, machine_mode);
 135 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 136                                                      const_tree,
 137                                                      machine_mode *, int *,
 138                                                      bool *);
 139 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 140 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 141 static void aarch64_override_options_after_change (void);
 142 static bool aarch64_vector_mode_supported_p (machine_mode);
 143 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
 144                                                  const unsigned char *sel);
 145 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 146 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 147                                                          const_tree type,
 148                                                          int misalignment,
 149                                                          bool is_packed);
 150
 151 /* Major revision number of the ARM Architecture implemented by the target.  */
 152 unsigned aarch64_architecture_version;
 153
 154 /* The processor for which instructions should be scheduled.  */
 155 enum aarch64_processor aarch64_tune = cortexa53;
 156
 157 /* Mask to specify which instruction scheduling options should be used.  */
 158 unsigned long aarch64_tune_flags = 0;
 159
 160 /* Global flag for PC relative loads.  */
 161 bool aarch64_pcrelative_literal_loads;
 162
 163 /* Support for command line parsing of boolean flags in the tuning
 164    structures.  */
 165 struct aarch64_flag_desc
 166 {
 167   const char* name;
 168   unsigned int flag;
 169 };
 170
 171 #define AARCH64_FUSION_PAIR(name, internal_name) \
 172   { name, AARCH64_FUSE_##internal_name },
 173 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 174 {
 175   { "none", AARCH64_FUSE_NOTHING },
 176 #include "aarch64-fusion-pairs.def"
 177   { "all", AARCH64_FUSE_ALL },
 178   { NULL, AARCH64_FUSE_NOTHING }
 179 };
 180
 181 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 182   { name, AARCH64_EXTRA_TUNE_##internal_name },
 183 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 184 {
 185   { "none", AARCH64_EXTRA_TUNE_NONE },
 186 #include "aarch64-tuning-flags.def"
 187   { "all", AARCH64_EXTRA_TUNE_ALL },
 188   { NULL, AARCH64_EXTRA_TUNE_NONE }
 189 };
 190
 191 /* Tuning parameters.  */
 192
 193 static const struct cpu_addrcost_table generic_addrcost_table =
 194 {
 195     {
 196       0, /* hi  */
 197       0, /* si  */
 198       0, /* di  */
 199       0, /* ti  */
 200     },
 201   0, /* pre_modify  */
 202   0, /* post_modify  */
 203   0, /* register_offset  */
 204   0, /* register_sextend  */
 205   0, /* register_zextend  */
 206   0 /* imm_offset  */
 207 };
 208
 209 static const struct cpu_addrcost_table cortexa57_addrcost_table =
 210 {
 211     {
 212       1, /* hi  */
 213       0, /* si  */
 214       0, /* di  */
 215       1, /* ti  */
 216     },
 217   0, /* pre_modify  */
 218   0, /* post_modify  */
 219   0, /* register_offset  */
 220   0, /* register_sextend  */
 221   0, /* register_zextend  */
 222   0, /* imm_offset  */
 223 };
 224
 225 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 226 {
 227     {
 228       0, /* hi  */
 229       0, /* si  */
 230       0, /* di  */
 231       2, /* ti  */
 232     },
 233   0, /* pre_modify  */
 234   0, /* post_modify  */
 235   1, /* register_offset  */
 236   1, /* register_sextend  */
 237   2, /* register_zextend  */
 238   0, /* imm_offset  */
 239 };
 240
 241 static const struct cpu_addrcost_table xgene1_addrcost_table =
 242 {
 243     {
 244       1, /* hi  */
 245       0, /* si  */
 246       0, /* di  */
 247       1, /* ti  */
 248     },
 249   1, /* pre_modify  */
 250   0, /* post_modify  */
 251   0, /* register_offset  */
 252   1, /* register_sextend  */
 253   1, /* register_zextend  */
 254   0, /* imm_offset  */
 255 };
 256
 257 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 258 {
 259     {
 260       1, /* hi  */
 261       0, /* si  */
 262       0, /* di  */
 263       1, /* ti  */
 264     },
 265   0, /* pre_modify  */
 266   0, /* post_modify  */
 267   0, /* register_offset  */
 268   0, /* register_sextend  */
 269   0, /* register_zextend  */
 270   0 /* imm_offset  */
 271 };
 272
 273 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 274 {
 275     {
 276       1, /* hi  */
 277       1, /* si  */
 278       1, /* di  */
 279       2, /* ti  */
 280     },
 281   0, /* pre_modify  */
 282   0, /* post_modify  */
 283   2, /* register_offset  */
 284   3, /* register_sextend  */
 285   3, /* register_zextend  */
 286   0, /* imm_offset  */
 287 };
 288
 289 static const struct cpu_regmove_cost generic_regmove_cost =
 290 {
 291   1, /* GP2GP  */
 292   /* Avoid the use of slow int<->fp moves for spilling by setting
 293      their cost higher than memmov_cost.  */
 294   5, /* GP2FP  */
 295   5, /* FP2GP  */
 296   2 /* FP2FP  */
 297 };
 298
 299 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 300 {
 301   1, /* GP2GP  */
 302   /* Avoid the use of slow int<->fp moves for spilling by setting
 303      their cost higher than memmov_cost.  */
 304   5, /* GP2FP  */
 305   5, /* FP2GP  */
 306   2 /* FP2FP  */
 307 };
 308
 309 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 310 {
 311   1, /* GP2GP  */
 312   /* Avoid the use of slow int<->fp moves for spilling by setting
 313      their cost higher than memmov_cost.  */
 314   5, /* GP2FP  */
 315   5, /* FP2GP  */
 316   2 /* FP2FP  */
 317 };
 318
 319 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 320 {
 321   1, /* GP2GP  */
 322   /* Avoid the use of slow int<->fp moves for spilling by setting
 323      their cost higher than memmov_cost (actual, 4 and 9).  */
 324   9, /* GP2FP  */
 325   9, /* FP2GP  */
 326   1 /* FP2FP  */
 327 };
 328
 329 static const struct cpu_regmove_cost thunderx_regmove_cost =
 330 {
 331   2, /* GP2GP  */
 332   2, /* GP2FP  */
 333   6, /* FP2GP  */
 334   4 /* FP2FP  */
 335 };
 336
 337 static const struct cpu_regmove_cost xgene1_regmove_cost =
 338 {
 339   1, /* GP2GP  */
 340   /* Avoid the use of slow int<->fp moves for spilling by setting
 341      their cost higher than memmov_cost.  */
 342   8, /* GP2FP  */
 343   8, /* FP2GP  */
 344   2 /* FP2FP  */
 345 };
 346
 347 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 348 {
 349   2, /* GP2GP  */
 350   /* Avoid the use of int<->fp moves for spilling.  */
 351   6, /* GP2FP  */
 352   6, /* FP2GP  */
 353   4 /* FP2FP  */
 354 };
 355
 356 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 357 {
 358   1, /* GP2GP  */
 359   /* Avoid the use of int<->fp moves for spilling.  */
 360   8, /* GP2FP  */
 361   8, /* FP2GP  */
 362   4  /* FP2FP  */
 363 };
 364
 365 /* Generic costs for vector insn classes.  */
 366 static const struct cpu_vector_cost generic_vector_cost =
 367 {
 368   1, /* scalar_int_stmt_cost  */
 369   1, /* scalar_fp_stmt_cost  */
 370   1, /* scalar_load_cost  */
 371   1, /* scalar_store_cost  */
 372   1, /* vec_int_stmt_cost  */
 373   1, /* vec_fp_stmt_cost  */
 374   2, /* vec_permute_cost  */
 375   1, /* vec_to_scalar_cost  */
 376   1, /* scalar_to_vec_cost  */
 377   1, /* vec_align_load_cost  */
 378   1, /* vec_unalign_load_cost  */
 379   1, /* vec_unalign_store_cost  */
 380   1, /* vec_store_cost  */
 381   3, /* cond_taken_branch_cost  */
 382   1 /* cond_not_taken_branch_cost  */
 383 };
 384
 385 /* ThunderX costs for vector insn classes.  */
 386 static const struct cpu_vector_cost thunderx_vector_cost =
 387 {
 388   1, /* scalar_int_stmt_cost  */
 389   1, /* scalar_fp_stmt_cost  */
 390   3, /* scalar_load_cost  */
 391   1, /* scalar_store_cost  */
 392   4, /* vec_int_stmt_cost  */
 393   4, /* vec_fp_stmt_cost  */
 394   4, /* vec_permute_cost  */
 395   2, /* vec_to_scalar_cost  */
 396   2, /* scalar_to_vec_cost  */
 397   3, /* vec_align_load_cost  */
 398   10, /* vec_unalign_load_cost  */
 399   10, /* vec_unalign_store_cost  */
 400   1, /* vec_store_cost  */
 401   3, /* cond_taken_branch_cost  */
 402   3 /* cond_not_taken_branch_cost  */
 403 };
 404
 405 /* Generic costs for vector insn classes.  */
 406 static const struct cpu_vector_cost cortexa57_vector_cost =
 407 {
 408   1, /* scalar_int_stmt_cost  */
 409   1, /* scalar_fp_stmt_cost  */
 410   4, /* scalar_load_cost  */
 411   1, /* scalar_store_cost  */
 412   2, /* vec_int_stmt_cost  */
 413   2, /* vec_fp_stmt_cost  */
 414   3, /* vec_permute_cost  */
 415   8, /* vec_to_scalar_cost  */
 416   8, /* scalar_to_vec_cost  */
 417   4, /* vec_align_load_cost  */
 418   4, /* vec_unalign_load_cost  */
 419   1, /* vec_unalign_store_cost  */
 420   1, /* vec_store_cost  */
 421   1, /* cond_taken_branch_cost  */
 422   1 /* cond_not_taken_branch_cost  */
 423 };
 424
 425 static const struct cpu_vector_cost exynosm1_vector_cost =
 426 {
 427   1, /* scalar_int_stmt_cost  */
 428   1, /* scalar_fp_stmt_cost  */
 429   5, /* scalar_load_cost  */
 430   1, /* scalar_store_cost  */
 431   3, /* vec_int_stmt_cost  */
 432   3, /* vec_fp_stmt_cost  */
 433   3, /* vec_permute_cost  */
 434   3, /* vec_to_scalar_cost  */
 435   3, /* scalar_to_vec_cost  */
 436   5, /* vec_align_load_cost  */
 437   5, /* vec_unalign_load_cost  */
 438   1, /* vec_unalign_store_cost  */
 439   1, /* vec_store_cost  */
 440   1, /* cond_taken_branch_cost  */
 441   1 /* cond_not_taken_branch_cost  */
 442 };
 443
 444 /* Generic costs for vector insn classes.  */
 445 static const struct cpu_vector_cost xgene1_vector_cost =
 446 {
 447   1, /* scalar_int_stmt_cost  */
 448   1, /* scalar_fp_stmt_cost  */
 449   5, /* scalar_load_cost  */
 450   1, /* scalar_store_cost  */
 451   2, /* vec_int_stmt_cost  */
 452   2, /* vec_fp_stmt_cost  */
 453   2, /* vec_permute_cost  */
 454   4, /* vec_to_scalar_cost  */
 455   4, /* scalar_to_vec_cost  */
 456   10, /* vec_align_load_cost  */
 457   10, /* vec_unalign_load_cost  */
 458   2, /* vec_unalign_store_cost  */
 459   2, /* vec_store_cost  */
 460   2, /* cond_taken_branch_cost  */
 461   1 /* cond_not_taken_branch_cost  */
 462 };
 463
 464 /* Costs for vector insn classes for Vulcan.  */
 465 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 466 {
 467   1, /* scalar_int_stmt_cost  */
 468   6, /* scalar_fp_stmt_cost  */
 469   4, /* scalar_load_cost  */
 470   1, /* scalar_store_cost  */
 471   5, /* vec_int_stmt_cost  */
 472   6, /* vec_fp_stmt_cost  */
 473   3, /* vec_permute_cost  */
 474   6, /* vec_to_scalar_cost  */
 475   5, /* scalar_to_vec_cost  */
 476   8, /* vec_align_load_cost  */
 477   8, /* vec_unalign_load_cost  */
 478   4, /* vec_unalign_store_cost  */
 479   4, /* vec_store_cost  */
 480   2, /* cond_taken_branch_cost  */
 481   1  /* cond_not_taken_branch_cost  */
 482 };
 483
 484 /* Generic costs for branch instructions.  */
 485 static const struct cpu_branch_cost generic_branch_cost =
 486 {
 487   2,  /* Predictable.  */
 488   2   /* Unpredictable.  */
 489 };
 490
 491 /* Branch costs for Cortex-A57.  */
 492 static const struct cpu_branch_cost cortexa57_branch_cost =
 493 {
 494   1,  /* Predictable.  */
 495   3   /* Unpredictable.  */
 496 };
 497
 498 /* Branch costs for Vulcan.  */
 499 static const struct cpu_branch_cost thunderx2t99_branch_cost =
 500 {
 501   1,  /* Predictable.  */
 502   3   /* Unpredictable.  */
 503 };
 504
 505 /* Generic approximation modes.  */
 506 static const cpu_approx_modes generic_approx_modes =
 507 {
 508   AARCH64_APPROX_NONE,  /* division  */
 509   AARCH64_APPROX_NONE,  /* sqrt  */
 510   AARCH64_APPROX_NONE   /* recip_sqrt  */
 511 };
 512
 513 /* Approximation modes for Exynos M1.  */
 514 static const cpu_approx_modes exynosm1_approx_modes =
 515 {
 516   AARCH64_APPROX_NONE,  /* division  */
 517   AARCH64_APPROX_ALL,   /* sqrt  */
 518   AARCH64_APPROX_ALL    /* recip_sqrt  */
 519 };
 520
 521 /* Approximation modes for X-Gene 1.  */
 522 static const cpu_approx_modes xgene1_approx_modes =
 523 {
 524   AARCH64_APPROX_NONE,  /* division  */
 525   AARCH64_APPROX_NONE,  /* sqrt  */
 526   AARCH64_APPROX_ALL    /* recip_sqrt  */
 527 };
 528
 529 static const struct tune_params generic_tunings =
 530 {
 531   &cortexa57_extra_costs,
 532   &generic_addrcost_table,
 533   &generic_regmove_cost,
 534   &generic_vector_cost,
 535   &generic_branch_cost,
 536   &generic_approx_modes,
 537   4, /* memmov_cost  */
 538   2, /* issue_rate  */
 539   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 540   8,    /* function_align.  */
 541   8,    /* jump_align.  */
 542   4,    /* loop_align.  */
 543   2,    /* int_reassoc_width.  */
 544   4,    /* fp_reassoc_width.  */
 545   1,    /* vec_reassoc_width.  */
 546   2,    /* min_div_recip_mul_sf.  */
 547   2,    /* min_div_recip_mul_df.  */
 548   0,    /* max_case_values.  */
 549   0,    /* cache_line_size.  */
 550   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 551   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 552 };
 553
 554 static const struct tune_params cortexa35_tunings =
 555 {
 556   &cortexa53_extra_costs,
 557   &generic_addrcost_table,
 558   &cortexa53_regmove_cost,
 559   &generic_vector_cost,
 560   &cortexa57_branch_cost,
 561   &generic_approx_modes,
 562   4, /* memmov_cost  */
 563   1, /* issue_rate  */
 564   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 565    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 566   16,   /* function_align.  */
 567   8,    /* jump_align.  */
 568   8,    /* loop_align.  */
 569   2,    /* int_reassoc_width.  */
 570   4,    /* fp_reassoc_width.  */
 571   1,    /* vec_reassoc_width.  */
 572   2,    /* min_div_recip_mul_sf.  */
 573   2,    /* min_div_recip_mul_df.  */
 574   0,    /* max_case_values.  */
 575   0,    /* cache_line_size.  */
 576   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 577   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 578 };
 579
 580 static const struct tune_params cortexa53_tunings =
 581 {
 582   &cortexa53_extra_costs,
 583   &generic_addrcost_table,
 584   &cortexa53_regmove_cost,
 585   &generic_vector_cost,
 586   &cortexa57_branch_cost,
 587   &generic_approx_modes,
 588   4, /* memmov_cost  */
 589   2, /* issue_rate  */
 590   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 591    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 592   16,   /* function_align.  */
 593   8,    /* jump_align.  */
 594   8,    /* loop_align.  */
 595   2,    /* int_reassoc_width.  */
 596   4,    /* fp_reassoc_width.  */
 597   1,    /* vec_reassoc_width.  */
 598   2,    /* min_div_recip_mul_sf.  */
 599   2,    /* min_div_recip_mul_df.  */
 600   0,    /* max_case_values.  */
 601   0,    /* cache_line_size.  */
 602   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 603   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 604 };
 605
 606 static const struct tune_params cortexa57_tunings =
 607 {
 608   &cortexa57_extra_costs,
 609   &cortexa57_addrcost_table,
 610   &cortexa57_regmove_cost,
 611   &cortexa57_vector_cost,
 612   &cortexa57_branch_cost,
 613   &generic_approx_modes,
 614   4, /* memmov_cost  */
 615   3, /* issue_rate  */
 616   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 617    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 618   16,   /* function_align.  */
 619   8,    /* jump_align.  */
 620   8,    /* loop_align.  */
 621   2,    /* int_reassoc_width.  */
 622   4,    /* fp_reassoc_width.  */
 623   1,    /* vec_reassoc_width.  */
 624   2,    /* min_div_recip_mul_sf.  */
 625   2,    /* min_div_recip_mul_df.  */
 626   0,    /* max_case_values.  */
 627   0,    /* cache_line_size.  */
 628   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 629   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS)  /* tune_flags.  */
 630 };
 631
 632 static const struct tune_params cortexa72_tunings =
 633 {
 634   &cortexa57_extra_costs,
 635   &cortexa57_addrcost_table,
 636   &cortexa57_regmove_cost,
 637   &cortexa57_vector_cost,
 638   &cortexa57_branch_cost,
 639   &generic_approx_modes,
 640   4, /* memmov_cost  */
 641   3, /* issue_rate  */
 642   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 643    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 644   16,   /* function_align.  */
 645   8,    /* jump_align.  */
 646   8,    /* loop_align.  */
 647   2,    /* int_reassoc_width.  */
 648   4,    /* fp_reassoc_width.  */
 649   1,    /* vec_reassoc_width.  */
 650   2,    /* min_div_recip_mul_sf.  */
 651   2,    /* min_div_recip_mul_df.  */
 652   0,    /* max_case_values.  */
 653   0,    /* cache_line_size.  */
 654   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 655   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 656 };
 657
 658 static const struct tune_params cortexa73_tunings =
 659 {
 660   &cortexa57_extra_costs,
 661   &cortexa57_addrcost_table,
 662   &cortexa57_regmove_cost,
 663   &cortexa57_vector_cost,
 664   &cortexa57_branch_cost,
 665   &generic_approx_modes,
 666   4, /* memmov_cost.  */
 667   2, /* issue_rate.  */
 668   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 669    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 670   16,   /* function_align.  */
 671   8,    /* jump_align.  */
 672   8,    /* loop_align.  */
 673   2,    /* int_reassoc_width.  */
 674   4,    /* fp_reassoc_width.  */
 675   1,    /* vec_reassoc_width.  */
 676   2,    /* min_div_recip_mul_sf.  */
 677   2,    /* min_div_recip_mul_df.  */
 678   0,    /* max_case_values.  */
 679   0,    /* cache_line_size.  */
 680   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 681   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 682 };
 683
 684 static const struct tune_params exynosm1_tunings =
 685 {
 686   &exynosm1_extra_costs,
 687   &exynosm1_addrcost_table,
 688   &exynosm1_regmove_cost,
 689   &exynosm1_vector_cost,
 690   &generic_branch_cost,
 691   &exynosm1_approx_modes,
 692   4,    /* memmov_cost  */
 693   3,    /* issue_rate  */
 694   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 695   4,    /* function_align.  */
 696   4,    /* jump_align.  */
 697   4,    /* loop_align.  */
 698   2,    /* int_reassoc_width.  */
 699   4,    /* fp_reassoc_width.  */
 700   1,    /* vec_reassoc_width.  */
 701   2,    /* min_div_recip_mul_sf.  */
 702   2,    /* min_div_recip_mul_df.  */
 703   48,   /* max_case_values.  */
 704   64,   /* cache_line_size.  */
 705   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 706   (AARCH64_EXTRA_TUNE_NONE) /* tune_flags.  */
 707 };
 708
 709 static const struct tune_params thunderx_tunings =
 710 {
 711   &thunderx_extra_costs,
 712   &generic_addrcost_table,
 713   &thunderx_regmove_cost,
 714   &thunderx_vector_cost,
 715   &generic_branch_cost,
 716   &generic_approx_modes,
 717   6, /* memmov_cost  */
 718   2, /* issue_rate  */
 719   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 720   8,    /* function_align.  */
 721   8,    /* jump_align.  */
 722   8,    /* loop_align.  */
 723   2,    /* int_reassoc_width.  */
 724   4,    /* fp_reassoc_width.  */
 725   1,    /* vec_reassoc_width.  */
 726   2,    /* min_div_recip_mul_sf.  */
 727   2,    /* min_div_recip_mul_df.  */
 728   0,    /* max_case_values.  */
 729   0,    /* cache_line_size.  */
 730   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 731   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)      /* tune_flags.  */
 732 };
 733
 734 static const struct tune_params xgene1_tunings =
 735 {
 736   &xgene1_extra_costs,
 737   &xgene1_addrcost_table,
 738   &xgene1_regmove_cost,
 739   &xgene1_vector_cost,
 740   &generic_branch_cost,
 741   &xgene1_approx_modes,
 742   6, /* memmov_cost  */
 743   4, /* issue_rate  */
 744   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 745   16,   /* function_align.  */
 746   8,    /* jump_align.  */
 747   16,   /* loop_align.  */
 748   2,    /* int_reassoc_width.  */
 749   4,    /* fp_reassoc_width.  */
 750   1,    /* vec_reassoc_width.  */
 751   2,    /* min_div_recip_mul_sf.  */
 752   2,    /* min_div_recip_mul_df.  */
 753   0,    /* max_case_values.  */
 754   0,    /* cache_line_size.  */
 755   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 756   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 757 };
 758
 759 static const struct tune_params qdf24xx_tunings =
 760 {
 761   &qdf24xx_extra_costs,
 762   &qdf24xx_addrcost_table,
 763   &qdf24xx_regmove_cost,
 764   &generic_vector_cost,
 765   &generic_branch_cost,
 766   &generic_approx_modes,
 767   4, /* memmov_cost  */
 768   4, /* issue_rate  */
 769   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 770    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 771   16,   /* function_align.  */
 772   8,    /* jump_align.  */
 773   16,   /* loop_align.  */
 774   2,    /* int_reassoc_width.  */
 775   4,    /* fp_reassoc_width.  */
 776   1,    /* vec_reassoc_width.  */
 777   2,    /* min_div_recip_mul_sf.  */
 778   2,    /* min_div_recip_mul_df.  */
 779   0,    /* max_case_values.  */
 780   64,   /* cache_line_size.  */
 781   tune_params::AUTOPREFETCHER_STRONG,   /* autoprefetcher_model.  */
 782   (AARCH64_EXTRA_TUNE_NONE)             /* tune_flags.  */
 783 };
 784
 785 static const struct tune_params thunderx2t99_tunings =
 786 {
 787   &thunderx2t99_extra_costs,
 788   &thunderx2t99_addrcost_table,
 789   &thunderx2t99_regmove_cost,
 790   &thunderx2t99_vector_cost,
 791   &thunderx2t99_branch_cost,
 792   &generic_approx_modes,
 793   4, /* memmov_cost.  */
 794   4, /* issue_rate.  */
 795   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 796   16,   /* function_align.  */
 797   8,    /* jump_align.  */
 798   16,   /* loop_align.  */
 799   3,    /* int_reassoc_width.  */
 800   2,    /* fp_reassoc_width.  */
 801   2,    /* vec_reassoc_width.  */
 802   2,    /* min_div_recip_mul_sf.  */
 803   2,    /* min_div_recip_mul_df.  */
 804   0,    /* max_case_values.  */
 805   64,   /* cache_line_size.  */
 806   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 807   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 808 };
 809
 810 /* Support for fine-grained override of the tuning structures.  */
 811 struct aarch64_tuning_override_function
 812 {
 813   const char* name;
 814   void (*parse_override)(const char*, struct tune_params*);
 815 };
 816
 817 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 818 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 819
 820 static const struct aarch64_tuning_override_function
 821 aarch64_tuning_override_functions[] =
 822 {
 823   { "fuse", aarch64_parse_fuse_string },
 824   { "tune", aarch64_parse_tune_string },
 825   { NULL, NULL }
 826 };
 827
 828 /* A processor implementing AArch64.  */
 829 struct processor
 830 {
 831   const char *const name;
 832   enum aarch64_processor ident;
 833   enum aarch64_processor sched_core;
 834   enum aarch64_arch arch;
 835   unsigned architecture_version;
 836   const unsigned long flags;
 837   const struct tune_params *const tune;
 838 };
 839
 840 /* Architectures implementing AArch64.  */
 841 static const struct processor all_architectures[] =
 842 {
 843 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
 844   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
 845 #include "aarch64-arches.def"
 846   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 847 };
 848
 849 /* Processor cores implementing AArch64.  */
 850 static const struct processor all_cores[] =
 851 {
 852 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
 853   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
 854   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
 855   FLAGS, &COSTS##_tunings},
 856 #include "aarch64-cores.def"
 857   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
 858     AARCH64_FL_FOR_ARCH8, &generic_tunings},
 859   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 860 };
 861
 862
 863 /* Target specification.  These are populated by the -march, -mtune, -mcpu
 864    handling code or by target attributes.  */
 865 static const struct processor *selected_arch;
 866 static const struct processor *selected_cpu;
 867 static const struct processor *selected_tune;
 868
 869 /* The current tuning set.  */
 870 struct tune_params aarch64_tune_params = generic_tunings;
 871
 872 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 873
 874 /* An ISA extension in the co-processor and main instruction set space.  */
 875 struct aarch64_option_extension
 876 {
 877   const char *const name;
 878   const unsigned long flags_on;
 879   const unsigned long flags_off;
 880 };
 881
 882 typedef enum aarch64_cond_code
 883 {
 884   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 885   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 886   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 887 }
 888 aarch64_cc;
 889
 890 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 891
 892 /* The condition codes of the processor, and the inverse function.  */
 893 static const char * const aarch64_condition_codes[] =
 894 {
 895   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 896   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 897 };
 898
 899 /* Generate code to enable conditional branches in functions over 1 MiB.  */
 900 const char *
 901 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
 902                         const char * branch_format)
 903 {
 904     rtx_code_label * tmp_label = gen_label_rtx ();
 905     char label_buf[256];
 906     char buffer[128];
 907     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
 908                                  CODE_LABEL_NUMBER (tmp_label));
 909     const char *label_ptr = targetm.strip_name_encoding (label_buf);
 910     rtx dest_label = operands[pos_label];
 911     operands[pos_label] = tmp_label;
 912
 913     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
 914     output_asm_insn (buffer, operands);
 915
 916     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
 917     operands[pos_label] = dest_label;
 918     output_asm_insn (buffer, operands);
 919     return "";
 920 }
 921
 922 void
 923 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
 924 {
 925   const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
 926   if (TARGET_GENERAL_REGS_ONLY)
 927     error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
 928   else
 929     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
 930 }
 931
 932 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
 933    The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
 934    the same cost even if ALL_REGS has a much larger cost.  ALL_REGS is also
 935    used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
 936    cost (in this case the best class is the lowest cost one).  Using ALL_REGS
 937    irrespectively of its cost results in bad allocations with many redundant
 938    int<->FP moves which are expensive on various cores.
 939    To avoid this we don't allow ALL_REGS as the allocno class, but force a
 940    decision between FP_REGS and GENERAL_REGS.  We use the allocno class if it
 941    isn't ALL_REGS.  Similarly, use the best class if it isn't ALL_REGS.
 942    Otherwise set the allocno class depending on the mode.
 943    The result of this is that it is no longer inefficient to have a higher
 944    memory move cost than the register move cost.
 945 */
 946
 947 static reg_class_t
 948 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
 949                                          reg_class_t best_class)
 950 {
 951   enum machine_mode mode;
 952
 953   if (allocno_class != ALL_REGS)
 954     return allocno_class;
 955
 956   if (best_class != ALL_REGS)
 957     return best_class;
 958
 959   mode = PSEUDO_REGNO_MODE (regno);
 960   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
 961 }
 962
 963 static unsigned int
 964 aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
 965 {
 966   if (GET_MODE_UNIT_SIZE (mode) == 4)
 967     return aarch64_tune_params.min_div_recip_mul_sf;
 968   return aarch64_tune_params.min_div_recip_mul_df;
 969 }
 970
 971 static int
 972 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
 973                              enum machine_mode mode)
 974 {
 975   if (VECTOR_MODE_P (mode))
 976     return aarch64_tune_params.vec_reassoc_width;
 977   if (INTEGRAL_MODE_P (mode))
 978     return aarch64_tune_params.int_reassoc_width;
 979   if (FLOAT_MODE_P (mode))
 980     return aarch64_tune_params.fp_reassoc_width;
 981   return 1;
 982 }
 983
 984 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
 985 unsigned
 986 aarch64_dbx_register_number (unsigned regno)
 987 {
 988    if (GP_REGNUM_P (regno))
 989      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
 990    else if (regno == SP_REGNUM)
 991      return AARCH64_DWARF_SP;
 992    else if (FP_REGNUM_P (regno))
 993      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
 994
 995    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
 996       equivalent DWARF register.  */
 997    return DWARF_FRAME_REGISTERS;
 998 }
 999
1000 /* Return TRUE if MODE is any of the large INT modes.  */
1001 static bool
1002 aarch64_vect_struct_mode_p (machine_mode mode)
1003 {
1004   return mode == OImode || mode == CImode || mode == XImode;
1005 }
1006
1007 /* Return TRUE if MODE is any of the vector modes.  */
1008 static bool
1009 aarch64_vector_mode_p (machine_mode mode)
1010 {
1011   return aarch64_vector_mode_supported_p (mode)
1012          || aarch64_vect_struct_mode_p (mode);
1013 }
1014
1015 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1016 static bool
1017 aarch64_array_mode_supported_p (machine_mode mode,
1018                                 unsigned HOST_WIDE_INT nelems)
1019 {
1020   if (TARGET_SIMD
1021       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1022           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1023       && (nelems >= 2 && nelems <= 4))
1024     return true;
1025
1026   return false;
1027 }
1028
1029 /* Implement HARD_REGNO_NREGS.  */
1030
1031 int
1032 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1033 {
1034   switch (aarch64_regno_regclass (regno))
1035     {
1036     case FP_REGS:
1037     case FP_LO_REGS:
1038       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1039     default:
1040       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1041     }
1042   gcc_unreachable ();
1043 }
1044
1045 /* Implement HARD_REGNO_MODE_OK.  */
1046
1047 int
1048 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1049 {
1050   if (GET_MODE_CLASS (mode) == MODE_CC)
1051     return regno == CC_REGNUM;
1052
1053   if (regno == SP_REGNUM)
1054     /* The purpose of comparing with ptr_mode is to support the
1055        global register variable associated with the stack pointer
1056        register via the syntax of asm ("wsp") in ILP32.  */
1057     return mode == Pmode || mode == ptr_mode;
1058
1059   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1060     return mode == Pmode;
1061
1062   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1063     return 1;
1064
1065   if (FP_REGNUM_P (regno))
1066     {
1067       if (aarch64_vect_struct_mode_p (mode))
1068         return
1069           (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
1070       else
1071         return 1;
1072     }
1073
1074   return 0;
1075 }
1076
1077 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1078 machine_mode
1079 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1080                                      machine_mode mode)
1081 {
1082   /* Handle modes that fit within single registers.  */
1083   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1084     {
1085       if (GET_MODE_SIZE (mode) >= 4)
1086         return mode;
1087       else
1088         return SImode;
1089     }
1090   /* Fall back to generic for multi-reg and very large modes.  */
1091   else
1092     return choose_hard_reg_mode (regno, nregs, false);
1093 }
1094
1095 /* Return true if calls to DECL should be treated as
1096    long-calls (ie called via a register).  */
1097 static bool
1098 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1099 {
1100   return false;
1101 }
1102
1103 /* Return true if calls to symbol-ref SYM should be treated as
1104    long-calls (ie called via a register).  */
1105 bool
1106 aarch64_is_long_call_p (rtx sym)
1107 {
1108   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1109 }
1110
1111 /* Return true if calls to symbol-ref SYM should not go through
1112    plt stubs.  */
1113
1114 bool
1115 aarch64_is_noplt_call_p (rtx sym)
1116 {
1117   const_tree decl = SYMBOL_REF_DECL (sym);
1118
1119   if (flag_pic
1120       && decl
1121       && (!flag_plt
1122           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1123       && !targetm.binds_local_p (decl))
1124     return true;
1125
1126   return false;
1127 }
1128
1129 /* Return true if the offsets to a zero/sign-extract operation
1130    represent an expression that matches an extend operation.  The
1131    operands represent the paramters from
1132
1133    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1134 bool
1135 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
1136                                 rtx extract_imm)
1137 {
1138   HOST_WIDE_INT mult_val, extract_val;
1139
1140   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1141     return false;
1142
1143   mult_val = INTVAL (mult_imm);
1144   extract_val = INTVAL (extract_imm);
1145
1146   if (extract_val > 8
1147       && extract_val < GET_MODE_BITSIZE (mode)
1148       && exact_log2 (extract_val & ~7) > 0
1149       && (extract_val & 7) <= 4
1150       && mult_val == (1 << (extract_val & 7)))
1151     return true;
1152
1153   return false;
1154 }
1155
1156 /* Emit an insn that's a simple single-set.  Both the operands must be
1157    known to be valid.  */
1158 inline static rtx_insn *
1159 emit_set_insn (rtx x, rtx y)
1160 {
1161   return emit_insn (gen_rtx_SET (x, y));
1162 }
1163
1164 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1165    return the rtx for register 0 in the proper mode.  */
1166 rtx
1167 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1168 {
1169   machine_mode mode = SELECT_CC_MODE (code, x, y);
1170   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1171
1172   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1173   return cc_reg;
1174 }
1175
1176 /* Build the SYMBOL_REF for __tls_get_addr.  */
1177
1178 static GTY(()) rtx tls_get_addr_libfunc;
1179
1180 rtx
1181 aarch64_tls_get_addr (void)
1182 {
1183   if (!tls_get_addr_libfunc)
1184     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1185   return tls_get_addr_libfunc;
1186 }
1187
1188 /* Return the TLS model to use for ADDR.  */
1189
1190 static enum tls_model
1191 tls_symbolic_operand_type (rtx addr)
1192 {
1193   enum tls_model tls_kind = TLS_MODEL_NONE;
1194   rtx sym, addend;
1195
1196   if (GET_CODE (addr) == CONST)
1197     {
1198       split_const (addr, &sym, &addend);
1199       if (GET_CODE (sym) == SYMBOL_REF)
1200         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1201     }
1202   else if (GET_CODE (addr) == SYMBOL_REF)
1203     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1204
1205   return tls_kind;
1206 }
1207
1208 /* We'll allow lo_sum's in addresses in our legitimate addresses
1209    so that combine would take care of combining addresses where
1210    necessary, but for generation purposes, we'll generate the address
1211    as :
1212    RTL                               Absolute
1213    tmp = hi (symbol_ref);            adrp  x1, foo
1214    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1215                                      nop
1216
1217    PIC                               TLS
1218    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1219    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1220                                      bl   __tls_get_addr
1221                                      nop
1222
1223    Load TLS symbol, depending on TLS mechanism and TLS access model.
1224
1225    Global Dynamic - Traditional TLS:
1226    adrp tmp, :tlsgd:imm
1227    add  dest, tmp, #:tlsgd_lo12:imm
1228    bl   __tls_get_addr
1229
1230    Global Dynamic - TLS Descriptors:
1231    adrp dest, :tlsdesc:imm
1232    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1233    add  dest, dest, #:tlsdesc_lo12:imm
1234    blr  tmp
1235    mrs  tp, tpidr_el0
1236    add  dest, dest, tp
1237
1238    Initial Exec:
1239    mrs  tp, tpidr_el0
1240    adrp tmp, :gottprel:imm
1241    ldr  dest, [tmp, #:gottprel_lo12:imm]
1242    add  dest, dest, tp
1243
1244    Local Exec:
1245    mrs  tp, tpidr_el0
1246    add  t0, tp, #:tprel_hi12:imm, lsl #12
1247    add  t0, t0, #:tprel_lo12_nc:imm
1248 */
1249
1250 static void
1251 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1252                                    enum aarch64_symbol_type type)
1253 {
1254   switch (type)
1255     {
1256     case SYMBOL_SMALL_ABSOLUTE:
1257       {
1258         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1259         rtx tmp_reg = dest;
1260         machine_mode mode = GET_MODE (dest);
1261
1262         gcc_assert (mode == Pmode || mode == ptr_mode);
1263
1264         if (can_create_pseudo_p ())
1265           tmp_reg = gen_reg_rtx (mode);
1266
1267         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1268         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1269         return;
1270       }
1271
1272     case SYMBOL_TINY_ABSOLUTE:
1273       emit_insn (gen_rtx_SET (dest, imm));
1274       return;
1275
1276     case SYMBOL_SMALL_GOT_28K:
1277       {
1278         machine_mode mode = GET_MODE (dest);
1279         rtx gp_rtx = pic_offset_table_rtx;
1280         rtx insn;
1281         rtx mem;
1282
1283         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1284            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1285            decide rtx costs, in which case pic_offset_table_rtx is not
1286            initialized.  For that case no need to generate the first adrp
1287            instruction as the final cost for global variable access is
1288            one instruction.  */
1289         if (gp_rtx != NULL)
1290           {
1291             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1292                using the page base as GOT base, the first page may be wasted,
1293                in the worst scenario, there is only 28K space for GOT).
1294
1295                The generate instruction sequence for accessing global variable
1296                is:
1297
1298                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1299
1300                Only one instruction needed. But we must initialize
1301                pic_offset_table_rtx properly.  We generate initialize insn for
1302                every global access, and allow CSE to remove all redundant.
1303
1304                The final instruction sequences will look like the following
1305                for multiply global variables access.
1306
1307                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1308
1309                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1310                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1311                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1312                  ...  */
1313
1314             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1315             crtl->uses_pic_offset_table = 1;
1316             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1317
1318             if (mode != GET_MODE (gp_rtx))
1319              gp_rtx = gen_lowpart (mode, gp_rtx);
1320
1321           }
1322
1323         if (mode == ptr_mode)
1324           {
1325             if (mode == DImode)
1326               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1327             else
1328               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1329
1330             mem = XVECEXP (SET_SRC (insn), 0, 0);
1331           }
1332         else
1333           {
1334             gcc_assert (mode == Pmode);
1335
1336             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1337             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1338           }
1339
1340         /* The operand is expected to be MEM.  Whenever the related insn
1341            pattern changed, above code which calculate mem should be
1342            updated.  */
1343         gcc_assert (GET_CODE (mem) == MEM);
1344         MEM_READONLY_P (mem) = 1;
1345         MEM_NOTRAP_P (mem) = 1;
1346         emit_insn (insn);
1347         return;
1348       }
1349
1350     case SYMBOL_SMALL_GOT_4G:
1351       {
1352         /* In ILP32, the mode of dest can be either SImode or DImode,
1353            while the got entry is always of SImode size.  The mode of
1354            dest depends on how dest is used: if dest is assigned to a
1355            pointer (e.g. in the memory), it has SImode; it may have
1356            DImode if dest is dereferenced to access the memeory.
1357            This is why we have to handle three different ldr_got_small
1358            patterns here (two patterns for ILP32).  */
1359
1360         rtx insn;
1361         rtx mem;
1362         rtx tmp_reg = dest;
1363         machine_mode mode = GET_MODE (dest);
1364
1365         if (can_create_pseudo_p ())
1366           tmp_reg = gen_reg_rtx (mode);
1367
1368         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1369         if (mode == ptr_mode)
1370           {
1371             if (mode == DImode)
1372               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1373             else
1374               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1375
1376             mem = XVECEXP (SET_SRC (insn), 0, 0);
1377           }
1378         else
1379           {
1380             gcc_assert (mode == Pmode);
1381
1382             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1383             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1384           }
1385
1386         gcc_assert (GET_CODE (mem) == MEM);
1387         MEM_READONLY_P (mem) = 1;
1388         MEM_NOTRAP_P (mem) = 1;
1389         emit_insn (insn);
1390         return;
1391       }
1392
1393     case SYMBOL_SMALL_TLSGD:
1394       {
1395         rtx_insn *insns;
1396         machine_mode mode = GET_MODE (dest);
1397         rtx result = gen_rtx_REG (mode, R0_REGNUM);
1398
1399         start_sequence ();
1400         if (TARGET_ILP32)
1401           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1402         else
1403           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1404         insns = get_insns ();
1405         end_sequence ();
1406
1407         RTL_CONST_CALL_P (insns) = 1;
1408         emit_libcall_block (insns, dest, result, imm);
1409         return;
1410       }
1411
1412     case SYMBOL_SMALL_TLSDESC:
1413       {
1414         machine_mode mode = GET_MODE (dest);
1415         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1416         rtx tp;
1417
1418         gcc_assert (mode == Pmode || mode == ptr_mode);
1419
1420         /* In ILP32, the got entry is always of SImode size.  Unlike
1421            small GOT, the dest is fixed at reg 0.  */
1422         if (TARGET_ILP32)
1423           emit_insn (gen_tlsdesc_small_si (imm));
1424         else
1425           emit_insn (gen_tlsdesc_small_di (imm));
1426         tp = aarch64_load_tp (NULL);
1427
1428         if (mode != Pmode)
1429           tp = gen_lowpart (mode, tp);
1430
1431         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1432         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1433         return;
1434       }
1435
1436     case SYMBOL_SMALL_TLSIE:
1437       {
1438         /* In ILP32, the mode of dest can be either SImode or DImode,
1439            while the got entry is always of SImode size.  The mode of
1440            dest depends on how dest is used: if dest is assigned to a
1441            pointer (e.g. in the memory), it has SImode; it may have
1442            DImode if dest is dereferenced to access the memeory.
1443            This is why we have to handle three different tlsie_small
1444            patterns here (two patterns for ILP32).  */
1445         machine_mode mode = GET_MODE (dest);
1446         rtx tmp_reg = gen_reg_rtx (mode);
1447         rtx tp = aarch64_load_tp (NULL);
1448
1449         if (mode == ptr_mode)
1450           {
1451             if (mode == DImode)
1452               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1453             else
1454               {
1455                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1456                 tp = gen_lowpart (mode, tp);
1457               }
1458           }
1459         else
1460           {
1461             gcc_assert (mode == Pmode);
1462             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1463           }
1464
1465         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1466         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1467         return;
1468       }
1469
1470     case SYMBOL_TLSLE12:
1471     case SYMBOL_TLSLE24:
1472     case SYMBOL_TLSLE32:
1473     case SYMBOL_TLSLE48:
1474       {
1475         machine_mode mode = GET_MODE (dest);
1476         rtx tp = aarch64_load_tp (NULL);
1477
1478         if (mode != Pmode)
1479           tp = gen_lowpart (mode, tp);
1480
1481         switch (type)
1482           {
1483           case SYMBOL_TLSLE12:
1484             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1485                         (dest, tp, imm));
1486             break;
1487           case SYMBOL_TLSLE24:
1488             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1489                         (dest, tp, imm));
1490           break;
1491           case SYMBOL_TLSLE32:
1492             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1493                         (dest, imm));
1494             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1495                         (dest, dest, tp));
1496           break;
1497           case SYMBOL_TLSLE48:
1498             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1499                         (dest, imm));
1500             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1501                         (dest, dest, tp));
1502             break;
1503           default:
1504             gcc_unreachable ();
1505           }
1506
1507         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1508         return;
1509       }
1510
1511     case SYMBOL_TINY_GOT:
1512       emit_insn (gen_ldr_got_tiny (dest, imm));
1513       return;
1514
1515     case SYMBOL_TINY_TLSIE:
1516       {
1517         machine_mode mode = GET_MODE (dest);
1518         rtx tp = aarch64_load_tp (NULL);
1519
1520         if (mode == ptr_mode)
1521           {
1522             if (mode == DImode)
1523               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1524             else
1525               {
1526                 tp = gen_lowpart (mode, tp);
1527                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1528               }
1529           }
1530         else
1531           {
1532             gcc_assert (mode == Pmode);
1533             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1534           }
1535
1536         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1537         return;
1538       }
1539
1540     default:
1541       gcc_unreachable ();
1542     }
1543 }
1544
1545 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1546    handle all moves if !can_create_pseudo_p ().  The distinction is
1547    important because, unlike emit_move_insn, the move expanders know
1548    how to force Pmode objects into the constant pool even when the
1549    constant pool address is not itself legitimate.  */
1550 static rtx
1551 aarch64_emit_move (rtx dest, rtx src)
1552 {
1553   return (can_create_pseudo_p ()
1554           ? emit_move_insn (dest, src)
1555           : emit_move_insn_1 (dest, src));
1556 }
1557
1558 /* Split a 128-bit move operation into two 64-bit move operations,
1559    taking care to handle partial overlap of register to register
1560    copies.  Special cases are needed when moving between GP regs and
1561    FP regs.  SRC can be a register, constant or memory; DST a register
1562    or memory.  If either operand is memory it must not have any side
1563    effects.  */
1564 void
1565 aarch64_split_128bit_move (rtx dst, rtx src)
1566 {
1567   rtx dst_lo, dst_hi;
1568   rtx src_lo, src_hi;
1569
1570   machine_mode mode = GET_MODE (dst);
1571
1572   gcc_assert (mode == TImode || mode == TFmode);
1573   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1574   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1575
1576   if (REG_P (dst) && REG_P (src))
1577     {
1578       int src_regno = REGNO (src);
1579       int dst_regno = REGNO (dst);
1580
1581       /* Handle FP <-> GP regs.  */
1582       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1583         {
1584           src_lo = gen_lowpart (word_mode, src);
1585           src_hi = gen_highpart (word_mode, src);
1586
1587           if (mode == TImode)
1588             {
1589               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1590               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1591             }
1592           else
1593             {
1594               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1595               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1596             }
1597           return;
1598         }
1599       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1600         {
1601           dst_lo = gen_lowpart (word_mode, dst);
1602           dst_hi = gen_highpart (word_mode, dst);
1603
1604           if (mode == TImode)
1605             {
1606               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1607               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1608             }
1609           else
1610             {
1611               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1612               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1613             }
1614           return;
1615         }
1616     }
1617
1618   dst_lo = gen_lowpart (word_mode, dst);
1619   dst_hi = gen_highpart (word_mode, dst);
1620   src_lo = gen_lowpart (word_mode, src);
1621   src_hi = gen_highpart_mode (word_mode, mode, src);
1622
1623   /* At most one pairing may overlap.  */
1624   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1625     {
1626       aarch64_emit_move (dst_hi, src_hi);
1627       aarch64_emit_move (dst_lo, src_lo);
1628     }
1629   else
1630     {
1631       aarch64_emit_move (dst_lo, src_lo);
1632       aarch64_emit_move (dst_hi, src_hi);
1633     }
1634 }
1635
1636 bool
1637 aarch64_split_128bit_move_p (rtx dst, rtx src)
1638 {
1639   return (! REG_P (src)
1640           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1641 }
1642
1643 /* Split a complex SIMD combine.  */
1644
1645 void
1646 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1647 {
1648   machine_mode src_mode = GET_MODE (src1);
1649   machine_mode dst_mode = GET_MODE (dst);
1650
1651   gcc_assert (VECTOR_MODE_P (dst_mode));
1652
1653   if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1654     {
1655       rtx (*gen) (rtx, rtx, rtx);
1656
1657       switch (src_mode)
1658         {
1659         case V8QImode:
1660           gen = gen_aarch64_simd_combinev8qi;
1661           break;
1662         case V4HImode:
1663           gen = gen_aarch64_simd_combinev4hi;
1664           break;
1665         case V2SImode:
1666           gen = gen_aarch64_simd_combinev2si;
1667           break;
1668         case V4HFmode:
1669           gen = gen_aarch64_simd_combinev4hf;
1670           break;
1671         case V2SFmode:
1672           gen = gen_aarch64_simd_combinev2sf;
1673           break;
1674         case DImode:
1675           gen = gen_aarch64_simd_combinedi;
1676           break;
1677         case DFmode:
1678           gen = gen_aarch64_simd_combinedf;
1679           break;
1680         default:
1681           gcc_unreachable ();
1682         }
1683
1684       emit_insn (gen (dst, src1, src2));
1685       return;
1686     }
1687 }
1688
1689 /* Split a complex SIMD move.  */
1690
1691 void
1692 aarch64_split_simd_move (rtx dst, rtx src)
1693 {
1694   machine_mode src_mode = GET_MODE (src);
1695   machine_mode dst_mode = GET_MODE (dst);
1696
1697   gcc_assert (VECTOR_MODE_P (dst_mode));
1698
1699   if (REG_P (dst) && REG_P (src))
1700     {
1701       rtx (*gen) (rtx, rtx);
1702
1703       gcc_assert (VECTOR_MODE_P (src_mode));
1704
1705       switch (src_mode)
1706         {
1707         case V16QImode:
1708           gen = gen_aarch64_split_simd_movv16qi;
1709           break;
1710         case V8HImode:
1711           gen = gen_aarch64_split_simd_movv8hi;
1712           break;
1713         case V4SImode:
1714           gen = gen_aarch64_split_simd_movv4si;
1715           break;
1716         case V2DImode:
1717           gen = gen_aarch64_split_simd_movv2di;
1718           break;
1719         case V8HFmode:
1720           gen = gen_aarch64_split_simd_movv8hf;
1721           break;
1722         case V4SFmode:
1723           gen = gen_aarch64_split_simd_movv4sf;
1724           break;
1725         case V2DFmode:
1726           gen = gen_aarch64_split_simd_movv2df;
1727           break;
1728         default:
1729           gcc_unreachable ();
1730         }
1731
1732       emit_insn (gen (dst, src));
1733       return;
1734     }
1735 }
1736
1737 bool
1738 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1739                               machine_mode ymode, rtx y)
1740 {
1741   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1742   gcc_assert (r != NULL);
1743   return rtx_equal_p (x, r);
1744 }
1745
1746
1747 static rtx
1748 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1749 {
1750   if (can_create_pseudo_p ())
1751     return force_reg (mode, value);
1752   else
1753     {
1754       x = aarch64_emit_move (x, value);
1755       return x;
1756     }
1757 }
1758
1759
1760 static rtx
1761 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1762 {
1763   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1764     {
1765       rtx high;
1766       /* Load the full offset into a register.  This
1767          might be improvable in the future.  */
1768       high = GEN_INT (offset);
1769       offset = 0;
1770       high = aarch64_force_temporary (mode, temp, high);
1771       reg = aarch64_force_temporary (mode, temp,
1772                                      gen_rtx_PLUS (mode, high, reg));
1773     }
1774   return plus_constant (mode, reg, offset);
1775 }
1776
1777 static int
1778 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1779                                 machine_mode mode)
1780 {
1781   int i;
1782   unsigned HOST_WIDE_INT val, val2, mask;
1783   int one_match, zero_match;
1784   int num_insns;
1785
1786   val = INTVAL (imm);
1787
1788   if (aarch64_move_imm (val, mode))
1789     {
1790       if (generate)
1791         emit_insn (gen_rtx_SET (dest, imm));
1792       return 1;
1793     }
1794
1795   if ((val >> 32) == 0 || mode == SImode)
1796     {
1797       if (generate)
1798         {
1799           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1800           if (mode == SImode)
1801             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1802                                        GEN_INT ((val >> 16) & 0xffff)));
1803           else
1804             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1805                                        GEN_INT ((val >> 16) & 0xffff)));
1806         }
1807       return 2;
1808     }
1809
1810   /* Remaining cases are all for DImode.  */
1811
1812   mask = 0xffff;
1813   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1814     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1815   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1816     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1817
1818   if (zero_match != 2 && one_match != 2)
1819     {
1820       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1821          For a 64-bit bitmask try whether changing 16 bits to all ones or
1822          zeroes creates a valid bitmask.  To check any repeated bitmask,
1823          try using 16 bits from the other 32-bit half of val.  */
1824
1825       for (i = 0; i < 64; i += 16, mask <<= 16)
1826         {
1827           val2 = val & ~mask;
1828           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1829             break;
1830           val2 = val | mask;
1831           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1832             break;
1833           val2 = val2 & ~mask;
1834           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1835           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1836             break;
1837         }
1838       if (i != 64)
1839         {
1840           if (generate)
1841             {
1842               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1843               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1844                                          GEN_INT ((val >> i) & 0xffff)));
1845             }
1846           return 2;
1847         }
1848     }
1849
1850   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1851      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
1852      otherwise skip zero bits.  */
1853
1854   num_insns = 1;
1855   mask = 0xffff;
1856   val2 = one_match > zero_match ? ~val : val;
1857   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1858
1859   if (generate)
1860     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1861                                            ? (val | ~(mask << i))
1862                                            : (val & (mask << i)))));
1863   for (i += 16; i < 64; i += 16)
1864     {
1865       if ((val2 & (mask << i)) == 0)
1866         continue;
1867       if (generate)
1868         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1869                                    GEN_INT ((val >> i) & 0xffff)));
1870       num_insns ++;
1871     }
1872
1873   return num_insns;
1874 }
1875
1876
1877 void
1878 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1879 {
1880   machine_mode mode = GET_MODE (dest);
1881
1882   gcc_assert (mode == SImode || mode == DImode);
1883
1884   /* Check on what type of symbol it is.  */
1885   if (GET_CODE (imm) == SYMBOL_REF
1886       || GET_CODE (imm) == LABEL_REF
1887       || GET_CODE (imm) == CONST)
1888     {
1889       rtx mem, base, offset;
1890       enum aarch64_symbol_type sty;
1891
1892       /* If we have (const (plus symbol offset)), separate out the offset
1893          before we start classifying the symbol.  */
1894       split_const (imm, &base, &offset);
1895
1896       sty = aarch64_classify_symbol (base, offset);
1897       switch (sty)
1898         {
1899         case SYMBOL_FORCE_TO_MEM:
1900           if (offset != const0_rtx
1901               && targetm.cannot_force_const_mem (mode, imm))
1902             {
1903               gcc_assert (can_create_pseudo_p ());
1904               base = aarch64_force_temporary (mode, dest, base);
1905               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1906               aarch64_emit_move (dest, base);
1907               return;
1908             }
1909
1910           mem = force_const_mem (ptr_mode, imm);
1911           gcc_assert (mem);
1912
1913           /* If we aren't generating PC relative literals, then
1914              we need to expand the literal pool access carefully.
1915              This is something that needs to be done in a number
1916              of places, so could well live as a separate function.  */
1917           if (!aarch64_pcrelative_literal_loads)
1918             {
1919               gcc_assert (can_create_pseudo_p ());
1920               base = gen_reg_rtx (ptr_mode);
1921               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
1922               mem = gen_rtx_MEM (ptr_mode, base);
1923             }
1924
1925           if (mode != ptr_mode)
1926             mem = gen_rtx_ZERO_EXTEND (mode, mem);
1927
1928           emit_insn (gen_rtx_SET (dest, mem));
1929
1930           return;
1931
1932         case SYMBOL_SMALL_TLSGD:
1933         case SYMBOL_SMALL_TLSDESC:
1934         case SYMBOL_SMALL_TLSIE:
1935         case SYMBOL_SMALL_GOT_28K:
1936         case SYMBOL_SMALL_GOT_4G:
1937         case SYMBOL_TINY_GOT:
1938         case SYMBOL_TINY_TLSIE:
1939           if (offset != const0_rtx)
1940             {
1941               gcc_assert(can_create_pseudo_p ());
1942               base = aarch64_force_temporary (mode, dest, base);
1943               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1944               aarch64_emit_move (dest, base);
1945               return;
1946             }
1947           /* FALLTHRU */
1948
1949         case SYMBOL_SMALL_ABSOLUTE:
1950         case SYMBOL_TINY_ABSOLUTE:
1951         case SYMBOL_TLSLE12:
1952         case SYMBOL_TLSLE24:
1953         case SYMBOL_TLSLE32:
1954         case SYMBOL_TLSLE48:
1955           aarch64_load_symref_appropriately (dest, imm, sty);
1956           return;
1957
1958         default:
1959           gcc_unreachable ();
1960         }
1961     }
1962
1963   if (!CONST_INT_P (imm))
1964     {
1965       if (GET_CODE (imm) == HIGH)
1966         emit_insn (gen_rtx_SET (dest, imm));
1967       else
1968         {
1969           rtx mem = force_const_mem (mode, imm);
1970           gcc_assert (mem);
1971           emit_insn (gen_rtx_SET (dest, mem));
1972         }
1973
1974       return;
1975     }
1976
1977   aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1978 }
1979
1980 /* Add DELTA to REGNUM in mode MODE.  SCRATCHREG can be used to hold a
1981    temporary value if necessary.  FRAME_RELATED_P should be true if
1982    the RTX_FRAME_RELATED flag should be set and CFA adjustments added
1983    to the generated instructions.  If SCRATCHREG is known to hold
1984    abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
1985    immediate again.
1986
1987    Since this function may be used to adjust the stack pointer, we must
1988    ensure that it cannot cause transient stack deallocation (for example
1989    by first incrementing SP and then decrementing when adjusting by a
1990    large immediate).  */
1991
1992 static void
1993 aarch64_add_constant_internal (machine_mode mode, int regnum, int scratchreg,
1994                                HOST_WIDE_INT delta, bool frame_related_p,
1995                                bool emit_move_imm)
1996 {
1997   HOST_WIDE_INT mdelta = abs_hwi (delta);
1998   rtx this_rtx = gen_rtx_REG (mode, regnum);
1999   rtx_insn *insn;
2000
2001   if (!mdelta)
2002     return;
2003
2004   /* Single instruction adjustment.  */
2005   if (aarch64_uimm12_shift (mdelta))
2006     {
2007       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
2008       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2009       return;
2010     }
2011
2012   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
2013      Only do this if mdelta is not a 16-bit move as adjusting using a move
2014      is better.  */
2015   if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode))
2016     {
2017       HOST_WIDE_INT low_off = mdelta & 0xfff;
2018
2019       low_off = delta < 0 ? -low_off : low_off;
2020       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
2021       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2022       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2023       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2024       return;
2025     }
2026
2027   /* Emit a move immediate if required and an addition/subtraction.  */
2028   rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2029   if (emit_move_imm)
2030     aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode);
2031   insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx)
2032                               : gen_add2_insn (this_rtx, scratch_rtx));
2033   if (frame_related_p)
2034     {
2035       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2036       rtx adj = plus_constant (mode, this_rtx, delta);
2037       add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2038     }
2039 }
2040
2041 static inline void
2042 aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
2043                       HOST_WIDE_INT delta)
2044 {
2045   aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true);
2046 }
2047
2048 static inline void
2049 aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm)
2050 {
2051   aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta,
2052                                  true, emit_move_imm);
2053 }
2054
2055 static inline void
2056 aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p)
2057 {
2058   aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta,
2059                                  frame_related_p, true);
2060 }
2061
2062 static bool
2063 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2064                                  tree exp ATTRIBUTE_UNUSED)
2065 {
2066   /* Currently, always true.  */
2067   return true;
2068 }
2069
2070 /* Implement TARGET_PASS_BY_REFERENCE.  */
2071
2072 static bool
2073 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2074                            machine_mode mode,
2075                            const_tree type,
2076                            bool named ATTRIBUTE_UNUSED)
2077 {
2078   HOST_WIDE_INT size;
2079   machine_mode dummymode;
2080   int nregs;
2081
2082   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
2083   size = (mode == BLKmode && type)
2084     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2085
2086   /* Aggregates are passed by reference based on their size.  */
2087   if (type && AGGREGATE_TYPE_P (type))
2088     {
2089       size = int_size_in_bytes (type);
2090     }
2091
2092   /* Variable sized arguments are always returned by reference.  */
2093   if (size < 0)
2094     return true;
2095
2096   /* Can this be a candidate to be passed in fp/simd register(s)?  */
2097   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2098                                                &dummymode, &nregs,
2099                                                NULL))
2100     return false;
2101
2102   /* Arguments which are variable sized or larger than 2 registers are
2103      passed by reference unless they are a homogenous floating point
2104      aggregate.  */
2105   return size > 2 * UNITS_PER_WORD;
2106 }
2107
2108 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
2109 static bool
2110 aarch64_return_in_msb (const_tree valtype)
2111 {
2112   machine_mode dummy_mode;
2113   int dummy_int;
2114
2115   /* Never happens in little-endian mode.  */
2116   if (!BYTES_BIG_ENDIAN)
2117     return false;
2118
2119   /* Only composite types smaller than or equal to 16 bytes can
2120      be potentially returned in registers.  */
2121   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2122       || int_size_in_bytes (valtype) <= 0
2123       || int_size_in_bytes (valtype) > 16)
2124     return false;
2125
2126   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2127      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2128      is always passed/returned in the least significant bits of fp/simd
2129      register(s).  */
2130   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2131                                                &dummy_mode, &dummy_int, NULL))
2132     return false;
2133
2134   return true;
2135 }
2136
2137 /* Implement TARGET_FUNCTION_VALUE.
2138    Define how to find the value returned by a function.  */
2139
2140 static rtx
2141 aarch64_function_value (const_tree type, const_tree func,
2142                         bool outgoing ATTRIBUTE_UNUSED)
2143 {
2144   machine_mode mode;
2145   int unsignedp;
2146   int count;
2147   machine_mode ag_mode;
2148
2149   mode = TYPE_MODE (type);
2150   if (INTEGRAL_TYPE_P (type))
2151     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2152
2153   if (aarch64_return_in_msb (type))
2154     {
2155       HOST_WIDE_INT size = int_size_in_bytes (type);
2156
2157       if (size % UNITS_PER_WORD != 0)
2158         {
2159           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2160           mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
2161         }
2162     }
2163
2164   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2165                                                &ag_mode, &count, NULL))
2166     {
2167       if (!aarch64_composite_type_p (type, mode))
2168         {
2169           gcc_assert (count == 1 && mode == ag_mode);
2170           return gen_rtx_REG (mode, V0_REGNUM);
2171         }
2172       else
2173         {
2174           int i;
2175           rtx par;
2176
2177           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2178           for (i = 0; i < count; i++)
2179             {
2180               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2181               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2182                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2183               XVECEXP (par, 0, i) = tmp;
2184             }
2185           return par;
2186         }
2187     }
2188   else
2189     return gen_rtx_REG (mode, R0_REGNUM);
2190 }
2191
2192 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2193    Return true if REGNO is the number of a hard register in which the values
2194    of called function may come back.  */
2195
2196 static bool
2197 aarch64_function_value_regno_p (const unsigned int regno)
2198 {
2199   /* Maximum of 16 bytes can be returned in the general registers.  Examples
2200      of 16-byte return values are: 128-bit integers and 16-byte small
2201      structures (excluding homogeneous floating-point aggregates).  */
2202   if (regno == R0_REGNUM || regno == R1_REGNUM)
2203     return true;
2204
2205   /* Up to four fp/simd registers can return a function value, e.g. a
2206      homogeneous floating-point aggregate having four members.  */
2207   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2208     return TARGET_FLOAT;
2209
2210   return false;
2211 }
2212
2213 /* Implement TARGET_RETURN_IN_MEMORY.
2214
2215    If the type T of the result of a function is such that
2216      void func (T arg)
2217    would require that arg be passed as a value in a register (or set of
2218    registers) according to the parameter passing rules, then the result
2219    is returned in the same registers as would be used for such an
2220    argument.  */
2221
2222 static bool
2223 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2224 {
2225   HOST_WIDE_INT size;
2226   machine_mode ag_mode;
2227   int count;
2228
2229   if (!AGGREGATE_TYPE_P (type)
2230       && TREE_CODE (type) != COMPLEX_TYPE
2231       && TREE_CODE (type) != VECTOR_TYPE)
2232     /* Simple scalar types always returned in registers.  */
2233     return false;
2234
2235   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2236                                                type,
2237                                                &ag_mode,
2238                                                &count,
2239                                                NULL))
2240     return false;
2241
2242   /* Types larger than 2 registers returned in memory.  */
2243   size = int_size_in_bytes (type);
2244   return (size < 0 || size > 2 * UNITS_PER_WORD);
2245 }
2246
2247 static bool
2248 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2249                                const_tree type, int *nregs)
2250 {
2251   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2252   return aarch64_vfp_is_call_or_return_candidate (mode,
2253                                                   type,
2254                                                   &pcum->aapcs_vfp_rmode,
2255                                                   nregs,
2256                                                   NULL);
2257 }
2258
2259 /* Given MODE and TYPE of a function argument, return the alignment in
2260    bits.  The idea is to suppress any stronger alignment requested by
2261    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2262    This is a helper function for local use only.  */
2263
2264 static unsigned int
2265 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2266 {
2267   if (!type)
2268     return GET_MODE_ALIGNMENT (mode);
2269   if (integer_zerop (TYPE_SIZE (type)))
2270     return 0;
2271
2272   gcc_assert (TYPE_MODE (type) == mode);
2273
2274   if (!AGGREGATE_TYPE_P (type))
2275     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2276
2277   if (TREE_CODE (type) == ARRAY_TYPE)
2278     return TYPE_ALIGN (TREE_TYPE (type));
2279
2280   unsigned int alignment = 0;
2281
2282   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2283     alignment = std::max (alignment, DECL_ALIGN (field));
2284
2285   return alignment;
2286 }
2287
2288 /* Layout a function argument according to the AAPCS64 rules.  The rule
2289    numbers refer to the rule numbers in the AAPCS64.  */
2290
2291 static void
2292 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2293                     const_tree type,
2294                     bool named ATTRIBUTE_UNUSED)
2295 {
2296   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2297   int ncrn, nvrn, nregs;
2298   bool allocate_ncrn, allocate_nvrn;
2299   HOST_WIDE_INT size;
2300
2301   /* We need to do this once per argument.  */
2302   if (pcum->aapcs_arg_processed)
2303     return;
2304
2305   pcum->aapcs_arg_processed = true;
2306
2307   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
2308   size
2309     = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2310                 UNITS_PER_WORD);
2311
2312   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2313   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2314                                                  mode,
2315                                                  type,
2316                                                  &nregs);
2317
2318   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2319      The following code thus handles passing by SIMD/FP registers first.  */
2320
2321   nvrn = pcum->aapcs_nvrn;
2322
2323   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2324      and homogenous short-vector aggregates (HVA).  */
2325   if (allocate_nvrn)
2326     {
2327       if (!TARGET_FLOAT)
2328         aarch64_err_no_fpadvsimd (mode, "argument");
2329
2330       if (nvrn + nregs <= NUM_FP_ARG_REGS)
2331         {
2332           pcum->aapcs_nextnvrn = nvrn + nregs;
2333           if (!aarch64_composite_type_p (type, mode))
2334             {
2335               gcc_assert (nregs == 1);
2336               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2337             }
2338           else
2339             {
2340               rtx par;
2341               int i;
2342               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2343               for (i = 0; i < nregs; i++)
2344                 {
2345                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2346                                          V0_REGNUM + nvrn + i);
2347                   tmp = gen_rtx_EXPR_LIST
2348                     (VOIDmode, tmp,
2349                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2350                   XVECEXP (par, 0, i) = tmp;
2351                 }
2352               pcum->aapcs_reg = par;
2353             }
2354           return;
2355         }
2356       else
2357         {
2358           /* C.3 NSRN is set to 8.  */
2359           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2360           goto on_stack;
2361         }
2362     }
2363
2364   ncrn = pcum->aapcs_ncrn;
2365   nregs = size / UNITS_PER_WORD;
2366
2367   /* C6 - C9.  though the sign and zero extension semantics are
2368      handled elsewhere.  This is the case where the argument fits
2369      entirely general registers.  */
2370   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2371     {
2372       unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2373
2374       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2375
2376       /* C.8 if the argument has an alignment of 16 then the NGRN is
2377          rounded up to the next even number.  */
2378       if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
2379         {
2380           ++ncrn;
2381           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2382         }
2383       /* NREGS can be 0 when e.g. an empty structure is to be passed.
2384          A reg is still generated for it, but the caller should be smart
2385          enough not to use it.  */
2386       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2387         {
2388           pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2389         }
2390       else
2391         {
2392           rtx par;
2393           int i;
2394
2395           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2396           for (i = 0; i < nregs; i++)
2397             {
2398               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2399               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2400                                        GEN_INT (i * UNITS_PER_WORD));
2401               XVECEXP (par, 0, i) = tmp;
2402             }
2403           pcum->aapcs_reg = par;
2404         }
2405
2406       pcum->aapcs_nextncrn = ncrn + nregs;
2407       return;
2408     }
2409
2410   /* C.11  */
2411   pcum->aapcs_nextncrn = NUM_ARG_REGS;
2412
2413   /* The argument is passed on stack; record the needed number of words for
2414      this argument and align the total size if necessary.  */
2415 on_stack:
2416   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2417   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2418     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2419                                        16 / UNITS_PER_WORD);
2420   return;
2421 }
2422
2423 /* Implement TARGET_FUNCTION_ARG.  */
2424
2425 static rtx
2426 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2427                       const_tree type, bool named)
2428 {
2429   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2430   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2431
2432   if (mode == VOIDmode)
2433     return NULL_RTX;
2434
2435   aarch64_layout_arg (pcum_v, mode, type, named);
2436   return pcum->aapcs_reg;
2437 }
2438
2439 void
2440 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2441                            const_tree fntype ATTRIBUTE_UNUSED,
2442                            rtx libname ATTRIBUTE_UNUSED,
2443                            const_tree fndecl ATTRIBUTE_UNUSED,
2444                            unsigned n_named ATTRIBUTE_UNUSED)
2445 {
2446   pcum->aapcs_ncrn = 0;
2447   pcum->aapcs_nvrn = 0;
2448   pcum->aapcs_nextncrn = 0;
2449   pcum->aapcs_nextnvrn = 0;
2450   pcum->pcs_variant = ARM_PCS_AAPCS64;
2451   pcum->aapcs_reg = NULL_RTX;
2452   pcum->aapcs_arg_processed = false;
2453   pcum->aapcs_stack_words = 0;
2454   pcum->aapcs_stack_size = 0;
2455
2456   if (!TARGET_FLOAT
2457       && fndecl && TREE_PUBLIC (fndecl)
2458       && fntype && fntype != error_mark_node)
2459     {
2460       const_tree type = TREE_TYPE (fntype);
2461       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
2462       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
2463       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2464                                                    &mode, &nregs, NULL))
2465         aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2466     }
2467   return;
2468 }
2469
2470 static void
2471 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2472                               machine_mode mode,
2473                               const_tree type,
2474                               bool named)
2475 {
2476   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2477   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2478     {
2479       aarch64_layout_arg (pcum_v, mode, type, named);
2480       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2481                   != (pcum->aapcs_stack_words != 0));
2482       pcum->aapcs_arg_processed = false;
2483       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2484       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2485       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2486       pcum->aapcs_stack_words = 0;
2487       pcum->aapcs_reg = NULL_RTX;
2488     }
2489 }
2490
2491 bool
2492 aarch64_function_arg_regno_p (unsigned regno)
2493 {
2494   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2495           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2496 }
2497
2498 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
2499    PARM_BOUNDARY bits of alignment, but will be given anything up
2500    to STACK_BOUNDARY bits if the type requires it.  This makes sure
2501    that both before and after the layout of each argument, the Next
2502    Stacked Argument Address (NSAA) will have a minimum alignment of
2503    8 bytes.  */
2504
2505 static unsigned int
2506 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2507 {
2508   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2509
2510   if (alignment < PARM_BOUNDARY)
2511     alignment = PARM_BOUNDARY;
2512   if (alignment > STACK_BOUNDARY)
2513     alignment = STACK_BOUNDARY;
2514   return alignment;
2515 }
2516
2517 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2518
2519    Return true if an argument passed on the stack should be padded upwards,
2520    i.e. if the least-significant byte of the stack slot has useful data.
2521
2522    Small aggregate types are placed in the lowest memory address.
2523
2524    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
2525
2526 bool
2527 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
2528 {
2529   /* On little-endian targets, the least significant byte of every stack
2530      argument is passed at the lowest byte address of the stack slot.  */
2531   if (!BYTES_BIG_ENDIAN)
2532     return true;
2533
2534   /* Otherwise, integral, floating-point and pointer types are padded downward:
2535      the least significant byte of a stack argument is passed at the highest
2536      byte address of the stack slot.  */
2537   if (type
2538       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2539          || POINTER_TYPE_P (type))
2540       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2541     return false;
2542
2543   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
2544   return true;
2545 }
2546
2547 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2548
2549    It specifies padding for the last (may also be the only)
2550    element of a block move between registers and memory.  If
2551    assuming the block is in the memory, padding upward means that
2552    the last element is padded after its highest significant byte,
2553    while in downward padding, the last element is padded at the
2554    its least significant byte side.
2555
2556    Small aggregates and small complex types are always padded
2557    upwards.
2558
2559    We don't need to worry about homogeneous floating-point or
2560    short-vector aggregates; their move is not affected by the
2561    padding direction determined here.  Regardless of endianness,
2562    each element of such an aggregate is put in the least
2563    significant bits of a fp/simd register.
2564
2565    Return !BYTES_BIG_ENDIAN if the least significant byte of the
2566    register has useful data, and return the opposite if the most
2567    significant byte does.  */
2568
2569 bool
2570 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2571                      bool first ATTRIBUTE_UNUSED)
2572 {
2573
2574   /* Small composite types are always padded upward.  */
2575   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2576     {
2577       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2578                             : GET_MODE_SIZE (mode));
2579       if (size < 2 * UNITS_PER_WORD)
2580         return true;
2581     }
2582
2583   /* Otherwise, use the default padding.  */
2584   return !BYTES_BIG_ENDIAN;
2585 }
2586
2587 static machine_mode
2588 aarch64_libgcc_cmp_return_mode (void)
2589 {
2590   return SImode;
2591 }
2592
2593 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2594
2595 /* We use the 12-bit shifted immediate arithmetic instructions so values
2596    must be multiple of (1 << 12), i.e. 4096.  */
2597 #define ARITH_FACTOR 4096
2598
2599 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2600 #error Cannot use simple address calculation for stack probing
2601 #endif
2602
2603 /* The pair of scratch registers used for stack probing.  */
2604 #define PROBE_STACK_FIRST_REG  9
2605 #define PROBE_STACK_SECOND_REG 10
2606
2607 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2608    inclusive.  These are offsets from the current stack pointer.  */
2609
2610 static void
2611 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2612 {
2613   rtx reg1 = gen_rtx_REG (ptr_mode, PROBE_STACK_FIRST_REG);
2614
2615   /* See the same assertion on PROBE_INTERVAL above.  */
2616   gcc_assert ((first % ARITH_FACTOR) == 0);
2617
2618   /* See if we have a constant small number of probes to generate.  If so,
2619      that's the easy case.  */
2620   if (size <= PROBE_INTERVAL)
2621     {
2622       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2623
2624       emit_set_insn (reg1,
2625                      plus_constant (ptr_mode,
2626                                     stack_pointer_rtx, -(first + base)));
2627       emit_stack_probe (plus_constant (ptr_mode, reg1, base - size));
2628     }
2629
2630   /* The run-time loop is made up of 8 insns in the generic case while the
2631      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
2632   else if (size <= 4 * PROBE_INTERVAL)
2633     {
2634       HOST_WIDE_INT i, rem;
2635
2636       emit_set_insn (reg1,
2637                      plus_constant (ptr_mode,
2638                                     stack_pointer_rtx,
2639                                     -(first + PROBE_INTERVAL)));
2640       emit_stack_probe (reg1);
2641
2642       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2643          it exceeds SIZE.  If only two probes are needed, this will not
2644          generate any code.  Then probe at FIRST + SIZE.  */
2645       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2646         {
2647           emit_set_insn (reg1,
2648                          plus_constant (ptr_mode, reg1, -PROBE_INTERVAL));
2649           emit_stack_probe (reg1);
2650         }
2651
2652       rem = size - (i - PROBE_INTERVAL);
2653       if (rem > 256)
2654         {
2655           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2656
2657           emit_set_insn (reg1, plus_constant (ptr_mode, reg1, -base));
2658           emit_stack_probe (plus_constant (ptr_mode, reg1, base - rem));
2659         }
2660       else
2661         emit_stack_probe (plus_constant (ptr_mode, reg1, -rem));
2662     }
2663
2664   /* Otherwise, do the same as above, but in a loop.  Note that we must be
2665      extra careful with variables wrapping around because we might be at
2666      the very top (or the very bottom) of the address space and we have
2667      to be able to handle this case properly; in particular, we use an
2668      equality test for the loop condition.  */
2669   else
2670     {
2671       rtx reg2 = gen_rtx_REG (ptr_mode, PROBE_STACK_SECOND_REG);
2672
2673       /* Step 1: round SIZE to the previous multiple of the interval.  */
2674
2675       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2676
2677
2678       /* Step 2: compute initial and final value of the loop counter.  */
2679
2680       /* TEST_ADDR = SP + FIRST.  */
2681       emit_set_insn (reg1,
2682                      plus_constant (ptr_mode, stack_pointer_rtx, -first));
2683
2684       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
2685       emit_set_insn (reg2,
2686                      plus_constant (ptr_mode, stack_pointer_rtx,
2687                                     -(first + rounded_size)));
2688
2689
2690       /* Step 3: the loop
2691
2692          do
2693            {
2694              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2695              probe at TEST_ADDR
2696            }
2697          while (TEST_ADDR != LAST_ADDR)
2698
2699          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2700          until it is equal to ROUNDED_SIZE.  */
2701
2702       if (ptr_mode == DImode)
2703         emit_insn (gen_probe_stack_range_di (reg1, reg1, reg2));
2704       else
2705         emit_insn (gen_probe_stack_range_si (reg1, reg1, reg2));
2706
2707
2708       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2709          that SIZE is equal to ROUNDED_SIZE.  */
2710
2711       if (size != rounded_size)
2712         {
2713           HOST_WIDE_INT rem = size - rounded_size;
2714
2715           if (rem > 256)
2716             {
2717               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2718
2719               emit_set_insn (reg2, plus_constant (ptr_mode, reg2, -base));
2720               emit_stack_probe (plus_constant (ptr_mode, reg2, base - rem));
2721             }
2722           else
2723             emit_stack_probe (plus_constant (ptr_mode, reg2, -rem));
2724         }
2725     }
2726
2727   /* Make sure nothing is scheduled before we are done.  */
2728   emit_insn (gen_blockage ());
2729 }
2730
2731 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
2732    absolute addresses.  */
2733
2734 const char *
2735 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2736 {
2737   static int labelno = 0;
2738   char loop_lab[32];
2739   rtx xops[2];
2740
2741   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2742
2743   /* Loop.  */
2744   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2745
2746   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
2747   xops[0] = reg1;
2748   xops[1] = GEN_INT (PROBE_INTERVAL);
2749   output_asm_insn ("sub\t%0, %0, %1", xops);
2750
2751   /* Probe at TEST_ADDR.  */
2752   output_asm_insn ("str\txzr, [%0]", xops);
2753
2754   /* Test if TEST_ADDR == LAST_ADDR.  */
2755   xops[1] = reg2;
2756   output_asm_insn ("cmp\t%0, %1", xops);
2757
2758   /* Branch.  */
2759   fputs ("\tb.ne\t", asm_out_file);
2760   assemble_name_raw (asm_out_file, loop_lab);
2761   fputc ('\n', asm_out_file);
2762
2763   return "";
2764 }
2765
2766 static bool
2767 aarch64_frame_pointer_required (void)
2768 {
2769   /* In aarch64_override_options_after_change
2770      flag_omit_leaf_frame_pointer turns off the frame pointer by
2771      default.  Turn it back on now if we've not got a leaf
2772      function.  */
2773   if (flag_omit_leaf_frame_pointer
2774       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2775     return true;
2776
2777   /* Force a frame pointer for EH returns so the return address is at FP+8.  */
2778   if (crtl->calls_eh_return)
2779     return true;
2780
2781   return false;
2782 }
2783
2784 /* Mark the registers that need to be saved by the callee and calculate
2785    the size of the callee-saved registers area and frame record (both FP
2786    and LR may be omitted).  */
2787 static void
2788 aarch64_layout_frame (void)
2789 {
2790   HOST_WIDE_INT offset = 0;
2791   int regno, last_fp_reg = INVALID_REGNUM;
2792
2793   if (reload_completed && cfun->machine->frame.laid_out)
2794     return;
2795
2796 #define SLOT_NOT_REQUIRED (-2)
2797 #define SLOT_REQUIRED     (-1)
2798
2799   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2800   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2801
2802   /* First mark all the registers that really need to be saved...  */
2803   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2804     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2805
2806   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2807     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2808
2809   /* ... that includes the eh data registers (if needed)...  */
2810   if (crtl->calls_eh_return)
2811     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2812       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2813         = SLOT_REQUIRED;
2814
2815   /* ... and any callee saved register that dataflow says is live.  */
2816   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2817     if (df_regs_ever_live_p (regno)
2818         && (regno == R30_REGNUM
2819             || !call_used_regs[regno]))
2820       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2821
2822   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2823     if (df_regs_ever_live_p (regno)
2824         && !call_used_regs[regno])
2825       {
2826         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2827         last_fp_reg = regno;
2828       }
2829
2830   if (frame_pointer_needed)
2831     {
2832       /* FP and LR are placed in the linkage record.  */
2833       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2834       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2835       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2836       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2837       offset += 2 * UNITS_PER_WORD;
2838     }
2839
2840   /* Now assign stack slots for them.  */
2841   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2842     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2843       {
2844         cfun->machine->frame.reg_offset[regno] = offset;
2845         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2846           cfun->machine->frame.wb_candidate1 = regno;
2847         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2848           cfun->machine->frame.wb_candidate2 = regno;
2849         offset += UNITS_PER_WORD;
2850       }
2851
2852   HOST_WIDE_INT max_int_offset = offset;
2853   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2854   bool has_align_gap = offset != max_int_offset;
2855
2856   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2857     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2858       {
2859         /* If there is an alignment gap between integer and fp callee-saves,
2860            allocate the last fp register to it if possible.  */
2861         if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
2862           {
2863             cfun->machine->frame.reg_offset[regno] = max_int_offset;
2864             break;
2865           }
2866
2867         cfun->machine->frame.reg_offset[regno] = offset;
2868         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2869           cfun->machine->frame.wb_candidate1 = regno;
2870         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2871                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2872           cfun->machine->frame.wb_candidate2 = regno;
2873         offset += UNITS_PER_WORD;
2874       }
2875
2876   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2877
2878   cfun->machine->frame.saved_regs_size = offset;
2879
2880   HOST_WIDE_INT varargs_and_saved_regs_size
2881     = offset + cfun->machine->frame.saved_varargs_size;
2882
2883   cfun->machine->frame.hard_fp_offset
2884     = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
2885                 STACK_BOUNDARY / BITS_PER_UNIT);
2886
2887   cfun->machine->frame.frame_size
2888     = ROUND_UP (cfun->machine->frame.hard_fp_offset
2889                 + crtl->outgoing_args_size,
2890                 STACK_BOUNDARY / BITS_PER_UNIT);
2891
2892   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
2893
2894   cfun->machine->frame.initial_adjust = 0;
2895   cfun->machine->frame.final_adjust = 0;
2896   cfun->machine->frame.callee_adjust = 0;
2897   cfun->machine->frame.callee_offset = 0;
2898
2899   HOST_WIDE_INT max_push_offset = 0;
2900   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
2901     max_push_offset = 512;
2902   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
2903     max_push_offset = 256;
2904
2905   if (cfun->machine->frame.frame_size < max_push_offset
2906       && crtl->outgoing_args_size == 0)
2907     {
2908       /* Simple, small frame with no outgoing arguments:
2909          stp reg1, reg2, [sp, -frame_size]!
2910          stp reg3, reg4, [sp, 16]  */
2911       cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
2912     }
2913   else if ((crtl->outgoing_args_size
2914             + cfun->machine->frame.saved_regs_size < 512)
2915            && !(cfun->calls_alloca
2916                 && cfun->machine->frame.hard_fp_offset < max_push_offset))
2917     {
2918       /* Frame with small outgoing arguments:
2919          sub sp, sp, frame_size
2920          stp reg1, reg2, [sp, outgoing_args_size]
2921          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
2922       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
2923       cfun->machine->frame.callee_offset
2924         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
2925     }
2926   else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
2927     {
2928       /* Frame with large outgoing arguments but a small local area:
2929          stp reg1, reg2, [sp, -hard_fp_offset]!
2930          stp reg3, reg4, [sp, 16]
2931          sub sp, sp, outgoing_args_size  */
2932       cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
2933       cfun->machine->frame.final_adjust
2934         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
2935     }
2936   else if (!frame_pointer_needed
2937            && varargs_and_saved_regs_size < max_push_offset)
2938     {
2939       /* Frame with large local area and outgoing arguments (this pushes the
2940          callee-saves first, followed by the locals and outgoing area):
2941          stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
2942          stp reg3, reg4, [sp, 16]
2943          sub sp, sp, frame_size - varargs_and_saved_regs_size  */
2944       cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
2945       cfun->machine->frame.final_adjust
2946         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
2947       cfun->machine->frame.hard_fp_offset = cfun->machine->frame.callee_adjust;
2948       cfun->machine->frame.locals_offset = cfun->machine->frame.hard_fp_offset;
2949     }
2950   else
2951     {
2952       /* Frame with large local area and outgoing arguments using frame pointer:
2953          sub sp, sp, hard_fp_offset
2954          stp x29, x30, [sp, 0]
2955          add x29, sp, 0
2956          stp reg3, reg4, [sp, 16]
2957          sub sp, sp, outgoing_args_size  */
2958       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
2959       cfun->machine->frame.final_adjust
2960         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
2961     }
2962
2963   cfun->machine->frame.laid_out = true;
2964 }
2965
2966 /* Return true if the register REGNO is saved on entry to
2967    the current function.  */
2968
2969 static bool
2970 aarch64_register_saved_on_entry (int regno)
2971 {
2972   return cfun->machine->frame.reg_offset[regno] >= 0;
2973 }
2974
2975 /* Return the next register up from REGNO up to LIMIT for the callee
2976    to save.  */
2977
2978 static unsigned
2979 aarch64_next_callee_save (unsigned regno, unsigned limit)
2980 {
2981   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2982     regno ++;
2983   return regno;
2984 }
2985
2986 /* Push the register number REGNO of mode MODE to the stack with write-back
2987    adjusting the stack by ADJUSTMENT.  */
2988
2989 static void
2990 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2991                            HOST_WIDE_INT adjustment)
2992  {
2993   rtx base_rtx = stack_pointer_rtx;
2994   rtx insn, reg, mem;
2995
2996   reg = gen_rtx_REG (mode, regno);
2997   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2998                             plus_constant (Pmode, base_rtx, -adjustment));
2999   mem = gen_rtx_MEM (mode, mem);
3000
3001   insn = emit_move_insn (mem, reg);
3002   RTX_FRAME_RELATED_P (insn) = 1;
3003 }
3004
3005 /* Generate and return an instruction to store the pair of registers
3006    REG and REG2 of mode MODE to location BASE with write-back adjusting
3007    the stack location BASE by ADJUSTMENT.  */
3008
3009 static rtx
3010 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3011                           HOST_WIDE_INT adjustment)
3012 {
3013   switch (mode)
3014     {
3015     case DImode:
3016       return gen_storewb_pairdi_di (base, base, reg, reg2,
3017                                     GEN_INT (-adjustment),
3018                                     GEN_INT (UNITS_PER_WORD - adjustment));
3019     case DFmode:
3020       return gen_storewb_pairdf_di (base, base, reg, reg2,
3021                                     GEN_INT (-adjustment),
3022                                     GEN_INT (UNITS_PER_WORD - adjustment));
3023     default:
3024       gcc_unreachable ();
3025     }
3026 }
3027
3028 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3029    stack pointer by ADJUSTMENT.  */
3030
3031 static void
3032 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
3033 {
3034   rtx_insn *insn;
3035   machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
3036
3037   if (regno2 == INVALID_REGNUM)
3038     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
3039
3040   rtx reg1 = gen_rtx_REG (mode, regno1);
3041   rtx reg2 = gen_rtx_REG (mode, regno2);
3042
3043   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
3044                                               reg2, adjustment));
3045   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
3046   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3047   RTX_FRAME_RELATED_P (insn) = 1;
3048 }
3049
3050 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3051    adjusting it by ADJUSTMENT afterwards.  */
3052
3053 static rtx
3054 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3055                          HOST_WIDE_INT adjustment)
3056 {
3057   switch (mode)
3058     {
3059     case DImode:
3060       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3061                                    GEN_INT (UNITS_PER_WORD));
3062     case DFmode:
3063       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3064                                    GEN_INT (UNITS_PER_WORD));
3065     default:
3066       gcc_unreachable ();
3067     }
3068 }
3069
3070 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3071    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3072    into CFI_OPS.  */
3073
3074 static void
3075 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
3076                   rtx *cfi_ops)
3077 {
3078   machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
3079   rtx reg1 = gen_rtx_REG (mode, regno1);
3080
3081   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
3082
3083   if (regno2 == INVALID_REGNUM)
3084     {
3085       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
3086       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3087       emit_move_insn (reg1, gen_rtx_MEM (mode, mem));
3088     }
3089   else
3090     {
3091       rtx reg2 = gen_rtx_REG (mode, regno2);
3092       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3093       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3094                                           reg2, adjustment));
3095     }
3096 }
3097
3098 /* Generate and return a store pair instruction of mode MODE to store
3099    register REG1 to MEM1 and register REG2 to MEM2.  */
3100
3101 static rtx
3102 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3103                         rtx reg2)
3104 {
3105   switch (mode)
3106     {
3107     case DImode:
3108       return gen_store_pairdi (mem1, reg1, mem2, reg2);
3109
3110     case DFmode:
3111       return gen_store_pairdf (mem1, reg1, mem2, reg2);
3112
3113     default:
3114       gcc_unreachable ();
3115     }
3116 }
3117
3118 /* Generate and regurn a load pair isntruction of mode MODE to load register
3119    REG1 from MEM1 and register REG2 from MEM2.  */
3120
3121 static rtx
3122 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3123                        rtx mem2)
3124 {
3125   switch (mode)
3126     {
3127     case DImode:
3128       return gen_load_pairdi (reg1, mem1, reg2, mem2);
3129
3130     case DFmode:
3131       return gen_load_pairdf (reg1, mem1, reg2, mem2);
3132
3133     default:
3134       gcc_unreachable ();
3135     }
3136 }
3137
3138 /* Return TRUE if return address signing should be enabled for the current
3139    function, otherwise return FALSE.  */
3140
3141 bool
3142 aarch64_return_address_signing_enabled (void)
3143 {
3144   /* This function should only be called after frame laid out.   */
3145   gcc_assert (cfun->machine->frame.laid_out);
3146
3147   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
3148      if it's LR is pushed onto stack.  */
3149   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
3150           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
3151               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
3152 }
3153
3154 /* Emit code to save the callee-saved registers from register number START
3155    to LIMIT to the stack at the location starting at offset START_OFFSET,
3156    skipping any write-back candidates if SKIP_WB is true.  */
3157
3158 static void
3159 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3160                            unsigned start, unsigned limit, bool skip_wb)
3161 {
3162   rtx_insn *insn;
3163   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3164                                                  ? gen_frame_mem : gen_rtx_MEM);
3165   unsigned regno;
3166   unsigned regno2;
3167
3168   for (regno = aarch64_next_callee_save (start, limit);
3169        regno <= limit;
3170        regno = aarch64_next_callee_save (regno + 1, limit))
3171     {
3172       rtx reg, mem;
3173       HOST_WIDE_INT offset;
3174
3175       if (skip_wb
3176           && (regno == cfun->machine->frame.wb_candidate1
3177               || regno == cfun->machine->frame.wb_candidate2))
3178         continue;
3179
3180       if (cfun->machine->reg_is_wrapped_separately[regno])
3181        continue;
3182
3183       reg = gen_rtx_REG (mode, regno);
3184       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3185       mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3186                                               offset));
3187
3188       regno2 = aarch64_next_callee_save (regno + 1, limit);
3189
3190       if (regno2 <= limit
3191           && !cfun->machine->reg_is_wrapped_separately[regno2]
3192           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3193               == cfun->machine->frame.reg_offset[regno2]))
3194
3195         {
3196           rtx reg2 = gen_rtx_REG (mode, regno2);
3197           rtx mem2;
3198
3199           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3200           mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3201                                                    offset));
3202           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3203                                                     reg2));
3204
3205           /* The first part of a frame-related parallel insn is
3206              always assumed to be relevant to the frame
3207              calculations; subsequent parts, are only
3208              frame-related if explicitly marked.  */
3209           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3210           regno = regno2;
3211         }
3212       else
3213         insn = emit_move_insn (mem, reg);
3214
3215       RTX_FRAME_RELATED_P (insn) = 1;
3216     }
3217 }
3218
3219 /* Emit code to restore the callee registers of mode MODE from register
3220    number START up to and including LIMIT.  Restore from the stack offset
3221    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3222    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
3223
3224 static void
3225 aarch64_restore_callee_saves (machine_mode mode,
3226                               HOST_WIDE_INT start_offset, unsigned start,
3227                               unsigned limit, bool skip_wb, rtx *cfi_ops)
3228 {
3229   rtx base_rtx = stack_pointer_rtx;
3230   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3231                                                  ? gen_frame_mem : gen_rtx_MEM);
3232   unsigned regno;
3233   unsigned regno2;
3234   HOST_WIDE_INT offset;
3235
3236   for (regno = aarch64_next_callee_save (start, limit);
3237        regno <= limit;
3238        regno = aarch64_next_callee_save (regno + 1, limit))
3239     {
3240       if (cfun->machine->reg_is_wrapped_separately[regno])
3241        continue;
3242
3243       rtx reg, mem;
3244
3245       if (skip_wb
3246           && (regno == cfun->machine->frame.wb_candidate1
3247               || regno == cfun->machine->frame.wb_candidate2))
3248         continue;
3249
3250       reg = gen_rtx_REG (mode, regno);
3251       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3252       mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3253
3254       regno2 = aarch64_next_callee_save (regno + 1, limit);
3255
3256       if (regno2 <= limit
3257           && !cfun->machine->reg_is_wrapped_separately[regno2]
3258           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3259               == cfun->machine->frame.reg_offset[regno2]))
3260         {
3261           rtx reg2 = gen_rtx_REG (mode, regno2);
3262           rtx mem2;
3263
3264           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3265           mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3266           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3267
3268           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3269           regno = regno2;
3270         }
3271       else
3272         emit_move_insn (reg, mem);
3273       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3274     }
3275 }
3276
3277 static inline bool
3278 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3279                                HOST_WIDE_INT offset)
3280 {
3281   return offset >= -256 && offset < 256;
3282 }
3283
3284 static inline bool
3285 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3286 {
3287   return (offset >= 0
3288           && offset < 4096 * GET_MODE_SIZE (mode)
3289           && offset % GET_MODE_SIZE (mode) == 0);
3290 }
3291
3292 bool
3293 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3294 {
3295   return (offset >= -64 * GET_MODE_SIZE (mode)
3296           && offset < 64 * GET_MODE_SIZE (mode)
3297           && offset % GET_MODE_SIZE (mode) == 0);
3298 }
3299
3300 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
3301
3302 static sbitmap
3303 aarch64_get_separate_components (void)
3304 {
3305   aarch64_layout_frame ();
3306
3307   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3308   bitmap_clear (components);
3309
3310   /* The registers we need saved to the frame.  */
3311   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3312     if (aarch64_register_saved_on_entry (regno))
3313       {
3314         HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3315         if (!frame_pointer_needed)
3316           offset += cfun->machine->frame.frame_size
3317                     - cfun->machine->frame.hard_fp_offset;
3318         /* Check that we can access the stack slot of the register with one
3319            direct load with no adjustments needed.  */
3320         if (offset_12bit_unsigned_scaled_p (DImode, offset))
3321           bitmap_set_bit (components, regno);
3322       }
3323
3324   /* Don't mess with the hard frame pointer.  */
3325   if (frame_pointer_needed)
3326     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
3327
3328   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3329   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3330   /* If aarch64_layout_frame has chosen registers to store/restore with
3331      writeback don't interfere with them to avoid having to output explicit
3332      stack adjustment instructions.  */
3333   if (reg2 != INVALID_REGNUM)
3334     bitmap_clear_bit (components, reg2);
3335   if (reg1 != INVALID_REGNUM)
3336     bitmap_clear_bit (components, reg1);
3337
3338   bitmap_clear_bit (components, LR_REGNUM);
3339   bitmap_clear_bit (components, SP_REGNUM);
3340
3341   return components;
3342 }
3343
3344 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
3345
3346 static sbitmap
3347 aarch64_components_for_bb (basic_block bb)
3348 {
3349   bitmap in = DF_LIVE_IN (bb);
3350   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
3351   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
3352
3353   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3354   bitmap_clear (components);
3355
3356   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
3357   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3358     if ((!call_used_regs[regno])
3359        && (bitmap_bit_p (in, regno)
3360            || bitmap_bit_p (gen, regno)
3361            || bitmap_bit_p (kill, regno)))
3362           bitmap_set_bit (components, regno);
3363
3364   return components;
3365 }
3366
3367 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
3368    Nothing to do for aarch64.  */
3369
3370 static void
3371 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
3372 {
3373 }
3374
3375 /* Return the next set bit in BMP from START onwards.  Return the total number
3376    of bits in BMP if no set bit is found at or after START.  */
3377
3378 static unsigned int
3379 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
3380 {
3381   unsigned int nbits = SBITMAP_SIZE (bmp);
3382   if (start == nbits)
3383     return start;
3384
3385   gcc_assert (start < nbits);
3386   for (unsigned int i = start; i < nbits; i++)
3387     if (bitmap_bit_p (bmp, i))
3388       return i;
3389
3390   return nbits;
3391 }
3392
3393 /* Do the work for aarch64_emit_prologue_components and
3394    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
3395    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
3396    for these components or the epilogue sequence.  That is, it determines
3397    whether we should emit stores or loads and what kind of CFA notes to attach
3398    to the insns.  Otherwise the logic for the two sequences is very
3399    similar.  */
3400
3401 static void
3402 aarch64_process_components (sbitmap components, bool prologue_p)
3403 {
3404   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
3405                              ? HARD_FRAME_POINTER_REGNUM
3406                              : STACK_POINTER_REGNUM);
3407
3408   unsigned last_regno = SBITMAP_SIZE (components);
3409   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
3410   rtx_insn *insn = NULL;
3411
3412   while (regno != last_regno)
3413     {
3414       /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
3415          so DFmode for the vector registers is enough.  */
3416       machine_mode mode = GP_REGNUM_P (regno) ? DImode : DFmode;
3417       rtx reg = gen_rtx_REG (mode, regno);
3418       HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3419       if (!frame_pointer_needed)
3420         offset += cfun->machine->frame.frame_size
3421                   - cfun->machine->frame.hard_fp_offset;
3422       rtx addr = plus_constant (Pmode, ptr_reg, offset);
3423       rtx mem = gen_frame_mem (mode, addr);
3424
3425       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
3426       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
3427       /* No more registers to handle after REGNO.
3428          Emit a single save/restore and exit.  */
3429       if (regno2 == last_regno)
3430         {
3431           insn = emit_insn (set);
3432           RTX_FRAME_RELATED_P (insn) = 1;
3433           if (prologue_p)
3434             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3435           else
3436             add_reg_note (insn, REG_CFA_RESTORE, reg);
3437           break;
3438         }
3439
3440       HOST_WIDE_INT offset2 = cfun->machine->frame.reg_offset[regno2];
3441       /* The next register is not of the same class or its offset is not
3442          mergeable with the current one into a pair.  */
3443       if (!satisfies_constraint_Ump (mem)
3444           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
3445           || (offset2 - cfun->machine->frame.reg_offset[regno])
3446                 != GET_MODE_SIZE (mode))
3447         {
3448           insn = emit_insn (set);
3449           RTX_FRAME_RELATED_P (insn) = 1;
3450           if (prologue_p)
3451             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3452           else
3453             add_reg_note (insn, REG_CFA_RESTORE, reg);
3454
3455           regno = regno2;
3456           continue;
3457         }
3458
3459       /* REGNO2 can be saved/restored in a pair with REGNO.  */
3460       rtx reg2 = gen_rtx_REG (mode, regno2);
3461       if (!frame_pointer_needed)
3462         offset2 += cfun->machine->frame.frame_size
3463                   - cfun->machine->frame.hard_fp_offset;
3464       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
3465       rtx mem2 = gen_frame_mem (mode, addr2);
3466       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
3467                              : gen_rtx_SET (reg2, mem2);
3468
3469       if (prologue_p)
3470         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
3471       else
3472         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3473
3474       RTX_FRAME_RELATED_P (insn) = 1;
3475       if (prologue_p)
3476         {
3477           add_reg_note (insn, REG_CFA_OFFSET, set);
3478           add_reg_note (insn, REG_CFA_OFFSET, set2);
3479         }
3480       else
3481         {
3482           add_reg_note (insn, REG_CFA_RESTORE, reg);
3483           add_reg_note (insn, REG_CFA_RESTORE, reg2);
3484         }
3485
3486       regno = aarch64_get_next_set_bit (components, regno2 + 1);
3487     }
3488 }
3489
3490 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
3491
3492 static void
3493 aarch64_emit_prologue_components (sbitmap components)
3494 {
3495   aarch64_process_components (components, true);
3496 }
3497
3498 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
3499
3500 static void
3501 aarch64_emit_epilogue_components (sbitmap components)
3502 {
3503   aarch64_process_components (components, false);
3504 }
3505
3506 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
3507
3508 static void
3509 aarch64_set_handled_components (sbitmap components)
3510 {
3511   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3512     if (bitmap_bit_p (components, regno))
3513       cfun->machine->reg_is_wrapped_separately[regno] = true;
3514 }
3515
3516 /* AArch64 stack frames generated by this compiler look like:
3517
3518         +-------------------------------+
3519         |                               |
3520         |  incoming stack arguments     |
3521         |                               |
3522         +-------------------------------+
3523         |                               | <-- incoming stack pointer (aligned)
3524         |  callee-allocated save area   |
3525         |  for register varargs         |
3526         |                               |
3527         +-------------------------------+
3528         |  local variables              | <-- frame_pointer_rtx
3529         |                               |
3530         +-------------------------------+
3531         |  padding0                     | \
3532         +-------------------------------+  |
3533         |  callee-saved registers       |  | frame.saved_regs_size
3534         +-------------------------------+  |
3535         |  LR'                          |  |
3536         +-------------------------------+  |
3537         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
3538         +-------------------------------+
3539         |  dynamic allocation           |
3540         +-------------------------------+
3541         |  padding                      |
3542         +-------------------------------+
3543         |  outgoing stack arguments     | <-- arg_pointer
3544         |                               |
3545         +-------------------------------+
3546         |                               | <-- stack_pointer_rtx (aligned)
3547
3548    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3549    but leave frame_pointer_rtx and hard_frame_pointer_rtx
3550    unchanged.  */
3551
3552 /* Generate the prologue instructions for entry into a function.
3553    Establish the stack frame by decreasing the stack pointer with a
3554    properly calculated size and, if necessary, create a frame record
3555    filled with the values of LR and previous frame pointer.  The
3556    current FP is also set up if it is in use.  */
3557
3558 void
3559 aarch64_expand_prologue (void)
3560 {
3561   aarch64_layout_frame ();
3562
3563   HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3564   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3565   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3566   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3567   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3568   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3569   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3570   rtx_insn *insn;
3571
3572   /* Sign return address for functions.  */
3573   if (aarch64_return_address_signing_enabled ())
3574     {
3575       insn = emit_insn (gen_pacisp ());
3576       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3577       RTX_FRAME_RELATED_P (insn) = 1;
3578     }
3579
3580   if (flag_stack_usage_info)
3581     current_function_static_stack_size = frame_size;
3582
3583   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3584     {
3585       if (crtl->is_leaf && !cfun->calls_alloca)
3586         {
3587           if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
3588             aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
3589                                             frame_size - STACK_CHECK_PROTECT);
3590         }
3591       else if (frame_size > 0)
3592         aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
3593     }
3594
3595   aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
3596
3597   if (callee_adjust != 0)
3598     aarch64_push_regs (reg1, reg2, callee_adjust);
3599
3600   if (frame_pointer_needed)
3601     {
3602       if (callee_adjust == 0)
3603         aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3604                                    R30_REGNUM, false);
3605       insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3606                                        stack_pointer_rtx,
3607                                        GEN_INT (callee_offset)));
3608       RTX_FRAME_RELATED_P (insn) = 1;
3609       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3610     }
3611
3612   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3613                              callee_adjust != 0 || frame_pointer_needed);
3614   aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3615                              callee_adjust != 0 || frame_pointer_needed);
3616   aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
3617 }
3618
3619 /* Return TRUE if we can use a simple_return insn.
3620
3621    This function checks whether the callee saved stack is empty, which
3622    means no restore actions are need. The pro_and_epilogue will use
3623    this to check whether shrink-wrapping opt is feasible.  */
3624
3625 bool
3626 aarch64_use_return_insn_p (void)
3627 {
3628   if (!reload_completed)
3629     return false;
3630
3631   if (crtl->profile)
3632     return false;
3633
3634   aarch64_layout_frame ();
3635
3636   return cfun->machine->frame.frame_size == 0;
3637 }
3638
3639 /* Generate the epilogue instructions for returning from a function.
3640    This is almost exactly the reverse of the prolog sequence, except
3641    that we need to insert barriers to avoid scheduling loads that read
3642    from a deallocated stack, and we optimize the unwind records by
3643    emitting them all together if possible.  */
3644 void
3645 aarch64_expand_epilogue (bool for_sibcall)
3646 {
3647   aarch64_layout_frame ();
3648
3649   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3650   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3651   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3652   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3653   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3654   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3655   rtx cfi_ops = NULL;
3656   rtx_insn *insn;
3657
3658   /* We need to add memory barrier to prevent read from deallocated stack.  */
3659   bool need_barrier_p = (get_frame_size ()
3660                          + cfun->machine->frame.saved_varargs_size) != 0;
3661
3662   /* Emit a barrier to prevent loads from a deallocated stack.  */
3663   if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca
3664       || crtl->calls_eh_return)
3665     {
3666       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3667       need_barrier_p = false;
3668     }
3669
3670   /* Restore the stack pointer from the frame pointer if it may not
3671      be the same as the stack pointer.  */
3672   if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3673     {
3674       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3675                                        hard_frame_pointer_rtx,
3676                                        GEN_INT (-callee_offset)));
3677       /* If writeback is used when restoring callee-saves, the CFA
3678          is restored on the instruction doing the writeback.  */
3679       RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3680     }
3681   else
3682     aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
3683
3684   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3685                                 callee_adjust != 0, &cfi_ops);
3686   aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3687                                 callee_adjust != 0, &cfi_ops);
3688
3689   if (need_barrier_p)
3690     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3691
3692   if (callee_adjust != 0)
3693     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3694
3695   if (callee_adjust != 0 || initial_adjust > 65536)
3696     {
3697       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
3698       insn = get_last_insn ();
3699       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3700       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3701       RTX_FRAME_RELATED_P (insn) = 1;
3702       cfi_ops = NULL;
3703     }
3704
3705   aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
3706
3707   if (cfi_ops)
3708     {
3709       /* Emit delayed restores and reset the CFA to be SP.  */
3710       insn = get_last_insn ();
3711       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3712       REG_NOTES (insn) = cfi_ops;
3713       RTX_FRAME_RELATED_P (insn) = 1;
3714     }
3715
3716   /* We prefer to emit the combined return/authenticate instruction RETAA,
3717      however there are three cases in which we must instead emit an explicit
3718      authentication instruction.
3719
3720         1) Sibcalls don't return in a normal way, so if we're about to call one
3721            we must authenticate.
3722
3723         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
3724            generating code for !TARGET_ARMV8_3 we can't use it and must
3725            explicitly authenticate.
3726
3727         3) On an eh_return path we make extra stack adjustments to update the
3728            canonical frame address to be the exception handler's CFA.  We want
3729            to authenticate using the CFA of the function which calls eh_return.
3730     */
3731   if (aarch64_return_address_signing_enabled ()
3732       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
3733     {
3734       insn = emit_insn (gen_autisp ());
3735       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3736       RTX_FRAME_RELATED_P (insn) = 1;
3737     }
3738
3739   /* Stack adjustment for exception handler.  */
3740   if (crtl->calls_eh_return)
3741     {
3742       /* We need to unwind the stack by the offset computed by
3743          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
3744          to be SP; letting the CFA move during this adjustment
3745          is just as correct as retaining the CFA from the body
3746          of the function.  Therefore, do nothing special.  */
3747       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3748     }
3749
3750   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3751   if (!for_sibcall)
3752     emit_jump_insn (ret_rtx);
3753 }
3754
3755 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
3756    normally or return to a previous frame after unwinding.
3757
3758    An EH return uses a single shared return sequence.  The epilogue is
3759    exactly like a normal epilogue except that it has an extra input
3760    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
3761    that must be applied after the frame has been destroyed.  An extra label
3762    is inserted before the epilogue which initializes this register to zero,
3763    and this is the entry point for a normal return.
3764
3765    An actual EH return updates the return address, initializes the stack
3766    adjustment and jumps directly into the epilogue (bypassing the zeroing
3767    of the adjustment).  Since the return address is typically saved on the
3768    stack when a function makes a call, the saved LR must be updated outside
3769    the epilogue.
3770
3771    This poses problems as the store is generated well before the epilogue,
3772    so the offset of LR is not known yet.  Also optimizations will remove the
3773    store as it appears dead, even after the epilogue is generated (as the
3774    base or offset for loading LR is different in many cases).
3775
3776    To avoid these problems this implementation forces the frame pointer
3777    in eh_return functions so that the location of LR is fixed and known early.
3778    It also marks the store volatile, so no optimization is permitted to
3779    remove the store.  */
3780 rtx
3781 aarch64_eh_return_handler_rtx (void)
3782 {
3783   rtx tmp = gen_frame_mem (Pmode,
3784     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3785
3786   /* Mark the store volatile, so no optimization is permitted to remove it.  */
3787   MEM_VOLATILE_P (tmp) = true;
3788   return tmp;
3789 }
3790
3791 /* Output code to add DELTA to the first argument, and then jump
3792    to FUNCTION.  Used for C++ multiple inheritance.  */
3793 static void
3794 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3795                          HOST_WIDE_INT delta,
3796                          HOST_WIDE_INT vcall_offset,
3797                          tree function)
3798 {
3799   /* The this pointer is always in x0.  Note that this differs from
3800      Arm where the this pointer maybe bumped to r1 if r0 is required
3801      to return a pointer to an aggregate.  On AArch64 a result value
3802      pointer will be in x8.  */
3803   int this_regno = R0_REGNUM;
3804   rtx this_rtx, temp0, temp1, addr, funexp;
3805   rtx_insn *insn;
3806
3807   reload_completed = 1;
3808   emit_note (NOTE_INSN_PROLOGUE_END);
3809
3810   if (vcall_offset == 0)
3811     aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3812   else
3813     {
3814       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3815
3816       this_rtx = gen_rtx_REG (Pmode, this_regno);
3817       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3818       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3819
3820       addr = this_rtx;
3821       if (delta != 0)
3822         {
3823           if (delta >= -256 && delta < 256)
3824             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3825                                        plus_constant (Pmode, this_rtx, delta));
3826           else
3827             aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3828         }
3829
3830       if (Pmode == ptr_mode)
3831         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3832       else
3833         aarch64_emit_move (temp0,
3834                            gen_rtx_ZERO_EXTEND (Pmode,
3835                                                 gen_rtx_MEM (ptr_mode, addr)));
3836
3837       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3838           addr = plus_constant (Pmode, temp0, vcall_offset);
3839       else
3840         {
3841           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3842                                           Pmode);
3843           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3844         }
3845
3846       if (Pmode == ptr_mode)
3847         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3848       else
3849         aarch64_emit_move (temp1,
3850                            gen_rtx_SIGN_EXTEND (Pmode,
3851                                                 gen_rtx_MEM (ptr_mode, addr)));
3852
3853       emit_insn (gen_add2_insn (this_rtx, temp1));
3854     }
3855
3856   /* Generate a tail call to the target function.  */
3857   if (!TREE_USED (function))
3858     {
3859       assemble_external (function);
3860       TREE_USED (function) = 1;
3861     }
3862   funexp = XEXP (DECL_RTL (function), 0);
3863   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3864   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3865   SIBLING_CALL_P (insn) = 1;
3866
3867   insn = get_insns ();
3868   shorten_branches (insn);
3869   final_start_function (insn, file, 1);
3870   final (insn, file, 1);
3871   final_end_function ();
3872
3873   /* Stop pretending to be a post-reload pass.  */
3874   reload_completed = 0;
3875 }
3876
3877 static bool
3878 aarch64_tls_referenced_p (rtx x)
3879 {
3880   if (!TARGET_HAVE_TLS)
3881     return false;
3882   subrtx_iterator::array_type array;
3883   FOR_EACH_SUBRTX (iter, array, x, ALL)
3884     {
3885       const_rtx x = *iter;
3886       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3887         return true;
3888       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3889          TLS offsets, not real symbol references.  */
3890       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3891         iter.skip_subrtxes ();
3892     }
3893   return false;
3894 }
3895
3896
3897 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3898    a left shift of 0 or 12 bits.  */
3899 bool
3900 aarch64_uimm12_shift (HOST_WIDE_INT val)
3901 {
3902   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3903           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3904           );
3905 }
3906
3907
3908 /* Return true if val is an immediate that can be loaded into a
3909    register by a MOVZ instruction.  */
3910 static bool
3911 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3912 {
3913   if (GET_MODE_SIZE (mode) > 4)
3914     {
3915       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3916           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3917         return 1;
3918     }
3919   else
3920     {
3921       /* Ignore sign extension.  */
3922       val &= (HOST_WIDE_INT) 0xffffffff;
3923     }
3924   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3925           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3926 }
3927
3928 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
3929
3930 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
3931   {
3932     0x0000000100000001ull,
3933     0x0001000100010001ull,
3934     0x0101010101010101ull,
3935     0x1111111111111111ull,
3936     0x5555555555555555ull,
3937   };
3938
3939
3940 /* Return true if val is a valid bitmask immediate.  */
3941
3942 bool
3943 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
3944 {
3945   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
3946   int bits;
3947
3948   /* Check for a single sequence of one bits and return quickly if so.
3949      The special cases of all ones and all zeroes returns false.  */
3950   val = (unsigned HOST_WIDE_INT) val_in;
3951   tmp = val + (val & -val);
3952
3953   if (tmp == (tmp & -tmp))
3954     return (val + 1) > 1;
3955
3956   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
3957   if (mode == SImode)
3958     val = (val << 32) | (val & 0xffffffff);
3959
3960   /* Invert if the immediate doesn't start with a zero bit - this means we
3961      only need to search for sequences of one bits.  */
3962   if (val & 1)
3963     val = ~val;
3964
3965   /* Find the first set bit and set tmp to val with the first sequence of one
3966      bits removed.  Return success if there is a single sequence of ones.  */
3967   first_one = val & -val;
3968   tmp = val & (val + first_one);
3969
3970   if (tmp == 0)
3971     return true;
3972
3973   /* Find the next set bit and compute the difference in bit position.  */
3974   next_one = tmp & -tmp;
3975   bits = clz_hwi (first_one) - clz_hwi (next_one);
3976   mask = val ^ tmp;
3977
3978   /* Check the bit position difference is a power of 2, and that the first
3979      sequence of one bits fits within 'bits' bits.  */
3980   if ((mask >> bits) != 0 || bits != (bits & -bits))
3981     return false;
3982
3983   /* Check the sequence of one bits is repeated 64/bits times.  */
3984   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
3985 }
3986
3987 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
3988    Assumed precondition: VAL_IN Is not zero.  */
3989
3990 unsigned HOST_WIDE_INT
3991 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
3992 {
3993   int lowest_bit_set = ctz_hwi (val_in);
3994   int highest_bit_set = floor_log2 (val_in);
3995   gcc_assert (val_in != 0);
3996
3997   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
3998           (HOST_WIDE_INT_1U << lowest_bit_set));
3999 }
4000
4001 /* Create constant where bits outside of lowest bit set to highest bit set
4002    are set to 1.  */
4003
4004 unsigned HOST_WIDE_INT
4005 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
4006 {
4007   return val_in | ~aarch64_and_split_imm1 (val_in);
4008 }
4009
4010 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
4011
4012 bool
4013 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
4014 {
4015   if (aarch64_bitmask_imm (val_in, mode))
4016     return false;
4017
4018   if (aarch64_move_imm (val_in, mode))
4019     return false;
4020
4021   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
4022
4023   return aarch64_bitmask_imm (imm2, mode);
4024 }
4025
4026 /* Return true if val is an immediate that can be loaded into a
4027    register in a single instruction.  */
4028 bool
4029 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
4030 {
4031   if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
4032     return 1;
4033   return aarch64_bitmask_imm (val, mode);
4034 }
4035
4036 static bool
4037 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
4038 {
4039   rtx base, offset;
4040
4041   if (GET_CODE (x) == HIGH)
4042     return true;
4043
4044   split_const (x, &base, &offset);
4045   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
4046     {
4047       if (aarch64_classify_symbol (base, offset)
4048           != SYMBOL_FORCE_TO_MEM)
4049         return true;
4050       else
4051         /* Avoid generating a 64-bit relocation in ILP32; leave
4052            to aarch64_expand_mov_immediate to handle it properly.  */
4053         return mode != ptr_mode;
4054     }
4055
4056   return aarch64_tls_referenced_p (x);
4057 }
4058
4059 /* Implement TARGET_CASE_VALUES_THRESHOLD.
4060    The expansion for a table switch is quite expensive due to the number
4061    of instructions, the table lookup and hard to predict indirect jump.
4062    When optimizing for speed, and -O3 enabled, use the per-core tuning if
4063    set, otherwise use tables for > 16 cases as a tradeoff between size and
4064    performance.  When optimizing for size, use the default setting.  */
4065
4066 static unsigned int
4067 aarch64_case_values_threshold (void)
4068 {
4069   /* Use the specified limit for the number of cases before using jump
4070      tables at higher optimization levels.  */
4071   if (optimize > 2
4072       && selected_cpu->tune->max_case_values != 0)
4073     return selected_cpu->tune->max_case_values;
4074   else
4075     return optimize_size ? default_case_values_threshold () : 17;
4076 }
4077
4078 /* Return true if register REGNO is a valid index register.
4079    STRICT_P is true if REG_OK_STRICT is in effect.  */
4080
4081 bool
4082 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
4083 {
4084   if (!HARD_REGISTER_NUM_P (regno))
4085     {
4086       if (!strict_p)
4087         return true;
4088
4089       if (!reg_renumber)
4090         return false;
4091
4092       regno = reg_renumber[regno];
4093     }
4094   return GP_REGNUM_P (regno);
4095 }
4096
4097 /* Return true if register REGNO is a valid base register for mode MODE.
4098    STRICT_P is true if REG_OK_STRICT is in effect.  */
4099
4100 bool
4101 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
4102 {
4103   if (!HARD_REGISTER_NUM_P (regno))
4104     {
4105       if (!strict_p)
4106         return true;
4107
4108       if (!reg_renumber)
4109         return false;
4110
4111       regno = reg_renumber[regno];
4112     }
4113
4114   /* The fake registers will be eliminated to either the stack or
4115      hard frame pointer, both of which are usually valid base registers.
4116      Reload deals with the cases where the eliminated form isn't valid.  */
4117   return (GP_REGNUM_P (regno)
4118           || regno == SP_REGNUM
4119           || regno == FRAME_POINTER_REGNUM
4120           || regno == ARG_POINTER_REGNUM);
4121 }
4122
4123 /* Return true if X is a valid base register for mode MODE.
4124    STRICT_P is true if REG_OK_STRICT is in effect.  */
4125
4126 static bool
4127 aarch64_base_register_rtx_p (rtx x, bool strict_p)
4128 {
4129   if (!strict_p && GET_CODE (x) == SUBREG)
4130     x = SUBREG_REG (x);
4131
4132   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
4133 }
4134
4135 /* Return true if address offset is a valid index.  If it is, fill in INFO
4136    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
4137
4138 static bool
4139 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
4140                         machine_mode mode, bool strict_p)
4141 {
4142   enum aarch64_address_type type;
4143   rtx index;
4144   int shift;
4145
4146   /* (reg:P) */
4147   if ((REG_P (x) || GET_CODE (x) == SUBREG)
4148       && GET_MODE (x) == Pmode)
4149     {
4150       type = ADDRESS_REG_REG;
4151       index = x;
4152       shift = 0;
4153     }
4154   /* (sign_extend:DI (reg:SI)) */
4155   else if ((GET_CODE (x) == SIGN_EXTEND
4156             || GET_CODE (x) == ZERO_EXTEND)
4157            && GET_MODE (x) == DImode
4158            && GET_MODE (XEXP (x, 0)) == SImode)
4159     {
4160       type = (GET_CODE (x) == SIGN_EXTEND)
4161         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4162       index = XEXP (x, 0);
4163       shift = 0;
4164     }
4165   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
4166   else if (GET_CODE (x) == MULT
4167            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4168                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4169            && GET_MODE (XEXP (x, 0)) == DImode
4170            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4171            && CONST_INT_P (XEXP (x, 1)))
4172     {
4173       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4174         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4175       index = XEXP (XEXP (x, 0), 0);
4176       shift = exact_log2 (INTVAL (XEXP (x, 1)));
4177     }
4178   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
4179   else if (GET_CODE (x) == ASHIFT
4180            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4181                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4182            && GET_MODE (XEXP (x, 0)) == DImode
4183            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4184            && CONST_INT_P (XEXP (x, 1)))
4185     {
4186       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4187         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4188       index = XEXP (XEXP (x, 0), 0);
4189       shift = INTVAL (XEXP (x, 1));
4190     }
4191   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
4192   else if ((GET_CODE (x) == SIGN_EXTRACT
4193             || GET_CODE (x) == ZERO_EXTRACT)
4194            && GET_MODE (x) == DImode
4195            && GET_CODE (XEXP (x, 0)) == MULT
4196            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4197            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4198     {
4199       type = (GET_CODE (x) == SIGN_EXTRACT)
4200         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4201       index = XEXP (XEXP (x, 0), 0);
4202       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4203       if (INTVAL (XEXP (x, 1)) != 32 + shift
4204           || INTVAL (XEXP (x, 2)) != 0)
4205         shift = -1;
4206     }
4207   /* (and:DI (mult:DI (reg:DI) (const_int scale))
4208      (const_int 0xffffffff<<shift)) */
4209   else if (GET_CODE (x) == AND
4210            && GET_MODE (x) == DImode
4211            && GET_CODE (XEXP (x, 0)) == MULT
4212            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4213            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4214            && CONST_INT_P (XEXP (x, 1)))
4215     {
4216       type = ADDRESS_REG_UXTW;
4217       index = XEXP (XEXP (x, 0), 0);
4218       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4219       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4220         shift = -1;
4221     }
4222   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
4223   else if ((GET_CODE (x) == SIGN_EXTRACT
4224             || GET_CODE (x) == ZERO_EXTRACT)
4225            && GET_MODE (x) == DImode
4226            && GET_CODE (XEXP (x, 0)) == ASHIFT
4227            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4228            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4229     {
4230       type = (GET_CODE (x) == SIGN_EXTRACT)
4231         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4232       index = XEXP (XEXP (x, 0), 0);
4233       shift = INTVAL (XEXP (XEXP (x, 0), 1));
4234       if (INTVAL (XEXP (x, 1)) != 32 + shift
4235           || INTVAL (XEXP (x, 2)) != 0)
4236         shift = -1;
4237     }
4238   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
4239      (const_int 0xffffffff<<shift)) */
4240   else if (GET_CODE (x) == AND
4241            && GET_MODE (x) == DImode
4242            && GET_CODE (XEXP (x, 0)) == ASHIFT
4243            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4244            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4245            && CONST_INT_P (XEXP (x, 1)))
4246     {
4247       type = ADDRESS_REG_UXTW;
4248       index = XEXP (XEXP (x, 0), 0);
4249       shift = INTVAL (XEXP (XEXP (x, 0), 1));
4250       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4251         shift = -1;
4252     }
4253   /* (mult:P (reg:P) (const_int scale)) */
4254   else if (GET_CODE (x) == MULT
4255            && GET_MODE (x) == Pmode
4256            && GET_MODE (XEXP (x, 0)) == Pmode
4257            && CONST_INT_P (XEXP (x, 1)))
4258     {
4259       type = ADDRESS_REG_REG;
4260       index = XEXP (x, 0);
4261       shift = exact_log2 (INTVAL (XEXP (x, 1)));
4262     }
4263   /* (ashift:P (reg:P) (const_int shift)) */
4264   else if (GET_CODE (x) == ASHIFT
4265            && GET_MODE (x) == Pmode
4266            && GET_MODE (XEXP (x, 0)) == Pmode
4267            && CONST_INT_P (XEXP (x, 1)))
4268     {
4269       type = ADDRESS_REG_REG;
4270       index = XEXP (x, 0);
4271       shift = INTVAL (XEXP (x, 1));
4272     }
4273   else
4274     return false;
4275
4276   if (GET_CODE (index) == SUBREG)
4277     index = SUBREG_REG (index);
4278
4279   if ((shift == 0 ||
4280        (shift > 0 && shift <= 3
4281         && (1 << shift) == GET_MODE_SIZE (mode)))
4282       && REG_P (index)
4283       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
4284     {
4285       info->type = type;
4286       info->offset = index;
4287       info->shift = shift;
4288       return true;
4289     }
4290
4291   return false;
4292 }
4293
4294 /* Return true if MODE is one of the modes for which we
4295    support LDP/STP operations.  */
4296
4297 static bool
4298 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
4299 {
4300   return mode == SImode || mode == DImode
4301          || mode == SFmode || mode == DFmode
4302          || (aarch64_vector_mode_supported_p (mode)
4303              && GET_MODE_SIZE (mode) == 8);
4304 }
4305
4306 /* Return true if REGNO is a virtual pointer register, or an eliminable
4307    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
4308    include stack_pointer or hard_frame_pointer.  */
4309 static bool
4310 virt_or_elim_regno_p (unsigned regno)
4311 {
4312   return ((regno >= FIRST_VIRTUAL_REGISTER
4313            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
4314           || regno == FRAME_POINTER_REGNUM
4315           || regno == ARG_POINTER_REGNUM);
4316 }
4317
4318 /* Return true if X is a valid address for machine mode MODE.  If it is,
4319    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
4320    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
4321
4322 static bool
4323 aarch64_classify_address (struct aarch64_address_info *info,
4324                           rtx x, machine_mode mode,
4325                           RTX_CODE outer_code, bool strict_p)
4326 {
4327   enum rtx_code code = GET_CODE (x);
4328   rtx op0, op1;
4329
4330   /* On BE, we use load/store pair for all large int mode load/stores.
4331      TI/TFmode may also use a load/store pair.  */
4332   bool load_store_pair_p = (outer_code == PARALLEL
4333                             || mode == TImode
4334                             || mode == TFmode
4335                             || (BYTES_BIG_ENDIAN
4336                                 && aarch64_vect_struct_mode_p (mode)));
4337
4338   bool allow_reg_index_p =
4339     !load_store_pair_p
4340     && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
4341     && !aarch64_vect_struct_mode_p (mode);
4342
4343   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4344      REG addressing.  */
4345   if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
4346       && (code != POST_INC && code != REG))
4347     return false;
4348
4349   switch (code)
4350     {
4351     case REG:
4352     case SUBREG:
4353       info->type = ADDRESS_REG_IMM;
4354       info->base = x;
4355       info->offset = const0_rtx;
4356       return aarch64_base_register_rtx_p (x, strict_p);
4357
4358     case PLUS:
4359       op0 = XEXP (x, 0);
4360       op1 = XEXP (x, 1);
4361
4362       if (! strict_p
4363           && REG_P (op0)
4364           && virt_or_elim_regno_p (REGNO (op0))
4365           && CONST_INT_P (op1))
4366         {
4367           info->type = ADDRESS_REG_IMM;
4368           info->base = op0;
4369           info->offset = op1;
4370
4371           return true;
4372         }
4373
4374       if (GET_MODE_SIZE (mode) != 0
4375           && CONST_INT_P (op1)
4376           && aarch64_base_register_rtx_p (op0, strict_p))
4377         {
4378           HOST_WIDE_INT offset = INTVAL (op1);
4379
4380           info->type = ADDRESS_REG_IMM;
4381           info->base = op0;
4382           info->offset = op1;
4383
4384           /* TImode and TFmode values are allowed in both pairs of X
4385              registers and individual Q registers.  The available
4386              address modes are:
4387              X,X: 7-bit signed scaled offset
4388              Q:   9-bit signed offset
4389              We conservatively require an offset representable in either mode.
4390              When performing the check for pairs of X registers i.e.  LDP/STP
4391              pass down DImode since that is the natural size of the LDP/STP
4392              instruction memory accesses.  */
4393           if (mode == TImode || mode == TFmode)
4394             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
4395                     && (offset_9bit_signed_unscaled_p (mode, offset)
4396                         || offset_12bit_unsigned_scaled_p (mode, offset)));
4397
4398           /* A 7bit offset check because OImode will emit a ldp/stp
4399              instruction (only big endian will get here).
4400              For ldp/stp instructions, the offset is scaled for the size of a
4401              single element of the pair.  */
4402           if (mode == OImode)
4403             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4404
4405           /* Three 9/12 bit offsets checks because CImode will emit three
4406              ldr/str instructions (only big endian will get here).  */
4407           if (mode == CImode)
4408             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4409                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4410                         || offset_12bit_unsigned_scaled_p (V16QImode,
4411                                                            offset + 32)));
4412
4413           /* Two 7bit offsets checks because XImode will emit two ldp/stp
4414              instructions (only big endian will get here).  */
4415           if (mode == XImode)
4416             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4417                     && aarch64_offset_7bit_signed_scaled_p (TImode,
4418                                                             offset + 32));
4419
4420           if (load_store_pair_p)
4421             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4422                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4423           else
4424             return (offset_9bit_signed_unscaled_p (mode, offset)
4425                     || offset_12bit_unsigned_scaled_p (mode, offset));
4426         }
4427
4428       if (allow_reg_index_p)
4429         {
4430           /* Look for base + (scaled/extended) index register.  */
4431           if (aarch64_base_register_rtx_p (op0, strict_p)
4432               && aarch64_classify_index (info, op1, mode, strict_p))
4433             {
4434               info->base = op0;
4435               return true;
4436             }
4437           if (aarch64_base_register_rtx_p (op1, strict_p)
4438               && aarch64_classify_index (info, op0, mode, strict_p))
4439             {
4440               info->base = op1;
4441               return true;
4442             }
4443         }
4444
4445       return false;
4446
4447     case POST_INC:
4448     case POST_DEC:
4449     case PRE_INC:
4450     case PRE_DEC:
4451       info->type = ADDRESS_REG_WB;
4452       info->base = XEXP (x, 0);
4453       info->offset = NULL_RTX;
4454       return aarch64_base_register_rtx_p (info->base, strict_p);
4455
4456     case POST_MODIFY:
4457     case PRE_MODIFY:
4458       info->type = ADDRESS_REG_WB;
4459       info->base = XEXP (x, 0);
4460       if (GET_CODE (XEXP (x, 1)) == PLUS
4461           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4462           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4463           && aarch64_base_register_rtx_p (info->base, strict_p))
4464         {
4465           HOST_WIDE_INT offset;
4466           info->offset = XEXP (XEXP (x, 1), 1);
4467           offset = INTVAL (info->offset);
4468
4469           /* TImode and TFmode values are allowed in both pairs of X
4470              registers and individual Q registers.  The available
4471              address modes are:
4472              X,X: 7-bit signed scaled offset
4473              Q:   9-bit signed offset
4474              We conservatively require an offset representable in either mode.
4475            */
4476           if (mode == TImode || mode == TFmode)
4477             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4478                     && offset_9bit_signed_unscaled_p (mode, offset));
4479
4480           if (load_store_pair_p)
4481             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4482                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4483           else
4484             return offset_9bit_signed_unscaled_p (mode, offset);
4485         }
4486       return false;
4487
4488     case CONST:
4489     case SYMBOL_REF:
4490     case LABEL_REF:
4491       /* load literal: pc-relative constant pool entry.  Only supported
4492          for SI mode or larger.  */
4493       info->type = ADDRESS_SYMBOLIC;
4494
4495       if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4496         {
4497           rtx sym, addend;
4498
4499           split_const (x, &sym, &addend);
4500           return ((GET_CODE (sym) == LABEL_REF
4501                    || (GET_CODE (sym) == SYMBOL_REF
4502                        && CONSTANT_POOL_ADDRESS_P (sym)
4503                        && aarch64_pcrelative_literal_loads)));
4504         }
4505       return false;
4506
4507     case LO_SUM:
4508       info->type = ADDRESS_LO_SUM;
4509       info->base = XEXP (x, 0);
4510       info->offset = XEXP (x, 1);
4511       if (allow_reg_index_p
4512           && aarch64_base_register_rtx_p (info->base, strict_p))
4513         {
4514           rtx sym, offs;
4515           split_const (info->offset, &sym, &offs);
4516           if (GET_CODE (sym) == SYMBOL_REF
4517               && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4518             {
4519               /* The symbol and offset must be aligned to the access size.  */
4520               unsigned int align;
4521               unsigned int ref_size;
4522
4523               if (CONSTANT_POOL_ADDRESS_P (sym))
4524                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4525               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4526                 {
4527                   tree exp = SYMBOL_REF_DECL (sym);
4528                   align = TYPE_ALIGN (TREE_TYPE (exp));
4529                   align = CONSTANT_ALIGNMENT (exp, align);
4530                 }
4531               else if (SYMBOL_REF_DECL (sym))
4532                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4533               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4534                        && SYMBOL_REF_BLOCK (sym) != NULL)
4535                 align = SYMBOL_REF_BLOCK (sym)->alignment;
4536               else
4537                 align = BITS_PER_UNIT;
4538
4539               ref_size = GET_MODE_SIZE (mode);
4540               if (ref_size == 0)
4541                 ref_size = GET_MODE_SIZE (DImode);
4542
4543               return ((INTVAL (offs) & (ref_size - 1)) == 0
4544                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4545             }
4546         }
4547       return false;
4548
4549     default:
4550       return false;
4551     }
4552 }
4553
4554 bool
4555 aarch64_symbolic_address_p (rtx x)
4556 {
4557   rtx offset;
4558
4559   split_const (x, &x, &offset);
4560   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4561 }
4562
4563 /* Classify the base of symbolic expression X.  */
4564
4565 enum aarch64_symbol_type
4566 aarch64_classify_symbolic_expression (rtx x)
4567 {
4568   rtx offset;
4569
4570   split_const (x, &x, &offset);
4571   return aarch64_classify_symbol (x, offset);
4572 }
4573
4574
4575 /* Return TRUE if X is a legitimate address for accessing memory in
4576    mode MODE.  */
4577 static bool
4578 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4579 {
4580   struct aarch64_address_info addr;
4581
4582   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4583 }
4584
4585 /* Return TRUE if X is a legitimate address for accessing memory in
4586    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
4587    pair operation.  */
4588 bool
4589 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4590                               RTX_CODE outer_code, bool strict_p)
4591 {
4592   struct aarch64_address_info addr;
4593
4594   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4595 }
4596
4597 /* Split an out-of-range address displacement into a base and offset.
4598    Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4599    to increase opportunities for sharing the base address of different sizes.
4600    For unaligned accesses and TI/TF mode use the signed 9-bit range.  */
4601 static bool
4602 aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
4603 {
4604   HOST_WIDE_INT offset = INTVAL (*disp);
4605   HOST_WIDE_INT base = offset & ~(GET_MODE_SIZE (mode) < 4 ? 0xfff : 0x3ffc);
4606
4607   if (mode == TImode || mode == TFmode
4608       || (offset & (GET_MODE_SIZE (mode) - 1)) != 0)
4609     base = (offset + 0x100) & ~0x1ff;
4610
4611   *off = GEN_INT (base);
4612   *disp = GEN_INT (offset - base);
4613   return true;
4614 }
4615
4616 /* Return TRUE if rtx X is immediate constant 0.0 */
4617 bool
4618 aarch64_float_const_zero_rtx_p (rtx x)
4619 {
4620   if (GET_MODE (x) == VOIDmode)
4621     return false;
4622
4623   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4624     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4625   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4626 }
4627
4628 /* Return the fixed registers used for condition codes.  */
4629
4630 static bool
4631 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4632 {
4633   *p1 = CC_REGNUM;
4634   *p2 = INVALID_REGNUM;
4635   return true;
4636 }
4637
4638 /* Emit call insn with PAT and do aarch64-specific handling.  */
4639
4640 void
4641 aarch64_emit_call_insn (rtx pat)
4642 {
4643   rtx insn = emit_call_insn (pat);
4644
4645   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4646   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4647   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4648 }
4649
4650 machine_mode
4651 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4652 {
4653   /* All floating point compares return CCFP if it is an equality
4654      comparison, and CCFPE otherwise.  */
4655   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4656     {
4657       switch (code)
4658         {
4659         case EQ:
4660         case NE:
4661         case UNORDERED:
4662         case ORDERED:
4663         case UNLT:
4664         case UNLE:
4665         case UNGT:
4666         case UNGE:
4667         case UNEQ:
4668         case LTGT:
4669           return CCFPmode;
4670
4671         case LT:
4672         case LE:
4673         case GT:
4674         case GE:
4675           return CCFPEmode;
4676
4677         default:
4678           gcc_unreachable ();
4679         }
4680     }
4681
4682   /* Equality comparisons of short modes against zero can be performed
4683      using the TST instruction with the appropriate bitmask.  */
4684   if (y == const0_rtx && REG_P (x)
4685       && (code == EQ || code == NE)
4686       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4687     return CC_NZmode;
4688
4689   /* Similarly, comparisons of zero_extends from shorter modes can
4690      be performed using an ANDS with an immediate mask.  */
4691   if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
4692       && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4693       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
4694       && (code == EQ || code == NE))
4695     return CC_NZmode;
4696
4697   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4698       && y == const0_rtx
4699       && (code == EQ || code == NE || code == LT || code == GE)
4700       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4701           || GET_CODE (x) == NEG
4702           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4703               && CONST_INT_P (XEXP (x, 2)))))
4704     return CC_NZmode;
4705
4706   /* A compare with a shifted operand.  Because of canonicalization,
4707      the comparison will have to be swapped when we emit the assembly
4708      code.  */
4709   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4710       && (REG_P (y) || GET_CODE (y) == SUBREG)
4711       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4712           || GET_CODE (x) == LSHIFTRT
4713           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4714     return CC_SWPmode;
4715
4716   /* Similarly for a negated operand, but we can only do this for
4717      equalities.  */
4718   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4719       && (REG_P (y) || GET_CODE (y) == SUBREG)
4720       && (code == EQ || code == NE)
4721       && GET_CODE (x) == NEG)
4722     return CC_Zmode;
4723
4724   /* A test for unsigned overflow.  */
4725   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
4726       && code == NE
4727       && GET_CODE (x) == PLUS
4728       && GET_CODE (y) == ZERO_EXTEND)
4729     return CC_Cmode;
4730
4731   /* For everything else, return CCmode.  */
4732   return CCmode;
4733 }
4734
4735 static int
4736 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
4737
4738 int
4739 aarch64_get_condition_code (rtx x)
4740 {
4741   machine_mode mode = GET_MODE (XEXP (x, 0));
4742   enum rtx_code comp_code = GET_CODE (x);
4743
4744   if (GET_MODE_CLASS (mode) != MODE_CC)
4745     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
4746   return aarch64_get_condition_code_1 (mode, comp_code);
4747 }
4748
4749 static int
4750 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
4751 {
4752   switch (mode)
4753     {
4754     case CCFPmode:
4755     case CCFPEmode:
4756       switch (comp_code)
4757         {
4758         case GE: return AARCH64_GE;
4759         case GT: return AARCH64_GT;
4760         case LE: return AARCH64_LS;
4761         case LT: return AARCH64_MI;
4762         case NE: return AARCH64_NE;
4763         case EQ: return AARCH64_EQ;
4764         case ORDERED: return AARCH64_VC;
4765         case UNORDERED: return AARCH64_VS;
4766         case UNLT: return AARCH64_LT;
4767         case UNLE: return AARCH64_LE;
4768         case UNGT: return AARCH64_HI;
4769         case UNGE: return AARCH64_PL;
4770         default: return -1;
4771         }
4772       break;
4773
4774     case CCmode:
4775       switch (comp_code)
4776         {
4777         case NE: return AARCH64_NE;
4778         case EQ: return AARCH64_EQ;
4779         case GE: return AARCH64_GE;
4780         case GT: return AARCH64_GT;
4781         case LE: return AARCH64_LE;
4782         case LT: return AARCH64_LT;
4783         case GEU: return AARCH64_CS;
4784         case GTU: return AARCH64_HI;
4785         case LEU: return AARCH64_LS;
4786         case LTU: return AARCH64_CC;
4787         default: return -1;
4788         }
4789       break;
4790
4791     case CC_SWPmode:
4792       switch (comp_code)
4793         {
4794         case NE: return AARCH64_NE;
4795         case EQ: return AARCH64_EQ;
4796         case GE: return AARCH64_LE;
4797         case GT: return AARCH64_LT;
4798         case LE: return AARCH64_GE;
4799         case LT: return AARCH64_GT;
4800         case GEU: return AARCH64_LS;
4801         case GTU: return AARCH64_CC;
4802         case LEU: return AARCH64_CS;
4803         case LTU: return AARCH64_HI;
4804         default: return -1;
4805         }
4806       break;
4807
4808     case CC_NZmode:
4809       switch (comp_code)
4810         {
4811         case NE: return AARCH64_NE;
4812         case EQ: return AARCH64_EQ;
4813         case GE: return AARCH64_PL;
4814         case LT: return AARCH64_MI;
4815         default: return -1;
4816         }
4817       break;
4818
4819     case CC_Zmode:
4820       switch (comp_code)
4821         {
4822         case NE: return AARCH64_NE;
4823         case EQ: return AARCH64_EQ;
4824         default: return -1;
4825         }
4826       break;
4827
4828     case CC_Cmode:
4829       switch (comp_code)
4830         {
4831         case NE: return AARCH64_CS;
4832         case EQ: return AARCH64_CC;
4833         default: return -1;
4834         }
4835       break;
4836
4837     default:
4838       return -1;
4839     }
4840
4841   return -1;
4842 }
4843
4844 bool
4845 aarch64_const_vec_all_same_in_range_p (rtx x,
4846                                   HOST_WIDE_INT minval,
4847                                   HOST_WIDE_INT maxval)
4848 {
4849   HOST_WIDE_INT firstval;
4850   int count, i;
4851
4852   if (GET_CODE (x) != CONST_VECTOR
4853       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
4854     return false;
4855
4856   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
4857   if (firstval < minval || firstval > maxval)
4858     return false;
4859
4860   count = CONST_VECTOR_NUNITS (x);
4861   for (i = 1; i < count; i++)
4862     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
4863       return false;
4864
4865   return true;
4866 }
4867
4868 bool
4869 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
4870 {
4871   return aarch64_const_vec_all_same_in_range_p (x, val, val);
4872 }
4873
4874
4875 /* N Z C V.  */
4876 #define AARCH64_CC_V 1
4877 #define AARCH64_CC_C (1 << 1)
4878 #define AARCH64_CC_Z (1 << 2)
4879 #define AARCH64_CC_N (1 << 3)
4880
4881 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
4882 static const int aarch64_nzcv_codes[] =
4883 {
4884   0,            /* EQ, Z == 1.  */
4885   AARCH64_CC_Z, /* NE, Z == 0.  */
4886   0,            /* CS, C == 1.  */
4887   AARCH64_CC_C, /* CC, C == 0.  */
4888   0,            /* MI, N == 1.  */
4889   AARCH64_CC_N, /* PL, N == 0.  */
4890   0,            /* VS, V == 1.  */
4891   AARCH64_CC_V, /* VC, V == 0.  */
4892   0,            /* HI, C ==1 && Z == 0.  */
4893   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
4894   AARCH64_CC_V, /* GE, N == V.  */
4895   0,            /* LT, N != V.  */
4896   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
4897   0,            /* LE, !(Z == 0 && N == V).  */
4898   0,            /* AL, Any.  */
4899   0             /* NV, Any.  */
4900 };
4901
4902 static void
4903 aarch64_print_operand (FILE *f, rtx x, int code)
4904 {
4905   switch (code)
4906     {
4907     /* An integer or symbol address without a preceding # sign.  */
4908     case 'c':
4909       switch (GET_CODE (x))
4910         {
4911         case CONST_INT:
4912           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4913           break;
4914
4915         case SYMBOL_REF:
4916           output_addr_const (f, x);
4917           break;
4918
4919         case CONST:
4920           if (GET_CODE (XEXP (x, 0)) == PLUS
4921               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4922             {
4923               output_addr_const (f, x);
4924               break;
4925             }
4926           /* Fall through.  */
4927
4928         default:
4929           output_operand_lossage ("Unsupported operand for code '%c'", code);
4930         }
4931       break;
4932
4933     case 'e':
4934       /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w.  */
4935       {
4936         int n;
4937
4938         if (!CONST_INT_P (x)
4939             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4940           {
4941             output_operand_lossage ("invalid operand for '%%%c'", code);
4942             return;
4943           }
4944
4945         switch (n)
4946           {
4947           case 3:
4948             fputc ('b', f);
4949             break;
4950           case 4:
4951             fputc ('h', f);
4952             break;
4953           case 5:
4954             fputc ('w', f);
4955             break;
4956           default:
4957             output_operand_lossage ("invalid operand for '%%%c'", code);
4958             return;
4959           }
4960       }
4961       break;
4962
4963     case 'p':
4964       {
4965         int n;
4966
4967         /* Print N such that 2^N == X.  */
4968         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4969           {
4970             output_operand_lossage ("invalid operand for '%%%c'", code);
4971             return;
4972           }
4973
4974         asm_fprintf (f, "%d", n);
4975       }
4976       break;
4977
4978     case 'P':
4979       /* Print the number of non-zero bits in X (a const_int).  */
4980       if (!CONST_INT_P (x))
4981         {
4982           output_operand_lossage ("invalid operand for '%%%c'", code);
4983           return;
4984         }
4985
4986       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
4987       break;
4988
4989     case 'H':
4990       /* Print the higher numbered register of a pair (TImode) of regs.  */
4991       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4992         {
4993           output_operand_lossage ("invalid operand for '%%%c'", code);
4994           return;
4995         }
4996
4997       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4998       break;
4999
5000     case 'M':
5001     case 'm':
5002       {
5003         int cond_code;
5004         /* Print a condition (eq, ne, etc) or its inverse.  */
5005
5006         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
5007         if (x == const_true_rtx)
5008           {
5009             if (code == 'M')
5010               fputs ("nv", f);
5011             return;
5012           }
5013
5014         if (!COMPARISON_P (x))
5015           {
5016             output_operand_lossage ("invalid operand for '%%%c'", code);
5017             return;
5018           }
5019
5020         cond_code = aarch64_get_condition_code (x);
5021         gcc_assert (cond_code >= 0);
5022         if (code == 'M')
5023           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
5024         fputs (aarch64_condition_codes[cond_code], f);
5025       }
5026       break;
5027
5028     case 'b':
5029     case 'h':
5030     case 's':
5031     case 'd':
5032     case 'q':
5033       /* Print a scalar FP/SIMD register name.  */
5034       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5035         {
5036           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5037           return;
5038         }
5039       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
5040       break;
5041
5042     case 'S':
5043     case 'T':
5044     case 'U':
5045     case 'V':
5046       /* Print the first FP/SIMD register name in a list.  */
5047       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5048         {
5049           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5050           return;
5051         }
5052       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
5053       break;
5054
5055     case 'R':
5056       /* Print a scalar FP/SIMD register name + 1.  */
5057       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5058         {
5059           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5060           return;
5061         }
5062       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
5063       break;
5064
5065     case 'X':
5066       /* Print bottom 16 bits of integer constant in hex.  */
5067       if (!CONST_INT_P (x))
5068         {
5069           output_operand_lossage ("invalid operand for '%%%c'", code);
5070           return;
5071         }
5072       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
5073       break;
5074
5075     case 'w':
5076     case 'x':
5077       /* Print a general register name or the zero register (32-bit or
5078          64-bit).  */
5079       if (x == const0_rtx
5080           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
5081         {
5082           asm_fprintf (f, "%czr", code);
5083           break;
5084         }
5085
5086       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
5087         {
5088           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
5089           break;
5090         }
5091
5092       if (REG_P (x) && REGNO (x) == SP_REGNUM)
5093         {
5094           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
5095           break;
5096         }
5097
5098       /* Fall through */
5099
5100     case 0:
5101       /* Print a normal operand, if it's a general register, then we
5102          assume DImode.  */
5103       if (x == NULL)
5104         {
5105           output_operand_lossage ("missing operand");
5106           return;
5107         }
5108
5109       switch (GET_CODE (x))
5110         {
5111         case REG:
5112           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
5113           break;
5114
5115         case MEM:
5116           output_address (GET_MODE (x), XEXP (x, 0));
5117           break;
5118
5119         case CONST:
5120         case LABEL_REF:
5121         case SYMBOL_REF:
5122           output_addr_const (asm_out_file, x);
5123           break;
5124
5125         case CONST_INT:
5126           asm_fprintf (f, "%wd", INTVAL (x));
5127           break;
5128
5129         case CONST_VECTOR:
5130           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
5131             {
5132               gcc_assert (
5133                   aarch64_const_vec_all_same_in_range_p (x,
5134                                                          HOST_WIDE_INT_MIN,
5135                                                          HOST_WIDE_INT_MAX));
5136               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
5137             }
5138           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
5139             {
5140               fputc ('0', f);
5141             }
5142           else
5143             gcc_unreachable ();
5144           break;
5145
5146         case CONST_DOUBLE:
5147           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
5148              be getting CONST_DOUBLEs holding integers.  */
5149           gcc_assert (GET_MODE (x) != VOIDmode);
5150           if (aarch64_float_const_zero_rtx_p (x))
5151             {
5152               fputc ('0', f);
5153               break;
5154             }
5155           else if (aarch64_float_const_representable_p (x))
5156             {
5157 #define buf_size 20
5158               char float_buf[buf_size] = {'\0'};
5159               real_to_decimal_for_mode (float_buf,
5160                                         CONST_DOUBLE_REAL_VALUE (x),
5161                                         buf_size, buf_size,
5162                                         1, GET_MODE (x));
5163               asm_fprintf (asm_out_file, "%s", float_buf);
5164               break;
5165 #undef buf_size
5166             }
5167           output_operand_lossage ("invalid constant");
5168           return;
5169         default:
5170           output_operand_lossage ("invalid operand");
5171           return;
5172         }
5173       break;
5174
5175     case 'A':
5176       if (GET_CODE (x) == HIGH)
5177         x = XEXP (x, 0);
5178
5179       switch (aarch64_classify_symbolic_expression (x))
5180         {
5181         case SYMBOL_SMALL_GOT_4G:
5182           asm_fprintf (asm_out_file, ":got:");
5183           break;
5184
5185         case SYMBOL_SMALL_TLSGD:
5186           asm_fprintf (asm_out_file, ":tlsgd:");
5187           break;
5188
5189         case SYMBOL_SMALL_TLSDESC:
5190           asm_fprintf (asm_out_file, ":tlsdesc:");
5191           break;
5192
5193         case SYMBOL_SMALL_TLSIE:
5194           asm_fprintf (asm_out_file, ":gottprel:");
5195           break;
5196
5197         case SYMBOL_TLSLE24:
5198           asm_fprintf (asm_out_file, ":tprel:");
5199           break;
5200
5201         case SYMBOL_TINY_GOT:
5202           gcc_unreachable ();
5203           break;
5204
5205         default:
5206           break;
5207         }
5208       output_addr_const (asm_out_file, x);
5209       break;
5210
5211     case 'L':
5212       switch (aarch64_classify_symbolic_expression (x))
5213         {
5214         case SYMBOL_SMALL_GOT_4G:
5215           asm_fprintf (asm_out_file, ":lo12:");
5216           break;
5217
5218         case SYMBOL_SMALL_TLSGD:
5219           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
5220           break;
5221
5222         case SYMBOL_SMALL_TLSDESC:
5223           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
5224           break;
5225
5226         case SYMBOL_SMALL_TLSIE:
5227           asm_fprintf (asm_out_file, ":gottprel_lo12:");
5228           break;
5229
5230         case SYMBOL_TLSLE12:
5231           asm_fprintf (asm_out_file, ":tprel_lo12:");
5232           break;
5233
5234         case SYMBOL_TLSLE24:
5235           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
5236           break;
5237
5238         case SYMBOL_TINY_GOT:
5239           asm_fprintf (asm_out_file, ":got:");
5240           break;
5241
5242         case SYMBOL_TINY_TLSIE:
5243           asm_fprintf (asm_out_file, ":gottprel:");
5244           break;
5245
5246         default:
5247           break;
5248         }
5249       output_addr_const (asm_out_file, x);
5250       break;
5251
5252     case 'G':
5253
5254       switch (aarch64_classify_symbolic_expression (x))
5255         {
5256         case SYMBOL_TLSLE24:
5257           asm_fprintf (asm_out_file, ":tprel_hi12:");
5258           break;
5259         default:
5260           break;
5261         }
5262       output_addr_const (asm_out_file, x);
5263       break;
5264
5265     case 'k':
5266       {
5267         HOST_WIDE_INT cond_code;
5268         /* Print nzcv.  */
5269
5270         if (!CONST_INT_P (x))
5271           {
5272             output_operand_lossage ("invalid operand for '%%%c'", code);
5273             return;
5274           }
5275
5276         cond_code = INTVAL (x);
5277         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
5278         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
5279       }
5280       break;
5281
5282     default:
5283       output_operand_lossage ("invalid operand prefix '%%%c'", code);
5284       return;
5285     }
5286 }
5287
5288 static void
5289 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
5290 {
5291   struct aarch64_address_info addr;
5292
5293   if (aarch64_classify_address (&addr, x, mode, MEM, true))
5294     switch (addr.type)
5295       {
5296       case ADDRESS_REG_IMM:
5297         if (addr.offset == const0_rtx)
5298           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
5299         else
5300           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
5301                        INTVAL (addr.offset));
5302         return;
5303
5304       case ADDRESS_REG_REG:
5305         if (addr.shift == 0)
5306           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
5307                        reg_names [REGNO (addr.offset)]);
5308         else
5309           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
5310                        reg_names [REGNO (addr.offset)], addr.shift);
5311         return;
5312
5313       case ADDRESS_REG_UXTW:
5314         if (addr.shift == 0)
5315           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
5316                        REGNO (addr.offset) - R0_REGNUM);
5317         else
5318           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
5319                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
5320         return;
5321
5322       case ADDRESS_REG_SXTW:
5323         if (addr.shift == 0)
5324           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
5325                        REGNO (addr.offset) - R0_REGNUM);
5326         else
5327           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
5328                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
5329         return;
5330
5331       case ADDRESS_REG_WB:
5332         switch (GET_CODE (x))
5333           {
5334           case PRE_INC:
5335             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
5336                          GET_MODE_SIZE (mode));
5337             return;
5338           case POST_INC:
5339             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
5340                          GET_MODE_SIZE (mode));
5341             return;
5342           case PRE_DEC:
5343             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
5344                          GET_MODE_SIZE (mode));
5345             return;
5346           case POST_DEC:
5347             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
5348                          GET_MODE_SIZE (mode));
5349             return;
5350           case PRE_MODIFY:
5351             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
5352                          INTVAL (addr.offset));
5353             return;
5354           case POST_MODIFY:
5355             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
5356                          INTVAL (addr.offset));
5357             return;
5358           default:
5359             break;
5360           }
5361         break;
5362
5363       case ADDRESS_LO_SUM:
5364         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
5365         output_addr_const (f, addr.offset);
5366         asm_fprintf (f, "]");
5367         return;
5368
5369       case ADDRESS_SYMBOLIC:
5370         break;
5371       }
5372
5373   output_addr_const (f, x);
5374 }
5375
5376 bool
5377 aarch64_label_mentioned_p (rtx x)
5378 {
5379   const char *fmt;
5380   int i;
5381
5382   if (GET_CODE (x) == LABEL_REF)
5383     return true;
5384
5385   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5386      referencing instruction, but they are constant offsets, not
5387      symbols.  */
5388   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5389     return false;
5390
5391   fmt = GET_RTX_FORMAT (GET_CODE (x));
5392   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
5393     {
5394       if (fmt[i] == 'E')
5395         {
5396           int j;
5397
5398           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
5399             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
5400               return 1;
5401         }
5402       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
5403         return 1;
5404     }
5405
5406   return 0;
5407 }
5408
5409 /* Implement REGNO_REG_CLASS.  */
5410
5411 enum reg_class
5412 aarch64_regno_regclass (unsigned regno)
5413 {
5414   if (GP_REGNUM_P (regno))
5415     return GENERAL_REGS;
5416
5417   if (regno == SP_REGNUM)
5418     return STACK_REG;
5419
5420   if (regno == FRAME_POINTER_REGNUM
5421       || regno == ARG_POINTER_REGNUM)
5422     return POINTER_REGS;
5423
5424   if (FP_REGNUM_P (regno))
5425     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
5426
5427   return NO_REGS;
5428 }
5429
5430 static rtx
5431 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
5432 {
5433   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5434      where mask is selected by alignment and size of the offset.
5435      We try to pick as large a range for the offset as possible to
5436      maximize the chance of a CSE.  However, for aligned addresses
5437      we limit the range to 4k so that structures with different sized
5438      elements are likely to use the same base.  We need to be careful
5439      not to split a CONST for some forms of address expression, otherwise
5440      it will generate sub-optimal code.  */
5441
5442   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5443     {
5444       rtx base = XEXP (x, 0);
5445       rtx offset_rtx = XEXP (x, 1);
5446       HOST_WIDE_INT offset = INTVAL (offset_rtx);
5447
5448       if (GET_CODE (base) == PLUS)
5449         {
5450           rtx op0 = XEXP (base, 0);
5451           rtx op1 = XEXP (base, 1);
5452
5453           /* Force any scaling into a temp for CSE.  */
5454           op0 = force_reg (Pmode, op0);
5455           op1 = force_reg (Pmode, op1);
5456
5457           /* Let the pointer register be in op0.  */
5458           if (REG_POINTER (op1))
5459             std::swap (op0, op1);
5460
5461           /* If the pointer is virtual or frame related, then we know that
5462              virtual register instantiation or register elimination is going
5463              to apply a second constant.  We want the two constants folded
5464              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
5465           if (virt_or_elim_regno_p (REGNO (op0)))
5466             {
5467               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5468                                    NULL_RTX, true, OPTAB_DIRECT);
5469               return gen_rtx_PLUS (Pmode, base, op1);
5470             }
5471
5472           /* Otherwise, in order to encourage CSE (and thence loop strength
5473              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
5474           base = expand_binop (Pmode, add_optab, op0, op1,
5475                                NULL_RTX, true, OPTAB_DIRECT);
5476           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5477         }
5478
5479       /* Does it look like we'll need a 16-byte load/store-pair operation?  */
5480       HOST_WIDE_INT base_offset;
5481       if (GET_MODE_SIZE (mode) > 16)
5482         base_offset = (offset + 0x400) & ~0x7f0;
5483       /* For offsets aren't a multiple of the access size, the limit is
5484          -256...255.  */
5485       else if (offset & (GET_MODE_SIZE (mode) - 1))
5486         {
5487           base_offset = (offset + 0x100) & ~0x1ff;
5488
5489           /* BLKmode typically uses LDP of X-registers.  */
5490           if (mode == BLKmode)
5491             base_offset = (offset + 512) & ~0x3ff;
5492         }
5493       /* Small negative offsets are supported.  */
5494       else if (IN_RANGE (offset, -256, 0))
5495         base_offset = 0;
5496       else if (mode == TImode || mode == TFmode)
5497         base_offset = (offset + 0x100) & ~0x1ff;
5498       /* Use 12-bit offset by access size.  */
5499       else
5500         base_offset = offset & (~0xfff * GET_MODE_SIZE (mode));
5501
5502       if (base_offset != 0)
5503         {
5504           base = plus_constant (Pmode, base, base_offset);
5505           base = force_operand (base, NULL_RTX);
5506           return plus_constant (Pmode, base, offset - base_offset);
5507         }
5508     }
5509
5510   return x;
5511 }
5512
5513 /* Return the reload icode required for a constant pool in mode.  */
5514 static enum insn_code
5515 aarch64_constant_pool_reload_icode (machine_mode mode)
5516 {
5517   switch (mode)
5518     {
5519     case SFmode:
5520       return CODE_FOR_aarch64_reload_movcpsfdi;
5521
5522     case DFmode:
5523       return CODE_FOR_aarch64_reload_movcpdfdi;
5524
5525     case TFmode:
5526       return CODE_FOR_aarch64_reload_movcptfdi;
5527
5528     case V8QImode:
5529       return CODE_FOR_aarch64_reload_movcpv8qidi;
5530
5531     case V16QImode:
5532       return CODE_FOR_aarch64_reload_movcpv16qidi;
5533
5534     case V4HImode:
5535       return CODE_FOR_aarch64_reload_movcpv4hidi;
5536
5537     case V8HImode:
5538       return CODE_FOR_aarch64_reload_movcpv8hidi;
5539
5540     case V2SImode:
5541       return CODE_FOR_aarch64_reload_movcpv2sidi;
5542
5543     case V4SImode:
5544       return CODE_FOR_aarch64_reload_movcpv4sidi;
5545
5546     case V2DImode:
5547       return CODE_FOR_aarch64_reload_movcpv2didi;
5548
5549     case V2DFmode:
5550       return CODE_FOR_aarch64_reload_movcpv2dfdi;
5551
5552     default:
5553       gcc_unreachable ();
5554     }
5555
5556   gcc_unreachable ();
5557 }
5558 static reg_class_t
5559 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5560                           reg_class_t rclass,
5561                           machine_mode mode,
5562                           secondary_reload_info *sri)
5563 {
5564
5565   /* If we have to disable direct literal pool loads and stores because the
5566      function is too big, then we need a scratch register.  */
5567   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5568       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5569           || targetm.vector_mode_supported_p (GET_MODE (x)))
5570       && !aarch64_pcrelative_literal_loads)
5571     {
5572       sri->icode = aarch64_constant_pool_reload_icode (mode);
5573       return NO_REGS;
5574     }
5575
5576   /* Without the TARGET_SIMD instructions we cannot move a Q register
5577      to a Q register directly.  We need a scratch.  */
5578   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5579       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5580       && reg_class_subset_p (rclass, FP_REGS))
5581     {
5582       if (mode == TFmode)
5583         sri->icode = CODE_FOR_aarch64_reload_movtf;
5584       else if (mode == TImode)
5585         sri->icode = CODE_FOR_aarch64_reload_movti;
5586       return NO_REGS;
5587     }
5588
5589   /* A TFmode or TImode memory access should be handled via an FP_REGS
5590      because AArch64 has richer addressing modes for LDR/STR instructions
5591      than LDP/STP instructions.  */
5592   if (TARGET_FLOAT && rclass == GENERAL_REGS
5593       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5594     return FP_REGS;
5595
5596   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5597       return GENERAL_REGS;
5598
5599   return NO_REGS;
5600 }
5601
5602 static bool
5603 aarch64_can_eliminate (const int from, const int to)
5604 {
5605   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5606      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
5607
5608   if (frame_pointer_needed)
5609     {
5610       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5611         return true;
5612       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5613         return false;
5614       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5615           && !cfun->calls_alloca)
5616         return true;
5617       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5618         return true;
5619
5620       return false;
5621     }
5622   else
5623     {
5624       /* If we decided that we didn't need a leaf frame pointer but then used
5625          LR in the function, then we'll want a frame pointer after all, so
5626          prevent this elimination to ensure a frame pointer is used.  */
5627       if (to == STACK_POINTER_REGNUM
5628           && flag_omit_leaf_frame_pointer
5629           && df_regs_ever_live_p (LR_REGNUM))
5630         return false;
5631     }
5632
5633   return true;
5634 }
5635
5636 HOST_WIDE_INT
5637 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5638 {
5639   aarch64_layout_frame ();
5640
5641   if (to == HARD_FRAME_POINTER_REGNUM)
5642     {
5643       if (from == ARG_POINTER_REGNUM)
5644         return cfun->machine->frame.hard_fp_offset;
5645
5646       if (from == FRAME_POINTER_REGNUM)
5647         return cfun->machine->frame.hard_fp_offset
5648                - cfun->machine->frame.locals_offset;
5649     }
5650
5651   if (to == STACK_POINTER_REGNUM)
5652     {
5653       if (from == FRAME_POINTER_REGNUM)
5654           return cfun->machine->frame.frame_size
5655                  - cfun->machine->frame.locals_offset;
5656     }
5657
5658   return cfun->machine->frame.frame_size;
5659 }
5660
5661 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
5662    previous frame.  */
5663
5664 rtx
5665 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5666 {
5667   if (count != 0)
5668     return const0_rtx;
5669   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5670 }
5671
5672
5673 static void
5674 aarch64_asm_trampoline_template (FILE *f)
5675 {
5676   if (TARGET_ILP32)
5677     {
5678       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5679       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5680     }
5681   else
5682     {
5683       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5684       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5685     }
5686   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5687   assemble_aligned_integer (4, const0_rtx);
5688   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5689   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5690 }
5691
5692 static void
5693 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5694 {
5695   rtx fnaddr, mem, a_tramp;
5696   const int tramp_code_sz = 16;
5697
5698   /* Don't need to copy the trailing D-words, we fill those in below.  */
5699   emit_block_move (m_tramp, assemble_trampoline_template (),
5700                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5701   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5702   fnaddr = XEXP (DECL_RTL (fndecl), 0);
5703   if (GET_MODE (fnaddr) != ptr_mode)
5704     fnaddr = convert_memory_address (ptr_mode, fnaddr);
5705   emit_move_insn (mem, fnaddr);
5706
5707   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5708   emit_move_insn (mem, chain_value);
5709
5710   /* XXX We should really define a "clear_cache" pattern and use
5711      gen_clear_cache().  */
5712   a_tramp = XEXP (m_tramp, 0);
5713   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5714                      LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
5715                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5716                      ptr_mode);
5717 }
5718
5719 static unsigned char
5720 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5721 {
5722   switch (regclass)
5723     {
5724     case CALLER_SAVE_REGS:
5725     case POINTER_REGS:
5726     case GENERAL_REGS:
5727     case ALL_REGS:
5728     case FP_REGS:
5729     case FP_LO_REGS:
5730       return
5731         aarch64_vector_mode_p (mode)
5732           ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
5733           : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5734     case STACK_REG:
5735       return 1;
5736
5737     case NO_REGS:
5738       return 0;
5739
5740     default:
5741       break;
5742     }
5743   gcc_unreachable ();
5744 }
5745
5746 static reg_class_t
5747 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
5748 {
5749   if (regclass == POINTER_REGS)
5750     return GENERAL_REGS;
5751
5752   if (regclass == STACK_REG)
5753     {
5754       if (REG_P(x)
5755           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
5756           return regclass;
5757
5758       return NO_REGS;
5759     }
5760
5761   /* If it's an integer immediate that MOVI can't handle, then
5762      FP_REGS is not an option, so we return NO_REGS instead.  */
5763   if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
5764       && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
5765     return NO_REGS;
5766
5767   /* Register eliminiation can result in a request for
5768      SP+constant->FP_REGS.  We cannot support such operations which
5769      use SP as source and an FP_REG as destination, so reject out
5770      right now.  */
5771   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
5772     {
5773       rtx lhs = XEXP (x, 0);
5774
5775       /* Look through a possible SUBREG introduced by ILP32.  */
5776       if (GET_CODE (lhs) == SUBREG)
5777         lhs = SUBREG_REG (lhs);
5778
5779       gcc_assert (REG_P (lhs));
5780       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
5781                                       POINTER_REGS));
5782       return NO_REGS;
5783     }
5784
5785   return regclass;
5786 }
5787
5788 void
5789 aarch64_asm_output_labelref (FILE* f, const char *name)
5790 {
5791   asm_fprintf (f, "%U%s", name);
5792 }
5793
5794 static void
5795 aarch64_elf_asm_constructor (rtx symbol, int priority)
5796 {
5797   if (priority == DEFAULT_INIT_PRIORITY)
5798     default_ctor_section_asm_out_constructor (symbol, priority);
5799   else
5800     {
5801       section *s;
5802       /* While priority is known to be in range [0, 65535], so 18 bytes
5803          would be enough, the compiler might not know that.  To avoid
5804          -Wformat-truncation false positive, use a larger size.  */
5805       char buf[23];
5806       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
5807       s = get_section (buf, SECTION_WRITE, NULL);
5808       switch_to_section (s);
5809       assemble_align (POINTER_SIZE);
5810       assemble_aligned_integer (POINTER_BYTES, symbol);
5811     }
5812 }
5813
5814 static void
5815 aarch64_elf_asm_destructor (rtx symbol, int priority)
5816 {
5817   if (priority == DEFAULT_INIT_PRIORITY)
5818     default_dtor_section_asm_out_destructor (symbol, priority);
5819   else
5820     {
5821       section *s;
5822       /* While priority is known to be in range [0, 65535], so 18 bytes
5823          would be enough, the compiler might not know that.  To avoid
5824          -Wformat-truncation false positive, use a larger size.  */
5825       char buf[23];
5826       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5827       s = get_section (buf, SECTION_WRITE, NULL);
5828       switch_to_section (s);
5829       assemble_align (POINTER_SIZE);
5830       assemble_aligned_integer (POINTER_BYTES, symbol);
5831     }
5832 }
5833
5834 const char*
5835 aarch64_output_casesi (rtx *operands)
5836 {
5837   char buf[100];
5838   char label[100];
5839   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5840   int index;
5841   static const char *const patterns[4][2] =
5842   {
5843     {
5844       "ldrb\t%w3, [%0,%w1,uxtw]",
5845       "add\t%3, %4, %w3, sxtb #2"
5846     },
5847     {
5848       "ldrh\t%w3, [%0,%w1,uxtw #1]",
5849       "add\t%3, %4, %w3, sxth #2"
5850     },
5851     {
5852       "ldr\t%w3, [%0,%w1,uxtw #2]",
5853       "add\t%3, %4, %w3, sxtw #2"
5854     },
5855     /* We assume that DImode is only generated when not optimizing and
5856        that we don't really need 64-bit address offsets.  That would
5857        imply an object file with 8GB of code in a single function!  */
5858     {
5859       "ldr\t%w3, [%0,%w1,uxtw #2]",
5860       "add\t%3, %4, %w3, sxtw #2"
5861     }
5862   };
5863
5864   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5865
5866   index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5867
5868   gcc_assert (index >= 0 && index <= 3);
5869
5870   /* Need to implement table size reduction, by chaning the code below.  */
5871   output_asm_insn (patterns[index][0], operands);
5872   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5873   snprintf (buf, sizeof (buf),
5874             "adr\t%%4, %s", targetm.strip_name_encoding (label));
5875   output_asm_insn (buf, operands);
5876   output_asm_insn (patterns[index][1], operands);
5877   output_asm_insn ("br\t%3", operands);
5878   assemble_label (asm_out_file, label);
5879   return "";
5880 }
5881
5882
5883 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5884    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5885    operator.  */
5886
5887 int
5888 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5889 {
5890   if (shift >= 0 && shift <= 3)
5891     {
5892       int size;
5893       for (size = 8; size <= 32; size *= 2)
5894         {
5895           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5896           if (mask == bits << shift)
5897             return size;
5898         }
5899     }
5900   return 0;
5901 }
5902
5903 /* Constant pools are per function only when PC relative
5904    literal loads are true or we are in the large memory
5905    model.  */
5906
5907 static inline bool
5908 aarch64_can_use_per_function_literal_pools_p (void)
5909 {
5910   return (aarch64_pcrelative_literal_loads
5911           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
5912 }
5913
5914 static bool
5915 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
5916 {
5917   /* Fixme:: In an ideal world this would work similar
5918      to the logic in aarch64_select_rtx_section but this
5919      breaks bootstrap in gcc go.  For now we workaround
5920      this by returning false here.  */
5921   return false;
5922 }
5923
5924 /* Select appropriate section for constants depending
5925    on where we place literal pools.  */
5926
5927 static section *
5928 aarch64_select_rtx_section (machine_mode mode,
5929                             rtx x,
5930                             unsigned HOST_WIDE_INT align)
5931 {
5932   if (aarch64_can_use_per_function_literal_pools_p ())
5933     return function_section (current_function_decl);
5934
5935   return default_elf_select_rtx_section (mode, x, align);
5936 }
5937
5938 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
5939 void
5940 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
5941                                   HOST_WIDE_INT offset)
5942 {
5943   /* When using per-function literal pools, we must ensure that any code
5944      section is aligned to the minimal instruction length, lest we get
5945      errors from the assembler re "unaligned instructions".  */
5946   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
5947     ASM_OUTPUT_ALIGN (f, 2);
5948 }
5949
5950 /* Costs.  */
5951
5952 /* Helper function for rtx cost calculation.  Strip a shift expression
5953    from X.  Returns the inner operand if successful, or the original
5954    expression on failure.  */
5955 static rtx
5956 aarch64_strip_shift (rtx x)
5957 {
5958   rtx op = x;
5959
5960   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5961      we can convert both to ROR during final output.  */
5962   if ((GET_CODE (op) == ASHIFT
5963        || GET_CODE (op) == ASHIFTRT
5964        || GET_CODE (op) == LSHIFTRT
5965        || GET_CODE (op) == ROTATERT
5966        || GET_CODE (op) == ROTATE)
5967       && CONST_INT_P (XEXP (op, 1)))
5968     return XEXP (op, 0);
5969
5970   if (GET_CODE (op) == MULT
5971       && CONST_INT_P (XEXP (op, 1))
5972       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5973     return XEXP (op, 0);
5974
5975   return x;
5976 }
5977
5978 /* Helper function for rtx cost calculation.  Strip an extend
5979    expression from X.  Returns the inner operand if successful, or the
5980    original expression on failure.  We deal with a number of possible
5981    canonicalization variations here.  */
5982 static rtx
5983 aarch64_strip_extend (rtx x)
5984 {
5985   rtx op = x;
5986
5987   /* Zero and sign extraction of a widened value.  */
5988   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5989       && XEXP (op, 2) == const0_rtx
5990       && GET_CODE (XEXP (op, 0)) == MULT
5991       && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5992                                          XEXP (op, 1)))
5993     return XEXP (XEXP (op, 0), 0);
5994
5995   /* It can also be represented (for zero-extend) as an AND with an
5996      immediate.  */
5997   if (GET_CODE (op) == AND
5998       && GET_CODE (XEXP (op, 0)) == MULT
5999       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
6000       && CONST_INT_P (XEXP (op, 1))
6001       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
6002                            INTVAL (XEXP (op, 1))) != 0)
6003     return XEXP (XEXP (op, 0), 0);
6004
6005   /* Now handle extended register, as this may also have an optional
6006      left shift by 1..4.  */
6007   if (GET_CODE (op) == ASHIFT
6008       && CONST_INT_P (XEXP (op, 1))
6009       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
6010     op = XEXP (op, 0);
6011
6012   if (GET_CODE (op) == ZERO_EXTEND
6013       || GET_CODE (op) == SIGN_EXTEND)
6014     op = XEXP (op, 0);
6015
6016   if (op != x)
6017     return op;
6018
6019   return x;
6020 }
6021
6022 /* Return true iff CODE is a shift supported in combination
6023    with arithmetic instructions.  */
6024
6025 static bool
6026 aarch64_shift_p (enum rtx_code code)
6027 {
6028   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
6029 }
6030
6031 /* Helper function for rtx cost calculation.  Calculate the cost of
6032    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
6033    Return the calculated cost of the expression, recursing manually in to
6034    operands where needed.  */
6035
6036 static int
6037 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
6038 {
6039   rtx op0, op1;
6040   const struct cpu_cost_table *extra_cost
6041     = aarch64_tune_params.insn_extra_cost;
6042   int cost = 0;
6043   bool compound_p = (outer == PLUS || outer == MINUS);
6044   machine_mode mode = GET_MODE (x);
6045
6046   gcc_checking_assert (code == MULT);
6047
6048   op0 = XEXP (x, 0);
6049   op1 = XEXP (x, 1);
6050
6051   if (VECTOR_MODE_P (mode))
6052     mode = GET_MODE_INNER (mode);
6053
6054   /* Integer multiply/fma.  */
6055   if (GET_MODE_CLASS (mode) == MODE_INT)
6056     {
6057       /* The multiply will be canonicalized as a shift, cost it as such.  */
6058       if (aarch64_shift_p (GET_CODE (x))
6059           || (CONST_INT_P (op1)
6060               && exact_log2 (INTVAL (op1)) > 0))
6061         {
6062           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
6063                            || GET_CODE (op0) == SIGN_EXTEND;
6064           if (speed)
6065             {
6066               if (compound_p)
6067                 {
6068                   if (REG_P (op1))
6069                     /* ARITH + shift-by-register.  */
6070                     cost += extra_cost->alu.arith_shift_reg;
6071                   else if (is_extend)
6072                     /* ARITH + extended register.  We don't have a cost field
6073                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
6074                     cost += extra_cost->alu.extend_arith;
6075                   else
6076                     /* ARITH + shift-by-immediate.  */
6077                     cost += extra_cost->alu.arith_shift;
6078                 }
6079               else
6080                 /* LSL (immediate).  */
6081                 cost += extra_cost->alu.shift;
6082
6083             }
6084           /* Strip extends as we will have costed them in the case above.  */
6085           if (is_extend)
6086             op0 = aarch64_strip_extend (op0);
6087
6088           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
6089
6090           return cost;
6091         }
6092
6093       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
6094          compound and let the below cases handle it.  After all, MNEG is a
6095          special-case alias of MSUB.  */
6096       if (GET_CODE (op0) == NEG)
6097         {
6098           op0 = XEXP (op0, 0);
6099           compound_p = true;
6100         }
6101
6102       /* Integer multiplies or FMAs have zero/sign extending variants.  */
6103       if ((GET_CODE (op0) == ZERO_EXTEND
6104            && GET_CODE (op1) == ZERO_EXTEND)
6105           || (GET_CODE (op0) == SIGN_EXTEND
6106               && GET_CODE (op1) == SIGN_EXTEND))
6107         {
6108           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
6109           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
6110
6111           if (speed)
6112             {
6113               if (compound_p)
6114                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
6115                 cost += extra_cost->mult[0].extend_add;
6116               else
6117                 /* MUL/SMULL/UMULL.  */
6118                 cost += extra_cost->mult[0].extend;
6119             }
6120
6121           return cost;
6122         }
6123
6124       /* This is either an integer multiply or a MADD.  In both cases
6125          we want to recurse and cost the operands.  */
6126       cost += rtx_cost (op0, mode, MULT, 0, speed);
6127       cost += rtx_cost (op1, mode, MULT, 1, speed);
6128
6129       if (speed)
6130         {
6131           if (compound_p)
6132             /* MADD/MSUB.  */
6133             cost += extra_cost->mult[mode == DImode].add;
6134           else
6135             /* MUL.  */
6136             cost += extra_cost->mult[mode == DImode].simple;
6137         }
6138
6139       return cost;
6140     }
6141   else
6142     {
6143       if (speed)
6144         {
6145           /* Floating-point FMA/FMUL can also support negations of the
6146              operands, unless the rounding mode is upward or downward in
6147              which case FNMUL is different than FMUL with operand negation.  */
6148           bool neg0 = GET_CODE (op0) == NEG;
6149           bool neg1 = GET_CODE (op1) == NEG;
6150           if (compound_p || !flag_rounding_math || (neg0 && neg1))
6151             {
6152               if (neg0)
6153                 op0 = XEXP (op0, 0);
6154               if (neg1)
6155                 op1 = XEXP (op1, 0);
6156             }
6157
6158           if (compound_p)
6159             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
6160             cost += extra_cost->fp[mode == DFmode].fma;
6161           else
6162             /* FMUL/FNMUL.  */
6163             cost += extra_cost->fp[mode == DFmode].mult;
6164         }
6165
6166       cost += rtx_cost (op0, mode, MULT, 0, speed);
6167       cost += rtx_cost (op1, mode, MULT, 1, speed);
6168       return cost;
6169     }
6170 }
6171
6172 static int
6173 aarch64_address_cost (rtx x,
6174                       machine_mode mode,
6175                       addr_space_t as ATTRIBUTE_UNUSED,
6176                       bool speed)
6177 {
6178   enum rtx_code c = GET_CODE (x);
6179   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
6180   struct aarch64_address_info info;
6181   int cost = 0;
6182   info.shift = 0;
6183
6184   if (!aarch64_classify_address (&info, x, mode, c, false))
6185     {
6186       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
6187         {
6188           /* This is a CONST or SYMBOL ref which will be split
6189              in a different way depending on the code model in use.
6190              Cost it through the generic infrastructure.  */
6191           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
6192           /* Divide through by the cost of one instruction to
6193              bring it to the same units as the address costs.  */
6194           cost_symbol_ref /= COSTS_N_INSNS (1);
6195           /* The cost is then the cost of preparing the address,
6196              followed by an immediate (possibly 0) offset.  */
6197           return cost_symbol_ref + addr_cost->imm_offset;
6198         }
6199       else
6200         {
6201           /* This is most likely a jump table from a case
6202              statement.  */
6203           return addr_cost->register_offset;
6204         }
6205     }
6206
6207   switch (info.type)
6208     {
6209       case ADDRESS_LO_SUM:
6210       case ADDRESS_SYMBOLIC:
6211       case ADDRESS_REG_IMM:
6212         cost += addr_cost->imm_offset;
6213         break;
6214
6215       case ADDRESS_REG_WB:
6216         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
6217           cost += addr_cost->pre_modify;
6218         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
6219           cost += addr_cost->post_modify;
6220         else
6221           gcc_unreachable ();
6222
6223         break;
6224
6225       case ADDRESS_REG_REG:
6226         cost += addr_cost->register_offset;
6227         break;
6228
6229       case ADDRESS_REG_SXTW:
6230         cost += addr_cost->register_sextend;
6231         break;
6232
6233       case ADDRESS_REG_UXTW:
6234         cost += addr_cost->register_zextend;
6235         break;
6236
6237       default:
6238         gcc_unreachable ();
6239     }
6240
6241
6242   if (info.shift > 0)
6243     {
6244       /* For the sake of calculating the cost of the shifted register
6245          component, we can treat same sized modes in the same way.  */
6246       switch (GET_MODE_BITSIZE (mode))
6247         {
6248           case 16:
6249             cost += addr_cost->addr_scale_costs.hi;
6250             break;
6251
6252           case 32:
6253             cost += addr_cost->addr_scale_costs.si;
6254             break;
6255
6256           case 64:
6257             cost += addr_cost->addr_scale_costs.di;
6258             break;
6259
6260           /* We can't tell, or this is a 128-bit vector.  */
6261           default:
6262             cost += addr_cost->addr_scale_costs.ti;
6263             break;
6264         }
6265     }
6266
6267   return cost;
6268 }
6269
6270 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
6271    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
6272    to be taken.  */
6273
6274 int
6275 aarch64_branch_cost (bool speed_p, bool predictable_p)
6276 {
6277   /* When optimizing for speed, use the cost of unpredictable branches.  */
6278   const struct cpu_branch_cost *branch_costs =
6279     aarch64_tune_params.branch_costs;
6280
6281   if (!speed_p || predictable_p)
6282     return branch_costs->predictable;
6283   else
6284     return branch_costs->unpredictable;
6285 }
6286
6287 /* Return true if the RTX X in mode MODE is a zero or sign extract
6288    usable in an ADD or SUB (extended register) instruction.  */
6289 static bool
6290 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
6291 {
6292   /* Catch add with a sign extract.
6293      This is add_<optab><mode>_multp2.  */
6294   if (GET_CODE (x) == SIGN_EXTRACT
6295       || GET_CODE (x) == ZERO_EXTRACT)
6296     {
6297       rtx op0 = XEXP (x, 0);
6298       rtx op1 = XEXP (x, 1);
6299       rtx op2 = XEXP (x, 2);
6300
6301       if (GET_CODE (op0) == MULT
6302           && CONST_INT_P (op1)
6303           && op2 == const0_rtx
6304           && CONST_INT_P (XEXP (op0, 1))
6305           && aarch64_is_extend_from_extract (mode,
6306                                              XEXP (op0, 1),
6307                                              op1))
6308         {
6309           return true;
6310         }
6311     }
6312   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
6313      No shift.  */
6314   else if (GET_CODE (x) == SIGN_EXTEND
6315            || GET_CODE (x) == ZERO_EXTEND)
6316     return REG_P (XEXP (x, 0));
6317
6318   return false;
6319 }
6320
6321 static bool
6322 aarch64_frint_unspec_p (unsigned int u)
6323 {
6324   switch (u)
6325     {
6326       case UNSPEC_FRINTZ:
6327       case UNSPEC_FRINTP:
6328       case UNSPEC_FRINTM:
6329       case UNSPEC_FRINTA:
6330       case UNSPEC_FRINTN:
6331       case UNSPEC_FRINTX:
6332       case UNSPEC_FRINTI:
6333         return true;
6334
6335       default:
6336         return false;
6337     }
6338 }
6339
6340 /* Return true iff X is an rtx that will match an extr instruction
6341    i.e. as described in the *extr<mode>5_insn family of patterns.
6342    OP0 and OP1 will be set to the operands of the shifts involved
6343    on success and will be NULL_RTX otherwise.  */
6344
6345 static bool
6346 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
6347 {
6348   rtx op0, op1;
6349   machine_mode mode = GET_MODE (x);
6350
6351   *res_op0 = NULL_RTX;
6352   *res_op1 = NULL_RTX;
6353
6354   if (GET_CODE (x) != IOR)
6355     return false;
6356
6357   op0 = XEXP (x, 0);
6358   op1 = XEXP (x, 1);
6359
6360   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
6361       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
6362     {
6363      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
6364       if (GET_CODE (op1) == ASHIFT)
6365         std::swap (op0, op1);
6366
6367       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6368         return false;
6369
6370       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6371       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6372
6373       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6374           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6375         {
6376           *res_op0 = XEXP (op0, 0);
6377           *res_op1 = XEXP (op1, 0);
6378           return true;
6379         }
6380     }
6381
6382   return false;
6383 }
6384
6385 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6386    storing it in *COST.  Result is true if the total cost of the operation
6387    has now been calculated.  */
6388 static bool
6389 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6390 {
6391   rtx inner;
6392   rtx comparator;
6393   enum rtx_code cmpcode;
6394
6395   if (COMPARISON_P (op0))
6396     {
6397       inner = XEXP (op0, 0);
6398       comparator = XEXP (op0, 1);
6399       cmpcode = GET_CODE (op0);
6400     }
6401   else
6402     {
6403       inner = op0;
6404       comparator = const0_rtx;
6405       cmpcode = NE;
6406     }
6407
6408   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6409     {
6410       /* Conditional branch.  */
6411       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6412         return true;
6413       else
6414         {
6415           if (cmpcode == NE || cmpcode == EQ)
6416             {
6417               if (comparator == const0_rtx)
6418                 {
6419                   /* TBZ/TBNZ/CBZ/CBNZ.  */
6420                   if (GET_CODE (inner) == ZERO_EXTRACT)
6421                     /* TBZ/TBNZ.  */
6422                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6423                                        ZERO_EXTRACT, 0, speed);
6424                   else
6425                     /* CBZ/CBNZ.  */
6426                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6427
6428                 return true;
6429               }
6430             }
6431           else if (cmpcode == LT || cmpcode == GE)
6432             {
6433               /* TBZ/TBNZ.  */
6434               if (comparator == const0_rtx)
6435                 return true;
6436             }
6437         }
6438     }
6439   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6440     {
6441       /* CCMP.  */
6442       if (GET_CODE (op1) == COMPARE)
6443         {
6444           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
6445           if (XEXP (op1, 1) == const0_rtx)
6446             *cost += 1;
6447           if (speed)
6448             {
6449               machine_mode mode = GET_MODE (XEXP (op1, 0));
6450               const struct cpu_cost_table *extra_cost
6451                 = aarch64_tune_params.insn_extra_cost;
6452
6453               if (GET_MODE_CLASS (mode) == MODE_INT)
6454                 *cost += extra_cost->alu.arith;
6455               else
6456                 *cost += extra_cost->fp[mode == DFmode].compare;
6457             }
6458           return true;
6459         }
6460
6461       /* It's a conditional operation based on the status flags,
6462          so it must be some flavor of CSEL.  */
6463
6464       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
6465       if (GET_CODE (op1) == NEG
6466           || GET_CODE (op1) == NOT
6467           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6468         op1 = XEXP (op1, 0);
6469       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6470         {
6471           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
6472           op1 = XEXP (op1, 0);
6473           op2 = XEXP (op2, 0);
6474         }
6475
6476       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6477       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6478       return true;
6479     }
6480
6481   /* We don't know what this is, cost all operands.  */
6482   return false;
6483 }
6484
6485 /* Check whether X is a bitfield operation of the form shift + extend that
6486    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
6487    operand to which the bitfield operation is applied.  Otherwise return
6488    NULL_RTX.  */
6489
6490 static rtx
6491 aarch64_extend_bitfield_pattern_p (rtx x)
6492 {
6493   rtx_code outer_code = GET_CODE (x);
6494   machine_mode outer_mode = GET_MODE (x);
6495
6496   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6497       && outer_mode != SImode && outer_mode != DImode)
6498     return NULL_RTX;
6499
6500   rtx inner = XEXP (x, 0);
6501   rtx_code inner_code = GET_CODE (inner);
6502   machine_mode inner_mode = GET_MODE (inner);
6503   rtx op = NULL_RTX;
6504
6505   switch (inner_code)
6506     {
6507       case ASHIFT:
6508         if (CONST_INT_P (XEXP (inner, 1))
6509             && (inner_mode == QImode || inner_mode == HImode))
6510           op = XEXP (inner, 0);
6511         break;
6512       case LSHIFTRT:
6513         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6514             && (inner_mode == QImode || inner_mode == HImode))
6515           op = XEXP (inner, 0);
6516         break;
6517       case ASHIFTRT:
6518         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6519             && (inner_mode == QImode || inner_mode == HImode))
6520           op = XEXP (inner, 0);
6521         break;
6522       default:
6523         break;
6524     }
6525
6526   return op;
6527 }
6528
6529 /* Return true if the mask and a shift amount from an RTX of the form
6530    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6531    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
6532
6533 bool
6534 aarch64_mask_and_shift_for_ubfiz_p (machine_mode mode, rtx mask, rtx shft_amnt)
6535 {
6536   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6537          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6538          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6539          && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6540 }
6541
6542 /* Calculate the cost of calculating X, storing it in *COST.  Result
6543    is true if the total cost of the operation has now been calculated.  */
6544 static bool
6545 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6546                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6547 {
6548   rtx op0, op1, op2;
6549   const struct cpu_cost_table *extra_cost
6550     = aarch64_tune_params.insn_extra_cost;
6551   int code = GET_CODE (x);
6552
6553   /* By default, assume that everything has equivalent cost to the
6554      cheapest instruction.  Any additional costs are applied as a delta
6555      above this default.  */
6556   *cost = COSTS_N_INSNS (1);
6557
6558   switch (code)
6559     {
6560     case SET:
6561       /* The cost depends entirely on the operands to SET.  */
6562       *cost = 0;
6563       op0 = SET_DEST (x);
6564       op1 = SET_SRC (x);
6565
6566       switch (GET_CODE (op0))
6567         {
6568         case MEM:
6569           if (speed)
6570             {
6571               rtx address = XEXP (op0, 0);
6572               if (VECTOR_MODE_P (mode))
6573                 *cost += extra_cost->ldst.storev;
6574               else if (GET_MODE_CLASS (mode) == MODE_INT)
6575                 *cost += extra_cost->ldst.store;
6576               else if (mode == SFmode)
6577                 *cost += extra_cost->ldst.storef;
6578               else if (mode == DFmode)
6579                 *cost += extra_cost->ldst.stored;
6580
6581               *cost +=
6582                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6583                                                      0, speed));
6584             }
6585
6586           *cost += rtx_cost (op1, mode, SET, 1, speed);
6587           return true;
6588
6589         case SUBREG:
6590           if (! REG_P (SUBREG_REG (op0)))
6591             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6592
6593           /* Fall through.  */
6594         case REG:
6595           /* The cost is one per vector-register copied.  */
6596           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6597             {
6598               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6599                               / GET_MODE_SIZE (V4SImode);
6600               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6601             }
6602           /* const0_rtx is in general free, but we will use an
6603              instruction to set a register to 0.  */
6604           else if (REG_P (op1) || op1 == const0_rtx)
6605             {
6606               /* The cost is 1 per register copied.  */
6607               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6608                               / UNITS_PER_WORD;
6609               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6610             }
6611           else
6612             /* Cost is just the cost of the RHS of the set.  */
6613             *cost += rtx_cost (op1, mode, SET, 1, speed);
6614           return true;
6615
6616         case ZERO_EXTRACT:
6617         case SIGN_EXTRACT:
6618           /* Bit-field insertion.  Strip any redundant widening of
6619              the RHS to meet the width of the target.  */
6620           if (GET_CODE (op1) == SUBREG)
6621             op1 = SUBREG_REG (op1);
6622           if ((GET_CODE (op1) == ZERO_EXTEND
6623                || GET_CODE (op1) == SIGN_EXTEND)
6624               && CONST_INT_P (XEXP (op0, 1))
6625               && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
6626                   >= INTVAL (XEXP (op0, 1))))
6627             op1 = XEXP (op1, 0);
6628
6629           if (CONST_INT_P (op1))
6630             {
6631               /* MOV immediate is assumed to always be cheap.  */
6632               *cost = COSTS_N_INSNS (1);
6633             }
6634           else
6635             {
6636               /* BFM.  */
6637               if (speed)
6638                 *cost += extra_cost->alu.bfi;
6639               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6640             }
6641
6642           return true;
6643
6644         default:
6645           /* We can't make sense of this, assume default cost.  */
6646           *cost = COSTS_N_INSNS (1);
6647           return false;
6648         }
6649       return false;
6650
6651     case CONST_INT:
6652       /* If an instruction can incorporate a constant within the
6653          instruction, the instruction's expression avoids calling
6654          rtx_cost() on the constant.  If rtx_cost() is called on a
6655          constant, then it is usually because the constant must be
6656          moved into a register by one or more instructions.
6657
6658          The exception is constant 0, which can be expressed
6659          as XZR/WZR and is therefore free.  The exception to this is
6660          if we have (set (reg) (const0_rtx)) in which case we must cost
6661          the move.  However, we can catch that when we cost the SET, so
6662          we don't need to consider that here.  */
6663       if (x == const0_rtx)
6664         *cost = 0;
6665       else
6666         {
6667           /* To an approximation, building any other constant is
6668              proportionally expensive to the number of instructions
6669              required to build that constant.  This is true whether we
6670              are compiling for SPEED or otherwise.  */
6671           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
6672                                  (NULL_RTX, x, false, mode));
6673         }
6674       return true;
6675
6676     case CONST_DOUBLE:
6677       if (speed)
6678         {
6679           /* mov[df,sf]_aarch64.  */
6680           if (aarch64_float_const_representable_p (x))
6681             /* FMOV (scalar immediate).  */
6682             *cost += extra_cost->fp[mode == DFmode].fpconst;
6683           else if (!aarch64_float_const_zero_rtx_p (x))
6684             {
6685               /* This will be a load from memory.  */
6686               if (mode == DFmode)
6687                 *cost += extra_cost->ldst.loadd;
6688               else
6689                 *cost += extra_cost->ldst.loadf;
6690             }
6691           else
6692             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
6693                or MOV v0.s[0], wzr - neither of which are modeled by the
6694                cost tables.  Just use the default cost.  */
6695             {
6696             }
6697         }
6698
6699       return true;
6700
6701     case MEM:
6702       if (speed)
6703         {
6704           /* For loads we want the base cost of a load, plus an
6705              approximation for the additional cost of the addressing
6706              mode.  */
6707           rtx address = XEXP (x, 0);
6708           if (VECTOR_MODE_P (mode))
6709             *cost += extra_cost->ldst.loadv;
6710           else if (GET_MODE_CLASS (mode) == MODE_INT)
6711             *cost += extra_cost->ldst.load;
6712           else if (mode == SFmode)
6713             *cost += extra_cost->ldst.loadf;
6714           else if (mode == DFmode)
6715             *cost += extra_cost->ldst.loadd;
6716
6717           *cost +=
6718                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6719                                                      0, speed));
6720         }
6721
6722       return true;
6723
6724     case NEG:
6725       op0 = XEXP (x, 0);
6726
6727       if (VECTOR_MODE_P (mode))
6728         {
6729           if (speed)
6730             {
6731               /* FNEG.  */
6732               *cost += extra_cost->vect.alu;
6733             }
6734           return false;
6735         }
6736
6737       if (GET_MODE_CLASS (mode) == MODE_INT)
6738         {
6739           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6740               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6741             {
6742               /* CSETM.  */
6743               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
6744               return true;
6745             }
6746
6747           /* Cost this as SUB wzr, X.  */
6748           op0 = CONST0_RTX (mode);
6749           op1 = XEXP (x, 0);
6750           goto cost_minus;
6751         }
6752
6753       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6754         {
6755           /* Support (neg(fma...)) as a single instruction only if
6756              sign of zeros is unimportant.  This matches the decision
6757              making in aarch64.md.  */
6758           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
6759             {
6760               /* FNMADD.  */
6761               *cost = rtx_cost (op0, mode, NEG, 0, speed);
6762               return true;
6763             }
6764           if (GET_CODE (op0) == MULT)
6765             {
6766               /* FNMUL.  */
6767               *cost = rtx_cost (op0, mode, NEG, 0, speed);
6768               return true;
6769             }
6770           if (speed)
6771             /* FNEG.  */
6772             *cost += extra_cost->fp[mode == DFmode].neg;
6773           return false;
6774         }
6775
6776       return false;
6777
6778     case CLRSB:
6779     case CLZ:
6780       if (speed)
6781         {
6782           if (VECTOR_MODE_P (mode))
6783             *cost += extra_cost->vect.alu;
6784           else
6785             *cost += extra_cost->alu.clz;
6786         }
6787
6788       return false;
6789
6790     case COMPARE:
6791       op0 = XEXP (x, 0);
6792       op1 = XEXP (x, 1);
6793
6794       if (op1 == const0_rtx
6795           && GET_CODE (op0) == AND)
6796         {
6797           x = op0;
6798           mode = GET_MODE (op0);
6799           goto cost_logic;
6800         }
6801
6802       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
6803         {
6804           /* TODO: A write to the CC flags possibly costs extra, this
6805              needs encoding in the cost tables.  */
6806
6807           mode = GET_MODE (op0);
6808           /* ANDS.  */
6809           if (GET_CODE (op0) == AND)
6810             {
6811               x = op0;
6812               goto cost_logic;
6813             }
6814
6815           if (GET_CODE (op0) == PLUS)
6816             {
6817               /* ADDS (and CMN alias).  */
6818               x = op0;
6819               goto cost_plus;
6820             }
6821
6822           if (GET_CODE (op0) == MINUS)
6823             {
6824               /* SUBS.  */
6825               x = op0;
6826               goto cost_minus;
6827             }
6828
6829           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
6830               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
6831               && CONST_INT_P (XEXP (op0, 2)))
6832             {
6833               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
6834                  Handle it here directly rather than going to cost_logic
6835                  since we know the immediate generated for the TST is valid
6836                  so we can avoid creating an intermediate rtx for it only
6837                  for costing purposes.  */
6838               if (speed)
6839                 *cost += extra_cost->alu.logical;
6840
6841               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
6842                                  ZERO_EXTRACT, 0, speed);
6843               return true;
6844             }
6845
6846           if (GET_CODE (op1) == NEG)
6847             {
6848               /* CMN.  */
6849               if (speed)
6850                 *cost += extra_cost->alu.arith;
6851
6852               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
6853               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
6854               return true;
6855             }
6856
6857           /* CMP.
6858
6859              Compare can freely swap the order of operands, and
6860              canonicalization puts the more complex operation first.
6861              But the integer MINUS logic expects the shift/extend
6862              operation in op1.  */
6863           if (! (REG_P (op0)
6864                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
6865           {
6866             op0 = XEXP (x, 1);
6867             op1 = XEXP (x, 0);
6868           }
6869           goto cost_minus;
6870         }
6871
6872       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
6873         {
6874           /* FCMP.  */
6875           if (speed)
6876             *cost += extra_cost->fp[mode == DFmode].compare;
6877
6878           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
6879             {
6880               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
6881               /* FCMP supports constant 0.0 for no extra cost. */
6882               return true;
6883             }
6884           return false;
6885         }
6886
6887       if (VECTOR_MODE_P (mode))
6888         {
6889           /* Vector compare.  */
6890           if (speed)
6891             *cost += extra_cost->vect.alu;
6892
6893           if (aarch64_float_const_zero_rtx_p (op1))
6894             {
6895               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
6896                  cost.  */
6897               return true;
6898             }
6899           return false;
6900         }
6901       return false;
6902
6903     case MINUS:
6904       {
6905         op0 = XEXP (x, 0);
6906         op1 = XEXP (x, 1);
6907
6908 cost_minus:
6909         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
6910
6911         /* Detect valid immediates.  */
6912         if ((GET_MODE_CLASS (mode) == MODE_INT
6913              || (GET_MODE_CLASS (mode) == MODE_CC
6914                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
6915             && CONST_INT_P (op1)
6916             && aarch64_uimm12_shift (INTVAL (op1)))
6917           {
6918             if (speed)
6919               /* SUB(S) (immediate).  */
6920               *cost += extra_cost->alu.arith;
6921             return true;
6922           }
6923
6924         /* Look for SUB (extended register).  */
6925         if (aarch64_rtx_arith_op_extract_p (op1, mode))
6926           {
6927             if (speed)
6928               *cost += extra_cost->alu.extend_arith;
6929
6930             op1 = aarch64_strip_extend (op1);
6931             *cost += rtx_cost (op1, VOIDmode,
6932                                (enum rtx_code) GET_CODE (op1), 0, speed);
6933             return true;
6934           }
6935
6936         rtx new_op1 = aarch64_strip_extend (op1);
6937
6938         /* Cost this as an FMA-alike operation.  */
6939         if ((GET_CODE (new_op1) == MULT
6940              || aarch64_shift_p (GET_CODE (new_op1)))
6941             && code != COMPARE)
6942           {
6943             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
6944                                             (enum rtx_code) code,
6945                                             speed);
6946             return true;
6947           }
6948
6949         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
6950
6951         if (speed)
6952           {
6953             if (VECTOR_MODE_P (mode))
6954               {
6955                 /* Vector SUB.  */
6956                 *cost += extra_cost->vect.alu;
6957               }
6958             else if (GET_MODE_CLASS (mode) == MODE_INT)
6959               {
6960                 /* SUB(S).  */
6961                 *cost += extra_cost->alu.arith;
6962               }
6963             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6964               {
6965                 /* FSUB.  */
6966                 *cost += extra_cost->fp[mode == DFmode].addsub;
6967               }
6968           }
6969         return true;
6970       }
6971
6972     case PLUS:
6973       {
6974         rtx new_op0;
6975
6976         op0 = XEXP (x, 0);
6977         op1 = XEXP (x, 1);
6978
6979 cost_plus:
6980         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6981             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6982           {
6983             /* CSINC.  */
6984             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
6985             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6986             return true;
6987           }
6988
6989         if (GET_MODE_CLASS (mode) == MODE_INT
6990             && CONST_INT_P (op1)
6991             && aarch64_uimm12_shift (INTVAL (op1)))
6992           {
6993             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
6994
6995             if (speed)
6996               /* ADD (immediate).  */
6997               *cost += extra_cost->alu.arith;
6998             return true;
6999           }
7000
7001         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7002
7003         /* Look for ADD (extended register).  */
7004         if (aarch64_rtx_arith_op_extract_p (op0, mode))
7005           {
7006             if (speed)
7007               *cost += extra_cost->alu.extend_arith;
7008
7009             op0 = aarch64_strip_extend (op0);
7010             *cost += rtx_cost (op0, VOIDmode,
7011                                (enum rtx_code) GET_CODE (op0), 0, speed);
7012             return true;
7013           }
7014
7015         /* Strip any extend, leave shifts behind as we will
7016            cost them through mult_cost.  */
7017         new_op0 = aarch64_strip_extend (op0);
7018
7019         if (GET_CODE (new_op0) == MULT
7020             || aarch64_shift_p (GET_CODE (new_op0)))
7021           {
7022             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
7023                                             speed);
7024             return true;
7025           }
7026
7027         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
7028
7029         if (speed)
7030           {
7031             if (VECTOR_MODE_P (mode))
7032               {
7033                 /* Vector ADD.  */
7034                 *cost += extra_cost->vect.alu;
7035               }
7036             else if (GET_MODE_CLASS (mode) == MODE_INT)
7037               {
7038                 /* ADD.  */
7039                 *cost += extra_cost->alu.arith;
7040               }
7041             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7042               {
7043                 /* FADD.  */
7044                 *cost += extra_cost->fp[mode == DFmode].addsub;
7045               }
7046           }
7047         return true;
7048       }
7049
7050     case BSWAP:
7051       *cost = COSTS_N_INSNS (1);
7052
7053       if (speed)
7054         {
7055           if (VECTOR_MODE_P (mode))
7056             *cost += extra_cost->vect.alu;
7057           else
7058             *cost += extra_cost->alu.rev;
7059         }
7060       return false;
7061
7062     case IOR:
7063       if (aarch_rev16_p (x))
7064         {
7065           *cost = COSTS_N_INSNS (1);
7066
7067           if (speed)
7068             {
7069               if (VECTOR_MODE_P (mode))
7070                 *cost += extra_cost->vect.alu;
7071               else
7072                 *cost += extra_cost->alu.rev;
7073             }
7074           return true;
7075         }
7076
7077       if (aarch64_extr_rtx_p (x, &op0, &op1))
7078         {
7079           *cost += rtx_cost (op0, mode, IOR, 0, speed);
7080           *cost += rtx_cost (op1, mode, IOR, 1, speed);
7081           if (speed)
7082             *cost += extra_cost->alu.shift;
7083
7084           return true;
7085         }
7086     /* Fall through.  */
7087     case XOR:
7088     case AND:
7089     cost_logic:
7090       op0 = XEXP (x, 0);
7091       op1 = XEXP (x, 1);
7092
7093       if (VECTOR_MODE_P (mode))
7094         {
7095           if (speed)
7096             *cost += extra_cost->vect.alu;
7097           return true;
7098         }
7099
7100       if (code == AND
7101           && GET_CODE (op0) == MULT
7102           && CONST_INT_P (XEXP (op0, 1))
7103           && CONST_INT_P (op1)
7104           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
7105                                INTVAL (op1)) != 0)
7106         {
7107           /* This is a UBFM/SBFM.  */
7108           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
7109           if (speed)
7110             *cost += extra_cost->alu.bfx;
7111           return true;
7112         }
7113
7114       if (GET_MODE_CLASS (mode) == MODE_INT)
7115         {
7116           if (CONST_INT_P (op1))
7117             {
7118               /* We have a mask + shift version of a UBFIZ
7119                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
7120               if (GET_CODE (op0) == ASHIFT
7121                   && aarch64_mask_and_shift_for_ubfiz_p (mode, op1,
7122                                                           XEXP (op0, 1)))
7123                 {
7124                   *cost += rtx_cost (XEXP (op0, 0), mode,
7125                                      (enum rtx_code) code, 0, speed);
7126                   if (speed)
7127                     *cost += extra_cost->alu.bfx;
7128
7129                   return true;
7130                 }
7131               else if (aarch64_bitmask_imm (INTVAL (op1), mode))
7132                 {
7133                 /* We possibly get the immediate for free, this is not
7134                    modelled.  */
7135                   *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7136                   if (speed)
7137                     *cost += extra_cost->alu.logical;
7138
7139                   return true;
7140                 }
7141             }
7142           else
7143             {
7144               rtx new_op0 = op0;
7145
7146               /* Handle ORN, EON, or BIC.  */
7147               if (GET_CODE (op0) == NOT)
7148                 op0 = XEXP (op0, 0);
7149
7150               new_op0 = aarch64_strip_shift (op0);
7151
7152               /* If we had a shift on op0 then this is a logical-shift-
7153                  by-register/immediate operation.  Otherwise, this is just
7154                  a logical operation.  */
7155               if (speed)
7156                 {
7157                   if (new_op0 != op0)
7158                     {
7159                       /* Shift by immediate.  */
7160                       if (CONST_INT_P (XEXP (op0, 1)))
7161                         *cost += extra_cost->alu.log_shift;
7162                       else
7163                         *cost += extra_cost->alu.log_shift_reg;
7164                     }
7165                   else
7166                     *cost += extra_cost->alu.logical;
7167                 }
7168
7169               /* In both cases we want to cost both operands.  */
7170               *cost += rtx_cost (new_op0, mode, (enum rtx_code) code, 0, speed);
7171               *cost += rtx_cost (op1, mode, (enum rtx_code) code, 1, speed);
7172
7173               return true;
7174             }
7175         }
7176       return false;
7177
7178     case NOT:
7179       x = XEXP (x, 0);
7180       op0 = aarch64_strip_shift (x);
7181
7182       if (VECTOR_MODE_P (mode))
7183         {
7184           /* Vector NOT.  */
7185           *cost += extra_cost->vect.alu;
7186           return false;
7187         }
7188
7189       /* MVN-shifted-reg.  */
7190       if (op0 != x)
7191         {
7192           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7193
7194           if (speed)
7195             *cost += extra_cost->alu.log_shift;
7196
7197           return true;
7198         }
7199       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
7200          Handle the second form here taking care that 'a' in the above can
7201          be a shift.  */
7202       else if (GET_CODE (op0) == XOR)
7203         {
7204           rtx newop0 = XEXP (op0, 0);
7205           rtx newop1 = XEXP (op0, 1);
7206           rtx op0_stripped = aarch64_strip_shift (newop0);
7207
7208           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
7209           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
7210
7211           if (speed)
7212             {
7213               if (op0_stripped != newop0)
7214                 *cost += extra_cost->alu.log_shift;
7215               else
7216                 *cost += extra_cost->alu.logical;
7217             }
7218
7219           return true;
7220         }
7221       /* MVN.  */
7222       if (speed)
7223         *cost += extra_cost->alu.logical;
7224
7225       return false;
7226
7227     case ZERO_EXTEND:
7228
7229       op0 = XEXP (x, 0);
7230       /* If a value is written in SI mode, then zero extended to DI
7231          mode, the operation will in general be free as a write to
7232          a 'w' register implicitly zeroes the upper bits of an 'x'
7233          register.  However, if this is
7234
7235            (set (reg) (zero_extend (reg)))
7236
7237          we must cost the explicit register move.  */
7238       if (mode == DImode
7239           && GET_MODE (op0) == SImode
7240           && outer == SET)
7241         {
7242           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
7243
7244         /* If OP_COST is non-zero, then the cost of the zero extend
7245            is effectively the cost of the inner operation.  Otherwise
7246            we have a MOV instruction and we take the cost from the MOV
7247            itself.  This is true independently of whether we are
7248            optimizing for space or time.  */
7249           if (op_cost)
7250             *cost = op_cost;
7251
7252           return true;
7253         }
7254       else if (MEM_P (op0))
7255         {
7256           /* All loads can zero extend to any size for free.  */
7257           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
7258           return true;
7259         }
7260
7261       op0 = aarch64_extend_bitfield_pattern_p (x);
7262       if (op0)
7263         {
7264           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
7265           if (speed)
7266             *cost += extra_cost->alu.bfx;
7267           return true;
7268         }
7269
7270       if (speed)
7271         {
7272           if (VECTOR_MODE_P (mode))
7273             {
7274               /* UMOV.  */
7275               *cost += extra_cost->vect.alu;
7276             }
7277           else
7278             {
7279               /* We generate an AND instead of UXTB/UXTH.  */
7280               *cost += extra_cost->alu.logical;
7281             }
7282         }
7283       return false;
7284
7285     case SIGN_EXTEND:
7286       if (MEM_P (XEXP (x, 0)))
7287         {
7288           /* LDRSH.  */
7289           if (speed)
7290             {
7291               rtx address = XEXP (XEXP (x, 0), 0);
7292               *cost += extra_cost->ldst.load_sign_extend;
7293
7294               *cost +=
7295                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7296                                                      0, speed));
7297             }
7298           return true;
7299         }
7300
7301       op0 = aarch64_extend_bitfield_pattern_p (x);
7302       if (op0)
7303         {
7304           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
7305           if (speed)
7306             *cost += extra_cost->alu.bfx;
7307           return true;
7308         }
7309
7310       if (speed)
7311         {
7312           if (VECTOR_MODE_P (mode))
7313             *cost += extra_cost->vect.alu;
7314           else
7315             *cost += extra_cost->alu.extend;
7316         }
7317       return false;
7318
7319     case ASHIFT:
7320       op0 = XEXP (x, 0);
7321       op1 = XEXP (x, 1);
7322
7323       if (CONST_INT_P (op1))
7324         {
7325           if (speed)
7326             {
7327               if (VECTOR_MODE_P (mode))
7328                 {
7329                   /* Vector shift (immediate).  */
7330                   *cost += extra_cost->vect.alu;
7331                 }
7332               else
7333                 {
7334                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
7335                      aliases.  */
7336                   *cost += extra_cost->alu.shift;
7337                 }
7338             }
7339
7340           /* We can incorporate zero/sign extend for free.  */
7341           if (GET_CODE (op0) == ZERO_EXTEND
7342               || GET_CODE (op0) == SIGN_EXTEND)
7343             op0 = XEXP (op0, 0);
7344
7345           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
7346           return true;
7347         }
7348       else
7349         {
7350           if (speed)
7351             {
7352               if (VECTOR_MODE_P (mode))
7353                 {
7354                   /* Vector shift (register).  */
7355                   *cost += extra_cost->vect.alu;
7356                 }
7357               else
7358                 {
7359                   /* LSLV.  */
7360                   *cost += extra_cost->alu.shift_reg;
7361                 }
7362             }
7363           return false;  /* All arguments need to be in registers.  */
7364         }
7365
7366     case ROTATE:
7367     case ROTATERT:
7368     case LSHIFTRT:
7369     case ASHIFTRT:
7370       op0 = XEXP (x, 0);
7371       op1 = XEXP (x, 1);
7372
7373       if (CONST_INT_P (op1))
7374         {
7375           /* ASR (immediate) and friends.  */
7376           if (speed)
7377             {
7378               if (VECTOR_MODE_P (mode))
7379                 *cost += extra_cost->vect.alu;
7380               else
7381                 *cost += extra_cost->alu.shift;
7382             }
7383
7384           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7385           return true;
7386         }
7387       else
7388         {
7389
7390           /* ASR (register) and friends.  */
7391           if (speed)
7392             {
7393               if (VECTOR_MODE_P (mode))
7394                 *cost += extra_cost->vect.alu;
7395               else
7396                 *cost += extra_cost->alu.shift_reg;
7397             }
7398           return false;  /* All arguments need to be in registers.  */
7399         }
7400
7401     case SYMBOL_REF:
7402
7403       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7404           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7405         {
7406           /* LDR.  */
7407           if (speed)
7408             *cost += extra_cost->ldst.load;
7409         }
7410       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7411                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7412         {
7413           /* ADRP, followed by ADD.  */
7414           *cost += COSTS_N_INSNS (1);
7415           if (speed)
7416             *cost += 2 * extra_cost->alu.arith;
7417         }
7418       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7419                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7420         {
7421           /* ADR.  */
7422           if (speed)
7423             *cost += extra_cost->alu.arith;
7424         }
7425
7426       if (flag_pic)
7427         {
7428           /* One extra load instruction, after accessing the GOT.  */
7429           *cost += COSTS_N_INSNS (1);
7430           if (speed)
7431             *cost += extra_cost->ldst.load;
7432         }
7433       return true;
7434
7435     case HIGH:
7436     case LO_SUM:
7437       /* ADRP/ADD (immediate).  */
7438       if (speed)
7439         *cost += extra_cost->alu.arith;
7440       return true;
7441
7442     case ZERO_EXTRACT:
7443     case SIGN_EXTRACT:
7444       /* UBFX/SBFX.  */
7445       if (speed)
7446         {
7447           if (VECTOR_MODE_P (mode))
7448             *cost += extra_cost->vect.alu;
7449           else
7450             *cost += extra_cost->alu.bfx;
7451         }
7452
7453       /* We can trust that the immediates used will be correct (there
7454          are no by-register forms), so we need only cost op0.  */
7455       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7456       return true;
7457
7458     case MULT:
7459       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7460       /* aarch64_rtx_mult_cost always handles recursion to its
7461          operands.  */
7462       return true;
7463
7464     case MOD:
7465     /* We can expand signed mod by power of 2 using a NEGS, two parallel
7466        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
7467        an unconditional negate.  This case should only ever be reached through
7468        the set_smod_pow2_cheap check in expmed.c.  */
7469       if (CONST_INT_P (XEXP (x, 1))
7470           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7471           && (mode == SImode || mode == DImode))
7472         {
7473           /* We expand to 4 instructions.  Reset the baseline.  */
7474           *cost = COSTS_N_INSNS (4);
7475
7476           if (speed)
7477             *cost += 2 * extra_cost->alu.logical
7478                      + 2 * extra_cost->alu.arith;
7479
7480           return true;
7481         }
7482
7483     /* Fall-through.  */
7484     case UMOD:
7485       if (speed)
7486         {
7487           if (VECTOR_MODE_P (mode))
7488             *cost += extra_cost->vect.alu;
7489           else if (GET_MODE_CLASS (mode) == MODE_INT)
7490             *cost += (extra_cost->mult[mode == DImode].add
7491                       + extra_cost->mult[mode == DImode].idiv);
7492           else if (mode == DFmode)
7493             *cost += (extra_cost->fp[1].mult
7494                       + extra_cost->fp[1].div);
7495           else if (mode == SFmode)
7496             *cost += (extra_cost->fp[0].mult
7497                       + extra_cost->fp[0].div);
7498         }
7499       return false;  /* All arguments need to be in registers.  */
7500
7501     case DIV:
7502     case UDIV:
7503     case SQRT:
7504       if (speed)
7505         {
7506           if (VECTOR_MODE_P (mode))
7507             *cost += extra_cost->vect.alu;
7508           else if (GET_MODE_CLASS (mode) == MODE_INT)
7509             /* There is no integer SQRT, so only DIV and UDIV can get
7510                here.  */
7511             *cost += extra_cost->mult[mode == DImode].idiv;
7512           else
7513             *cost += extra_cost->fp[mode == DFmode].div;
7514         }
7515       return false;  /* All arguments need to be in registers.  */
7516
7517     case IF_THEN_ELSE:
7518       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7519                                          XEXP (x, 2), cost, speed);
7520
7521     case EQ:
7522     case NE:
7523     case GT:
7524     case GTU:
7525     case LT:
7526     case LTU:
7527     case GE:
7528     case GEU:
7529     case LE:
7530     case LEU:
7531
7532       return false; /* All arguments must be in registers.  */
7533
7534     case FMA:
7535       op0 = XEXP (x, 0);
7536       op1 = XEXP (x, 1);
7537       op2 = XEXP (x, 2);
7538
7539       if (speed)
7540         {
7541           if (VECTOR_MODE_P (mode))
7542             *cost += extra_cost->vect.alu;
7543           else
7544             *cost += extra_cost->fp[mode == DFmode].fma;
7545         }
7546
7547       /* FMSUB, FNMADD, and FNMSUB are free.  */
7548       if (GET_CODE (op0) == NEG)
7549         op0 = XEXP (op0, 0);
7550
7551       if (GET_CODE (op2) == NEG)
7552         op2 = XEXP (op2, 0);
7553
7554       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7555          and the by-element operand as operand 0.  */
7556       if (GET_CODE (op1) == NEG)
7557         op1 = XEXP (op1, 0);
7558
7559       /* Catch vector-by-element operations.  The by-element operand can
7560          either be (vec_duplicate (vec_select (x))) or just
7561          (vec_select (x)), depending on whether we are multiplying by
7562          a vector or a scalar.
7563
7564          Canonicalization is not very good in these cases, FMA4 will put the
7565          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
7566       if (GET_CODE (op0) == VEC_DUPLICATE)
7567         op0 = XEXP (op0, 0);
7568       else if (GET_CODE (op1) == VEC_DUPLICATE)
7569         op1 = XEXP (op1, 0);
7570
7571       if (GET_CODE (op0) == VEC_SELECT)
7572         op0 = XEXP (op0, 0);
7573       else if (GET_CODE (op1) == VEC_SELECT)
7574         op1 = XEXP (op1, 0);
7575
7576       /* If the remaining parameters are not registers,
7577          get the cost to put them into registers.  */
7578       *cost += rtx_cost (op0, mode, FMA, 0, speed);
7579       *cost += rtx_cost (op1, mode, FMA, 1, speed);
7580       *cost += rtx_cost (op2, mode, FMA, 2, speed);
7581       return true;
7582
7583     case FLOAT:
7584     case UNSIGNED_FLOAT:
7585       if (speed)
7586         *cost += extra_cost->fp[mode == DFmode].fromint;
7587       return false;
7588
7589     case FLOAT_EXTEND:
7590       if (speed)
7591         {
7592           if (VECTOR_MODE_P (mode))
7593             {
7594               /*Vector truncate.  */
7595               *cost += extra_cost->vect.alu;
7596             }
7597           else
7598             *cost += extra_cost->fp[mode == DFmode].widen;
7599         }
7600       return false;
7601
7602     case FLOAT_TRUNCATE:
7603       if (speed)
7604         {
7605           if (VECTOR_MODE_P (mode))
7606             {
7607               /*Vector conversion.  */
7608               *cost += extra_cost->vect.alu;
7609             }
7610           else
7611             *cost += extra_cost->fp[mode == DFmode].narrow;
7612         }
7613       return false;
7614
7615     case FIX:
7616     case UNSIGNED_FIX:
7617       x = XEXP (x, 0);
7618       /* Strip the rounding part.  They will all be implemented
7619          by the fcvt* family of instructions anyway.  */
7620       if (GET_CODE (x) == UNSPEC)
7621         {
7622           unsigned int uns_code = XINT (x, 1);
7623
7624           if (uns_code == UNSPEC_FRINTA
7625               || uns_code == UNSPEC_FRINTM
7626               || uns_code == UNSPEC_FRINTN
7627               || uns_code == UNSPEC_FRINTP
7628               || uns_code == UNSPEC_FRINTZ)
7629             x = XVECEXP (x, 0, 0);
7630         }
7631
7632       if (speed)
7633         {
7634           if (VECTOR_MODE_P (mode))
7635             *cost += extra_cost->vect.alu;
7636           else
7637             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
7638         }
7639
7640       /* We can combine fmul by a power of 2 followed by a fcvt into a single
7641          fixed-point fcvt.  */
7642       if (GET_CODE (x) == MULT
7643           && ((VECTOR_MODE_P (mode)
7644                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
7645               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
7646         {
7647           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
7648                              0, speed);
7649           return true;
7650         }
7651
7652       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
7653       return true;
7654
7655     case ABS:
7656       if (VECTOR_MODE_P (mode))
7657         {
7658           /* ABS (vector).  */
7659           if (speed)
7660             *cost += extra_cost->vect.alu;
7661         }
7662       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7663         {
7664           op0 = XEXP (x, 0);
7665
7666           /* FABD, which is analogous to FADD.  */
7667           if (GET_CODE (op0) == MINUS)
7668             {
7669               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
7670               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
7671               if (speed)
7672                 *cost += extra_cost->fp[mode == DFmode].addsub;
7673
7674               return true;
7675             }
7676           /* Simple FABS is analogous to FNEG.  */
7677           if (speed)
7678             *cost += extra_cost->fp[mode == DFmode].neg;
7679         }
7680       else
7681         {
7682           /* Integer ABS will either be split to
7683              two arithmetic instructions, or will be an ABS
7684              (scalar), which we don't model.  */
7685           *cost = COSTS_N_INSNS (2);
7686           if (speed)
7687             *cost += 2 * extra_cost->alu.arith;
7688         }
7689       return false;
7690
7691     case SMAX:
7692     case SMIN:
7693       if (speed)
7694         {
7695           if (VECTOR_MODE_P (mode))
7696             *cost += extra_cost->vect.alu;
7697           else
7698             {
7699               /* FMAXNM/FMINNM/FMAX/FMIN.
7700                  TODO: This may not be accurate for all implementations, but
7701                  we do not model this in the cost tables.  */
7702               *cost += extra_cost->fp[mode == DFmode].addsub;
7703             }
7704         }
7705       return false;
7706
7707     case UNSPEC:
7708       /* The floating point round to integer frint* instructions.  */
7709       if (aarch64_frint_unspec_p (XINT (x, 1)))
7710         {
7711           if (speed)
7712             *cost += extra_cost->fp[mode == DFmode].roundint;
7713
7714           return false;
7715         }
7716
7717       if (XINT (x, 1) == UNSPEC_RBIT)
7718         {
7719           if (speed)
7720             *cost += extra_cost->alu.rev;
7721
7722           return false;
7723         }
7724       break;
7725
7726     case TRUNCATE:
7727
7728       /* Decompose <su>muldi3_highpart.  */
7729       if (/* (truncate:DI  */
7730           mode == DImode
7731           /*   (lshiftrt:TI  */
7732           && GET_MODE (XEXP (x, 0)) == TImode
7733           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
7734           /*      (mult:TI  */
7735           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
7736           /*        (ANY_EXTEND:TI (reg:DI))
7737                     (ANY_EXTEND:TI (reg:DI)))  */
7738           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
7739                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
7740               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
7741                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
7742           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
7743           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
7744           /*     (const_int 64)  */
7745           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7746           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
7747         {
7748           /* UMULH/SMULH.  */
7749           if (speed)
7750             *cost += extra_cost->mult[mode == DImode].extend;
7751           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
7752                              mode, MULT, 0, speed);
7753           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
7754                              mode, MULT, 1, speed);
7755           return true;
7756         }
7757
7758       /* Fall through.  */
7759     default:
7760       break;
7761     }
7762
7763   if (dump_file
7764       && flag_aarch64_verbose_cost)
7765     fprintf (dump_file,
7766       "\nFailed to cost RTX.  Assuming default cost.\n");
7767
7768   return true;
7769 }
7770
7771 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
7772    calculated for X.  This cost is stored in *COST.  Returns true
7773    if the total cost of X was calculated.  */
7774 static bool
7775 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
7776                    int param, int *cost, bool speed)
7777 {
7778   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
7779
7780   if (dump_file
7781       && flag_aarch64_verbose_cost)
7782     {
7783       print_rtl_single (dump_file, x);
7784       fprintf (dump_file, "\n%s cost: %d (%s)\n",
7785                speed ? "Hot" : "Cold",
7786                *cost, result ? "final" : "partial");
7787     }
7788
7789   return result;
7790 }
7791
7792 static int
7793 aarch64_register_move_cost (machine_mode mode,
7794                             reg_class_t from_i, reg_class_t to_i)
7795 {
7796   enum reg_class from = (enum reg_class) from_i;
7797   enum reg_class to = (enum reg_class) to_i;
7798   const struct cpu_regmove_cost *regmove_cost
7799     = aarch64_tune_params.regmove_cost;
7800
7801   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
7802   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
7803     to = GENERAL_REGS;
7804
7805   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
7806     from = GENERAL_REGS;
7807
7808   /* Moving between GPR and stack cost is the same as GP2GP.  */
7809   if ((from == GENERAL_REGS && to == STACK_REG)
7810       || (to == GENERAL_REGS && from == STACK_REG))
7811     return regmove_cost->GP2GP;
7812
7813   /* To/From the stack register, we move via the gprs.  */
7814   if (to == STACK_REG || from == STACK_REG)
7815     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
7816             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
7817
7818   if (GET_MODE_SIZE (mode) == 16)
7819     {
7820       /* 128-bit operations on general registers require 2 instructions.  */
7821       if (from == GENERAL_REGS && to == GENERAL_REGS)
7822         return regmove_cost->GP2GP * 2;
7823       else if (from == GENERAL_REGS)
7824         return regmove_cost->GP2FP * 2;
7825       else if (to == GENERAL_REGS)
7826         return regmove_cost->FP2GP * 2;
7827
7828       /* When AdvSIMD instructions are disabled it is not possible to move
7829          a 128-bit value directly between Q registers.  This is handled in
7830          secondary reload.  A general register is used as a scratch to move
7831          the upper DI value and the lower DI value is moved directly,
7832          hence the cost is the sum of three moves. */
7833       if (! TARGET_SIMD)
7834         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
7835
7836       return regmove_cost->FP2FP;
7837     }
7838
7839   if (from == GENERAL_REGS && to == GENERAL_REGS)
7840     return regmove_cost->GP2GP;
7841   else if (from == GENERAL_REGS)
7842     return regmove_cost->GP2FP;
7843   else if (to == GENERAL_REGS)
7844     return regmove_cost->FP2GP;
7845
7846   return regmove_cost->FP2FP;
7847 }
7848
7849 static int
7850 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
7851                           reg_class_t rclass ATTRIBUTE_UNUSED,
7852                           bool in ATTRIBUTE_UNUSED)
7853 {
7854   return aarch64_tune_params.memmov_cost;
7855 }
7856
7857 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
7858    to optimize 1.0/sqrt.  */
7859
7860 static bool
7861 use_rsqrt_p (machine_mode mode)
7862 {
7863   return (!flag_trapping_math
7864           && flag_unsafe_math_optimizations
7865           && ((aarch64_tune_params.approx_modes->recip_sqrt
7866                & AARCH64_APPROX_MODE (mode))
7867               || flag_mrecip_low_precision_sqrt));
7868 }
7869
7870 /* Function to decide when to use the approximate reciprocal square root
7871    builtin.  */
7872
7873 static tree
7874 aarch64_builtin_reciprocal (tree fndecl)
7875 {
7876   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
7877
7878   if (!use_rsqrt_p (mode))
7879     return NULL_TREE;
7880   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
7881 }
7882
7883 typedef rtx (*rsqrte_type) (rtx, rtx);
7884
7885 /* Select reciprocal square root initial estimate insn depending on machine
7886    mode.  */
7887
7888 static rsqrte_type
7889 get_rsqrte_type (machine_mode mode)
7890 {
7891   switch (mode)
7892   {
7893     case DFmode:   return gen_aarch64_rsqrtedf;
7894     case SFmode:   return gen_aarch64_rsqrtesf;
7895     case V2DFmode: return gen_aarch64_rsqrtev2df;
7896     case V2SFmode: return gen_aarch64_rsqrtev2sf;
7897     case V4SFmode: return gen_aarch64_rsqrtev4sf;
7898     default: gcc_unreachable ();
7899   }
7900 }
7901
7902 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
7903
7904 /* Select reciprocal square root series step insn depending on machine mode.  */
7905
7906 static rsqrts_type
7907 get_rsqrts_type (machine_mode mode)
7908 {
7909   switch (mode)
7910   {
7911     case DFmode:   return gen_aarch64_rsqrtsdf;
7912     case SFmode:   return gen_aarch64_rsqrtssf;
7913     case V2DFmode: return gen_aarch64_rsqrtsv2df;
7914     case V2SFmode: return gen_aarch64_rsqrtsv2sf;
7915     case V4SFmode: return gen_aarch64_rsqrtsv4sf;
7916     default: gcc_unreachable ();
7917   }
7918 }
7919
7920 /* Emit instruction sequence to compute either the approximate square root
7921    or its approximate reciprocal, depending on the flag RECP, and return
7922    whether the sequence was emitted or not.  */
7923
7924 bool
7925 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
7926 {
7927   machine_mode mode = GET_MODE (dst);
7928
7929   if (GET_MODE_INNER (mode) == HFmode)
7930     return false;
7931
7932   machine_mode mmsk = mode_for_vector
7933                         (int_mode_for_mode (GET_MODE_INNER (mode)),
7934                          GET_MODE_NUNITS (mode));
7935   bool use_approx_sqrt_p = (!recp
7936                             && (flag_mlow_precision_sqrt
7937                                 || (aarch64_tune_params.approx_modes->sqrt
7938                                     & AARCH64_APPROX_MODE (mode))));
7939   bool use_approx_rsqrt_p = (recp
7940                              && (flag_mrecip_low_precision_sqrt
7941                                  || (aarch64_tune_params.approx_modes->recip_sqrt
7942                                      & AARCH64_APPROX_MODE (mode))));
7943
7944   if (!flag_finite_math_only
7945       || flag_trapping_math
7946       || !flag_unsafe_math_optimizations
7947       || !(use_approx_sqrt_p || use_approx_rsqrt_p)
7948       || optimize_function_for_size_p (cfun))
7949     return false;
7950
7951   rtx xmsk = gen_reg_rtx (mmsk);
7952   if (!recp)
7953     /* When calculating the approximate square root, compare the argument with
7954        0.0 and create a mask.  */
7955     emit_insn (gen_rtx_SET (xmsk, gen_rtx_NEG (mmsk, gen_rtx_EQ (mmsk, src,
7956                                                           CONST0_RTX (mode)))));
7957
7958   /* Estimate the approximate reciprocal square root.  */
7959   rtx xdst = gen_reg_rtx (mode);
7960   emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
7961
7962   /* Iterate over the series twice for SF and thrice for DF.  */
7963   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
7964
7965   /* Optionally iterate over the series once less for faster performance
7966      while sacrificing the accuracy.  */
7967   if ((recp && flag_mrecip_low_precision_sqrt)
7968       || (!recp && flag_mlow_precision_sqrt))
7969     iterations--;
7970
7971   /* Iterate over the series to calculate the approximate reciprocal square
7972      root.  */
7973   rtx x1 = gen_reg_rtx (mode);
7974   while (iterations--)
7975     {
7976       rtx x2 = gen_reg_rtx (mode);
7977       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
7978
7979       emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
7980
7981       if (iterations > 0)
7982         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
7983     }
7984
7985   if (!recp)
7986     {
7987       /* Qualify the approximate reciprocal square root when the argument is
7988          0.0 by squashing the intermediary result to 0.0.  */
7989       rtx xtmp = gen_reg_rtx (mmsk);
7990       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
7991                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
7992       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
7993
7994       /* Calculate the approximate square root.  */
7995       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
7996     }
7997
7998   /* Finalize the approximation.  */
7999   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
8000
8001   return true;
8002 }
8003
8004 typedef rtx (*recpe_type) (rtx, rtx);
8005
8006 /* Select reciprocal initial estimate insn depending on machine mode.  */
8007
8008 static recpe_type
8009 get_recpe_type (machine_mode mode)
8010 {
8011   switch (mode)
8012   {
8013     case SFmode:   return (gen_aarch64_frecpesf);
8014     case V2SFmode: return (gen_aarch64_frecpev2sf);
8015     case V4SFmode: return (gen_aarch64_frecpev4sf);
8016     case DFmode:   return (gen_aarch64_frecpedf);
8017     case V2DFmode: return (gen_aarch64_frecpev2df);
8018     default:       gcc_unreachable ();
8019   }
8020 }
8021
8022 typedef rtx (*recps_type) (rtx, rtx, rtx);
8023
8024 /* Select reciprocal series step insn depending on machine mode.  */
8025
8026 static recps_type
8027 get_recps_type (machine_mode mode)
8028 {
8029   switch (mode)
8030   {
8031     case SFmode:   return (gen_aarch64_frecpssf);
8032     case V2SFmode: return (gen_aarch64_frecpsv2sf);
8033     case V4SFmode: return (gen_aarch64_frecpsv4sf);
8034     case DFmode:   return (gen_aarch64_frecpsdf);
8035     case V2DFmode: return (gen_aarch64_frecpsv2df);
8036     default:       gcc_unreachable ();
8037   }
8038 }
8039
8040 /* Emit the instruction sequence to compute the approximation for the division
8041    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
8042
8043 bool
8044 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
8045 {
8046   machine_mode mode = GET_MODE (quo);
8047
8048   if (GET_MODE_INNER (mode) == HFmode)
8049     return false;
8050
8051   bool use_approx_division_p = (flag_mlow_precision_div
8052                                 || (aarch64_tune_params.approx_modes->division
8053                                     & AARCH64_APPROX_MODE (mode)));
8054
8055   if (!flag_finite_math_only
8056       || flag_trapping_math
8057       || !flag_unsafe_math_optimizations
8058       || optimize_function_for_size_p (cfun)
8059       || !use_approx_division_p)
8060     return false;
8061
8062   /* Estimate the approximate reciprocal.  */
8063   rtx xrcp = gen_reg_rtx (mode);
8064   emit_insn ((*get_recpe_type (mode)) (xrcp, den));
8065
8066   /* Iterate over the series twice for SF and thrice for DF.  */
8067   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8068
8069   /* Optionally iterate over the series once less for faster performance,
8070      while sacrificing the accuracy.  */
8071   if (flag_mlow_precision_div)
8072     iterations--;
8073
8074   /* Iterate over the series to calculate the approximate reciprocal.  */
8075   rtx xtmp = gen_reg_rtx (mode);
8076   while (iterations--)
8077     {
8078       emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
8079
8080       if (iterations > 0)
8081         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
8082     }
8083
8084   if (num != CONST1_RTX (mode))
8085     {
8086       /* As the approximate reciprocal of DEN is already calculated, only
8087          calculate the approximate division when NUM is not 1.0.  */
8088       rtx xnum = force_reg (mode, num);
8089       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
8090     }
8091
8092   /* Finalize the approximation.  */
8093   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
8094   return true;
8095 }
8096
8097 /* Return the number of instructions that can be issued per cycle.  */
8098 static int
8099 aarch64_sched_issue_rate (void)
8100 {
8101   return aarch64_tune_params.issue_rate;
8102 }
8103
8104 static int
8105 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
8106 {
8107   int issue_rate = aarch64_sched_issue_rate ();
8108
8109   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
8110 }
8111
8112
8113 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
8114    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
8115    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
8116
8117 static int
8118 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
8119                                                     int ready_index)
8120 {
8121   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
8122 }
8123
8124
8125 /* Vectorizer cost model target hooks.  */
8126
8127 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
8128 static int
8129 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
8130                                     tree vectype,
8131                                     int misalign ATTRIBUTE_UNUSED)
8132 {
8133   unsigned elements;
8134   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
8135   bool fp = false;
8136
8137   if (vectype != NULL)
8138     fp = FLOAT_TYPE_P (vectype);
8139
8140   switch (type_of_cost)
8141     {
8142       case scalar_stmt:
8143         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
8144
8145       case scalar_load:
8146         return costs->scalar_load_cost;
8147
8148       case scalar_store:
8149         return costs->scalar_store_cost;
8150
8151       case vector_stmt:
8152         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8153
8154       case vector_load:
8155         return costs->vec_align_load_cost;
8156
8157       case vector_store:
8158         return costs->vec_store_cost;
8159
8160       case vec_to_scalar:
8161         return costs->vec_to_scalar_cost;
8162
8163       case scalar_to_vec:
8164         return costs->scalar_to_vec_cost;
8165
8166       case unaligned_load:
8167         return costs->vec_unalign_load_cost;
8168
8169       case unaligned_store:
8170         return costs->vec_unalign_store_cost;
8171
8172       case cond_branch_taken:
8173         return costs->cond_taken_branch_cost;
8174
8175       case cond_branch_not_taken:
8176         return costs->cond_not_taken_branch_cost;
8177
8178       case vec_perm:
8179         return costs->vec_permute_cost;
8180
8181       case vec_promote_demote:
8182         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8183
8184       case vec_construct:
8185         elements = TYPE_VECTOR_SUBPARTS (vectype);
8186         return elements / 2 + 1;
8187
8188       default:
8189         gcc_unreachable ();
8190     }
8191 }
8192
8193 /* Implement targetm.vectorize.add_stmt_cost.  */
8194 static unsigned
8195 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
8196                        struct _stmt_vec_info *stmt_info, int misalign,
8197                        enum vect_cost_model_location where)
8198 {
8199   unsigned *cost = (unsigned *) data;
8200   unsigned retval = 0;
8201
8202   if (flag_vect_cost_model)
8203     {
8204       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
8205       int stmt_cost =
8206             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
8207
8208       /* Statements in an inner loop relative to the loop being
8209          vectorized are weighted more heavily.  The value here is
8210          arbitrary and could potentially be improved with analysis.  */
8211       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
8212         count *= 50; /*  FIXME  */
8213
8214       retval = (unsigned) (count * stmt_cost);
8215       cost[where] += retval;
8216     }
8217
8218   return retval;
8219 }
8220
8221 static void initialize_aarch64_code_model (struct gcc_options *);
8222
8223 /* Parse the TO_PARSE string and put the architecture struct that it
8224    selects into RES and the architectural features into ISA_FLAGS.
8225    Return an aarch64_parse_opt_result describing the parse result.
8226    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
8227
8228 static enum aarch64_parse_opt_result
8229 aarch64_parse_arch (const char *to_parse, const struct processor **res,
8230                     unsigned long *isa_flags)
8231 {
8232   char *ext;
8233   const struct processor *arch;
8234   char *str = (char *) alloca (strlen (to_parse) + 1);
8235   size_t len;
8236
8237   strcpy (str, to_parse);
8238
8239   ext = strchr (str, '+');
8240
8241   if (ext != NULL)
8242     len = ext - str;
8243   else
8244     len = strlen (str);
8245
8246   if (len == 0)
8247     return AARCH64_PARSE_MISSING_ARG;
8248
8249
8250   /* Loop through the list of supported ARCHes to find a match.  */
8251   for (arch = all_architectures; arch->name != NULL; arch++)
8252     {
8253       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
8254         {
8255           unsigned long isa_temp = arch->flags;
8256
8257           if (ext != NULL)
8258             {
8259               /* TO_PARSE string contains at least one extension.  */
8260               enum aarch64_parse_opt_result ext_res
8261                 = aarch64_parse_extension (ext, &isa_temp);
8262
8263               if (ext_res != AARCH64_PARSE_OK)
8264                 return ext_res;
8265             }
8266           /* Extension parsing was successful.  Confirm the result
8267              arch and ISA flags.  */
8268           *res = arch;
8269           *isa_flags = isa_temp;
8270           return AARCH64_PARSE_OK;
8271         }
8272     }
8273
8274   /* ARCH name not found in list.  */
8275   return AARCH64_PARSE_INVALID_ARG;
8276 }
8277
8278 /* Parse the TO_PARSE string and put the result tuning in RES and the
8279    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
8280    describing the parse result.  If there is an error parsing, RES and
8281    ISA_FLAGS are left unchanged.  */
8282
8283 static enum aarch64_parse_opt_result
8284 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
8285                    unsigned long *isa_flags)
8286 {
8287   char *ext;
8288   const struct processor *cpu;
8289   char *str = (char *) alloca (strlen (to_parse) + 1);
8290   size_t len;
8291
8292   strcpy (str, to_parse);
8293
8294   ext = strchr (str, '+');
8295
8296   if (ext != NULL)
8297     len = ext - str;
8298   else
8299     len = strlen (str);
8300
8301   if (len == 0)
8302     return AARCH64_PARSE_MISSING_ARG;
8303
8304
8305   /* Loop through the list of supported CPUs to find a match.  */
8306   for (cpu = all_cores; cpu->name != NULL; cpu++)
8307     {
8308       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
8309         {
8310           unsigned long isa_temp = cpu->flags;
8311
8312
8313           if (ext != NULL)
8314             {
8315               /* TO_PARSE string contains at least one extension.  */
8316               enum aarch64_parse_opt_result ext_res
8317                 = aarch64_parse_extension (ext, &isa_temp);
8318
8319               if (ext_res != AARCH64_PARSE_OK)
8320                 return ext_res;
8321             }
8322           /* Extension parsing was successfull.  Confirm the result
8323              cpu and ISA flags.  */
8324           *res = cpu;
8325           *isa_flags = isa_temp;
8326           return AARCH64_PARSE_OK;
8327         }
8328     }
8329
8330   /* CPU name not found in list.  */
8331   return AARCH64_PARSE_INVALID_ARG;
8332 }
8333
8334 /* Parse the TO_PARSE string and put the cpu it selects into RES.
8335    Return an aarch64_parse_opt_result describing the parse result.
8336    If the parsing fails the RES does not change.  */
8337
8338 static enum aarch64_parse_opt_result
8339 aarch64_parse_tune (const char *to_parse, const struct processor **res)
8340 {
8341   const struct processor *cpu;
8342   char *str = (char *) alloca (strlen (to_parse) + 1);
8343
8344   strcpy (str, to_parse);
8345
8346   /* Loop through the list of supported CPUs to find a match.  */
8347   for (cpu = all_cores; cpu->name != NULL; cpu++)
8348     {
8349       if (strcmp (cpu->name, str) == 0)
8350         {
8351           *res = cpu;
8352           return AARCH64_PARSE_OK;
8353         }
8354     }
8355
8356   /* CPU name not found in list.  */
8357   return AARCH64_PARSE_INVALID_ARG;
8358 }
8359
8360 /* Parse TOKEN, which has length LENGTH to see if it is an option
8361    described in FLAG.  If it is, return the index bit for that fusion type.
8362    If not, error (printing OPTION_NAME) and return zero.  */
8363
8364 static unsigned int
8365 aarch64_parse_one_option_token (const char *token,
8366                                 size_t length,
8367                                 const struct aarch64_flag_desc *flag,
8368                                 const char *option_name)
8369 {
8370   for (; flag->name != NULL; flag++)
8371     {
8372       if (length == strlen (flag->name)
8373           && !strncmp (flag->name, token, length))
8374         return flag->flag;
8375     }
8376
8377   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
8378   return 0;
8379 }
8380
8381 /* Parse OPTION which is a comma-separated list of flags to enable.
8382    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8383    default state we inherit from the CPU tuning structures.  OPTION_NAME
8384    gives the top-level option we are parsing in the -moverride string,
8385    for use in error messages.  */
8386
8387 static unsigned int
8388 aarch64_parse_boolean_options (const char *option,
8389                                const struct aarch64_flag_desc *flags,
8390                                unsigned int initial_state,
8391                                const char *option_name)
8392 {
8393   const char separator = '.';
8394   const char* specs = option;
8395   const char* ntoken = option;
8396   unsigned int found_flags = initial_state;
8397
8398   while ((ntoken = strchr (specs, separator)))
8399     {
8400       size_t token_length = ntoken - specs;
8401       unsigned token_ops = aarch64_parse_one_option_token (specs,
8402                                                            token_length,
8403                                                            flags,
8404                                                            option_name);
8405       /* If we find "none" (or, for simplicity's sake, an error) anywhere
8406          in the token stream, reset the supported operations.  So:
8407
8408            adrp+add.cmp+branch.none.adrp+add
8409
8410            would have the result of turning on only adrp+add fusion.  */
8411       if (!token_ops)
8412         found_flags = 0;
8413
8414       found_flags |= token_ops;
8415       specs = ++ntoken;
8416     }
8417
8418   /* We ended with a comma, print something.  */
8419   if (!(*specs))
8420     {
8421       error ("%s string ill-formed\n", option_name);
8422       return 0;
8423     }
8424
8425   /* We still have one more token to parse.  */
8426   size_t token_length = strlen (specs);
8427   unsigned token_ops = aarch64_parse_one_option_token (specs,
8428                                                        token_length,
8429                                                        flags,
8430                                                        option_name);
8431    if (!token_ops)
8432      found_flags = 0;
8433
8434   found_flags |= token_ops;
8435   return found_flags;
8436 }
8437
8438 /* Support for overriding instruction fusion.  */
8439
8440 static void
8441 aarch64_parse_fuse_string (const char *fuse_string,
8442                             struct tune_params *tune)
8443 {
8444   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8445                                                      aarch64_fusible_pairs,
8446                                                      tune->fusible_ops,
8447                                                      "fuse=");
8448 }
8449
8450 /* Support for overriding other tuning flags.  */
8451
8452 static void
8453 aarch64_parse_tune_string (const char *tune_string,
8454                             struct tune_params *tune)
8455 {
8456   tune->extra_tuning_flags
8457     = aarch64_parse_boolean_options (tune_string,
8458                                      aarch64_tuning_flags,
8459                                      tune->extra_tuning_flags,
8460                                      "tune=");
8461 }
8462
8463 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8464    we understand.  If it is, extract the option string and handoff to
8465    the appropriate function.  */
8466
8467 void
8468 aarch64_parse_one_override_token (const char* token,
8469                                   size_t length,
8470                                   struct tune_params *tune)
8471 {
8472   const struct aarch64_tuning_override_function *fn
8473     = aarch64_tuning_override_functions;
8474
8475   const char *option_part = strchr (token, '=');
8476   if (!option_part)
8477     {
8478       error ("tuning string missing in option (%s)", token);
8479       return;
8480     }
8481
8482   /* Get the length of the option name.  */
8483   length = option_part - token;
8484   /* Skip the '=' to get to the option string.  */
8485   option_part++;
8486
8487   for (; fn->name != NULL; fn++)
8488     {
8489       if (!strncmp (fn->name, token, length))
8490         {
8491           fn->parse_override (option_part, tune);
8492           return;
8493         }
8494     }
8495
8496   error ("unknown tuning option (%s)",token);
8497   return;
8498 }
8499
8500 /* A checking mechanism for the implementation of the tls size.  */
8501
8502 static void
8503 initialize_aarch64_tls_size (struct gcc_options *opts)
8504 {
8505   if (aarch64_tls_size == 0)
8506     aarch64_tls_size = 24;
8507
8508   switch (opts->x_aarch64_cmodel_var)
8509     {
8510     case AARCH64_CMODEL_TINY:
8511       /* Both the default and maximum TLS size allowed under tiny is 1M which
8512          needs two instructions to address, so we clamp the size to 24.  */
8513       if (aarch64_tls_size > 24)
8514         aarch64_tls_size = 24;
8515       break;
8516     case AARCH64_CMODEL_SMALL:
8517       /* The maximum TLS size allowed under small is 4G.  */
8518       if (aarch64_tls_size > 32)
8519         aarch64_tls_size = 32;
8520       break;
8521     case AARCH64_CMODEL_LARGE:
8522       /* The maximum TLS size allowed under large is 16E.
8523          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
8524       if (aarch64_tls_size > 48)
8525         aarch64_tls_size = 48;
8526       break;
8527     default:
8528       gcc_unreachable ();
8529     }
8530
8531   return;
8532 }
8533
8534 /* Parse STRING looking for options in the format:
8535      string     :: option:string
8536      option     :: name=substring
8537      name       :: {a-z}
8538      substring  :: defined by option.  */
8539
8540 static void
8541 aarch64_parse_override_string (const char* input_string,
8542                                struct tune_params* tune)
8543 {
8544   const char separator = ':';
8545   size_t string_length = strlen (input_string) + 1;
8546   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8547   char *string = string_root;
8548   strncpy (string, input_string, string_length);
8549   string[string_length - 1] = '\0';
8550
8551   char* ntoken = string;
8552
8553   while ((ntoken = strchr (string, separator)))
8554     {
8555       size_t token_length = ntoken - string;
8556       /* Make this substring look like a string.  */
8557       *ntoken = '\0';
8558       aarch64_parse_one_override_token (string, token_length, tune);
8559       string = ++ntoken;
8560     }
8561
8562   /* One last option to parse.  */
8563   aarch64_parse_one_override_token (string, strlen (string), tune);
8564   free (string_root);
8565 }
8566
8567
8568 static void
8569 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8570 {
8571   /* The logic here is that if we are disabling all frame pointer generation
8572      then we do not need to disable leaf frame pointer generation as a
8573      separate operation.  But if we are *only* disabling leaf frame pointer
8574      generation then we set flag_omit_frame_pointer to true, but in
8575      aarch64_frame_pointer_required we return false only for leaf functions.
8576
8577      PR 70044: We have to be careful about being called multiple times for the
8578      same function.  Once we have decided to set flag_omit_frame_pointer just
8579      so that we can omit leaf frame pointers, we must then not interpret a
8580      second call as meaning that all frame pointer generation should be
8581      omitted.  We do this by setting flag_omit_frame_pointer to a special,
8582      non-zero value.  */
8583   if (opts->x_flag_omit_frame_pointer == 2)
8584     opts->x_flag_omit_frame_pointer = 0;
8585
8586   if (opts->x_flag_omit_frame_pointer)
8587     opts->x_flag_omit_leaf_frame_pointer = false;
8588   else if (opts->x_flag_omit_leaf_frame_pointer)
8589     opts->x_flag_omit_frame_pointer = 2;
8590
8591   /* If not optimizing for size, set the default
8592      alignment to what the target wants.  */
8593   if (!opts->x_optimize_size)
8594     {
8595       if (opts->x_align_loops <= 0)
8596         opts->x_align_loops = aarch64_tune_params.loop_align;
8597       if (opts->x_align_jumps <= 0)
8598         opts->x_align_jumps = aarch64_tune_params.jump_align;
8599       if (opts->x_align_functions <= 0)
8600         opts->x_align_functions = aarch64_tune_params.function_align;
8601     }
8602
8603   /* We default to no pc-relative literal loads.  */
8604
8605   aarch64_pcrelative_literal_loads = false;
8606
8607   /* If -mpc-relative-literal-loads is set on the command line, this
8608      implies that the user asked for PC relative literal loads.  */
8609   if (opts->x_pcrelative_literal_loads == 1)
8610     aarch64_pcrelative_literal_loads = true;
8611
8612   /* This is PR70113. When building the Linux kernel with
8613      CONFIG_ARM64_ERRATUM_843419, support for relocations
8614      R_AARCH64_ADR_PREL_PG_HI21 and R_AARCH64_ADR_PREL_PG_HI21_NC is
8615      removed from the kernel to avoid loading objects with possibly
8616      offending sequences.  Without -mpc-relative-literal-loads we would
8617      generate such relocations, preventing the kernel build from
8618      succeeding.  */
8619   if (opts->x_pcrelative_literal_loads == 2
8620       && TARGET_FIX_ERR_A53_843419)
8621     aarch64_pcrelative_literal_loads = true;
8622
8623   /* In the tiny memory model it makes no sense to disallow PC relative
8624      literal pool loads.  */
8625   if (aarch64_cmodel == AARCH64_CMODEL_TINY
8626       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
8627     aarch64_pcrelative_literal_loads = true;
8628
8629   /* When enabling the lower precision Newton series for the square root, also
8630      enable it for the reciprocal square root, since the latter is an
8631      intermediary step for the former.  */
8632   if (flag_mlow_precision_sqrt)
8633     flag_mrecip_low_precision_sqrt = true;
8634 }
8635
8636 /* 'Unpack' up the internal tuning structs and update the options
8637     in OPTS.  The caller must have set up selected_tune and selected_arch
8638     as all the other target-specific codegen decisions are
8639     derived from them.  */
8640
8641 void
8642 aarch64_override_options_internal (struct gcc_options *opts)
8643 {
8644   aarch64_tune_flags = selected_tune->flags;
8645   aarch64_tune = selected_tune->sched_core;
8646   /* Make a copy of the tuning parameters attached to the core, which
8647      we may later overwrite.  */
8648   aarch64_tune_params = *(selected_tune->tune);
8649   aarch64_architecture_version = selected_arch->architecture_version;
8650
8651   if (opts->x_aarch64_override_tune_string)
8652     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
8653                                   &aarch64_tune_params);
8654
8655   /* This target defaults to strict volatile bitfields.  */
8656   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
8657     opts->x_flag_strict_volatile_bitfields = 1;
8658
8659   initialize_aarch64_code_model (opts);
8660   initialize_aarch64_tls_size (opts);
8661
8662   int queue_depth = 0;
8663   switch (aarch64_tune_params.autoprefetcher_model)
8664     {
8665       case tune_params::AUTOPREFETCHER_OFF:
8666         queue_depth = -1;
8667         break;
8668       case tune_params::AUTOPREFETCHER_WEAK:
8669         queue_depth = 0;
8670         break;
8671       case tune_params::AUTOPREFETCHER_STRONG:
8672         queue_depth = max_insn_queue_index + 1;
8673         break;
8674       default:
8675         gcc_unreachable ();
8676     }
8677
8678   /* We don't mind passing in global_options_set here as we don't use
8679      the *options_set structs anyway.  */
8680   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
8681                          queue_depth,
8682                          opts->x_param_values,
8683                          global_options_set.x_param_values);
8684
8685   /* Set the L1 cache line size.  */
8686   if (selected_cpu->tune->cache_line_size != 0)
8687     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
8688                            selected_cpu->tune->cache_line_size,
8689                            opts->x_param_values,
8690                            global_options_set.x_param_values);
8691
8692   aarch64_override_options_after_change_1 (opts);
8693 }
8694
8695 /* Print a hint with a suggestion for a core or architecture name that
8696    most closely resembles what the user passed in STR.  ARCH is true if
8697    the user is asking for an architecture name.  ARCH is false if the user
8698    is asking for a core name.  */
8699
8700 static void
8701 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
8702 {
8703   auto_vec<const char *> candidates;
8704   const struct processor *entry = arch ? all_architectures : all_cores;
8705   for (; entry->name != NULL; entry++)
8706     candidates.safe_push (entry->name);
8707   char *s;
8708   const char *hint = candidates_list_and_hint (str, s, candidates);
8709   if (hint)
8710     inform (input_location, "valid arguments are: %s;"
8711                              " did you mean %qs?", s, hint);
8712   XDELETEVEC (s);
8713 }
8714
8715 /* Print a hint with a suggestion for a core name that most closely resembles
8716    what the user passed in STR.  */
8717
8718 inline static void
8719 aarch64_print_hint_for_core (const char *str)
8720 {
8721   aarch64_print_hint_for_core_or_arch (str, false);
8722 }
8723
8724 /* Print a hint with a suggestion for an architecture name that most closely
8725    resembles what the user passed in STR.  */
8726
8727 inline static void
8728 aarch64_print_hint_for_arch (const char *str)
8729 {
8730   aarch64_print_hint_for_core_or_arch (str, true);
8731 }
8732
8733 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
8734    specified in STR and throw errors if appropriate.  Put the results if
8735    they are valid in RES and ISA_FLAGS.  Return whether the option is
8736    valid.  */
8737
8738 static bool
8739 aarch64_validate_mcpu (const char *str, const struct processor **res,
8740                        unsigned long *isa_flags)
8741 {
8742   enum aarch64_parse_opt_result parse_res
8743     = aarch64_parse_cpu (str, res, isa_flags);
8744
8745   if (parse_res == AARCH64_PARSE_OK)
8746     return true;
8747
8748   switch (parse_res)
8749     {
8750       case AARCH64_PARSE_MISSING_ARG:
8751         error ("missing cpu name in -mcpu=%qs", str);
8752         break;
8753       case AARCH64_PARSE_INVALID_ARG:
8754         error ("unknown value %qs for -mcpu", str);
8755         aarch64_print_hint_for_core (str);
8756         break;
8757       case AARCH64_PARSE_INVALID_FEATURE:
8758         error ("invalid feature modifier in -mcpu=%qs", str);
8759         break;
8760       default:
8761         gcc_unreachable ();
8762     }
8763
8764   return false;
8765 }
8766
8767 /* Validate a command-line -march option.  Parse the arch and extensions
8768    (if any) specified in STR and throw errors if appropriate.  Put the
8769    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
8770    option is valid.  */
8771
8772 static bool
8773 aarch64_validate_march (const char *str, const struct processor **res,
8774                          unsigned long *isa_flags)
8775 {
8776   enum aarch64_parse_opt_result parse_res
8777     = aarch64_parse_arch (str, res, isa_flags);
8778
8779   if (parse_res == AARCH64_PARSE_OK)
8780     return true;
8781
8782   switch (parse_res)
8783     {
8784       case AARCH64_PARSE_MISSING_ARG:
8785         error ("missing arch name in -march=%qs", str);
8786         break;
8787       case AARCH64_PARSE_INVALID_ARG:
8788         error ("unknown value %qs for -march", str);
8789         aarch64_print_hint_for_arch (str);
8790         break;
8791       case AARCH64_PARSE_INVALID_FEATURE:
8792         error ("invalid feature modifier in -march=%qs", str);
8793         break;
8794       default:
8795         gcc_unreachable ();
8796     }
8797
8798   return false;
8799 }
8800
8801 /* Validate a command-line -mtune option.  Parse the cpu
8802    specified in STR and throw errors if appropriate.  Put the
8803    result, if it is valid, in RES.  Return whether the option is
8804    valid.  */
8805
8806 static bool
8807 aarch64_validate_mtune (const char *str, const struct processor **res)
8808 {
8809   enum aarch64_parse_opt_result parse_res
8810     = aarch64_parse_tune (str, res);
8811
8812   if (parse_res == AARCH64_PARSE_OK)
8813     return true;
8814
8815   switch (parse_res)
8816     {
8817       case AARCH64_PARSE_MISSING_ARG:
8818         error ("missing cpu name in -mtune=%qs", str);
8819         break;
8820       case AARCH64_PARSE_INVALID_ARG:
8821         error ("unknown value %qs for -mtune", str);
8822         aarch64_print_hint_for_core (str);
8823         break;
8824       default:
8825         gcc_unreachable ();
8826     }
8827   return false;
8828 }
8829
8830 /* Return the CPU corresponding to the enum CPU.
8831    If it doesn't specify a cpu, return the default.  */
8832
8833 static const struct processor *
8834 aarch64_get_tune_cpu (enum aarch64_processor cpu)
8835 {
8836   if (cpu != aarch64_none)
8837     return &all_cores[cpu];
8838
8839   /* The & 0x3f is to extract the bottom 6 bits that encode the
8840      default cpu as selected by the --with-cpu GCC configure option
8841      in config.gcc.
8842      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
8843      flags mechanism should be reworked to make it more sane.  */
8844   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8845 }
8846
8847 /* Return the architecture corresponding to the enum ARCH.
8848    If it doesn't specify a valid architecture, return the default.  */
8849
8850 static const struct processor *
8851 aarch64_get_arch (enum aarch64_arch arch)
8852 {
8853   if (arch != aarch64_no_arch)
8854     return &all_architectures[arch];
8855
8856   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8857
8858   return &all_architectures[cpu->arch];
8859 }
8860
8861 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
8862    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
8863    tuning structs.  In particular it must set selected_tune and
8864    aarch64_isa_flags that define the available ISA features and tuning
8865    decisions.  It must also set selected_arch as this will be used to
8866    output the .arch asm tags for each function.  */
8867
8868 static void
8869 aarch64_override_options (void)
8870 {
8871   unsigned long cpu_isa = 0;
8872   unsigned long arch_isa = 0;
8873   aarch64_isa_flags = 0;
8874
8875   bool valid_cpu = true;
8876   bool valid_tune = true;
8877   bool valid_arch = true;
8878
8879   selected_cpu = NULL;
8880   selected_arch = NULL;
8881   selected_tune = NULL;
8882
8883   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
8884      If either of -march or -mtune is given, they override their
8885      respective component of -mcpu.  */
8886   if (aarch64_cpu_string)
8887     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
8888                                         &cpu_isa);
8889
8890   if (aarch64_arch_string)
8891     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
8892                                           &arch_isa);
8893
8894   if (aarch64_tune_string)
8895     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
8896
8897   /* If the user did not specify a processor, choose the default
8898      one for them.  This will be the CPU set during configuration using
8899      --with-cpu, otherwise it is "generic".  */
8900   if (!selected_cpu)
8901     {
8902       if (selected_arch)
8903         {
8904           selected_cpu = &all_cores[selected_arch->ident];
8905           aarch64_isa_flags = arch_isa;
8906           explicit_arch = selected_arch->arch;
8907         }
8908       else
8909         {
8910           /* Get default configure-time CPU.  */
8911           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
8912           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
8913         }
8914
8915       if (selected_tune)
8916         explicit_tune_core = selected_tune->ident;
8917     }
8918   /* If both -mcpu and -march are specified check that they are architecturally
8919      compatible, warn if they're not and prefer the -march ISA flags.  */
8920   else if (selected_arch)
8921     {
8922       if (selected_arch->arch != selected_cpu->arch)
8923         {
8924           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
8925                        all_architectures[selected_cpu->arch].name,
8926                        selected_arch->name);
8927         }
8928       aarch64_isa_flags = arch_isa;
8929       explicit_arch = selected_arch->arch;
8930       explicit_tune_core = selected_tune ? selected_tune->ident
8931                                           : selected_cpu->ident;
8932     }
8933   else
8934     {
8935       /* -mcpu but no -march.  */
8936       aarch64_isa_flags = cpu_isa;
8937       explicit_tune_core = selected_tune ? selected_tune->ident
8938                                           : selected_cpu->ident;
8939       gcc_assert (selected_cpu);
8940       selected_arch = &all_architectures[selected_cpu->arch];
8941       explicit_arch = selected_arch->arch;
8942     }
8943
8944   /* Set the arch as well as we will need it when outputing
8945      the .arch directive in assembly.  */
8946   if (!selected_arch)
8947     {
8948       gcc_assert (selected_cpu);
8949       selected_arch = &all_architectures[selected_cpu->arch];
8950     }
8951
8952   if (!selected_tune)
8953     selected_tune = selected_cpu;
8954
8955 #ifndef HAVE_AS_MABI_OPTION
8956   /* The compiler may have been configured with 2.23.* binutils, which does
8957      not have support for ILP32.  */
8958   if (TARGET_ILP32)
8959     error ("Assembler does not support -mabi=ilp32");
8960 #endif
8961
8962   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
8963     sorry ("Return address signing is only supported for -mabi=lp64");
8964
8965   /* Make sure we properly set up the explicit options.  */
8966   if ((aarch64_cpu_string && valid_cpu)
8967        || (aarch64_tune_string && valid_tune))
8968     gcc_assert (explicit_tune_core != aarch64_none);
8969
8970   if ((aarch64_cpu_string && valid_cpu)
8971        || (aarch64_arch_string && valid_arch))
8972     gcc_assert (explicit_arch != aarch64_no_arch);
8973
8974   aarch64_override_options_internal (&global_options);
8975
8976   /* Save these options as the default ones in case we push and pop them later
8977      while processing functions with potential target attributes.  */
8978   target_option_default_node = target_option_current_node
8979       = build_target_option_node (&global_options);
8980 }
8981
8982 /* Implement targetm.override_options_after_change.  */
8983
8984 static void
8985 aarch64_override_options_after_change (void)
8986 {
8987   aarch64_override_options_after_change_1 (&global_options);
8988 }
8989
8990 static struct machine_function *
8991 aarch64_init_machine_status (void)
8992 {
8993   struct machine_function *machine;
8994   machine = ggc_cleared_alloc<machine_function> ();
8995   return machine;
8996 }
8997
8998 void
8999 aarch64_init_expanders (void)
9000 {
9001   init_machine_status = aarch64_init_machine_status;
9002 }
9003
9004 /* A checking mechanism for the implementation of the various code models.  */
9005 static void
9006 initialize_aarch64_code_model (struct gcc_options *opts)
9007 {
9008    if (opts->x_flag_pic)
9009      {
9010        switch (opts->x_aarch64_cmodel_var)
9011          {
9012          case AARCH64_CMODEL_TINY:
9013            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
9014            break;
9015          case AARCH64_CMODEL_SMALL:
9016 #ifdef HAVE_AS_SMALL_PIC_RELOCS
9017            aarch64_cmodel = (flag_pic == 2
9018                              ? AARCH64_CMODEL_SMALL_PIC
9019                              : AARCH64_CMODEL_SMALL_SPIC);
9020 #else
9021            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
9022 #endif
9023            break;
9024          case AARCH64_CMODEL_LARGE:
9025            sorry ("code model %qs with -f%s", "large",
9026                   opts->x_flag_pic > 1 ? "PIC" : "pic");
9027            break;
9028          default:
9029            gcc_unreachable ();
9030          }
9031      }
9032    else
9033      aarch64_cmodel = opts->x_aarch64_cmodel_var;
9034 }
9035
9036 /* Implement TARGET_OPTION_SAVE.  */
9037
9038 static void
9039 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
9040 {
9041   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
9042 }
9043
9044 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
9045    using the information saved in PTR.  */
9046
9047 static void
9048 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
9049 {
9050   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
9051   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9052   opts->x_explicit_arch = ptr->x_explicit_arch;
9053   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
9054   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
9055
9056   aarch64_override_options_internal (opts);
9057 }
9058
9059 /* Implement TARGET_OPTION_PRINT.  */
9060
9061 static void
9062 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
9063 {
9064   const struct processor *cpu
9065     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9066   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
9067   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
9068   std::string extension
9069     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
9070
9071   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
9072   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
9073            arch->name, extension.c_str ());
9074 }
9075
9076 static GTY(()) tree aarch64_previous_fndecl;
9077
9078 void
9079 aarch64_reset_previous_fndecl (void)
9080 {
9081   aarch64_previous_fndecl = NULL;
9082 }
9083
9084 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
9085    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
9086    make sure optab availability predicates are recomputed when necessary.  */
9087
9088 void
9089 aarch64_save_restore_target_globals (tree new_tree)
9090 {
9091   if (TREE_TARGET_GLOBALS (new_tree))
9092     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
9093   else if (new_tree == target_option_default_node)
9094     restore_target_globals (&default_target_globals);
9095   else
9096     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
9097 }
9098
9099 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
9100    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
9101    of the function, if such exists.  This function may be called multiple
9102    times on a single function so use aarch64_previous_fndecl to avoid
9103    setting up identical state.  */
9104
9105 static void
9106 aarch64_set_current_function (tree fndecl)
9107 {
9108   if (!fndecl || fndecl == aarch64_previous_fndecl)
9109     return;
9110
9111   tree old_tree = (aarch64_previous_fndecl
9112                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
9113                    : NULL_TREE);
9114
9115   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9116
9117   /* If current function has no attributes but the previous one did,
9118      use the default node.  */
9119   if (!new_tree && old_tree)
9120     new_tree = target_option_default_node;
9121
9122   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
9123      the default have been handled by aarch64_save_restore_target_globals from
9124      aarch64_pragma_target_parse.  */
9125   if (old_tree == new_tree)
9126     return;
9127
9128   aarch64_previous_fndecl = fndecl;
9129
9130   /* First set the target options.  */
9131   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
9132
9133   aarch64_save_restore_target_globals (new_tree);
9134 }
9135
9136 /* Enum describing the various ways we can handle attributes.
9137    In many cases we can reuse the generic option handling machinery.  */
9138
9139 enum aarch64_attr_opt_type
9140 {
9141   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
9142   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
9143   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
9144   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
9145 };
9146
9147 /* All the information needed to handle a target attribute.
9148    NAME is the name of the attribute.
9149    ATTR_TYPE specifies the type of behavior of the attribute as described
9150    in the definition of enum aarch64_attr_opt_type.
9151    ALLOW_NEG is true if the attribute supports a "no-" form.
9152    HANDLER is the function that takes the attribute string and whether
9153    it is a pragma or attribute and handles the option.  It is needed only
9154    when the ATTR_TYPE is aarch64_attr_custom.
9155    OPT_NUM is the enum specifying the option that the attribute modifies.
9156    This is needed for attributes that mirror the behavior of a command-line
9157    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
9158    aarch64_attr_enum.  */
9159
9160 struct aarch64_attribute_info
9161 {
9162   const char *name;
9163   enum aarch64_attr_opt_type attr_type;
9164   bool allow_neg;
9165   bool (*handler) (const char *, const char *);
9166   enum opt_code opt_num;
9167 };
9168
9169 /* Handle the ARCH_STR argument to the arch= target attribute.
9170    PRAGMA_OR_ATTR is used in potential error messages.  */
9171
9172 static bool
9173 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
9174 {
9175   const struct processor *tmp_arch = NULL;
9176   enum aarch64_parse_opt_result parse_res
9177     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
9178
9179   if (parse_res == AARCH64_PARSE_OK)
9180     {
9181       gcc_assert (tmp_arch);
9182       selected_arch = tmp_arch;
9183       explicit_arch = selected_arch->arch;
9184       return true;
9185     }
9186
9187   switch (parse_res)
9188     {
9189       case AARCH64_PARSE_MISSING_ARG:
9190         error ("missing architecture name in 'arch' target %s", pragma_or_attr);
9191         break;
9192       case AARCH64_PARSE_INVALID_ARG:
9193         error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
9194         aarch64_print_hint_for_arch (str);
9195         break;
9196       case AARCH64_PARSE_INVALID_FEATURE:
9197         error ("invalid feature modifier %qs for 'arch' target %s",
9198                str, pragma_or_attr);
9199         break;
9200       default:
9201         gcc_unreachable ();
9202     }
9203
9204   return false;
9205 }
9206
9207 /* Handle the argument CPU_STR to the cpu= target attribute.
9208    PRAGMA_OR_ATTR is used in potential error messages.  */
9209
9210 static bool
9211 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
9212 {
9213   const struct processor *tmp_cpu = NULL;
9214   enum aarch64_parse_opt_result parse_res
9215     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
9216
9217   if (parse_res == AARCH64_PARSE_OK)
9218     {
9219       gcc_assert (tmp_cpu);
9220       selected_tune = tmp_cpu;
9221       explicit_tune_core = selected_tune->ident;
9222
9223       selected_arch = &all_architectures[tmp_cpu->arch];
9224       explicit_arch = selected_arch->arch;
9225       return true;
9226     }
9227
9228   switch (parse_res)
9229     {
9230       case AARCH64_PARSE_MISSING_ARG:
9231         error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
9232         break;
9233       case AARCH64_PARSE_INVALID_ARG:
9234         error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
9235         aarch64_print_hint_for_core (str);
9236         break;
9237       case AARCH64_PARSE_INVALID_FEATURE:
9238         error ("invalid feature modifier %qs for 'cpu' target %s",
9239                str, pragma_or_attr);
9240         break;
9241       default:
9242         gcc_unreachable ();
9243     }
9244
9245   return false;
9246 }
9247
9248 /* Handle the argument STR to the tune= target attribute.
9249    PRAGMA_OR_ATTR is used in potential error messages.  */
9250
9251 static bool
9252 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
9253 {
9254   const struct processor *tmp_tune = NULL;
9255   enum aarch64_parse_opt_result parse_res
9256     = aarch64_parse_tune (str, &tmp_tune);
9257
9258   if (parse_res == AARCH64_PARSE_OK)
9259     {
9260       gcc_assert (tmp_tune);
9261       selected_tune = tmp_tune;
9262       explicit_tune_core = selected_tune->ident;
9263       return true;
9264     }
9265
9266   switch (parse_res)
9267     {
9268       case AARCH64_PARSE_INVALID_ARG:
9269         error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
9270         aarch64_print_hint_for_core (str);
9271         break;
9272       default:
9273         gcc_unreachable ();
9274     }
9275
9276   return false;
9277 }
9278
9279 /* Parse an architecture extensions target attribute string specified in STR.
9280    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
9281    if successful.  Update aarch64_isa_flags to reflect the ISA features
9282    modified.
9283    PRAGMA_OR_ATTR is used in potential error messages.  */
9284
9285 static bool
9286 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
9287 {
9288   enum aarch64_parse_opt_result parse_res;
9289   unsigned long isa_flags = aarch64_isa_flags;
9290
9291   /* We allow "+nothing" in the beginning to clear out all architectural
9292      features if the user wants to handpick specific features.  */
9293   if (strncmp ("+nothing", str, 8) == 0)
9294     {
9295       isa_flags = 0;
9296       str += 8;
9297     }
9298
9299   parse_res = aarch64_parse_extension (str, &isa_flags);
9300
9301   if (parse_res == AARCH64_PARSE_OK)
9302     {
9303       aarch64_isa_flags = isa_flags;
9304       return true;
9305     }
9306
9307   switch (parse_res)
9308     {
9309       case AARCH64_PARSE_MISSING_ARG:
9310         error ("missing feature modifier in target %s %qs",
9311                pragma_or_attr, str);
9312         break;
9313
9314       case AARCH64_PARSE_INVALID_FEATURE:
9315         error ("invalid feature modifier in target %s %qs",
9316                pragma_or_attr, str);
9317         break;
9318
9319       default:
9320         gcc_unreachable ();
9321     }
9322
9323  return false;
9324 }
9325
9326 /* The target attributes that we support.  On top of these we also support just
9327    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
9328    handled explicitly in aarch64_process_one_target_attr.  */
9329
9330 static const struct aarch64_attribute_info aarch64_attributes[] =
9331 {
9332   { "general-regs-only", aarch64_attr_mask, false, NULL,
9333      OPT_mgeneral_regs_only },
9334   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
9335      OPT_mfix_cortex_a53_835769 },
9336   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
9337      OPT_mfix_cortex_a53_843419 },
9338   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
9339   { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
9340   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
9341      OPT_momit_leaf_frame_pointer },
9342   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
9343   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
9344      OPT_march_ },
9345   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
9346   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
9347      OPT_mtune_ },
9348   { "sign-return-address", aarch64_attr_enum, false, NULL,
9349      OPT_msign_return_address_ },
9350   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
9351 };
9352
9353 /* Parse ARG_STR which contains the definition of one target attribute.
9354    Show appropriate errors if any or return true if the attribute is valid.
9355    PRAGMA_OR_ATTR holds the string to use in error messages about whether
9356    we're processing a target attribute or pragma.  */
9357
9358 static bool
9359 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
9360 {
9361   bool invert = false;
9362
9363   size_t len = strlen (arg_str);
9364
9365   if (len == 0)
9366     {
9367       error ("malformed target %s", pragma_or_attr);
9368       return false;
9369     }
9370
9371   char *str_to_check = (char *) alloca (len + 1);
9372   strcpy (str_to_check, arg_str);
9373
9374   /* Skip leading whitespace.  */
9375   while (*str_to_check == ' ' || *str_to_check == '\t')
9376     str_to_check++;
9377
9378   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
9379      It is easier to detect and handle it explicitly here rather than going
9380      through the machinery for the rest of the target attributes in this
9381      function.  */
9382   if (*str_to_check == '+')
9383     return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
9384
9385   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
9386     {
9387       invert = true;
9388       str_to_check += 3;
9389     }
9390   char *arg = strchr (str_to_check, '=');
9391
9392   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9393      and point ARG to "foo".  */
9394   if (arg)
9395     {
9396       *arg = '\0';
9397       arg++;
9398     }
9399   const struct aarch64_attribute_info *p_attr;
9400   bool found = false;
9401   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
9402     {
9403       /* If the names don't match up, or the user has given an argument
9404          to an attribute that doesn't accept one, or didn't give an argument
9405          to an attribute that expects one, fail to match.  */
9406       if (strcmp (str_to_check, p_attr->name) != 0)
9407         continue;
9408
9409       found = true;
9410       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
9411                               || p_attr->attr_type == aarch64_attr_enum;
9412
9413       if (attr_need_arg_p ^ (arg != NULL))
9414         {
9415           error ("target %s %qs does not accept an argument",
9416                   pragma_or_attr, str_to_check);
9417           return false;
9418         }
9419
9420       /* If the name matches but the attribute does not allow "no-" versions
9421          then we can't match.  */
9422       if (invert && !p_attr->allow_neg)
9423         {
9424           error ("target %s %qs does not allow a negated form",
9425                   pragma_or_attr, str_to_check);
9426           return false;
9427         }
9428
9429       switch (p_attr->attr_type)
9430         {
9431         /* Has a custom handler registered.
9432            For example, cpu=, arch=, tune=.  */
9433           case aarch64_attr_custom:
9434             gcc_assert (p_attr->handler);
9435             if (!p_attr->handler (arg, pragma_or_attr))
9436               return false;
9437             break;
9438
9439           /* Either set or unset a boolean option.  */
9440           case aarch64_attr_bool:
9441             {
9442               struct cl_decoded_option decoded;
9443
9444               generate_option (p_attr->opt_num, NULL, !invert,
9445                                CL_TARGET, &decoded);
9446               aarch64_handle_option (&global_options, &global_options_set,
9447                                       &decoded, input_location);
9448               break;
9449             }
9450           /* Set or unset a bit in the target_flags.  aarch64_handle_option
9451              should know what mask to apply given the option number.  */
9452           case aarch64_attr_mask:
9453             {
9454               struct cl_decoded_option decoded;
9455               /* We only need to specify the option number.
9456                  aarch64_handle_option will know which mask to apply.  */
9457               decoded.opt_index = p_attr->opt_num;
9458               decoded.value = !invert;
9459               aarch64_handle_option (&global_options, &global_options_set,
9460                                       &decoded, input_location);
9461               break;
9462             }
9463           /* Use the option setting machinery to set an option to an enum.  */
9464           case aarch64_attr_enum:
9465             {
9466               gcc_assert (arg);
9467               bool valid;
9468               int value;
9469               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
9470                                               &value, CL_TARGET);
9471               if (valid)
9472                 {
9473                   set_option (&global_options, NULL, p_attr->opt_num, value,
9474                               NULL, DK_UNSPECIFIED, input_location,
9475                               global_dc);
9476                 }
9477               else
9478                 {
9479                   error ("target %s %s=%s is not valid",
9480                          pragma_or_attr, str_to_check, arg);
9481                 }
9482               break;
9483             }
9484           default:
9485             gcc_unreachable ();
9486         }
9487     }
9488
9489   /* If we reached here we either have found an attribute and validated
9490      it or didn't match any.  If we matched an attribute but its arguments
9491      were malformed we will have returned false already.  */
9492   return found;
9493 }
9494
9495 /* Count how many times the character C appears in
9496    NULL-terminated string STR.  */
9497
9498 static unsigned int
9499 num_occurences_in_str (char c, char *str)
9500 {
9501   unsigned int res = 0;
9502   while (*str != '\0')
9503     {
9504       if (*str == c)
9505         res++;
9506
9507       str++;
9508     }
9509
9510   return res;
9511 }
9512
9513 /* Parse the tree in ARGS that contains the target attribute information
9514    and update the global target options space.  PRAGMA_OR_ATTR is a string
9515    to be used in error messages, specifying whether this is processing
9516    a target attribute or a target pragma.  */
9517
9518 bool
9519 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9520 {
9521   if (TREE_CODE (args) == TREE_LIST)
9522     {
9523       do
9524         {
9525           tree head = TREE_VALUE (args);
9526           if (head)
9527             {
9528               if (!aarch64_process_target_attr (head, pragma_or_attr))
9529                 return false;
9530             }
9531           args = TREE_CHAIN (args);
9532         } while (args);
9533
9534       return true;
9535     }
9536   /* We expect to find a string to parse.  */
9537   gcc_assert (TREE_CODE (args) == STRING_CST);
9538
9539   size_t len = strlen (TREE_STRING_POINTER (args));
9540   char *str_to_check = (char *) alloca (len + 1);
9541   strcpy (str_to_check, TREE_STRING_POINTER (args));
9542
9543   if (len == 0)
9544     {
9545       error ("malformed target %s value", pragma_or_attr);
9546       return false;
9547     }
9548
9549   /* Used to catch empty spaces between commas i.e.
9550      attribute ((target ("attr1,,attr2"))).  */
9551   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9552
9553   /* Handle multiple target attributes separated by ','.  */
9554   char *token = strtok (str_to_check, ",");
9555
9556   unsigned int num_attrs = 0;
9557   while (token)
9558     {
9559       num_attrs++;
9560       if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9561         {
9562           error ("target %s %qs is invalid", pragma_or_attr, token);
9563           return false;
9564         }
9565
9566       token = strtok (NULL, ",");
9567     }
9568
9569   if (num_attrs != num_commas + 1)
9570     {
9571       error ("malformed target %s list %qs",
9572               pragma_or_attr, TREE_STRING_POINTER (args));
9573       return false;
9574     }
9575
9576   return true;
9577 }
9578
9579 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
9580    process attribute ((target ("..."))).  */
9581
9582 static bool
9583 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9584 {
9585   struct cl_target_option cur_target;
9586   bool ret;
9587   tree old_optimize;
9588   tree new_target, new_optimize;
9589   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9590
9591   /* If what we're processing is the current pragma string then the
9592      target option node is already stored in target_option_current_node
9593      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
9594      having to re-parse the string.  This is especially useful to keep
9595      arm_neon.h compile times down since that header contains a lot
9596      of intrinsics enclosed in pragmas.  */
9597   if (!existing_target && args == current_target_pragma)
9598     {
9599       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
9600       return true;
9601     }
9602   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9603
9604   old_optimize = build_optimization_node (&global_options);
9605   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9606
9607   /* If the function changed the optimization levels as well as setting
9608      target options, start with the optimizations specified.  */
9609   if (func_optimize && func_optimize != old_optimize)
9610     cl_optimization_restore (&global_options,
9611                              TREE_OPTIMIZATION (func_optimize));
9612
9613   /* Save the current target options to restore at the end.  */
9614   cl_target_option_save (&cur_target, &global_options);
9615
9616   /* If fndecl already has some target attributes applied to it, unpack
9617      them so that we add this attribute on top of them, rather than
9618      overwriting them.  */
9619   if (existing_target)
9620     {
9621       struct cl_target_option *existing_options
9622         = TREE_TARGET_OPTION (existing_target);
9623
9624       if (existing_options)
9625         cl_target_option_restore (&global_options, existing_options);
9626     }
9627   else
9628     cl_target_option_restore (&global_options,
9629                         TREE_TARGET_OPTION (target_option_current_node));
9630
9631
9632   ret = aarch64_process_target_attr (args, "attribute");
9633
9634   /* Set up any additional state.  */
9635   if (ret)
9636     {
9637       aarch64_override_options_internal (&global_options);
9638       /* Initialize SIMD builtins if we haven't already.
9639          Set current_target_pragma to NULL for the duration so that
9640          the builtin initialization code doesn't try to tag the functions
9641          being built with the attributes specified by any current pragma, thus
9642          going into an infinite recursion.  */
9643       if (TARGET_SIMD)
9644         {
9645           tree saved_current_target_pragma = current_target_pragma;
9646           current_target_pragma = NULL;
9647           aarch64_init_simd_builtins ();
9648           current_target_pragma = saved_current_target_pragma;
9649         }
9650       new_target = build_target_option_node (&global_options);
9651     }
9652   else
9653     new_target = NULL;
9654
9655   new_optimize = build_optimization_node (&global_options);
9656
9657   if (fndecl && ret)
9658     {
9659       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
9660
9661       if (old_optimize != new_optimize)
9662         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
9663     }
9664
9665   cl_target_option_restore (&global_options, &cur_target);
9666
9667   if (old_optimize != new_optimize)
9668     cl_optimization_restore (&global_options,
9669                              TREE_OPTIMIZATION (old_optimize));
9670   return ret;
9671 }
9672
9673 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
9674    tri-bool options (yes, no, don't care) and the default value is
9675    DEF, determine whether to reject inlining.  */
9676
9677 static bool
9678 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
9679                                      int dont_care, int def)
9680 {
9681   /* If the callee doesn't care, always allow inlining.  */
9682   if (callee == dont_care)
9683     return true;
9684
9685   /* If the caller doesn't care, always allow inlining.  */
9686   if (caller == dont_care)
9687     return true;
9688
9689   /* Otherwise, allow inlining if either the callee and caller values
9690      agree, or if the callee is using the default value.  */
9691   return (callee == caller || callee == def);
9692 }
9693
9694 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
9695    to inline CALLEE into CALLER based on target-specific info.
9696    Make sure that the caller and callee have compatible architectural
9697    features.  Then go through the other possible target attributes
9698    and see if they can block inlining.  Try not to reject always_inline
9699    callees unless they are incompatible architecturally.  */
9700
9701 static bool
9702 aarch64_can_inline_p (tree caller, tree callee)
9703 {
9704   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
9705   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
9706
9707   /* If callee has no option attributes, then it is ok to inline.  */
9708   if (!callee_tree)
9709     return true;
9710
9711   struct cl_target_option *caller_opts
9712         = TREE_TARGET_OPTION (caller_tree ? caller_tree
9713                                            : target_option_default_node);
9714
9715   struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
9716
9717
9718   /* Callee's ISA flags should be a subset of the caller's.  */
9719   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
9720        != callee_opts->x_aarch64_isa_flags)
9721     return false;
9722
9723   /* Allow non-strict aligned functions inlining into strict
9724      aligned ones.  */
9725   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
9726        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
9727       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
9728            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
9729     return false;
9730
9731   bool always_inline = lookup_attribute ("always_inline",
9732                                           DECL_ATTRIBUTES (callee));
9733
9734   /* If the architectural features match up and the callee is always_inline
9735      then the other attributes don't matter.  */
9736   if (always_inline)
9737     return true;
9738
9739   if (caller_opts->x_aarch64_cmodel_var
9740       != callee_opts->x_aarch64_cmodel_var)
9741     return false;
9742
9743   if (caller_opts->x_aarch64_tls_dialect
9744       != callee_opts->x_aarch64_tls_dialect)
9745     return false;
9746
9747   /* Honour explicit requests to workaround errata.  */
9748   if (!aarch64_tribools_ok_for_inlining_p (
9749           caller_opts->x_aarch64_fix_a53_err835769,
9750           callee_opts->x_aarch64_fix_a53_err835769,
9751           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
9752     return false;
9753
9754   if (!aarch64_tribools_ok_for_inlining_p (
9755           caller_opts->x_aarch64_fix_a53_err843419,
9756           callee_opts->x_aarch64_fix_a53_err843419,
9757           2, TARGET_FIX_ERR_A53_843419))
9758     return false;
9759
9760   /* If the user explicitly specified -momit-leaf-frame-pointer for the
9761      caller and calle and they don't match up, reject inlining.  */
9762   if (!aarch64_tribools_ok_for_inlining_p (
9763           caller_opts->x_flag_omit_leaf_frame_pointer,
9764           callee_opts->x_flag_omit_leaf_frame_pointer,
9765           2, 1))
9766     return false;
9767
9768   /* If the callee has specific tuning overrides, respect them.  */
9769   if (callee_opts->x_aarch64_override_tune_string != NULL
9770       && caller_opts->x_aarch64_override_tune_string == NULL)
9771     return false;
9772
9773   /* If the user specified tuning override strings for the
9774      caller and callee and they don't match up, reject inlining.
9775      We just do a string compare here, we don't analyze the meaning
9776      of the string, as it would be too costly for little gain.  */
9777   if (callee_opts->x_aarch64_override_tune_string
9778       && caller_opts->x_aarch64_override_tune_string
9779       && (strcmp (callee_opts->x_aarch64_override_tune_string,
9780                   caller_opts->x_aarch64_override_tune_string) != 0))
9781     return false;
9782
9783   return true;
9784 }
9785
9786 /* Return true if SYMBOL_REF X binds locally.  */
9787
9788 static bool
9789 aarch64_symbol_binds_local_p (const_rtx x)
9790 {
9791   return (SYMBOL_REF_DECL (x)
9792           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
9793           : SYMBOL_REF_LOCAL_P (x));
9794 }
9795
9796 /* Return true if SYMBOL_REF X is thread local */
9797 static bool
9798 aarch64_tls_symbol_p (rtx x)
9799 {
9800   if (! TARGET_HAVE_TLS)
9801     return false;
9802
9803   if (GET_CODE (x) != SYMBOL_REF)
9804     return false;
9805
9806   return SYMBOL_REF_TLS_MODEL (x) != 0;
9807 }
9808
9809 /* Classify a TLS symbol into one of the TLS kinds.  */
9810 enum aarch64_symbol_type
9811 aarch64_classify_tls_symbol (rtx x)
9812 {
9813   enum tls_model tls_kind = tls_symbolic_operand_type (x);
9814
9815   switch (tls_kind)
9816     {
9817     case TLS_MODEL_GLOBAL_DYNAMIC:
9818     case TLS_MODEL_LOCAL_DYNAMIC:
9819       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
9820
9821     case TLS_MODEL_INITIAL_EXEC:
9822       switch (aarch64_cmodel)
9823         {
9824         case AARCH64_CMODEL_TINY:
9825         case AARCH64_CMODEL_TINY_PIC:
9826           return SYMBOL_TINY_TLSIE;
9827         default:
9828           return SYMBOL_SMALL_TLSIE;
9829         }
9830
9831     case TLS_MODEL_LOCAL_EXEC:
9832       if (aarch64_tls_size == 12)
9833         return SYMBOL_TLSLE12;
9834       else if (aarch64_tls_size == 24)
9835         return SYMBOL_TLSLE24;
9836       else if (aarch64_tls_size == 32)
9837         return SYMBOL_TLSLE32;
9838       else if (aarch64_tls_size == 48)
9839         return SYMBOL_TLSLE48;
9840       else
9841         gcc_unreachable ();
9842
9843     case TLS_MODEL_EMULATED:
9844     case TLS_MODEL_NONE:
9845       return SYMBOL_FORCE_TO_MEM;
9846
9847     default:
9848       gcc_unreachable ();
9849     }
9850 }
9851
9852 /* Return the method that should be used to access SYMBOL_REF or
9853    LABEL_REF X.  */
9854
9855 enum aarch64_symbol_type
9856 aarch64_classify_symbol (rtx x, rtx offset)
9857 {
9858   if (GET_CODE (x) == LABEL_REF)
9859     {
9860       switch (aarch64_cmodel)
9861         {
9862         case AARCH64_CMODEL_LARGE:
9863           return SYMBOL_FORCE_TO_MEM;
9864
9865         case AARCH64_CMODEL_TINY_PIC:
9866         case AARCH64_CMODEL_TINY:
9867           return SYMBOL_TINY_ABSOLUTE;
9868
9869         case AARCH64_CMODEL_SMALL_SPIC:
9870         case AARCH64_CMODEL_SMALL_PIC:
9871         case AARCH64_CMODEL_SMALL:
9872           return SYMBOL_SMALL_ABSOLUTE;
9873
9874         default:
9875           gcc_unreachable ();
9876         }
9877     }
9878
9879   if (GET_CODE (x) == SYMBOL_REF)
9880     {
9881       if (aarch64_tls_symbol_p (x))
9882         return aarch64_classify_tls_symbol (x);
9883
9884       switch (aarch64_cmodel)
9885         {
9886         case AARCH64_CMODEL_TINY:
9887           /* When we retrieve symbol + offset address, we have to make sure
9888              the offset does not cause overflow of the final address.  But
9889              we have no way of knowing the address of symbol at compile time
9890              so we can't accurately say if the distance between the PC and
9891              symbol + offset is outside the addressible range of +/-1M in the
9892              TINY code model.  So we rely on images not being greater than
9893              1M and cap the offset at 1M and anything beyond 1M will have to
9894              be loaded using an alternative mechanism.  Furthermore if the
9895              symbol is a weak reference to something that isn't known to
9896              resolve to a symbol in this module, then force to memory.  */
9897           if ((SYMBOL_REF_WEAK (x)
9898                && !aarch64_symbol_binds_local_p (x))
9899               || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
9900             return SYMBOL_FORCE_TO_MEM;
9901           return SYMBOL_TINY_ABSOLUTE;
9902
9903         case AARCH64_CMODEL_SMALL:
9904           /* Same reasoning as the tiny code model, but the offset cap here is
9905              4G.  */
9906           if ((SYMBOL_REF_WEAK (x)
9907                && !aarch64_symbol_binds_local_p (x))
9908               || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
9909                             HOST_WIDE_INT_C (4294967264)))
9910             return SYMBOL_FORCE_TO_MEM;
9911           return SYMBOL_SMALL_ABSOLUTE;
9912
9913         case AARCH64_CMODEL_TINY_PIC:
9914           if (!aarch64_symbol_binds_local_p (x))
9915             return SYMBOL_TINY_GOT;
9916           return SYMBOL_TINY_ABSOLUTE;
9917
9918         case AARCH64_CMODEL_SMALL_SPIC:
9919         case AARCH64_CMODEL_SMALL_PIC:
9920           if (!aarch64_symbol_binds_local_p (x))
9921             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
9922                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
9923           return SYMBOL_SMALL_ABSOLUTE;
9924
9925         case AARCH64_CMODEL_LARGE:
9926           /* This is alright even in PIC code as the constant
9927              pool reference is always PC relative and within
9928              the same translation unit.  */
9929           if (CONSTANT_POOL_ADDRESS_P (x))
9930             return SYMBOL_SMALL_ABSOLUTE;
9931           else
9932             return SYMBOL_FORCE_TO_MEM;
9933
9934         default:
9935           gcc_unreachable ();
9936         }
9937     }
9938
9939   /* By default push everything into the constant pool.  */
9940   return SYMBOL_FORCE_TO_MEM;
9941 }
9942
9943 bool
9944 aarch64_constant_address_p (rtx x)
9945 {
9946   return (CONSTANT_P (x) && memory_address_p (DImode, x));
9947 }
9948
9949 bool
9950 aarch64_legitimate_pic_operand_p (rtx x)
9951 {
9952   if (GET_CODE (x) == SYMBOL_REF
9953       || (GET_CODE (x) == CONST
9954           && GET_CODE (XEXP (x, 0)) == PLUS
9955           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
9956      return false;
9957
9958   return true;
9959 }
9960
9961 /* Return true if X holds either a quarter-precision or
9962      floating-point +0.0 constant.  */
9963 static bool
9964 aarch64_valid_floating_const (machine_mode mode, rtx x)
9965 {
9966   if (!CONST_DOUBLE_P (x))
9967     return false;
9968
9969   if (aarch64_float_const_zero_rtx_p (x))
9970     return true;
9971
9972   /* We only handle moving 0.0 to a TFmode register.  */
9973   if (!(mode == SFmode || mode == DFmode))
9974     return false;
9975
9976   return aarch64_float_const_representable_p (x);
9977 }
9978
9979 static bool
9980 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
9981 {
9982   /* Do not allow vector struct mode constants.  We could support
9983      0 and -1 easily, but they need support in aarch64-simd.md.  */
9984   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
9985     return false;
9986
9987   /* This could probably go away because
9988      we now decompose CONST_INTs according to expand_mov_immediate.  */
9989   if ((GET_CODE (x) == CONST_VECTOR
9990        && aarch64_simd_valid_immediate (x, mode, false, NULL))
9991       || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
9992         return !targetm.cannot_force_const_mem (mode, x);
9993
9994   if (GET_CODE (x) == HIGH
9995       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
9996     return true;
9997
9998   return aarch64_constant_address_p (x);
9999 }
10000
10001 rtx
10002 aarch64_load_tp (rtx target)
10003 {
10004   if (!target
10005       || GET_MODE (target) != Pmode
10006       || !register_operand (target, Pmode))
10007     target = gen_reg_rtx (Pmode);
10008
10009   /* Can return in any reg.  */
10010   emit_insn (gen_aarch64_load_tp_hard (target));
10011   return target;
10012 }
10013
10014 /* On AAPCS systems, this is the "struct __va_list".  */
10015 static GTY(()) tree va_list_type;
10016
10017 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
10018    Return the type to use as __builtin_va_list.
10019
10020    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
10021
10022    struct __va_list
10023    {
10024      void *__stack;
10025      void *__gr_top;
10026      void *__vr_top;
10027      int   __gr_offs;
10028      int   __vr_offs;
10029    };  */
10030
10031 static tree
10032 aarch64_build_builtin_va_list (void)
10033 {
10034   tree va_list_name;
10035   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10036
10037   /* Create the type.  */
10038   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
10039   /* Give it the required name.  */
10040   va_list_name = build_decl (BUILTINS_LOCATION,
10041                              TYPE_DECL,
10042                              get_identifier ("__va_list"),
10043                              va_list_type);
10044   DECL_ARTIFICIAL (va_list_name) = 1;
10045   TYPE_NAME (va_list_type) = va_list_name;
10046   TYPE_STUB_DECL (va_list_type) = va_list_name;
10047
10048   /* Create the fields.  */
10049   f_stack = build_decl (BUILTINS_LOCATION,
10050                         FIELD_DECL, get_identifier ("__stack"),
10051                         ptr_type_node);
10052   f_grtop = build_decl (BUILTINS_LOCATION,
10053                         FIELD_DECL, get_identifier ("__gr_top"),
10054                         ptr_type_node);
10055   f_vrtop = build_decl (BUILTINS_LOCATION,
10056                         FIELD_DECL, get_identifier ("__vr_top"),
10057                         ptr_type_node);
10058   f_groff = build_decl (BUILTINS_LOCATION,
10059                         FIELD_DECL, get_identifier ("__gr_offs"),
10060                         integer_type_node);
10061   f_vroff = build_decl (BUILTINS_LOCATION,
10062                         FIELD_DECL, get_identifier ("__vr_offs"),
10063                         integer_type_node);
10064
10065   /* Tell tree-stdarg pass about our internal offset fields.
10066      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
10067      purpose to identify whether the code is updating va_list internal
10068      offset fields through irregular way.  */
10069   va_list_gpr_counter_field = f_groff;
10070   va_list_fpr_counter_field = f_vroff;
10071
10072   DECL_ARTIFICIAL (f_stack) = 1;
10073   DECL_ARTIFICIAL (f_grtop) = 1;
10074   DECL_ARTIFICIAL (f_vrtop) = 1;
10075   DECL_ARTIFICIAL (f_groff) = 1;
10076   DECL_ARTIFICIAL (f_vroff) = 1;
10077
10078   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
10079   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
10080   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
10081   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
10082   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
10083
10084   TYPE_FIELDS (va_list_type) = f_stack;
10085   DECL_CHAIN (f_stack) = f_grtop;
10086   DECL_CHAIN (f_grtop) = f_vrtop;
10087   DECL_CHAIN (f_vrtop) = f_groff;
10088   DECL_CHAIN (f_groff) = f_vroff;
10089
10090   /* Compute its layout.  */
10091   layout_type (va_list_type);
10092
10093   return va_list_type;
10094 }
10095
10096 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
10097 static void
10098 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
10099 {
10100   const CUMULATIVE_ARGS *cum;
10101   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10102   tree stack, grtop, vrtop, groff, vroff;
10103   tree t;
10104   int gr_save_area_size = cfun->va_list_gpr_size;
10105   int vr_save_area_size = cfun->va_list_fpr_size;
10106   int vr_offset;
10107
10108   cum = &crtl->args.info;
10109   if (cfun->va_list_gpr_size)
10110     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
10111                              cfun->va_list_gpr_size);
10112   if (cfun->va_list_fpr_size)
10113     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
10114                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
10115
10116   if (!TARGET_FLOAT)
10117     {
10118       gcc_assert (cum->aapcs_nvrn == 0);
10119       vr_save_area_size = 0;
10120     }
10121
10122   f_stack = TYPE_FIELDS (va_list_type_node);
10123   f_grtop = DECL_CHAIN (f_stack);
10124   f_vrtop = DECL_CHAIN (f_grtop);
10125   f_groff = DECL_CHAIN (f_vrtop);
10126   f_vroff = DECL_CHAIN (f_groff);
10127
10128   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
10129                   NULL_TREE);
10130   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
10131                   NULL_TREE);
10132   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
10133                   NULL_TREE);
10134   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
10135                   NULL_TREE);
10136   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
10137                   NULL_TREE);
10138
10139   /* Emit code to initialize STACK, which points to the next varargs stack
10140      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
10141      by named arguments.  STACK is 8-byte aligned.  */
10142   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
10143   if (cum->aapcs_stack_size > 0)
10144     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
10145   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
10146   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10147
10148   /* Emit code to initialize GRTOP, the top of the GR save area.
10149      virtual_incoming_args_rtx should have been 16 byte aligned.  */
10150   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
10151   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
10152   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10153
10154   /* Emit code to initialize VRTOP, the top of the VR save area.
10155      This address is gr_save_area_bytes below GRTOP, rounded
10156      down to the next 16-byte boundary.  */
10157   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
10158   vr_offset = ROUND_UP (gr_save_area_size,
10159                         STACK_BOUNDARY / BITS_PER_UNIT);
10160
10161   if (vr_offset)
10162     t = fold_build_pointer_plus_hwi (t, -vr_offset);
10163   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
10164   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10165
10166   /* Emit code to initialize GROFF, the offset from GRTOP of the
10167      next GPR argument.  */
10168   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
10169               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
10170   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10171
10172   /* Likewise emit code to initialize VROFF, the offset from FTOP
10173      of the next VR argument.  */
10174   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
10175               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
10176   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10177 }
10178
10179 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
10180
10181 static tree
10182 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
10183                               gimple_seq *post_p ATTRIBUTE_UNUSED)
10184 {
10185   tree addr;
10186   bool indirect_p;
10187   bool is_ha;           /* is HFA or HVA.  */
10188   bool dw_align;        /* double-word align.  */
10189   machine_mode ag_mode = VOIDmode;
10190   int nregs;
10191   machine_mode mode;
10192
10193   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10194   tree stack, f_top, f_off, off, arg, roundup, on_stack;
10195   HOST_WIDE_INT size, rsize, adjust, align;
10196   tree t, u, cond1, cond2;
10197
10198   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
10199   if (indirect_p)
10200     type = build_pointer_type (type);
10201
10202   mode = TYPE_MODE (type);
10203
10204   f_stack = TYPE_FIELDS (va_list_type_node);
10205   f_grtop = DECL_CHAIN (f_stack);
10206   f_vrtop = DECL_CHAIN (f_grtop);
10207   f_groff = DECL_CHAIN (f_vrtop);
10208   f_vroff = DECL_CHAIN (f_groff);
10209
10210   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
10211                   f_stack, NULL_TREE);
10212   size = int_size_in_bytes (type);
10213   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
10214
10215   dw_align = false;
10216   adjust = 0;
10217   if (aarch64_vfp_is_call_or_return_candidate (mode,
10218                                                type,
10219                                                &ag_mode,
10220                                                &nregs,
10221                                                &is_ha))
10222     {
10223       /* TYPE passed in fp/simd registers.  */
10224       if (!TARGET_FLOAT)
10225         aarch64_err_no_fpadvsimd (mode, "varargs");
10226
10227       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
10228                       unshare_expr (valist), f_vrtop, NULL_TREE);
10229       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
10230                       unshare_expr (valist), f_vroff, NULL_TREE);
10231
10232       rsize = nregs * UNITS_PER_VREG;
10233
10234       if (is_ha)
10235         {
10236           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
10237             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
10238         }
10239       else if (BLOCK_REG_PADDING (mode, type, 1) == downward
10240                && size < UNITS_PER_VREG)
10241         {
10242           adjust = UNITS_PER_VREG - size;
10243         }
10244     }
10245   else
10246     {
10247       /* TYPE passed in general registers.  */
10248       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
10249                       unshare_expr (valist), f_grtop, NULL_TREE);
10250       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
10251                       unshare_expr (valist), f_groff, NULL_TREE);
10252       rsize = ROUND_UP (size, UNITS_PER_WORD);
10253       nregs = rsize / UNITS_PER_WORD;
10254
10255       if (align > 8)
10256         dw_align = true;
10257
10258       if (BLOCK_REG_PADDING (mode, type, 1) == downward
10259           && size < UNITS_PER_WORD)
10260         {
10261           adjust = UNITS_PER_WORD  - size;
10262         }
10263     }
10264
10265   /* Get a local temporary for the field value.  */
10266   off = get_initialized_tmp_var (f_off, pre_p, NULL);
10267
10268   /* Emit code to branch if off >= 0.  */
10269   t = build2 (GE_EXPR, boolean_type_node, off,
10270               build_int_cst (TREE_TYPE (off), 0));
10271   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
10272
10273   if (dw_align)
10274     {
10275       /* Emit: offs = (offs + 15) & -16.  */
10276       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10277                   build_int_cst (TREE_TYPE (off), 15));
10278       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
10279                   build_int_cst (TREE_TYPE (off), -16));
10280       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
10281     }
10282   else
10283     roundup = NULL;
10284
10285   /* Update ap.__[g|v]r_offs  */
10286   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10287               build_int_cst (TREE_TYPE (off), rsize));
10288   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
10289
10290   /* String up.  */
10291   if (roundup)
10292     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10293
10294   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
10295   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
10296               build_int_cst (TREE_TYPE (f_off), 0));
10297   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
10298
10299   /* String up: make sure the assignment happens before the use.  */
10300   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
10301   COND_EXPR_ELSE (cond1) = t;
10302
10303   /* Prepare the trees handling the argument that is passed on the stack;
10304      the top level node will store in ON_STACK.  */
10305   arg = get_initialized_tmp_var (stack, pre_p, NULL);
10306   if (align > 8)
10307     {
10308       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
10309       t = fold_convert (intDI_type_node, arg);
10310       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10311                   build_int_cst (TREE_TYPE (t), 15));
10312       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10313                   build_int_cst (TREE_TYPE (t), -16));
10314       t = fold_convert (TREE_TYPE (arg), t);
10315       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
10316     }
10317   else
10318     roundup = NULL;
10319   /* Advance ap.__stack  */
10320   t = fold_convert (intDI_type_node, arg);
10321   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10322               build_int_cst (TREE_TYPE (t), size + 7));
10323   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10324               build_int_cst (TREE_TYPE (t), -8));
10325   t = fold_convert (TREE_TYPE (arg), t);
10326   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
10327   /* String up roundup and advance.  */
10328   if (roundup)
10329     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10330   /* String up with arg */
10331   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
10332   /* Big-endianness related address adjustment.  */
10333   if (BLOCK_REG_PADDING (mode, type, 1) == downward
10334       && size < UNITS_PER_WORD)
10335   {
10336     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
10337                 size_int (UNITS_PER_WORD - size));
10338     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
10339   }
10340
10341   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
10342   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
10343
10344   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
10345   t = off;
10346   if (adjust)
10347     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
10348                 build_int_cst (TREE_TYPE (off), adjust));
10349
10350   t = fold_convert (sizetype, t);
10351   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
10352
10353   if (is_ha)
10354     {
10355       /* type ha; // treat as "struct {ftype field[n];}"
10356          ... [computing offs]
10357          for (i = 0; i <nregs; ++i, offs += 16)
10358            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
10359          return ha;  */
10360       int i;
10361       tree tmp_ha, field_t, field_ptr_t;
10362
10363       /* Declare a local variable.  */
10364       tmp_ha = create_tmp_var_raw (type, "ha");
10365       gimple_add_tmp_var (tmp_ha);
10366
10367       /* Establish the base type.  */
10368       switch (ag_mode)
10369         {
10370         case SFmode:
10371           field_t = float_type_node;
10372           field_ptr_t = float_ptr_type_node;
10373           break;
10374         case DFmode:
10375           field_t = double_type_node;
10376           field_ptr_t = double_ptr_type_node;
10377           break;
10378         case TFmode:
10379           field_t = long_double_type_node;
10380           field_ptr_t = long_double_ptr_type_node;
10381           break;
10382         case HFmode:
10383           field_t = aarch64_fp16_type_node;
10384           field_ptr_t = aarch64_fp16_ptr_type_node;
10385           break;
10386         case V2SImode:
10387         case V4SImode:
10388             {
10389               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
10390               field_t = build_vector_type_for_mode (innertype, ag_mode);
10391               field_ptr_t = build_pointer_type (field_t);
10392             }
10393           break;
10394         default:
10395           gcc_assert (0);
10396         }
10397
10398       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
10399       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
10400       addr = t;
10401       t = fold_convert (field_ptr_t, addr);
10402       t = build2 (MODIFY_EXPR, field_t,
10403                   build1 (INDIRECT_REF, field_t, tmp_ha),
10404                   build1 (INDIRECT_REF, field_t, t));
10405
10406       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
10407       for (i = 1; i < nregs; ++i)
10408         {
10409           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
10410           u = fold_convert (field_ptr_t, addr);
10411           u = build2 (MODIFY_EXPR, field_t,
10412                       build2 (MEM_REF, field_t, tmp_ha,
10413                               build_int_cst (field_ptr_t,
10414                                              (i *
10415                                               int_size_in_bytes (field_t)))),
10416                       build1 (INDIRECT_REF, field_t, u));
10417           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
10418         }
10419
10420       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
10421       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
10422     }
10423
10424   COND_EXPR_ELSE (cond2) = t;
10425   addr = fold_convert (build_pointer_type (type), cond1);
10426   addr = build_va_arg_indirect_ref (addr);
10427
10428   if (indirect_p)
10429     addr = build_va_arg_indirect_ref (addr);
10430
10431   return addr;
10432 }
10433
10434 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
10435
10436 static void
10437 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10438                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
10439                                 int no_rtl)
10440 {
10441   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10442   CUMULATIVE_ARGS local_cum;
10443   int gr_saved = cfun->va_list_gpr_size;
10444   int vr_saved = cfun->va_list_fpr_size;
10445
10446   /* The caller has advanced CUM up to, but not beyond, the last named
10447      argument.  Advance a local copy of CUM past the last "real" named
10448      argument, to find out how many registers are left over.  */
10449   local_cum = *cum;
10450   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
10451
10452   /* Found out how many registers we need to save.
10453      Honor tree-stdvar analysis results.  */
10454   if (cfun->va_list_gpr_size)
10455     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
10456                     cfun->va_list_gpr_size / UNITS_PER_WORD);
10457   if (cfun->va_list_fpr_size)
10458     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
10459                     cfun->va_list_fpr_size / UNITS_PER_VREG);
10460
10461   if (!TARGET_FLOAT)
10462     {
10463       gcc_assert (local_cum.aapcs_nvrn == 0);
10464       vr_saved = 0;
10465     }
10466
10467   if (!no_rtl)
10468     {
10469       if (gr_saved > 0)
10470         {
10471           rtx ptr, mem;
10472
10473           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
10474           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
10475                                - gr_saved * UNITS_PER_WORD);
10476           mem = gen_frame_mem (BLKmode, ptr);
10477           set_mem_alias_set (mem, get_varargs_alias_set ());
10478
10479           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
10480                                mem, gr_saved);
10481         }
10482       if (vr_saved > 0)
10483         {
10484           /* We can't use move_block_from_reg, because it will use
10485              the wrong mode, storing D regs only.  */
10486           machine_mode mode = TImode;
10487           int off, i, vr_start;
10488
10489           /* Set OFF to the offset from virtual_incoming_args_rtx of
10490              the first vector register.  The VR save area lies below
10491              the GR one, and is aligned to 16 bytes.  */
10492           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10493                            STACK_BOUNDARY / BITS_PER_UNIT);
10494           off -= vr_saved * UNITS_PER_VREG;
10495
10496           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10497           for (i = 0; i < vr_saved; ++i)
10498             {
10499               rtx ptr, mem;
10500
10501               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10502               mem = gen_frame_mem (mode, ptr);
10503               set_mem_alias_set (mem, get_varargs_alias_set ());
10504               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10505               off += UNITS_PER_VREG;
10506             }
10507         }
10508     }
10509
10510   /* We don't save the size into *PRETEND_SIZE because we want to avoid
10511      any complication of having crtl->args.pretend_args_size changed.  */
10512   cfun->machine->frame.saved_varargs_size
10513     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10514                  STACK_BOUNDARY / BITS_PER_UNIT)
10515        + vr_saved * UNITS_PER_VREG);
10516 }
10517
10518 static void
10519 aarch64_conditional_register_usage (void)
10520 {
10521   int i;
10522   if (!TARGET_FLOAT)
10523     {
10524       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10525         {
10526           fixed_regs[i] = 1;
10527           call_used_regs[i] = 1;
10528         }
10529     }
10530 }
10531
10532 /* Walk down the type tree of TYPE counting consecutive base elements.
10533    If *MODEP is VOIDmode, then set it to the first valid floating point
10534    type.  If a non-floating point type is found, or if a floating point
10535    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10536    otherwise return the count in the sub-tree.  */
10537 static int
10538 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10539 {
10540   machine_mode mode;
10541   HOST_WIDE_INT size;
10542
10543   switch (TREE_CODE (type))
10544     {
10545     case REAL_TYPE:
10546       mode = TYPE_MODE (type);
10547       if (mode != DFmode && mode != SFmode
10548           && mode != TFmode && mode != HFmode)
10549         return -1;
10550
10551       if (*modep == VOIDmode)
10552         *modep = mode;
10553
10554       if (*modep == mode)
10555         return 1;
10556
10557       break;
10558
10559     case COMPLEX_TYPE:
10560       mode = TYPE_MODE (TREE_TYPE (type));
10561       if (mode != DFmode && mode != SFmode
10562           && mode != TFmode && mode != HFmode)
10563         return -1;
10564
10565       if (*modep == VOIDmode)
10566         *modep = mode;
10567
10568       if (*modep == mode)
10569         return 2;
10570
10571       break;
10572
10573     case VECTOR_TYPE:
10574       /* Use V2SImode and V4SImode as representatives of all 64-bit
10575          and 128-bit vector types.  */
10576       size = int_size_in_bytes (type);
10577       switch (size)
10578         {
10579         case 8:
10580           mode = V2SImode;
10581           break;
10582         case 16:
10583           mode = V4SImode;
10584           break;
10585         default:
10586           return -1;
10587         }
10588
10589       if (*modep == VOIDmode)
10590         *modep = mode;
10591
10592       /* Vector modes are considered to be opaque: two vectors are
10593          equivalent for the purposes of being homogeneous aggregates
10594          if they are the same size.  */
10595       if (*modep == mode)
10596         return 1;
10597
10598       break;
10599
10600     case ARRAY_TYPE:
10601       {
10602         int count;
10603         tree index = TYPE_DOMAIN (type);
10604
10605         /* Can't handle incomplete types nor sizes that are not
10606            fixed.  */
10607         if (!COMPLETE_TYPE_P (type)
10608             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10609           return -1;
10610
10611         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
10612         if (count == -1
10613             || !index
10614             || !TYPE_MAX_VALUE (index)
10615             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
10616             || !TYPE_MIN_VALUE (index)
10617             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
10618             || count < 0)
10619           return -1;
10620
10621         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
10622                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
10623
10624         /* There must be no padding.  */
10625         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10626           return -1;
10627
10628         return count;
10629       }
10630
10631     case RECORD_TYPE:
10632       {
10633         int count = 0;
10634         int sub_count;
10635         tree field;
10636
10637         /* Can't handle incomplete types nor sizes that are not
10638            fixed.  */
10639         if (!COMPLETE_TYPE_P (type)
10640             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10641           return -1;
10642
10643         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10644           {
10645             if (TREE_CODE (field) != FIELD_DECL)
10646               continue;
10647
10648             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10649             if (sub_count < 0)
10650               return -1;
10651             count += sub_count;
10652           }
10653
10654         /* There must be no padding.  */
10655         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10656           return -1;
10657
10658         return count;
10659       }
10660
10661     case UNION_TYPE:
10662     case QUAL_UNION_TYPE:
10663       {
10664         /* These aren't very interesting except in a degenerate case.  */
10665         int count = 0;
10666         int sub_count;
10667         tree field;
10668
10669         /* Can't handle incomplete types nor sizes that are not
10670            fixed.  */
10671         if (!COMPLETE_TYPE_P (type)
10672             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10673           return -1;
10674
10675         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10676           {
10677             if (TREE_CODE (field) != FIELD_DECL)
10678               continue;
10679
10680             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10681             if (sub_count < 0)
10682               return -1;
10683             count = count > sub_count ? count : sub_count;
10684           }
10685
10686         /* There must be no padding.  */
10687         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10688           return -1;
10689
10690         return count;
10691       }
10692
10693     default:
10694       break;
10695     }
10696
10697   return -1;
10698 }
10699
10700 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
10701    type as described in AAPCS64 \S 4.1.2.
10702
10703    See the comment above aarch64_composite_type_p for the notes on MODE.  */
10704
10705 static bool
10706 aarch64_short_vector_p (const_tree type,
10707                         machine_mode mode)
10708 {
10709   HOST_WIDE_INT size = -1;
10710
10711   if (type && TREE_CODE (type) == VECTOR_TYPE)
10712     size = int_size_in_bytes (type);
10713   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
10714             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
10715     size = GET_MODE_SIZE (mode);
10716
10717   return (size == 8 || size == 16);
10718 }
10719
10720 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
10721    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
10722    array types.  The C99 floating-point complex types are also considered
10723    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
10724    types, which are GCC extensions and out of the scope of AAPCS64, are
10725    treated as composite types here as well.
10726
10727    Note that MODE itself is not sufficient in determining whether a type
10728    is such a composite type or not.  This is because
10729    stor-layout.c:compute_record_mode may have already changed the MODE
10730    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
10731    structure with only one field may have its MODE set to the mode of the
10732    field.  Also an integer mode whose size matches the size of the
10733    RECORD_TYPE type may be used to substitute the original mode
10734    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
10735    solely relied on.  */
10736
10737 static bool
10738 aarch64_composite_type_p (const_tree type,
10739                           machine_mode mode)
10740 {
10741   if (aarch64_short_vector_p (type, mode))
10742     return false;
10743
10744   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
10745     return true;
10746
10747   if (mode == BLKmode
10748       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
10749       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
10750     return true;
10751
10752   return false;
10753 }
10754
10755 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
10756    shall be passed or returned in simd/fp register(s) (providing these
10757    parameter passing registers are available).
10758
10759    Upon successful return, *COUNT returns the number of needed registers,
10760    *BASE_MODE returns the mode of the individual register and when IS_HAF
10761    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
10762    floating-point aggregate or a homogeneous short-vector aggregate.  */
10763
10764 static bool
10765 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
10766                                          const_tree type,
10767                                          machine_mode *base_mode,
10768                                          int *count,
10769                                          bool *is_ha)
10770 {
10771   machine_mode new_mode = VOIDmode;
10772   bool composite_p = aarch64_composite_type_p (type, mode);
10773
10774   if (is_ha != NULL) *is_ha = false;
10775
10776   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
10777       || aarch64_short_vector_p (type, mode))
10778     {
10779       *count = 1;
10780       new_mode = mode;
10781     }
10782   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
10783     {
10784       if (is_ha != NULL) *is_ha = true;
10785       *count = 2;
10786       new_mode = GET_MODE_INNER (mode);
10787     }
10788   else if (type && composite_p)
10789     {
10790       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
10791
10792       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
10793         {
10794           if (is_ha != NULL) *is_ha = true;
10795           *count = ag_count;
10796         }
10797       else
10798         return false;
10799     }
10800   else
10801     return false;
10802
10803   *base_mode = new_mode;
10804   return true;
10805 }
10806
10807 /* Implement TARGET_STRUCT_VALUE_RTX.  */
10808
10809 static rtx
10810 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
10811                           int incoming ATTRIBUTE_UNUSED)
10812 {
10813   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
10814 }
10815
10816 /* Implements target hook vector_mode_supported_p.  */
10817 static bool
10818 aarch64_vector_mode_supported_p (machine_mode mode)
10819 {
10820   if (TARGET_SIMD
10821       && (mode == V4SImode  || mode == V8HImode
10822           || mode == V16QImode || mode == V2DImode
10823           || mode == V2SImode  || mode == V4HImode
10824           || mode == V8QImode || mode == V2SFmode
10825           || mode == V4SFmode || mode == V2DFmode
10826           || mode == V4HFmode || mode == V8HFmode
10827           || mode == V1DFmode))
10828     return true;
10829
10830   return false;
10831 }
10832
10833 /* Return appropriate SIMD container
10834    for MODE within a vector of WIDTH bits.  */
10835 static machine_mode
10836 aarch64_simd_container_mode (machine_mode mode, unsigned width)
10837 {
10838   gcc_assert (width == 64 || width == 128);
10839   if (TARGET_SIMD)
10840     {
10841       if (width == 128)
10842         switch (mode)
10843           {
10844           case DFmode:
10845             return V2DFmode;
10846           case SFmode:
10847             return V4SFmode;
10848           case SImode:
10849             return V4SImode;
10850           case HImode:
10851             return V8HImode;
10852           case QImode:
10853             return V16QImode;
10854           case DImode:
10855             return V2DImode;
10856           default:
10857             break;
10858           }
10859       else
10860         switch (mode)
10861           {
10862           case SFmode:
10863             return V2SFmode;
10864           case SImode:
10865             return V2SImode;
10866           case HImode:
10867             return V4HImode;
10868           case QImode:
10869             return V8QImode;
10870           default:
10871             break;
10872           }
10873     }
10874   return word_mode;
10875 }
10876
10877 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
10878 static machine_mode
10879 aarch64_preferred_simd_mode (machine_mode mode)
10880 {
10881   return aarch64_simd_container_mode (mode, 128);
10882 }
10883
10884 /* Return the bitmask of possible vector sizes for the vectorizer
10885    to iterate over.  */
10886 static unsigned int
10887 aarch64_autovectorize_vector_sizes (void)
10888 {
10889   return (16 | 8);
10890 }
10891
10892 /* Implement TARGET_MANGLE_TYPE.  */
10893
10894 static const char *
10895 aarch64_mangle_type (const_tree type)
10896 {
10897   /* The AArch64 ABI documents say that "__va_list" has to be
10898      managled as if it is in the "std" namespace.  */
10899   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
10900     return "St9__va_list";
10901
10902   /* Half-precision float.  */
10903   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
10904     return "Dh";
10905
10906   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
10907      builtin types.  */
10908   if (TYPE_NAME (type) != NULL)
10909     return aarch64_mangle_builtin_type (type);
10910
10911   /* Use the default mangling.  */
10912   return NULL;
10913 }
10914
10915 /* Find the first rtx_insn before insn that will generate an assembly
10916    instruction.  */
10917
10918 static rtx_insn *
10919 aarch64_prev_real_insn (rtx_insn *insn)
10920 {
10921   if (!insn)
10922     return NULL;
10923
10924   do
10925     {
10926       insn = prev_real_insn (insn);
10927     }
10928   while (insn && recog_memoized (insn) < 0);
10929
10930   return insn;
10931 }
10932
10933 static bool
10934 is_madd_op (enum attr_type t1)
10935 {
10936   unsigned int i;
10937   /* A number of these may be AArch32 only.  */
10938   enum attr_type mlatypes[] = {
10939     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
10940     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
10941     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
10942   };
10943
10944   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
10945     {
10946       if (t1 == mlatypes[i])
10947         return true;
10948     }
10949
10950   return false;
10951 }
10952
10953 /* Check if there is a register dependency between a load and the insn
10954    for which we hold recog_data.  */
10955
10956 static bool
10957 dep_between_memop_and_curr (rtx memop)
10958 {
10959   rtx load_reg;
10960   int opno;
10961
10962   gcc_assert (GET_CODE (memop) == SET);
10963
10964   if (!REG_P (SET_DEST (memop)))
10965     return false;
10966
10967   load_reg = SET_DEST (memop);
10968   for (opno = 1; opno < recog_data.n_operands; opno++)
10969     {
10970       rtx operand = recog_data.operand[opno];
10971       if (REG_P (operand)
10972           && reg_overlap_mentioned_p (load_reg, operand))
10973         return true;
10974
10975     }
10976   return false;
10977 }
10978
10979
10980 /* When working around the Cortex-A53 erratum 835769,
10981    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
10982    instruction and has a preceding memory instruction such that a NOP
10983    should be inserted between them.  */
10984
10985 bool
10986 aarch64_madd_needs_nop (rtx_insn* insn)
10987 {
10988   enum attr_type attr_type;
10989   rtx_insn *prev;
10990   rtx body;
10991
10992   if (!TARGET_FIX_ERR_A53_835769)
10993     return false;
10994
10995   if (!INSN_P (insn) || recog_memoized (insn) < 0)
10996     return false;
10997
10998   attr_type = get_attr_type (insn);
10999   if (!is_madd_op (attr_type))
11000     return false;
11001
11002   prev = aarch64_prev_real_insn (insn);
11003   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
11004      Restore recog state to INSN to avoid state corruption.  */
11005   extract_constrain_insn_cached (insn);
11006
11007   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
11008     return false;
11009
11010   body = single_set (prev);
11011
11012   /* If the previous insn is a memory op and there is no dependency between
11013      it and the DImode madd, emit a NOP between them.  If body is NULL then we
11014      have a complex memory operation, probably a load/store pair.
11015      Be conservative for now and emit a NOP.  */
11016   if (GET_MODE (recog_data.operand[0]) == DImode
11017       && (!body || !dep_between_memop_and_curr (body)))
11018     return true;
11019
11020   return false;
11021
11022 }
11023
11024
11025 /* Implement FINAL_PRESCAN_INSN.  */
11026
11027 void
11028 aarch64_final_prescan_insn (rtx_insn *insn)
11029 {
11030   if (aarch64_madd_needs_nop (insn))
11031     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
11032 }
11033
11034
11035 /* Return the equivalent letter for size.  */
11036 static char
11037 sizetochar (int size)
11038 {
11039   switch (size)
11040     {
11041     case 64: return 'd';
11042     case 32: return 's';
11043     case 16: return 'h';
11044     case 8 : return 'b';
11045     default: gcc_unreachable ();
11046     }
11047 }
11048
11049 /* Return true iff x is a uniform vector of floating-point
11050    constants, and the constant can be represented in
11051    quarter-precision form.  Note, as aarch64_float_const_representable
11052    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
11053 static bool
11054 aarch64_vect_float_const_representable_p (rtx x)
11055 {
11056   rtx elt;
11057   return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
11058           && const_vec_duplicate_p (x, &elt)
11059           && aarch64_float_const_representable_p (elt));
11060 }
11061
11062 /* Return true for valid and false for invalid.  */
11063 bool
11064 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
11065                               struct simd_immediate_info *info)
11066 {
11067 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
11068   matches = 1;                                          \
11069   for (i = 0; i < idx; i += (STRIDE))                   \
11070     if (!(TEST))                                        \
11071       matches = 0;                                      \
11072   if (matches)                                          \
11073     {                                                   \
11074       immtype = (CLASS);                                \
11075       elsize = (ELSIZE);                                \
11076       eshift = (SHIFT);                                 \
11077       emvn = (NEG);                                     \
11078       break;                                            \
11079     }
11080
11081   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
11082   unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
11083   unsigned char bytes[16];
11084   int immtype = -1, matches;
11085   unsigned int invmask = inverse ? 0xff : 0;
11086   int eshift, emvn;
11087
11088   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11089     {
11090       if (! (aarch64_simd_imm_zero_p (op, mode)
11091              || aarch64_vect_float_const_representable_p (op)))
11092         return false;
11093
11094       if (info)
11095         {
11096           info->value = CONST_VECTOR_ELT (op, 0);
11097           info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
11098           info->mvn = false;
11099           info->shift = 0;
11100         }
11101
11102       return true;
11103     }
11104
11105   /* Splat vector constant out into a byte vector.  */
11106   for (i = 0; i < n_elts; i++)
11107     {
11108       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
11109          it must be laid out in the vector register in reverse order.  */
11110       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
11111       unsigned HOST_WIDE_INT elpart;
11112
11113       gcc_assert (CONST_INT_P (el));
11114       elpart = INTVAL (el);
11115
11116       for (unsigned int byte = 0; byte < innersize; byte++)
11117         {
11118           bytes[idx++] = (elpart & 0xff) ^ invmask;
11119           elpart >>= BITS_PER_UNIT;
11120         }
11121
11122     }
11123
11124   /* Sanity check.  */
11125   gcc_assert (idx == GET_MODE_SIZE (mode));
11126
11127   do
11128     {
11129       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
11130              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
11131
11132       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11133              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11134
11135       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
11136              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11137
11138       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
11139              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
11140
11141       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
11142
11143       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
11144
11145       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
11146              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
11147
11148       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11149              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11150
11151       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
11152              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11153
11154       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
11155              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
11156
11157       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
11158
11159       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
11160
11161       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11162              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11163
11164       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11165              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11166
11167       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
11168              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11169
11170       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
11171              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11172
11173       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
11174
11175       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
11176              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
11177     }
11178   while (0);
11179
11180   if (immtype == -1)
11181     return false;
11182
11183   if (info)
11184     {
11185       info->element_width = elsize;
11186       info->mvn = emvn != 0;
11187       info->shift = eshift;
11188
11189       unsigned HOST_WIDE_INT imm = 0;
11190
11191       if (immtype >= 12 && immtype <= 15)
11192         info->msl = true;
11193
11194       /* Un-invert bytes of recognized vector, if necessary.  */
11195       if (invmask != 0)
11196         for (i = 0; i < idx; i++)
11197           bytes[i] ^= invmask;
11198
11199       if (immtype == 17)
11200         {
11201           /* FIXME: Broken on 32-bit H_W_I hosts.  */
11202           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
11203
11204           for (i = 0; i < 8; i++)
11205             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
11206               << (i * BITS_PER_UNIT);
11207
11208
11209           info->value = GEN_INT (imm);
11210         }
11211       else
11212         {
11213           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
11214             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
11215
11216           /* Construct 'abcdefgh' because the assembler cannot handle
11217              generic constants.  */
11218           if (info->mvn)
11219             imm = ~imm;
11220           imm = (imm >> info->shift) & 0xff;
11221           info->value = GEN_INT (imm);
11222         }
11223     }
11224
11225   return true;
11226 #undef CHECK
11227 }
11228
11229 /* Check of immediate shift constants are within range.  */
11230 bool
11231 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
11232 {
11233   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
11234   if (left)
11235     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
11236   else
11237     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
11238 }
11239
11240 /* Return true if X is a uniform vector where all elements
11241    are either the floating-point constant 0.0 or the
11242    integer constant 0.  */
11243 bool
11244 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
11245 {
11246   return x == CONST0_RTX (mode);
11247 }
11248
11249
11250 /* Return the bitmask CONST_INT to select the bits required by a zero extract
11251    operation of width WIDTH at bit position POS.  */
11252
11253 rtx
11254 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
11255 {
11256   gcc_assert (CONST_INT_P (width));
11257   gcc_assert (CONST_INT_P (pos));
11258
11259   unsigned HOST_WIDE_INT mask
11260     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
11261   return GEN_INT (mask << UINTVAL (pos));
11262 }
11263
11264 bool
11265 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
11266 {
11267   HOST_WIDE_INT imm = INTVAL (x);
11268   int i;
11269
11270   for (i = 0; i < 8; i++)
11271     {
11272       unsigned int byte = imm & 0xff;
11273       if (byte != 0xff && byte != 0)
11274        return false;
11275       imm >>= 8;
11276     }
11277
11278   return true;
11279 }
11280
11281 bool
11282 aarch64_mov_operand_p (rtx x, machine_mode mode)
11283 {
11284   if (GET_CODE (x) == HIGH
11285       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
11286     return true;
11287
11288   if (CONST_INT_P (x))
11289     return true;
11290
11291   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
11292     return true;
11293
11294   return aarch64_classify_symbolic_expression (x)
11295     == SYMBOL_TINY_ABSOLUTE;
11296 }
11297
11298 /* Return a const_int vector of VAL.  */
11299 rtx
11300 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
11301 {
11302   int nunits = GET_MODE_NUNITS (mode);
11303   rtvec v = rtvec_alloc (nunits);
11304   int i;
11305
11306   rtx cache = GEN_INT (val);
11307
11308   for (i=0; i < nunits; i++)
11309     RTVEC_ELT (v, i) = cache;
11310
11311   return gen_rtx_CONST_VECTOR (mode, v);
11312 }
11313
11314 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
11315
11316 bool
11317 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
11318 {
11319   machine_mode vmode;
11320
11321   gcc_assert (!VECTOR_MODE_P (mode));
11322   vmode = aarch64_preferred_simd_mode (mode);
11323   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
11324   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
11325 }
11326
11327 /* Construct and return a PARALLEL RTX vector with elements numbering the
11328    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
11329    the vector - from the perspective of the architecture.  This does not
11330    line up with GCC's perspective on lane numbers, so we end up with
11331    different masks depending on our target endian-ness.  The diagram
11332    below may help.  We must draw the distinction when building masks
11333    which select one half of the vector.  An instruction selecting
11334    architectural low-lanes for a big-endian target, must be described using
11335    a mask selecting GCC high-lanes.
11336
11337                  Big-Endian             Little-Endian
11338
11339 GCC             0   1   2   3           3   2   1   0
11340               | x | x | x | x |       | x | x | x | x |
11341 Architecture    3   2   1   0           3   2   1   0
11342
11343 Low Mask:         { 2, 3 }                { 0, 1 }
11344 High Mask:        { 0, 1 }                { 2, 3 }
11345 */
11346
11347 rtx
11348 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
11349 {
11350   int nunits = GET_MODE_NUNITS (mode);
11351   rtvec v = rtvec_alloc (nunits / 2);
11352   int high_base = nunits / 2;
11353   int low_base = 0;
11354   int base;
11355   rtx t1;
11356   int i;
11357
11358   if (BYTES_BIG_ENDIAN)
11359     base = high ? low_base : high_base;
11360   else
11361     base = high ? high_base : low_base;
11362
11363   for (i = 0; i < nunits / 2; i++)
11364     RTVEC_ELT (v, i) = GEN_INT (base + i);
11365
11366   t1 = gen_rtx_PARALLEL (mode, v);
11367   return t1;
11368 }
11369
11370 /* Check OP for validity as a PARALLEL RTX vector with elements
11371    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
11372    from the perspective of the architecture.  See the diagram above
11373    aarch64_simd_vect_par_cnst_half for more details.  */
11374
11375 bool
11376 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
11377                                        bool high)
11378 {
11379   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
11380   HOST_WIDE_INT count_op = XVECLEN (op, 0);
11381   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
11382   int i = 0;
11383
11384   if (!VECTOR_MODE_P (mode))
11385     return false;
11386
11387   if (count_op != count_ideal)
11388     return false;
11389
11390   for (i = 0; i < count_ideal; i++)
11391     {
11392       rtx elt_op = XVECEXP (op, 0, i);
11393       rtx elt_ideal = XVECEXP (ideal, 0, i);
11394
11395       if (!CONST_INT_P (elt_op)
11396           || INTVAL (elt_ideal) != INTVAL (elt_op))
11397         return false;
11398     }
11399   return true;
11400 }
11401
11402 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
11403    HIGH (exclusive).  */
11404 void
11405 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
11406                           const_tree exp)
11407 {
11408   HOST_WIDE_INT lane;
11409   gcc_assert (CONST_INT_P (operand));
11410   lane = INTVAL (operand);
11411
11412   if (lane < low || lane >= high)
11413   {
11414     if (exp)
11415       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
11416     else
11417       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
11418   }
11419 }
11420
11421 /* Return TRUE if OP is a valid vector addressing mode.  */
11422 bool
11423 aarch64_simd_mem_operand_p (rtx op)
11424 {
11425   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
11426                         || REG_P (XEXP (op, 0)));
11427 }
11428
11429 /* Emit a register copy from operand to operand, taking care not to
11430    early-clobber source registers in the process.
11431
11432    COUNT is the number of components into which the copy needs to be
11433    decomposed.  */
11434 void
11435 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
11436                                 unsigned int count)
11437 {
11438   unsigned int i;
11439   int rdest = REGNO (operands[0]);
11440   int rsrc = REGNO (operands[1]);
11441
11442   if (!reg_overlap_mentioned_p (operands[0], operands[1])
11443       || rdest < rsrc)
11444     for (i = 0; i < count; i++)
11445       emit_move_insn (gen_rtx_REG (mode, rdest + i),
11446                       gen_rtx_REG (mode, rsrc + i));
11447   else
11448     for (i = 0; i < count; i++)
11449       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
11450                       gen_rtx_REG (mode, rsrc + count - i - 1));
11451 }
11452
11453 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11454    one of VSTRUCT modes: OI, CI, or XI.  */
11455 int
11456 aarch64_simd_attr_length_rglist (enum machine_mode mode)
11457 {
11458   return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
11459 }
11460
11461 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
11462    alignment of a vector to 128 bits.  */
11463 static HOST_WIDE_INT
11464 aarch64_simd_vector_alignment (const_tree type)
11465 {
11466   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
11467   return MIN (align, 128);
11468 }
11469
11470 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
11471 static bool
11472 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
11473 {
11474   if (is_packed)
11475     return false;
11476
11477   /* We guarantee alignment for vectors up to 128-bits.  */
11478   if (tree_int_cst_compare (TYPE_SIZE (type),
11479                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11480     return false;
11481
11482   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
11483   return true;
11484 }
11485
11486 /* Return true if the vector misalignment factor is supported by the
11487    target.  */
11488 static bool
11489 aarch64_builtin_support_vector_misalignment (machine_mode mode,
11490                                              const_tree type, int misalignment,
11491                                              bool is_packed)
11492 {
11493   if (TARGET_SIMD && STRICT_ALIGNMENT)
11494     {
11495       /* Return if movmisalign pattern is not supported for this mode.  */
11496       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
11497         return false;
11498
11499       if (misalignment == -1)
11500         {
11501           /* Misalignment factor is unknown at compile time but we know
11502              it's word aligned.  */
11503           if (aarch64_simd_vector_alignment_reachable (type, is_packed))
11504             {
11505               int element_size = TREE_INT_CST_LOW (TYPE_SIZE (type));
11506
11507               if (element_size != 64)
11508                 return true;
11509             }
11510           return false;
11511         }
11512     }
11513   return default_builtin_support_vector_misalignment (mode, type, misalignment,
11514                                                       is_packed);
11515 }
11516
11517 /* If VALS is a vector constant that can be loaded into a register
11518    using DUP, generate instructions to do so and return an RTX to
11519    assign to the register.  Otherwise return NULL_RTX.  */
11520 static rtx
11521 aarch64_simd_dup_constant (rtx vals)
11522 {
11523   machine_mode mode = GET_MODE (vals);
11524   machine_mode inner_mode = GET_MODE_INNER (mode);
11525   rtx x;
11526
11527   if (!const_vec_duplicate_p (vals, &x))
11528     return NULL_RTX;
11529
11530   /* We can load this constant by using DUP and a constant in a
11531      single ARM register.  This will be cheaper than a vector
11532      load.  */
11533   x = copy_to_mode_reg (inner_mode, x);
11534   return gen_rtx_VEC_DUPLICATE (mode, x);
11535 }
11536
11537
11538 /* Generate code to load VALS, which is a PARALLEL containing only
11539    constants (for vec_init) or CONST_VECTOR, efficiently into a
11540    register.  Returns an RTX to copy into the register, or NULL_RTX
11541    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
11542 static rtx
11543 aarch64_simd_make_constant (rtx vals)
11544 {
11545   machine_mode mode = GET_MODE (vals);
11546   rtx const_dup;
11547   rtx const_vec = NULL_RTX;
11548   int n_elts = GET_MODE_NUNITS (mode);
11549   int n_const = 0;
11550   int i;
11551
11552   if (GET_CODE (vals) == CONST_VECTOR)
11553     const_vec = vals;
11554   else if (GET_CODE (vals) == PARALLEL)
11555     {
11556       /* A CONST_VECTOR must contain only CONST_INTs and
11557          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11558          Only store valid constants in a CONST_VECTOR.  */
11559       for (i = 0; i < n_elts; ++i)
11560         {
11561           rtx x = XVECEXP (vals, 0, i);
11562           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11563             n_const++;
11564         }
11565       if (n_const == n_elts)
11566         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11567     }
11568   else
11569     gcc_unreachable ();
11570
11571   if (const_vec != NULL_RTX
11572       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11573     /* Load using MOVI/MVNI.  */
11574     return const_vec;
11575   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11576     /* Loaded using DUP.  */
11577     return const_dup;
11578   else if (const_vec != NULL_RTX)
11579     /* Load from constant pool. We can not take advantage of single-cycle
11580        LD1 because we need a PC-relative addressing mode.  */
11581     return const_vec;
11582   else
11583     /* A PARALLEL containing something not valid inside CONST_VECTOR.
11584        We can not construct an initializer.  */
11585     return NULL_RTX;
11586 }
11587
11588 /* Expand a vector initialisation sequence, such that TARGET is
11589    initialised to contain VALS.  */
11590
11591 void
11592 aarch64_expand_vector_init (rtx target, rtx vals)
11593 {
11594   machine_mode mode = GET_MODE (target);
11595   machine_mode inner_mode = GET_MODE_INNER (mode);
11596   /* The number of vector elements.  */
11597   int n_elts = GET_MODE_NUNITS (mode);
11598   /* The number of vector elements which are not constant.  */
11599   int n_var = 0;
11600   rtx any_const = NULL_RTX;
11601   /* The first element of vals.  */
11602   rtx v0 = XVECEXP (vals, 0, 0);
11603   bool all_same = true;
11604
11605   /* Count the number of variable elements to initialise.  */
11606   for (int i = 0; i < n_elts; ++i)
11607     {
11608       rtx x = XVECEXP (vals, 0, i);
11609       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
11610         ++n_var;
11611       else
11612         any_const = x;
11613
11614       all_same &= rtx_equal_p (x, v0);
11615     }
11616
11617   /* No variable elements, hand off to aarch64_simd_make_constant which knows
11618      how best to handle this.  */
11619   if (n_var == 0)
11620     {
11621       rtx constant = aarch64_simd_make_constant (vals);
11622       if (constant != NULL_RTX)
11623         {
11624           emit_move_insn (target, constant);
11625           return;
11626         }
11627     }
11628
11629   /* Splat a single non-constant element if we can.  */
11630   if (all_same)
11631     {
11632       rtx x = copy_to_mode_reg (inner_mode, v0);
11633       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
11634       return;
11635     }
11636
11637   /* Initialise a vector which is part-variable.  We want to first try
11638      to build those lanes which are constant in the most efficient way we
11639      can.  */
11640   if (n_var != n_elts)
11641     {
11642       rtx copy = copy_rtx (vals);
11643
11644       /* Load constant part of vector.  We really don't care what goes into the
11645          parts we will overwrite, but we're more likely to be able to load the
11646          constant efficiently if it has fewer, larger, repeating parts
11647          (see aarch64_simd_valid_immediate).  */
11648       for (int i = 0; i < n_elts; i++)
11649         {
11650           rtx x = XVECEXP (vals, 0, i);
11651           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11652             continue;
11653           rtx subst = any_const;
11654           for (int bit = n_elts / 2; bit > 0; bit /= 2)
11655             {
11656               /* Look in the copied vector, as more elements are const.  */
11657               rtx test = XVECEXP (copy, 0, i ^ bit);
11658               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
11659                 {
11660                   subst = test;
11661                   break;
11662                 }
11663             }
11664           XVECEXP (copy, 0, i) = subst;
11665         }
11666       aarch64_expand_vector_init (target, copy);
11667     }
11668
11669   /* Insert the variable lanes directly.  */
11670
11671   enum insn_code icode = optab_handler (vec_set_optab, mode);
11672   gcc_assert (icode != CODE_FOR_nothing);
11673
11674   for (int i = 0; i < n_elts; i++)
11675     {
11676       rtx x = XVECEXP (vals, 0, i);
11677       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11678         continue;
11679       x = copy_to_mode_reg (inner_mode, x);
11680       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
11681     }
11682 }
11683
11684 static unsigned HOST_WIDE_INT
11685 aarch64_shift_truncation_mask (machine_mode mode)
11686 {
11687   return
11688     (!SHIFT_COUNT_TRUNCATED
11689      || aarch64_vector_mode_supported_p (mode)
11690      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
11691 }
11692
11693 /* Select a format to encode pointers in exception handling data.  */
11694 int
11695 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
11696 {
11697    int type;
11698    switch (aarch64_cmodel)
11699      {
11700      case AARCH64_CMODEL_TINY:
11701      case AARCH64_CMODEL_TINY_PIC:
11702      case AARCH64_CMODEL_SMALL:
11703      case AARCH64_CMODEL_SMALL_PIC:
11704      case AARCH64_CMODEL_SMALL_SPIC:
11705        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
11706           for everything.  */
11707        type = DW_EH_PE_sdata4;
11708        break;
11709      default:
11710        /* No assumptions here.  8-byte relocs required.  */
11711        type = DW_EH_PE_sdata8;
11712        break;
11713      }
11714    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
11715 }
11716
11717 /* The last .arch and .tune assembly strings that we printed.  */
11718 static std::string aarch64_last_printed_arch_string;
11719 static std::string aarch64_last_printed_tune_string;
11720
11721 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
11722    by the function fndecl.  */
11723
11724 void
11725 aarch64_declare_function_name (FILE *stream, const char* name,
11726                                 tree fndecl)
11727 {
11728   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11729
11730   struct cl_target_option *targ_options;
11731   if (target_parts)
11732     targ_options = TREE_TARGET_OPTION (target_parts);
11733   else
11734     targ_options = TREE_TARGET_OPTION (target_option_current_node);
11735   gcc_assert (targ_options);
11736
11737   const struct processor *this_arch
11738     = aarch64_get_arch (targ_options->x_explicit_arch);
11739
11740   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
11741   std::string extension
11742     = aarch64_get_extension_string_for_isa_flags (isa_flags,
11743                                                   this_arch->flags);
11744   /* Only update the assembler .arch string if it is distinct from the last
11745      such string we printed.  */
11746   std::string to_print = this_arch->name + extension;
11747   if (to_print != aarch64_last_printed_arch_string)
11748     {
11749       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
11750       aarch64_last_printed_arch_string = to_print;
11751     }
11752
11753   /* Print the cpu name we're tuning for in the comments, might be
11754      useful to readers of the generated asm.  Do it only when it changes
11755      from function to function and verbose assembly is requested.  */
11756   const struct processor *this_tune
11757     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
11758
11759   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
11760     {
11761       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
11762                    this_tune->name);
11763       aarch64_last_printed_tune_string = this_tune->name;
11764     }
11765
11766   /* Don't forget the type directive for ELF.  */
11767   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
11768   ASM_OUTPUT_LABEL (stream, name);
11769 }
11770
11771 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
11772
11773 static void
11774 aarch64_start_file (void)
11775 {
11776   struct cl_target_option *default_options
11777     = TREE_TARGET_OPTION (target_option_default_node);
11778
11779   const struct processor *default_arch
11780     = aarch64_get_arch (default_options->x_explicit_arch);
11781   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
11782   std::string extension
11783     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
11784                                                   default_arch->flags);
11785
11786    aarch64_last_printed_arch_string = default_arch->name + extension;
11787    aarch64_last_printed_tune_string = "";
11788    asm_fprintf (asm_out_file, "\t.arch %s\n",
11789                 aarch64_last_printed_arch_string.c_str ());
11790
11791    default_file_start ();
11792 }
11793
11794 /* Emit load exclusive.  */
11795
11796 static void
11797 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
11798                              rtx mem, rtx model_rtx)
11799 {
11800   rtx (*gen) (rtx, rtx, rtx);
11801
11802   switch (mode)
11803     {
11804     case QImode: gen = gen_aarch64_load_exclusiveqi; break;
11805     case HImode: gen = gen_aarch64_load_exclusivehi; break;
11806     case SImode: gen = gen_aarch64_load_exclusivesi; break;
11807     case DImode: gen = gen_aarch64_load_exclusivedi; break;
11808     default:
11809       gcc_unreachable ();
11810     }
11811
11812   emit_insn (gen (rval, mem, model_rtx));
11813 }
11814
11815 /* Emit store exclusive.  */
11816
11817 static void
11818 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
11819                               rtx rval, rtx mem, rtx model_rtx)
11820 {
11821   rtx (*gen) (rtx, rtx, rtx, rtx);
11822
11823   switch (mode)
11824     {
11825     case QImode: gen = gen_aarch64_store_exclusiveqi; break;
11826     case HImode: gen = gen_aarch64_store_exclusivehi; break;
11827     case SImode: gen = gen_aarch64_store_exclusivesi; break;
11828     case DImode: gen = gen_aarch64_store_exclusivedi; break;
11829     default:
11830       gcc_unreachable ();
11831     }
11832
11833   emit_insn (gen (bval, rval, mem, model_rtx));
11834 }
11835
11836 /* Mark the previous jump instruction as unlikely.  */
11837
11838 static void
11839 aarch64_emit_unlikely_jump (rtx insn)
11840 {
11841   int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
11842
11843   rtx_insn *jump = emit_jump_insn (insn);
11844   add_int_reg_note (jump, REG_BR_PROB, very_unlikely);
11845 }
11846
11847 /* Expand a compare and swap pattern.  */
11848
11849 void
11850 aarch64_expand_compare_and_swap (rtx operands[])
11851 {
11852   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
11853   machine_mode mode, cmp_mode;
11854   typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
11855   int idx;
11856   gen_cas_fn gen;
11857   const gen_cas_fn split_cas[] =
11858   {
11859     gen_aarch64_compare_and_swapqi,
11860     gen_aarch64_compare_and_swaphi,
11861     gen_aarch64_compare_and_swapsi,
11862     gen_aarch64_compare_and_swapdi
11863   };
11864   const gen_cas_fn atomic_cas[] =
11865   {
11866     gen_aarch64_compare_and_swapqi_lse,
11867     gen_aarch64_compare_and_swaphi_lse,
11868     gen_aarch64_compare_and_swapsi_lse,
11869     gen_aarch64_compare_and_swapdi_lse
11870   };
11871
11872   bval = operands[0];
11873   rval = operands[1];
11874   mem = operands[2];
11875   oldval = operands[3];
11876   newval = operands[4];
11877   is_weak = operands[5];
11878   mod_s = operands[6];
11879   mod_f = operands[7];
11880   mode = GET_MODE (mem);
11881   cmp_mode = mode;
11882
11883   /* Normally the succ memory model must be stronger than fail, but in the
11884      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
11885      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
11886
11887   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
11888       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
11889     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
11890
11891   switch (mode)
11892     {
11893     case QImode:
11894     case HImode:
11895       /* For short modes, we're going to perform the comparison in SImode,
11896          so do the zero-extension now.  */
11897       cmp_mode = SImode;
11898       rval = gen_reg_rtx (SImode);
11899       oldval = convert_modes (SImode, mode, oldval, true);
11900       /* Fall through.  */
11901
11902     case SImode:
11903     case DImode:
11904       /* Force the value into a register if needed.  */
11905       if (!aarch64_plus_operand (oldval, mode))
11906         oldval = force_reg (cmp_mode, oldval);
11907       break;
11908
11909     default:
11910       gcc_unreachable ();
11911     }
11912
11913   switch (mode)
11914     {
11915     case QImode: idx = 0; break;
11916     case HImode: idx = 1; break;
11917     case SImode: idx = 2; break;
11918     case DImode: idx = 3; break;
11919     default:
11920       gcc_unreachable ();
11921     }
11922   if (TARGET_LSE)
11923     gen = atomic_cas[idx];
11924   else
11925     gen = split_cas[idx];
11926
11927   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
11928
11929   if (mode == QImode || mode == HImode)
11930     emit_move_insn (operands[1], gen_lowpart (mode, rval));
11931
11932   x = gen_rtx_REG (CCmode, CC_REGNUM);
11933   x = gen_rtx_EQ (SImode, x, const0_rtx);
11934   emit_insn (gen_rtx_SET (bval, x));
11935 }
11936
11937 /* Test whether the target supports using a atomic load-operate instruction.
11938    CODE is the operation and AFTER is TRUE if the data in memory after the
11939    operation should be returned and FALSE if the data before the operation
11940    should be returned.  Returns FALSE if the operation isn't supported by the
11941    architecture.  */
11942
11943 bool
11944 aarch64_atomic_ldop_supported_p (enum rtx_code code)
11945 {
11946   if (!TARGET_LSE)
11947     return false;
11948
11949   switch (code)
11950     {
11951     case SET:
11952     case AND:
11953     case IOR:
11954     case XOR:
11955     case MINUS:
11956     case PLUS:
11957       return true;
11958     default:
11959       return false;
11960     }
11961 }
11962
11963 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
11964    sequence implementing an atomic operation.  */
11965
11966 static void
11967 aarch64_emit_post_barrier (enum memmodel model)
11968 {
11969   const enum memmodel base_model = memmodel_base (model);
11970
11971   if (is_mm_sync (model)
11972       && (base_model == MEMMODEL_ACQUIRE
11973           || base_model == MEMMODEL_ACQ_REL
11974           || base_model == MEMMODEL_SEQ_CST))
11975     {
11976       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
11977     }
11978 }
11979
11980 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
11981    for the data in memory.  EXPECTED is the value expected to be in memory.
11982    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
11983    is the memory ordering to use.  */
11984
11985 void
11986 aarch64_gen_atomic_cas (rtx rval, rtx mem,
11987                         rtx expected, rtx desired,
11988                         rtx model)
11989 {
11990   rtx (*gen) (rtx, rtx, rtx, rtx);
11991   machine_mode mode;
11992
11993   mode = GET_MODE (mem);
11994
11995   switch (mode)
11996     {
11997     case QImode: gen = gen_aarch64_atomic_casqi; break;
11998     case HImode: gen = gen_aarch64_atomic_cashi; break;
11999     case SImode: gen = gen_aarch64_atomic_cassi; break;
12000     case DImode: gen = gen_aarch64_atomic_casdi; break;
12001     default:
12002       gcc_unreachable ();
12003     }
12004
12005   /* Move the expected value into the CAS destination register.  */
12006   emit_insn (gen_rtx_SET (rval, expected));
12007
12008   /* Emit the CAS.  */
12009   emit_insn (gen (rval, mem, desired, model));
12010
12011   /* Compare the expected value with the value loaded by the CAS, to establish
12012      whether the swap was made.  */
12013   aarch64_gen_compare_reg (EQ, rval, expected);
12014 }
12015
12016 /* Split a compare and swap pattern.  */
12017
12018 void
12019 aarch64_split_compare_and_swap (rtx operands[])
12020 {
12021   rtx rval, mem, oldval, newval, scratch;
12022   machine_mode mode;
12023   bool is_weak;
12024   rtx_code_label *label1, *label2;
12025   rtx x, cond;
12026   enum memmodel model;
12027   rtx model_rtx;
12028
12029   rval = operands[0];
12030   mem = operands[1];
12031   oldval = operands[2];
12032   newval = operands[3];
12033   is_weak = (operands[4] != const0_rtx);
12034   model_rtx = operands[5];
12035   scratch = operands[7];
12036   mode = GET_MODE (mem);
12037   model = memmodel_from_int (INTVAL (model_rtx));
12038
12039   label1 = NULL;
12040   if (!is_weak)
12041     {
12042       label1 = gen_label_rtx ();
12043       emit_label (label1);
12044     }
12045   label2 = gen_label_rtx ();
12046
12047   /* The initial load can be relaxed for a __sync operation since a final
12048      barrier will be emitted to stop code hoisting.  */
12049   if (is_mm_sync (model))
12050     aarch64_emit_load_exclusive (mode, rval, mem,
12051                                  GEN_INT (MEMMODEL_RELAXED));
12052   else
12053     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
12054
12055   cond = aarch64_gen_compare_reg (NE, rval, oldval);
12056   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12057   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12058                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12059   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12060
12061   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
12062
12063   if (!is_weak)
12064     {
12065       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
12066       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12067                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
12068       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12069     }
12070   else
12071     {
12072       cond = gen_rtx_REG (CCmode, CC_REGNUM);
12073       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
12074       emit_insn (gen_rtx_SET (cond, x));
12075     }
12076
12077   emit_label (label2);
12078
12079   /* Emit any final barrier needed for a __sync operation.  */
12080   if (is_mm_sync (model))
12081     aarch64_emit_post_barrier (model);
12082 }
12083
12084 /* Emit a BIC instruction.  */
12085
12086 static void
12087 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
12088 {
12089   rtx shift_rtx = GEN_INT (shift);
12090   rtx (*gen) (rtx, rtx, rtx, rtx);
12091
12092   switch (mode)
12093     {
12094     case SImode: gen = gen_and_one_cmpl_lshrsi3; break;
12095     case DImode: gen = gen_and_one_cmpl_lshrdi3; break;
12096     default:
12097       gcc_unreachable ();
12098     }
12099
12100   emit_insn (gen (dst, s2, shift_rtx, s1));
12101 }
12102
12103 /* Emit an atomic swap.  */
12104
12105 static void
12106 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
12107                           rtx mem, rtx model)
12108 {
12109   rtx (*gen) (rtx, rtx, rtx, rtx);
12110
12111   switch (mode)
12112     {
12113     case QImode: gen = gen_aarch64_atomic_swpqi; break;
12114     case HImode: gen = gen_aarch64_atomic_swphi; break;
12115     case SImode: gen = gen_aarch64_atomic_swpsi; break;
12116     case DImode: gen = gen_aarch64_atomic_swpdi; break;
12117     default:
12118       gcc_unreachable ();
12119     }
12120
12121   emit_insn (gen (dst, mem, value, model));
12122 }
12123
12124 /* Operations supported by aarch64_emit_atomic_load_op.  */
12125
12126 enum aarch64_atomic_load_op_code
12127 {
12128   AARCH64_LDOP_PLUS,    /* A + B  */
12129   AARCH64_LDOP_XOR,     /* A ^ B  */
12130   AARCH64_LDOP_OR,      /* A | B  */
12131   AARCH64_LDOP_BIC      /* A & ~B  */
12132 };
12133
12134 /* Emit an atomic load-operate.  */
12135
12136 static void
12137 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
12138                              machine_mode mode, rtx dst, rtx src,
12139                              rtx mem, rtx model)
12140 {
12141   typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
12142   const aarch64_atomic_load_op_fn plus[] =
12143   {
12144     gen_aarch64_atomic_loadaddqi,
12145     gen_aarch64_atomic_loadaddhi,
12146     gen_aarch64_atomic_loadaddsi,
12147     gen_aarch64_atomic_loadadddi
12148   };
12149   const aarch64_atomic_load_op_fn eor[] =
12150   {
12151     gen_aarch64_atomic_loadeorqi,
12152     gen_aarch64_atomic_loadeorhi,
12153     gen_aarch64_atomic_loadeorsi,
12154     gen_aarch64_atomic_loadeordi
12155   };
12156   const aarch64_atomic_load_op_fn ior[] =
12157   {
12158     gen_aarch64_atomic_loadsetqi,
12159     gen_aarch64_atomic_loadsethi,
12160     gen_aarch64_atomic_loadsetsi,
12161     gen_aarch64_atomic_loadsetdi
12162   };
12163   const aarch64_atomic_load_op_fn bic[] =
12164   {
12165     gen_aarch64_atomic_loadclrqi,
12166     gen_aarch64_atomic_loadclrhi,
12167     gen_aarch64_atomic_loadclrsi,
12168     gen_aarch64_atomic_loadclrdi
12169   };
12170   aarch64_atomic_load_op_fn gen;
12171   int idx = 0;
12172
12173   switch (mode)
12174     {
12175     case QImode: idx = 0; break;
12176     case HImode: idx = 1; break;
12177     case SImode: idx = 2; break;
12178     case DImode: idx = 3; break;
12179     default:
12180       gcc_unreachable ();
12181     }
12182
12183   switch (code)
12184     {
12185     case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
12186     case AARCH64_LDOP_XOR: gen = eor[idx]; break;
12187     case AARCH64_LDOP_OR: gen = ior[idx]; break;
12188     case AARCH64_LDOP_BIC: gen = bic[idx]; break;
12189     default:
12190       gcc_unreachable ();
12191     }
12192
12193   emit_insn (gen (dst, mem, src, model));
12194 }
12195
12196 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
12197    location to store the data read from memory.  OUT_RESULT is the location to
12198    store the result of the operation.  MEM is the memory location to read and
12199    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
12200    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
12201    be NULL.  */
12202
12203 void
12204 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
12205                          rtx mem, rtx value, rtx model_rtx)
12206 {
12207   machine_mode mode = GET_MODE (mem);
12208   machine_mode wmode = (mode == DImode ? DImode : SImode);
12209   const bool short_mode = (mode < SImode);
12210   aarch64_atomic_load_op_code ldop_code;
12211   rtx src;
12212   rtx x;
12213
12214   if (out_data)
12215     out_data = gen_lowpart (mode, out_data);
12216
12217   if (out_result)
12218     out_result = gen_lowpart (mode, out_result);
12219
12220   /* Make sure the value is in a register, putting it into a destination
12221      register if it needs to be manipulated.  */
12222   if (!register_operand (value, mode)
12223       || code == AND || code == MINUS)
12224     {
12225       src = out_result ? out_result : out_data;
12226       emit_move_insn (src, gen_lowpart (mode, value));
12227     }
12228   else
12229     src = value;
12230   gcc_assert (register_operand (src, mode));
12231
12232   /* Preprocess the data for the operation as necessary.  If the operation is
12233      a SET then emit a swap instruction and finish.  */
12234   switch (code)
12235     {
12236     case SET:
12237       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
12238       return;
12239
12240     case MINUS:
12241       /* Negate the value and treat it as a PLUS.  */
12242       {
12243         rtx neg_src;
12244
12245         /* Resize the value if necessary.  */
12246         if (short_mode)
12247           src = gen_lowpart (wmode, src);
12248
12249         neg_src = gen_rtx_NEG (wmode, src);
12250         emit_insn (gen_rtx_SET (src, neg_src));
12251
12252         if (short_mode)
12253           src = gen_lowpart (mode, src);
12254       }
12255       /* Fall-through.  */
12256     case PLUS:
12257       ldop_code = AARCH64_LDOP_PLUS;
12258       break;
12259
12260     case IOR:
12261       ldop_code = AARCH64_LDOP_OR;
12262       break;
12263
12264     case XOR:
12265       ldop_code = AARCH64_LDOP_XOR;
12266       break;
12267
12268     case AND:
12269       {
12270         rtx not_src;
12271
12272         /* Resize the value if necessary.  */
12273         if (short_mode)
12274           src = gen_lowpart (wmode, src);
12275
12276         not_src = gen_rtx_NOT (wmode, src);
12277         emit_insn (gen_rtx_SET (src, not_src));
12278
12279         if (short_mode)
12280           src = gen_lowpart (mode, src);
12281       }
12282       ldop_code = AARCH64_LDOP_BIC;
12283       break;
12284
12285     default:
12286       /* The operation can't be done with atomic instructions.  */
12287       gcc_unreachable ();
12288     }
12289
12290   aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
12291
12292   /* If necessary, calculate the data in memory after the update by redoing the
12293      operation from values in registers.  */
12294   if (!out_result)
12295     return;
12296
12297   if (short_mode)
12298     {
12299       src = gen_lowpart (wmode, src);
12300       out_data = gen_lowpart (wmode, out_data);
12301       out_result = gen_lowpart (wmode, out_result);
12302     }
12303
12304   x = NULL_RTX;
12305
12306   switch (code)
12307     {
12308     case MINUS:
12309     case PLUS:
12310       x = gen_rtx_PLUS (wmode, out_data, src);
12311       break;
12312     case IOR:
12313       x = gen_rtx_IOR (wmode, out_data, src);
12314       break;
12315     case XOR:
12316       x = gen_rtx_XOR (wmode, out_data, src);
12317       break;
12318     case AND:
12319       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
12320       return;
12321     default:
12322       gcc_unreachable ();
12323     }
12324
12325   emit_set_insn (out_result, x);
12326
12327   return;
12328 }
12329
12330 /* Split an atomic operation.  */
12331
12332 void
12333 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
12334                          rtx value, rtx model_rtx, rtx cond)
12335 {
12336   machine_mode mode = GET_MODE (mem);
12337   machine_mode wmode = (mode == DImode ? DImode : SImode);
12338   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
12339   const bool is_sync = is_mm_sync (model);
12340   rtx_code_label *label;
12341   rtx x;
12342
12343   /* Split the atomic operation into a sequence.  */
12344   label = gen_label_rtx ();
12345   emit_label (label);
12346
12347   if (new_out)
12348     new_out = gen_lowpart (wmode, new_out);
12349   if (old_out)
12350     old_out = gen_lowpart (wmode, old_out);
12351   else
12352     old_out = new_out;
12353   value = simplify_gen_subreg (wmode, value, mode, 0);
12354
12355   /* The initial load can be relaxed for a __sync operation since a final
12356      barrier will be emitted to stop code hoisting.  */
12357  if (is_sync)
12358     aarch64_emit_load_exclusive (mode, old_out, mem,
12359                                  GEN_INT (MEMMODEL_RELAXED));
12360   else
12361     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
12362
12363   switch (code)
12364     {
12365     case SET:
12366       new_out = value;
12367       break;
12368
12369     case NOT:
12370       x = gen_rtx_AND (wmode, old_out, value);
12371       emit_insn (gen_rtx_SET (new_out, x));
12372       x = gen_rtx_NOT (wmode, new_out);
12373       emit_insn (gen_rtx_SET (new_out, x));
12374       break;
12375
12376     case MINUS:
12377       if (CONST_INT_P (value))
12378         {
12379           value = GEN_INT (-INTVAL (value));
12380           code = PLUS;
12381         }
12382       /* Fall through.  */
12383
12384     default:
12385       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
12386       emit_insn (gen_rtx_SET (new_out, x));
12387       break;
12388     }
12389
12390   aarch64_emit_store_exclusive (mode, cond, mem,
12391                                 gen_lowpart (mode, new_out), model_rtx);
12392
12393   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12394   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12395                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
12396   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12397
12398   /* Emit any final barrier needed for a __sync operation.  */
12399   if (is_sync)
12400     aarch64_emit_post_barrier (model);
12401 }
12402
12403 static void
12404 aarch64_init_libfuncs (void)
12405 {
12406    /* Half-precision float operations.  The compiler handles all operations
12407      with NULL libfuncs by converting to SFmode.  */
12408
12409   /* Conversions.  */
12410   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
12411   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
12412
12413   /* Arithmetic.  */
12414   set_optab_libfunc (add_optab, HFmode, NULL);
12415   set_optab_libfunc (sdiv_optab, HFmode, NULL);
12416   set_optab_libfunc (smul_optab, HFmode, NULL);
12417   set_optab_libfunc (neg_optab, HFmode, NULL);
12418   set_optab_libfunc (sub_optab, HFmode, NULL);
12419
12420   /* Comparisons.  */
12421   set_optab_libfunc (eq_optab, HFmode, NULL);
12422   set_optab_libfunc (ne_optab, HFmode, NULL);
12423   set_optab_libfunc (lt_optab, HFmode, NULL);
12424   set_optab_libfunc (le_optab, HFmode, NULL);
12425   set_optab_libfunc (ge_optab, HFmode, NULL);
12426   set_optab_libfunc (gt_optab, HFmode, NULL);
12427   set_optab_libfunc (unord_optab, HFmode, NULL);
12428 }
12429
12430 /* Target hook for c_mode_for_suffix.  */
12431 static machine_mode
12432 aarch64_c_mode_for_suffix (char suffix)
12433 {
12434   if (suffix == 'q')
12435     return TFmode;
12436
12437   return VOIDmode;
12438 }
12439
12440 /* We can only represent floating point constants which will fit in
12441    "quarter-precision" values.  These values are characterised by
12442    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
12443    by:
12444
12445    (-1)^s * (n/16) * 2^r
12446
12447    Where:
12448      's' is the sign bit.
12449      'n' is an integer in the range 16 <= n <= 31.
12450      'r' is an integer in the range -3 <= r <= 4.  */
12451
12452 /* Return true iff X can be represented by a quarter-precision
12453    floating point immediate operand X.  Note, we cannot represent 0.0.  */
12454 bool
12455 aarch64_float_const_representable_p (rtx x)
12456 {
12457   /* This represents our current view of how many bits
12458      make up the mantissa.  */
12459   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
12460   int exponent;
12461   unsigned HOST_WIDE_INT mantissa, mask;
12462   REAL_VALUE_TYPE r, m;
12463   bool fail;
12464
12465   if (!CONST_DOUBLE_P (x))
12466     return false;
12467
12468   /* We don't support HFmode constants yet.  */
12469   if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
12470     return false;
12471
12472   r = *CONST_DOUBLE_REAL_VALUE (x);
12473
12474   /* We cannot represent infinities, NaNs or +/-zero.  We won't
12475      know if we have +zero until we analyse the mantissa, but we
12476      can reject the other invalid values.  */
12477   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
12478       || REAL_VALUE_MINUS_ZERO (r))
12479     return false;
12480
12481   /* Extract exponent.  */
12482   r = real_value_abs (&r);
12483   exponent = REAL_EXP (&r);
12484
12485   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12486      highest (sign) bit, with a fixed binary point at bit point_pos.
12487      m1 holds the low part of the mantissa, m2 the high part.
12488      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12489      bits for the mantissa, this can fail (low bits will be lost).  */
12490   real_ldexp (&m, &r, point_pos - exponent);
12491   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
12492
12493   /* If the low part of the mantissa has bits set we cannot represent
12494      the value.  */
12495   if (w.elt (0) != 0)
12496     return false;
12497   /* We have rejected the lower HOST_WIDE_INT, so update our
12498      understanding of how many bits lie in the mantissa and
12499      look only at the high HOST_WIDE_INT.  */
12500   mantissa = w.elt (1);
12501   point_pos -= HOST_BITS_PER_WIDE_INT;
12502
12503   /* We can only represent values with a mantissa of the form 1.xxxx.  */
12504   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
12505   if ((mantissa & mask) != 0)
12506     return false;
12507
12508   /* Having filtered unrepresentable values, we may now remove all
12509      but the highest 5 bits.  */
12510   mantissa >>= point_pos - 5;
12511
12512   /* We cannot represent the value 0.0, so reject it.  This is handled
12513      elsewhere.  */
12514   if (mantissa == 0)
12515     return false;
12516
12517   /* Then, as bit 4 is always set, we can mask it off, leaving
12518      the mantissa in the range [0, 15].  */
12519   mantissa &= ~(1 << 4);
12520   gcc_assert (mantissa <= 15);
12521
12522   /* GCC internally does not use IEEE754-like encoding (where normalized
12523      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
12524      Our mantissa values are shifted 4 places to the left relative to
12525      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
12526      by 5 places to correct for GCC's representation.  */
12527   exponent = 5 - exponent;
12528
12529   return (exponent >= 0 && exponent <= 7);
12530 }
12531
12532 char*
12533 aarch64_output_simd_mov_immediate (rtx const_vector,
12534                                    machine_mode mode,
12535                                    unsigned width)
12536 {
12537   bool is_valid;
12538   static char templ[40];
12539   const char *mnemonic;
12540   const char *shift_op;
12541   unsigned int lane_count = 0;
12542   char element_char;
12543
12544   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
12545
12546   /* This will return true to show const_vector is legal for use as either
12547      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
12548      also update INFO to show how the immediate should be generated.  */
12549   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
12550   gcc_assert (is_valid);
12551
12552   element_char = sizetochar (info.element_width);
12553   lane_count = width / info.element_width;
12554
12555   mode = GET_MODE_INNER (mode);
12556   if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12557     {
12558       gcc_assert (info.shift == 0 && ! info.mvn);
12559       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
12560          move immediate path.  */
12561       if (aarch64_float_const_zero_rtx_p (info.value))
12562         info.value = GEN_INT (0);
12563       else
12564         {
12565           const unsigned int buf_size = 20;
12566           char float_buf[buf_size] = {'\0'};
12567           real_to_decimal_for_mode (float_buf,
12568                                     CONST_DOUBLE_REAL_VALUE (info.value),
12569                                     buf_size, buf_size, 1, mode);
12570
12571           if (lane_count == 1)
12572             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
12573           else
12574             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
12575                       lane_count, element_char, float_buf);
12576           return templ;
12577         }
12578     }
12579
12580   mnemonic = info.mvn ? "mvni" : "movi";
12581   shift_op = info.msl ? "msl" : "lsl";
12582
12583   gcc_assert (CONST_INT_P (info.value));
12584   if (lane_count == 1)
12585     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
12586               mnemonic, UINTVAL (info.value));
12587   else if (info.shift)
12588     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
12589               ", %s %d", mnemonic, lane_count, element_char,
12590               UINTVAL (info.value), shift_op, info.shift);
12591   else
12592     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
12593               mnemonic, lane_count, element_char, UINTVAL (info.value));
12594   return templ;
12595 }
12596
12597 char*
12598 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
12599                                           machine_mode mode)
12600 {
12601   machine_mode vmode;
12602
12603   gcc_assert (!VECTOR_MODE_P (mode));
12604   vmode = aarch64_simd_container_mode (mode, 64);
12605   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
12606   return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
12607 }
12608
12609 /* Split operands into moves from op[1] + op[2] into op[0].  */
12610
12611 void
12612 aarch64_split_combinev16qi (rtx operands[3])
12613 {
12614   unsigned int dest = REGNO (operands[0]);
12615   unsigned int src1 = REGNO (operands[1]);
12616   unsigned int src2 = REGNO (operands[2]);
12617   machine_mode halfmode = GET_MODE (operands[1]);
12618   unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
12619   rtx destlo, desthi;
12620
12621   gcc_assert (halfmode == V16QImode);
12622
12623   if (src1 == dest && src2 == dest + halfregs)
12624     {
12625       /* No-op move.  Can't split to nothing; emit something.  */
12626       emit_note (NOTE_INSN_DELETED);
12627       return;
12628     }
12629
12630   /* Preserve register attributes for variable tracking.  */
12631   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
12632   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
12633                                GET_MODE_SIZE (halfmode));
12634
12635   /* Special case of reversed high/low parts.  */
12636   if (reg_overlap_mentioned_p (operands[2], destlo)
12637       && reg_overlap_mentioned_p (operands[1], desthi))
12638     {
12639       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12640       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
12641       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12642     }
12643   else if (!reg_overlap_mentioned_p (operands[2], destlo))
12644     {
12645       /* Try to avoid unnecessary moves if part of the result
12646          is in the right place already.  */
12647       if (src1 != dest)
12648         emit_move_insn (destlo, operands[1]);
12649       if (src2 != dest + halfregs)
12650         emit_move_insn (desthi, operands[2]);
12651     }
12652   else
12653     {
12654       if (src2 != dest + halfregs)
12655         emit_move_insn (desthi, operands[2]);
12656       if (src1 != dest)
12657         emit_move_insn (destlo, operands[1]);
12658     }
12659 }
12660
12661 /* vec_perm support.  */
12662
12663 #define MAX_VECT_LEN 16
12664
12665 struct expand_vec_perm_d
12666 {
12667   rtx target, op0, op1;
12668   unsigned char perm[MAX_VECT_LEN];
12669   machine_mode vmode;
12670   unsigned char nelt;
12671   bool one_vector_p;
12672   bool testing_p;
12673 };
12674
12675 /* Generate a variable permutation.  */
12676
12677 static void
12678 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
12679 {
12680   machine_mode vmode = GET_MODE (target);
12681   bool one_vector_p = rtx_equal_p (op0, op1);
12682
12683   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
12684   gcc_checking_assert (GET_MODE (op0) == vmode);
12685   gcc_checking_assert (GET_MODE (op1) == vmode);
12686   gcc_checking_assert (GET_MODE (sel) == vmode);
12687   gcc_checking_assert (TARGET_SIMD);
12688
12689   if (one_vector_p)
12690     {
12691       if (vmode == V8QImode)
12692         {
12693           /* Expand the argument to a V16QI mode by duplicating it.  */
12694           rtx pair = gen_reg_rtx (V16QImode);
12695           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
12696           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12697         }
12698       else
12699         {
12700           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
12701         }
12702     }
12703   else
12704     {
12705       rtx pair;
12706
12707       if (vmode == V8QImode)
12708         {
12709           pair = gen_reg_rtx (V16QImode);
12710           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
12711           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12712         }
12713       else
12714         {
12715           pair = gen_reg_rtx (OImode);
12716           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
12717           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
12718         }
12719     }
12720 }
12721
12722 void
12723 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
12724 {
12725   machine_mode vmode = GET_MODE (target);
12726   unsigned int nelt = GET_MODE_NUNITS (vmode);
12727   bool one_vector_p = rtx_equal_p (op0, op1);
12728   rtx mask;
12729
12730   /* The TBL instruction does not use a modulo index, so we must take care
12731      of that ourselves.  */
12732   mask = aarch64_simd_gen_const_vector_dup (vmode,
12733       one_vector_p ? nelt - 1 : 2 * nelt - 1);
12734   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
12735
12736   /* For big-endian, we also need to reverse the index within the vector
12737      (but not which vector).  */
12738   if (BYTES_BIG_ENDIAN)
12739     {
12740       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
12741       if (!one_vector_p)
12742         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
12743       sel = expand_simple_binop (vmode, XOR, sel, mask,
12744                                  NULL, 0, OPTAB_LIB_WIDEN);
12745     }
12746   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
12747 }
12748
12749 /* Recognize patterns suitable for the TRN instructions.  */
12750 static bool
12751 aarch64_evpc_trn (struct expand_vec_perm_d *d)
12752 {
12753   unsigned int i, odd, mask, nelt = d->nelt;
12754   rtx out, in0, in1, x;
12755   rtx (*gen) (rtx, rtx, rtx);
12756   machine_mode vmode = d->vmode;
12757
12758   if (GET_MODE_UNIT_SIZE (vmode) > 8)
12759     return false;
12760
12761   /* Note that these are little-endian tests.
12762      We correct for big-endian later.  */
12763   if (d->perm[0] == 0)
12764     odd = 0;
12765   else if (d->perm[0] == 1)
12766     odd = 1;
12767   else
12768     return false;
12769   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12770
12771   for (i = 0; i < nelt; i += 2)
12772     {
12773       if (d->perm[i] != i + odd)
12774         return false;
12775       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
12776         return false;
12777     }
12778
12779   /* Success!  */
12780   if (d->testing_p)
12781     return true;
12782
12783   in0 = d->op0;
12784   in1 = d->op1;
12785   if (BYTES_BIG_ENDIAN)
12786     {
12787       x = in0, in0 = in1, in1 = x;
12788       odd = !odd;
12789     }
12790   out = d->target;
12791
12792   if (odd)
12793     {
12794       switch (vmode)
12795         {
12796         case V16QImode: gen = gen_aarch64_trn2v16qi; break;
12797         case V8QImode: gen = gen_aarch64_trn2v8qi; break;
12798         case V8HImode: gen = gen_aarch64_trn2v8hi; break;
12799         case V4HImode: gen = gen_aarch64_trn2v4hi; break;
12800         case V4SImode: gen = gen_aarch64_trn2v4si; break;
12801         case V2SImode: gen = gen_aarch64_trn2v2si; break;
12802         case V2DImode: gen = gen_aarch64_trn2v2di; break;
12803         case V4HFmode: gen = gen_aarch64_trn2v4hf; break;
12804         case V8HFmode: gen = gen_aarch64_trn2v8hf; break;
12805         case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
12806         case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
12807         case V2DFmode: gen = gen_aarch64_trn2v2df; break;
12808         default:
12809           return false;
12810         }
12811     }
12812   else
12813     {
12814       switch (vmode)
12815         {
12816         case V16QImode: gen = gen_aarch64_trn1v16qi; break;
12817         case V8QImode: gen = gen_aarch64_trn1v8qi; break;
12818         case V8HImode: gen = gen_aarch64_trn1v8hi; break;
12819         case V4HImode: gen = gen_aarch64_trn1v4hi; break;
12820         case V4SImode: gen = gen_aarch64_trn1v4si; break;
12821         case V2SImode: gen = gen_aarch64_trn1v2si; break;
12822         case V2DImode: gen = gen_aarch64_trn1v2di; break;
12823         case V4HFmode: gen = gen_aarch64_trn1v4hf; break;
12824         case V8HFmode: gen = gen_aarch64_trn1v8hf; break;
12825         case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
12826         case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
12827         case V2DFmode: gen = gen_aarch64_trn1v2df; break;
12828         default:
12829           return false;
12830         }
12831     }
12832
12833   emit_insn (gen (out, in0, in1));
12834   return true;
12835 }
12836
12837 /* Recognize patterns suitable for the UZP instructions.  */
12838 static bool
12839 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
12840 {
12841   unsigned int i, odd, mask, nelt = d->nelt;
12842   rtx out, in0, in1, x;
12843   rtx (*gen) (rtx, rtx, rtx);
12844   machine_mode vmode = d->vmode;
12845
12846   if (GET_MODE_UNIT_SIZE (vmode) > 8)
12847     return false;
12848
12849   /* Note that these are little-endian tests.
12850      We correct for big-endian later.  */
12851   if (d->perm[0] == 0)
12852     odd = 0;
12853   else if (d->perm[0] == 1)
12854     odd = 1;
12855   else
12856     return false;
12857   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12858
12859   for (i = 0; i < nelt; i++)
12860     {
12861       unsigned elt = (i * 2 + odd) & mask;
12862       if (d->perm[i] != elt)
12863         return false;
12864     }
12865
12866   /* Success!  */
12867   if (d->testing_p)
12868     return true;
12869
12870   in0 = d->op0;
12871   in1 = d->op1;
12872   if (BYTES_BIG_ENDIAN)
12873     {
12874       x = in0, in0 = in1, in1 = x;
12875       odd = !odd;
12876     }
12877   out = d->target;
12878
12879   if (odd)
12880     {
12881       switch (vmode)
12882         {
12883         case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
12884         case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
12885         case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
12886         case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
12887         case V4SImode: gen = gen_aarch64_uzp2v4si; break;
12888         case V2SImode: gen = gen_aarch64_uzp2v2si; break;
12889         case V2DImode: gen = gen_aarch64_uzp2v2di; break;
12890         case V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
12891         case V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
12892         case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
12893         case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
12894         case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
12895         default:
12896           return false;
12897         }
12898     }
12899   else
12900     {
12901       switch (vmode)
12902         {
12903         case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
12904         case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
12905         case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
12906         case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
12907         case V4SImode: gen = gen_aarch64_uzp1v4si; break;
12908         case V2SImode: gen = gen_aarch64_uzp1v2si; break;
12909         case V2DImode: gen = gen_aarch64_uzp1v2di; break;
12910         case V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
12911         case V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
12912         case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
12913         case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
12914         case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
12915         default:
12916           return false;
12917         }
12918     }
12919
12920   emit_insn (gen (out, in0, in1));
12921   return true;
12922 }
12923
12924 /* Recognize patterns suitable for the ZIP instructions.  */
12925 static bool
12926 aarch64_evpc_zip (struct expand_vec_perm_d *d)
12927 {
12928   unsigned int i, high, mask, nelt = d->nelt;
12929   rtx out, in0, in1, x;
12930   rtx (*gen) (rtx, rtx, rtx);
12931   machine_mode vmode = d->vmode;
12932
12933   if (GET_MODE_UNIT_SIZE (vmode) > 8)
12934     return false;
12935
12936   /* Note that these are little-endian tests.
12937      We correct for big-endian later.  */
12938   high = nelt / 2;
12939   if (d->perm[0] == high)
12940     /* Do Nothing.  */
12941     ;
12942   else if (d->perm[0] == 0)
12943     high = 0;
12944   else
12945     return false;
12946   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12947
12948   for (i = 0; i < nelt / 2; i++)
12949     {
12950       unsigned elt = (i + high) & mask;
12951       if (d->perm[i * 2] != elt)
12952         return false;
12953       elt = (elt + nelt) & mask;
12954       if (d->perm[i * 2 + 1] != elt)
12955         return false;
12956     }
12957
12958   /* Success!  */
12959   if (d->testing_p)
12960     return true;
12961
12962   in0 = d->op0;
12963   in1 = d->op1;
12964   if (BYTES_BIG_ENDIAN)
12965     {
12966       x = in0, in0 = in1, in1 = x;
12967       high = !high;
12968     }
12969   out = d->target;
12970
12971   if (high)
12972     {
12973       switch (vmode)
12974         {
12975         case V16QImode: gen = gen_aarch64_zip2v16qi; break;
12976         case V8QImode: gen = gen_aarch64_zip2v8qi; break;
12977         case V8HImode: gen = gen_aarch64_zip2v8hi; break;
12978         case V4HImode: gen = gen_aarch64_zip2v4hi; break;
12979         case V4SImode: gen = gen_aarch64_zip2v4si; break;
12980         case V2SImode: gen = gen_aarch64_zip2v2si; break;
12981         case V2DImode: gen = gen_aarch64_zip2v2di; break;
12982         case V4HFmode: gen = gen_aarch64_zip2v4hf; break;
12983         case V8HFmode: gen = gen_aarch64_zip2v8hf; break;
12984         case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
12985         case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
12986         case V2DFmode: gen = gen_aarch64_zip2v2df; break;
12987         default:
12988           return false;
12989         }
12990     }
12991   else
12992     {
12993       switch (vmode)
12994         {
12995         case V16QImode: gen = gen_aarch64_zip1v16qi; break;
12996         case V8QImode: gen = gen_aarch64_zip1v8qi; break;
12997         case V8HImode: gen = gen_aarch64_zip1v8hi; break;
12998         case V4HImode: gen = gen_aarch64_zip1v4hi; break;
12999         case V4SImode: gen = gen_aarch64_zip1v4si; break;
13000         case V2SImode: gen = gen_aarch64_zip1v2si; break;
13001         case V2DImode: gen = gen_aarch64_zip1v2di; break;
13002         case V4HFmode: gen = gen_aarch64_zip1v4hf; break;
13003         case V8HFmode: gen = gen_aarch64_zip1v8hf; break;
13004         case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
13005         case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
13006         case V2DFmode: gen = gen_aarch64_zip1v2df; break;
13007         default:
13008           return false;
13009         }
13010     }
13011
13012   emit_insn (gen (out, in0, in1));
13013   return true;
13014 }
13015
13016 /* Recognize patterns for the EXT insn.  */
13017
13018 static bool
13019 aarch64_evpc_ext (struct expand_vec_perm_d *d)
13020 {
13021   unsigned int i, nelt = d->nelt;
13022   rtx (*gen) (rtx, rtx, rtx, rtx);
13023   rtx offset;
13024
13025   unsigned int location = d->perm[0]; /* Always < nelt.  */
13026
13027   /* Check if the extracted indices are increasing by one.  */
13028   for (i = 1; i < nelt; i++)
13029     {
13030       unsigned int required = location + i;
13031       if (d->one_vector_p)
13032         {
13033           /* We'll pass the same vector in twice, so allow indices to wrap.  */
13034           required &= (nelt - 1);
13035         }
13036       if (d->perm[i] != required)
13037         return false;
13038     }
13039
13040   switch (d->vmode)
13041     {
13042     case V16QImode: gen = gen_aarch64_extv16qi; break;
13043     case V8QImode: gen = gen_aarch64_extv8qi; break;
13044     case V4HImode: gen = gen_aarch64_extv4hi; break;
13045     case V8HImode: gen = gen_aarch64_extv8hi; break;
13046     case V2SImode: gen = gen_aarch64_extv2si; break;
13047     case V4SImode: gen = gen_aarch64_extv4si; break;
13048     case V4HFmode: gen = gen_aarch64_extv4hf; break;
13049     case V8HFmode: gen = gen_aarch64_extv8hf; break;
13050     case V2SFmode: gen = gen_aarch64_extv2sf; break;
13051     case V4SFmode: gen = gen_aarch64_extv4sf; break;
13052     case V2DImode: gen = gen_aarch64_extv2di; break;
13053     case V2DFmode: gen = gen_aarch64_extv2df; break;
13054     default:
13055       return false;
13056     }
13057
13058   /* Success! */
13059   if (d->testing_p)
13060     return true;
13061
13062   /* The case where (location == 0) is a no-op for both big- and little-endian,
13063      and is removed by the mid-end at optimization levels -O1 and higher.  */
13064
13065   if (BYTES_BIG_ENDIAN && (location != 0))
13066     {
13067       /* After setup, we want the high elements of the first vector (stored
13068          at the LSB end of the register), and the low elements of the second
13069          vector (stored at the MSB end of the register). So swap.  */
13070       std::swap (d->op0, d->op1);
13071       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
13072       location = nelt - location;
13073     }
13074
13075   offset = GEN_INT (location);
13076   emit_insn (gen (d->target, d->op0, d->op1, offset));
13077   return true;
13078 }
13079
13080 /* Recognize patterns for the REV insns.  */
13081
13082 static bool
13083 aarch64_evpc_rev (struct expand_vec_perm_d *d)
13084 {
13085   unsigned int i, j, diff, nelt = d->nelt;
13086   rtx (*gen) (rtx, rtx);
13087
13088   if (!d->one_vector_p)
13089     return false;
13090
13091   diff = d->perm[0];
13092   switch (diff)
13093     {
13094     case 7:
13095       switch (d->vmode)
13096         {
13097         case V16QImode: gen = gen_aarch64_rev64v16qi; break;
13098         case V8QImode: gen = gen_aarch64_rev64v8qi;  break;
13099         default:
13100           return false;
13101         }
13102       break;
13103     case 3:
13104       switch (d->vmode)
13105         {
13106         case V16QImode: gen = gen_aarch64_rev32v16qi; break;
13107         case V8QImode: gen = gen_aarch64_rev32v8qi;  break;
13108         case V8HImode: gen = gen_aarch64_rev64v8hi;  break;
13109         case V4HImode: gen = gen_aarch64_rev64v4hi;  break;
13110         default:
13111           return false;
13112         }
13113       break;
13114     case 1:
13115       switch (d->vmode)
13116         {
13117         case V16QImode: gen = gen_aarch64_rev16v16qi; break;
13118         case V8QImode: gen = gen_aarch64_rev16v8qi;  break;
13119         case V8HImode: gen = gen_aarch64_rev32v8hi;  break;
13120         case V4HImode: gen = gen_aarch64_rev32v4hi;  break;
13121         case V4SImode: gen = gen_aarch64_rev64v4si;  break;
13122         case V2SImode: gen = gen_aarch64_rev64v2si;  break;
13123         case V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
13124         case V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
13125         case V8HFmode: gen = gen_aarch64_rev64v8hf;  break;
13126         case V4HFmode: gen = gen_aarch64_rev64v4hf;  break;
13127         default:
13128           return false;
13129         }
13130       break;
13131     default:
13132       return false;
13133     }
13134
13135   for (i = 0; i < nelt ; i += diff + 1)
13136     for (j = 0; j <= diff; j += 1)
13137       {
13138         /* This is guaranteed to be true as the value of diff
13139            is 7, 3, 1 and we should have enough elements in the
13140            queue to generate this.  Getting a vector mask with a
13141            value of diff other than these values implies that
13142            something is wrong by the time we get here.  */
13143         gcc_assert (i + j < nelt);
13144         if (d->perm[i + j] != i + diff - j)
13145           return false;
13146       }
13147
13148   /* Success! */
13149   if (d->testing_p)
13150     return true;
13151
13152   emit_insn (gen (d->target, d->op0));
13153   return true;
13154 }
13155
13156 static bool
13157 aarch64_evpc_dup (struct expand_vec_perm_d *d)
13158 {
13159   rtx (*gen) (rtx, rtx, rtx);
13160   rtx out = d->target;
13161   rtx in0;
13162   machine_mode vmode = d->vmode;
13163   unsigned int i, elt, nelt = d->nelt;
13164   rtx lane;
13165
13166   elt = d->perm[0];
13167   for (i = 1; i < nelt; i++)
13168     {
13169       if (elt != d->perm[i])
13170         return false;
13171     }
13172
13173   /* The generic preparation in aarch64_expand_vec_perm_const_1
13174      swaps the operand order and the permute indices if it finds
13175      d->perm[0] to be in the second operand.  Thus, we can always
13176      use d->op0 and need not do any extra arithmetic to get the
13177      correct lane number.  */
13178   in0 = d->op0;
13179   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
13180
13181   switch (vmode)
13182     {
13183     case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
13184     case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
13185     case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
13186     case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
13187     case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
13188     case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
13189     case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
13190     case V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
13191     case V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
13192     case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
13193     case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
13194     case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
13195     default:
13196       return false;
13197     }
13198
13199   emit_insn (gen (out, in0, lane));
13200   return true;
13201 }
13202
13203 static bool
13204 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
13205 {
13206   rtx rperm[MAX_VECT_LEN], sel;
13207   machine_mode vmode = d->vmode;
13208   unsigned int i, nelt = d->nelt;
13209
13210   if (d->testing_p)
13211     return true;
13212
13213   /* Generic code will try constant permutation twice.  Once with the
13214      original mode and again with the elements lowered to QImode.
13215      So wait and don't do the selector expansion ourselves.  */
13216   if (vmode != V8QImode && vmode != V16QImode)
13217     return false;
13218
13219   for (i = 0; i < nelt; ++i)
13220     {
13221       int nunits = GET_MODE_NUNITS (vmode);
13222
13223       /* If big-endian and two vectors we end up with a weird mixed-endian
13224          mode on NEON.  Reverse the index within each word but not the word
13225          itself.  */
13226       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
13227                                            : d->perm[i]);
13228     }
13229   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
13230   sel = force_reg (vmode, sel);
13231
13232   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
13233   return true;
13234 }
13235
13236 static bool
13237 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
13238 {
13239   /* The pattern matching functions above are written to look for a small
13240      number to begin the sequence (0, 1, N/2).  If we begin with an index
13241      from the second operand, we can swap the operands.  */
13242   if (d->perm[0] >= d->nelt)
13243     {
13244       unsigned i, nelt = d->nelt;
13245
13246       gcc_assert (nelt == (nelt & -nelt));
13247       for (i = 0; i < nelt; ++i)
13248         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
13249
13250       std::swap (d->op0, d->op1);
13251     }
13252
13253   if (TARGET_SIMD)
13254     {
13255       if (aarch64_evpc_rev (d))
13256         return true;
13257       else if (aarch64_evpc_ext (d))
13258         return true;
13259       else if (aarch64_evpc_dup (d))
13260         return true;
13261       else if (aarch64_evpc_zip (d))
13262         return true;
13263       else if (aarch64_evpc_uzp (d))
13264         return true;
13265       else if (aarch64_evpc_trn (d))
13266         return true;
13267       return aarch64_evpc_tbl (d);
13268     }
13269   return false;
13270 }
13271
13272 /* Expand a vec_perm_const pattern.  */
13273
13274 bool
13275 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
13276 {
13277   struct expand_vec_perm_d d;
13278   int i, nelt, which;
13279
13280   d.target = target;
13281   d.op0 = op0;
13282   d.op1 = op1;
13283
13284   d.vmode = GET_MODE (target);
13285   gcc_assert (VECTOR_MODE_P (d.vmode));
13286   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13287   d.testing_p = false;
13288
13289   for (i = which = 0; i < nelt; ++i)
13290     {
13291       rtx e = XVECEXP (sel, 0, i);
13292       int ei = INTVAL (e) & (2 * nelt - 1);
13293       which |= (ei < nelt ? 1 : 2);
13294       d.perm[i] = ei;
13295     }
13296
13297   switch (which)
13298     {
13299     default:
13300       gcc_unreachable ();
13301
13302     case 3:
13303       d.one_vector_p = false;
13304       if (!rtx_equal_p (op0, op1))
13305         break;
13306
13307       /* The elements of PERM do not suggest that only the first operand
13308          is used, but both operands are identical.  Allow easier matching
13309          of the permutation by folding the permutation into the single
13310          input vector.  */
13311       /* Fall Through.  */
13312     case 2:
13313       for (i = 0; i < nelt; ++i)
13314         d.perm[i] &= nelt - 1;
13315       d.op0 = op1;
13316       d.one_vector_p = true;
13317       break;
13318
13319     case 1:
13320       d.op1 = op0;
13321       d.one_vector_p = true;
13322       break;
13323     }
13324
13325   return aarch64_expand_vec_perm_const_1 (&d);
13326 }
13327
13328 static bool
13329 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
13330                                      const unsigned char *sel)
13331 {
13332   struct expand_vec_perm_d d;
13333   unsigned int i, nelt, which;
13334   bool ret;
13335
13336   d.vmode = vmode;
13337   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13338   d.testing_p = true;
13339   memcpy (d.perm, sel, nelt);
13340
13341   /* Calculate whether all elements are in one vector.  */
13342   for (i = which = 0; i < nelt; ++i)
13343     {
13344       unsigned char e = d.perm[i];
13345       gcc_assert (e < 2 * nelt);
13346       which |= (e < nelt ? 1 : 2);
13347     }
13348
13349   /* If all elements are from the second vector, reindex as if from the
13350      first vector.  */
13351   if (which == 2)
13352     for (i = 0; i < nelt; ++i)
13353       d.perm[i] -= nelt;
13354
13355   /* Check whether the mask can be applied to a single vector.  */
13356   d.one_vector_p = (which != 3);
13357
13358   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
13359   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
13360   if (!d.one_vector_p)
13361     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
13362
13363   start_sequence ();
13364   ret = aarch64_expand_vec_perm_const_1 (&d);
13365   end_sequence ();
13366
13367   return ret;
13368 }
13369
13370 rtx
13371 aarch64_reverse_mask (enum machine_mode mode)
13372 {
13373   /* We have to reverse each vector because we dont have
13374      a permuted load that can reverse-load according to ABI rules.  */
13375   rtx mask;
13376   rtvec v = rtvec_alloc (16);
13377   int i, j;
13378   int nunits = GET_MODE_NUNITS (mode);
13379   int usize = GET_MODE_UNIT_SIZE (mode);
13380
13381   gcc_assert (BYTES_BIG_ENDIAN);
13382   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
13383
13384   for (i = 0; i < nunits; i++)
13385     for (j = 0; j < usize; j++)
13386       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
13387   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
13388   return force_reg (V16QImode, mask);
13389 }
13390
13391 /* Implement MODES_TIEABLE_P.  In principle we should always return true.
13392    However due to issues with register allocation it is preferable to avoid
13393    tieing integer scalar and FP scalar modes.  Executing integer operations
13394    in general registers is better than treating them as scalar vector
13395    operations.  This reduces latency and avoids redundant int<->FP moves.
13396    So tie modes if they are either the same class, or vector modes with
13397    other vector modes, vector structs or any scalar mode.
13398 */
13399
13400 bool
13401 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
13402 {
13403   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
13404     return true;
13405
13406   /* We specifically want to allow elements of "structure" modes to
13407      be tieable to the structure.  This more general condition allows
13408      other rarer situations too.  */
13409   if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
13410     return true;
13411
13412   /* Also allow any scalar modes with vectors.  */
13413   if (aarch64_vector_mode_supported_p (mode1)
13414       || aarch64_vector_mode_supported_p (mode2))
13415     return true;
13416
13417   return false;
13418 }
13419
13420 /* Return a new RTX holding the result of moving POINTER forward by
13421    AMOUNT bytes.  */
13422
13423 static rtx
13424 aarch64_move_pointer (rtx pointer, int amount)
13425 {
13426   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
13427
13428   return adjust_automodify_address (pointer, GET_MODE (pointer),
13429                                     next, amount);
13430 }
13431
13432 /* Return a new RTX holding the result of moving POINTER forward by the
13433    size of the mode it points to.  */
13434
13435 static rtx
13436 aarch64_progress_pointer (rtx pointer)
13437 {
13438   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
13439
13440   return aarch64_move_pointer (pointer, amount);
13441 }
13442
13443 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13444    MODE bytes.  */
13445
13446 static void
13447 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
13448                                               machine_mode mode)
13449 {
13450   rtx reg = gen_reg_rtx (mode);
13451
13452   /* "Cast" the pointers to the correct mode.  */
13453   *src = adjust_address (*src, mode, 0);
13454   *dst = adjust_address (*dst, mode, 0);
13455   /* Emit the memcpy.  */
13456   emit_move_insn (reg, *src);
13457   emit_move_insn (*dst, reg);
13458   /* Move the pointers forward.  */
13459   *src = aarch64_progress_pointer (*src);
13460   *dst = aarch64_progress_pointer (*dst);
13461 }
13462
13463 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
13464    we succeed, otherwise return false.  */
13465
13466 bool
13467 aarch64_expand_movmem (rtx *operands)
13468 {
13469   unsigned int n;
13470   rtx dst = operands[0];
13471   rtx src = operands[1];
13472   rtx base;
13473   bool speed_p = !optimize_function_for_size_p (cfun);
13474
13475   /* When optimizing for size, give a better estimate of the length of a
13476      memcpy call, but use the default otherwise.  */
13477   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
13478
13479   /* We can't do anything smart if the amount to copy is not constant.  */
13480   if (!CONST_INT_P (operands[2]))
13481     return false;
13482
13483   n = UINTVAL (operands[2]);
13484
13485   /* Try to keep the number of instructions low.  For cases below 16 bytes we
13486      need to make at most two moves.  For cases above 16 bytes it will be one
13487      move for each 16 byte chunk, then at most two additional moves.  */
13488   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
13489     return false;
13490
13491   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
13492   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
13493
13494   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
13495   src = adjust_automodify_address (src, VOIDmode, base, 0);
13496
13497   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
13498      1-byte chunk.  */
13499   if (n < 4)
13500     {
13501       if (n >= 2)
13502         {
13503           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13504           n -= 2;
13505         }
13506
13507       if (n == 1)
13508         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13509
13510       return true;
13511     }
13512
13513   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
13514      4-byte chunk, partially overlapping with the previously copied chunk.  */
13515   if (n < 8)
13516     {
13517       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13518       n -= 4;
13519       if (n > 0)
13520         {
13521           int move = n - 4;
13522
13523           src = aarch64_move_pointer (src, move);
13524           dst = aarch64_move_pointer (dst, move);
13525           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13526         }
13527       return true;
13528     }
13529
13530   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
13531      them, then (if applicable) an 8-byte chunk.  */
13532   while (n >= 8)
13533     {
13534       if (n / 16)
13535         {
13536           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
13537           n -= 16;
13538         }
13539       else
13540         {
13541           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13542           n -= 8;
13543         }
13544     }
13545
13546   /* Finish the final bytes of the copy.  We can always do this in one
13547      instruction.  We either copy the exact amount we need, or partially
13548      overlap with the previous chunk we copied and copy 8-bytes.  */
13549   if (n == 0)
13550     return true;
13551   else if (n == 1)
13552     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13553   else if (n == 2)
13554     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13555   else if (n == 4)
13556     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13557   else
13558     {
13559       if (n == 3)
13560         {
13561           src = aarch64_move_pointer (src, -1);
13562           dst = aarch64_move_pointer (dst, -1);
13563           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13564         }
13565       else
13566         {
13567           int move = n - 8;
13568
13569           src = aarch64_move_pointer (src, move);
13570           dst = aarch64_move_pointer (dst, move);
13571           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13572         }
13573     }
13574
13575   return true;
13576 }
13577
13578 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
13579    SImode stores.  Handle the case when the constant has identical
13580    bottom and top halves.  This is beneficial when the two stores can be
13581    merged into an STP and we avoid synthesising potentially expensive
13582    immediates twice.  Return true if such a split is possible.  */
13583
13584 bool
13585 aarch64_split_dimode_const_store (rtx dst, rtx src)
13586 {
13587   rtx lo = gen_lowpart (SImode, src);
13588   rtx hi = gen_highpart_mode (SImode, DImode, src);
13589
13590   bool size_p = optimize_function_for_size_p (cfun);
13591
13592   if (!rtx_equal_p (lo, hi))
13593     return false;
13594
13595   unsigned int orig_cost
13596     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
13597   unsigned int lo_cost
13598     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
13599
13600   /* We want to transform:
13601      MOV        x1, 49370
13602      MOVK       x1, 0x140, lsl 16
13603      MOVK       x1, 0xc0da, lsl 32
13604      MOVK       x1, 0x140, lsl 48
13605      STR        x1, [x0]
13606    into:
13607      MOV        w1, 49370
13608      MOVK       w1, 0x140, lsl 16
13609      STP        w1, w1, [x0]
13610    So we want to perform this only when we save two instructions
13611    or more.  When optimizing for size, however, accept any code size
13612    savings we can.  */
13613   if (size_p && orig_cost <= lo_cost)
13614     return false;
13615
13616   if (!size_p
13617       && (orig_cost <= lo_cost + 1))
13618     return false;
13619
13620   rtx mem_lo = adjust_address (dst, SImode, 0);
13621   if (!aarch64_mem_pair_operand (mem_lo, SImode))
13622     return false;
13623
13624   rtx tmp_reg = gen_reg_rtx (SImode);
13625   aarch64_expand_mov_immediate (tmp_reg, lo);
13626   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
13627   /* Don't emit an explicit store pair as this may not be always profitable.
13628      Let the sched-fusion logic decide whether to merge them.  */
13629   emit_move_insn (mem_lo, tmp_reg);
13630   emit_move_insn (mem_hi, tmp_reg);
13631
13632   return true;
13633 }
13634
13635 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
13636
13637 static unsigned HOST_WIDE_INT
13638 aarch64_asan_shadow_offset (void)
13639 {
13640   return (HOST_WIDE_INT_1 << 36);
13641 }
13642
13643 static bool
13644 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
13645                                         unsigned int align,
13646                                         enum by_pieces_operation op,
13647                                         bool speed_p)
13648 {
13649   /* STORE_BY_PIECES can be used when copying a constant string, but
13650      in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
13651      For now we always fail this and let the move_by_pieces code copy
13652      the string from read-only memory.  */
13653   if (op == STORE_BY_PIECES)
13654     return false;
13655
13656   return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
13657 }
13658
13659 static rtx
13660 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
13661                         int code, tree treeop0, tree treeop1)
13662 {
13663   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13664   rtx op0, op1;
13665   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13666   insn_code icode;
13667   struct expand_operand ops[4];
13668
13669   start_sequence ();
13670   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13671
13672   op_mode = GET_MODE (op0);
13673   if (op_mode == VOIDmode)
13674     op_mode = GET_MODE (op1);
13675
13676   switch (op_mode)
13677     {
13678     case QImode:
13679     case HImode:
13680     case SImode:
13681       cmp_mode = SImode;
13682       icode = CODE_FOR_cmpsi;
13683       break;
13684
13685     case DImode:
13686       cmp_mode = DImode;
13687       icode = CODE_FOR_cmpdi;
13688       break;
13689
13690     case SFmode:
13691       cmp_mode = SFmode;
13692       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13693       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
13694       break;
13695
13696     case DFmode:
13697       cmp_mode = DFmode;
13698       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13699       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
13700       break;
13701
13702     default:
13703       end_sequence ();
13704       return NULL_RTX;
13705     }
13706
13707   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
13708   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
13709   if (!op0 || !op1)
13710     {
13711       end_sequence ();
13712       return NULL_RTX;
13713     }
13714   *prep_seq = get_insns ();
13715   end_sequence ();
13716
13717   create_fixed_operand (&ops[0], op0);
13718   create_fixed_operand (&ops[1], op1);
13719
13720   start_sequence ();
13721   if (!maybe_expand_insn (icode, 2, ops))
13722     {
13723       end_sequence ();
13724       return NULL_RTX;
13725     }
13726   *gen_seq = get_insns ();
13727   end_sequence ();
13728
13729   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
13730                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
13731 }
13732
13733 static rtx
13734 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
13735                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
13736 {
13737   rtx op0, op1, target;
13738   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13739   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13740   insn_code icode;
13741   struct expand_operand ops[6];
13742   int aarch64_cond;
13743
13744   push_to_sequence (*prep_seq);
13745   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13746
13747   op_mode = GET_MODE (op0);
13748   if (op_mode == VOIDmode)
13749     op_mode = GET_MODE (op1);
13750
13751   switch (op_mode)
13752     {
13753     case QImode:
13754     case HImode:
13755     case SImode:
13756       cmp_mode = SImode;
13757       icode = CODE_FOR_ccmpsi;
13758       break;
13759
13760     case DImode:
13761       cmp_mode = DImode;
13762       icode = CODE_FOR_ccmpdi;
13763       break;
13764
13765     case SFmode:
13766       cmp_mode = SFmode;
13767       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13768       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
13769       break;
13770
13771     case DFmode:
13772       cmp_mode = DFmode;
13773       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13774       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
13775       break;
13776
13777     default:
13778       end_sequence ();
13779       return NULL_RTX;
13780     }
13781
13782   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
13783   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
13784   if (!op0 || !op1)
13785     {
13786       end_sequence ();
13787       return NULL_RTX;
13788     }
13789   *prep_seq = get_insns ();
13790   end_sequence ();
13791
13792   target = gen_rtx_REG (cc_mode, CC_REGNUM);
13793   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
13794
13795   if (bit_code != AND)
13796     {
13797       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
13798                                                 GET_MODE (XEXP (prev, 0))),
13799                              VOIDmode, XEXP (prev, 0), const0_rtx);
13800       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
13801     }
13802
13803   create_fixed_operand (&ops[0], XEXP (prev, 0));
13804   create_fixed_operand (&ops[1], target);
13805   create_fixed_operand (&ops[2], op0);
13806   create_fixed_operand (&ops[3], op1);
13807   create_fixed_operand (&ops[4], prev);
13808   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
13809
13810   push_to_sequence (*gen_seq);
13811   if (!maybe_expand_insn (icode, 6, ops))
13812     {
13813       end_sequence ();
13814       return NULL_RTX;
13815     }
13816
13817   *gen_seq = get_insns ();
13818   end_sequence ();
13819
13820   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
13821 }
13822
13823 #undef TARGET_GEN_CCMP_FIRST
13824 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
13825
13826 #undef TARGET_GEN_CCMP_NEXT
13827 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
13828
13829 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
13830    instruction fusion of some sort.  */
13831
13832 static bool
13833 aarch64_macro_fusion_p (void)
13834 {
13835   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
13836 }
13837
13838
13839 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
13840    should be kept together during scheduling.  */
13841
13842 static bool
13843 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
13844 {
13845   rtx set_dest;
13846   rtx prev_set = single_set (prev);
13847   rtx curr_set = single_set (curr);
13848   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
13849   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
13850
13851   if (!aarch64_macro_fusion_p ())
13852     return false;
13853
13854   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
13855     {
13856       /* We are trying to match:
13857          prev (mov)  == (set (reg r0) (const_int imm16))
13858          curr (movk) == (set (zero_extract (reg r0)
13859                                            (const_int 16)
13860                                            (const_int 16))
13861                              (const_int imm16_1))  */
13862
13863       set_dest = SET_DEST (curr_set);
13864
13865       if (GET_CODE (set_dest) == ZERO_EXTRACT
13866           && CONST_INT_P (SET_SRC (curr_set))
13867           && CONST_INT_P (SET_SRC (prev_set))
13868           && CONST_INT_P (XEXP (set_dest, 2))
13869           && INTVAL (XEXP (set_dest, 2)) == 16
13870           && REG_P (XEXP (set_dest, 0))
13871           && REG_P (SET_DEST (prev_set))
13872           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
13873         {
13874           return true;
13875         }
13876     }
13877
13878   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
13879     {
13880
13881       /*  We're trying to match:
13882           prev (adrp) == (set (reg r1)
13883                               (high (symbol_ref ("SYM"))))
13884           curr (add) == (set (reg r0)
13885                              (lo_sum (reg r1)
13886                                      (symbol_ref ("SYM"))))
13887           Note that r0 need not necessarily be the same as r1, especially
13888           during pre-regalloc scheduling.  */
13889
13890       if (satisfies_constraint_Ush (SET_SRC (prev_set))
13891           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13892         {
13893           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
13894               && REG_P (XEXP (SET_SRC (curr_set), 0))
13895               && REGNO (XEXP (SET_SRC (curr_set), 0))
13896                  == REGNO (SET_DEST (prev_set))
13897               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
13898                               XEXP (SET_SRC (curr_set), 1)))
13899             return true;
13900         }
13901     }
13902
13903   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
13904     {
13905
13906       /* We're trying to match:
13907          prev (movk) == (set (zero_extract (reg r0)
13908                                            (const_int 16)
13909                                            (const_int 32))
13910                              (const_int imm16_1))
13911          curr (movk) == (set (zero_extract (reg r0)
13912                                            (const_int 16)
13913                                            (const_int 48))
13914                              (const_int imm16_2))  */
13915
13916       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
13917           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
13918           && REG_P (XEXP (SET_DEST (prev_set), 0))
13919           && REG_P (XEXP (SET_DEST (curr_set), 0))
13920           && REGNO (XEXP (SET_DEST (prev_set), 0))
13921              == REGNO (XEXP (SET_DEST (curr_set), 0))
13922           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
13923           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
13924           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
13925           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
13926           && CONST_INT_P (SET_SRC (prev_set))
13927           && CONST_INT_P (SET_SRC (curr_set)))
13928         return true;
13929
13930     }
13931   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
13932     {
13933       /* We're trying to match:
13934           prev (adrp) == (set (reg r0)
13935                               (high (symbol_ref ("SYM"))))
13936           curr (ldr) == (set (reg r1)
13937                              (mem (lo_sum (reg r0)
13938                                              (symbol_ref ("SYM")))))
13939                  or
13940           curr (ldr) == (set (reg r1)
13941                              (zero_extend (mem
13942                                            (lo_sum (reg r0)
13943                                                    (symbol_ref ("SYM"))))))  */
13944       if (satisfies_constraint_Ush (SET_SRC (prev_set))
13945           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13946         {
13947           rtx curr_src = SET_SRC (curr_set);
13948
13949           if (GET_CODE (curr_src) == ZERO_EXTEND)
13950             curr_src = XEXP (curr_src, 0);
13951
13952           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
13953               && REG_P (XEXP (XEXP (curr_src, 0), 0))
13954               && REGNO (XEXP (XEXP (curr_src, 0), 0))
13955                  == REGNO (SET_DEST (prev_set))
13956               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
13957                               XEXP (SET_SRC (prev_set), 0)))
13958               return true;
13959         }
13960     }
13961
13962   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
13963        && aarch_crypto_can_dual_issue (prev, curr))
13964     return true;
13965
13966   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
13967       && any_condjump_p (curr))
13968     {
13969       enum attr_type prev_type = get_attr_type (prev);
13970
13971       /* FIXME: this misses some which is considered simple arthematic
13972          instructions for ThunderX.  Simple shifts are missed here.  */
13973       if (prev_type == TYPE_ALUS_SREG
13974           || prev_type == TYPE_ALUS_IMM
13975           || prev_type == TYPE_LOGICS_REG
13976           || prev_type == TYPE_LOGICS_IMM)
13977         return true;
13978     }
13979
13980   return false;
13981 }
13982
13983 /* Return true iff the instruction fusion described by OP is enabled.  */
13984
13985 bool
13986 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
13987 {
13988   return (aarch64_tune_params.fusible_ops & op) != 0;
13989 }
13990
13991 /* If MEM is in the form of [base+offset], extract the two parts
13992    of address and set to BASE and OFFSET, otherwise return false
13993    after clearing BASE and OFFSET.  */
13994
13995 bool
13996 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
13997 {
13998   rtx addr;
13999
14000   gcc_assert (MEM_P (mem));
14001
14002   addr = XEXP (mem, 0);
14003
14004   if (REG_P (addr))
14005     {
14006       *base = addr;
14007       *offset = const0_rtx;
14008       return true;
14009     }
14010
14011   if (GET_CODE (addr) == PLUS
14012       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
14013     {
14014       *base = XEXP (addr, 0);
14015       *offset = XEXP (addr, 1);
14016       return true;
14017     }
14018
14019   *base = NULL_RTX;
14020   *offset = NULL_RTX;
14021
14022   return false;
14023 }
14024
14025 /* Types for scheduling fusion.  */
14026 enum sched_fusion_type
14027 {
14028   SCHED_FUSION_NONE = 0,
14029   SCHED_FUSION_LD_SIGN_EXTEND,
14030   SCHED_FUSION_LD_ZERO_EXTEND,
14031   SCHED_FUSION_LD,
14032   SCHED_FUSION_ST,
14033   SCHED_FUSION_NUM
14034 };
14035
14036 /* If INSN is a load or store of address in the form of [base+offset],
14037    extract the two parts and set to BASE and OFFSET.  Return scheduling
14038    fusion type this INSN is.  */
14039
14040 static enum sched_fusion_type
14041 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
14042 {
14043   rtx x, dest, src;
14044   enum sched_fusion_type fusion = SCHED_FUSION_LD;
14045
14046   gcc_assert (INSN_P (insn));
14047   x = PATTERN (insn);
14048   if (GET_CODE (x) != SET)
14049     return SCHED_FUSION_NONE;
14050
14051   src = SET_SRC (x);
14052   dest = SET_DEST (x);
14053
14054   machine_mode dest_mode = GET_MODE (dest);
14055
14056   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
14057     return SCHED_FUSION_NONE;
14058
14059   if (GET_CODE (src) == SIGN_EXTEND)
14060     {
14061       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
14062       src = XEXP (src, 0);
14063       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14064         return SCHED_FUSION_NONE;
14065     }
14066   else if (GET_CODE (src) == ZERO_EXTEND)
14067     {
14068       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
14069       src = XEXP (src, 0);
14070       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14071         return SCHED_FUSION_NONE;
14072     }
14073
14074   if (GET_CODE (src) == MEM && REG_P (dest))
14075     extract_base_offset_in_addr (src, base, offset);
14076   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
14077     {
14078       fusion = SCHED_FUSION_ST;
14079       extract_base_offset_in_addr (dest, base, offset);
14080     }
14081   else
14082     return SCHED_FUSION_NONE;
14083
14084   if (*base == NULL_RTX || *offset == NULL_RTX)
14085     fusion = SCHED_FUSION_NONE;
14086
14087   return fusion;
14088 }
14089
14090 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
14091
14092    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
14093    and PRI are only calculated for these instructions.  For other instruction,
14094    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
14095    type instruction fusion can be added by returning different priorities.
14096
14097    It's important that irrelevant instructions get the largest FUSION_PRI.  */
14098
14099 static void
14100 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
14101                                int *fusion_pri, int *pri)
14102 {
14103   int tmp, off_val;
14104   rtx base, offset;
14105   enum sched_fusion_type fusion;
14106
14107   gcc_assert (INSN_P (insn));
14108
14109   tmp = max_pri - 1;
14110   fusion = fusion_load_store (insn, &base, &offset);
14111   if (fusion == SCHED_FUSION_NONE)
14112     {
14113       *pri = tmp;
14114       *fusion_pri = tmp;
14115       return;
14116     }
14117
14118   /* Set FUSION_PRI according to fusion type and base register.  */
14119   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
14120
14121   /* Calculate PRI.  */
14122   tmp /= 2;
14123
14124   /* INSN with smaller offset goes first.  */
14125   off_val = (int)(INTVAL (offset));
14126   if (off_val >= 0)
14127     tmp -= (off_val & 0xfffff);
14128   else
14129     tmp += ((- off_val) & 0xfffff);
14130
14131   *pri = tmp;
14132   return;
14133 }
14134
14135 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
14136    Adjust priority of sha1h instructions so they are scheduled before
14137    other SHA1 instructions.  */
14138
14139 static int
14140 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
14141 {
14142   rtx x = PATTERN (insn);
14143
14144   if (GET_CODE (x) == SET)
14145     {
14146       x = SET_SRC (x);
14147
14148       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
14149         return priority + 10;
14150     }
14151
14152   return priority;
14153 }
14154
14155 /* Given OPERANDS of consecutive load/store, check if we can merge
14156    them into ldp/stp.  LOAD is true if they are load instructions.
14157    MODE is the mode of memory operands.  */
14158
14159 bool
14160 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
14161                                 enum machine_mode mode)
14162 {
14163   HOST_WIDE_INT offval_1, offval_2, msize;
14164   enum reg_class rclass_1, rclass_2;
14165   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
14166
14167   if (load)
14168     {
14169       mem_1 = operands[1];
14170       mem_2 = operands[3];
14171       reg_1 = operands[0];
14172       reg_2 = operands[2];
14173       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
14174       if (REGNO (reg_1) == REGNO (reg_2))
14175         return false;
14176     }
14177   else
14178     {
14179       mem_1 = operands[0];
14180       mem_2 = operands[2];
14181       reg_1 = operands[1];
14182       reg_2 = operands[3];
14183     }
14184
14185   /* The mems cannot be volatile.  */
14186   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
14187     return false;
14188
14189   /* If we have SImode and slow unaligned ldp,
14190      check the alignment to be at least 8 byte. */
14191   if (mode == SImode
14192       && (aarch64_tune_params.extra_tuning_flags
14193           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14194       && !optimize_size
14195       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14196     return false;
14197
14198   /* Check if the addresses are in the form of [base+offset].  */
14199   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14200   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14201     return false;
14202   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14203   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14204     return false;
14205
14206   /* Check if the bases are same.  */
14207   if (!rtx_equal_p (base_1, base_2))
14208     return false;
14209
14210   offval_1 = INTVAL (offset_1);
14211   offval_2 = INTVAL (offset_2);
14212   msize = GET_MODE_SIZE (mode);
14213   /* Check if the offsets are consecutive.  */
14214   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
14215     return false;
14216
14217   /* Check if the addresses are clobbered by load.  */
14218   if (load)
14219     {
14220       if (reg_mentioned_p (reg_1, mem_1))
14221         return false;
14222
14223       /* In increasing order, the last load can clobber the address.  */
14224       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
14225       return false;
14226     }
14227
14228   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14229     rclass_1 = FP_REGS;
14230   else
14231     rclass_1 = GENERAL_REGS;
14232
14233   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14234     rclass_2 = FP_REGS;
14235   else
14236     rclass_2 = GENERAL_REGS;
14237
14238   /* Check if the registers are of same class.  */
14239   if (rclass_1 != rclass_2)
14240     return false;
14241
14242   return true;
14243 }
14244
14245 /* Given OPERANDS of consecutive load/store, check if we can merge
14246    them into ldp/stp by adjusting the offset.  LOAD is true if they
14247    are load instructions.  MODE is the mode of memory operands.
14248
14249    Given below consecutive stores:
14250
14251      str  w1, [xb, 0x100]
14252      str  w1, [xb, 0x104]
14253      str  w1, [xb, 0x108]
14254      str  w1, [xb, 0x10c]
14255
14256    Though the offsets are out of the range supported by stp, we can
14257    still pair them after adjusting the offset, like:
14258
14259      add  scratch, xb, 0x100
14260      stp  w1, w1, [scratch]
14261      stp  w1, w1, [scratch, 0x8]
14262
14263    The peephole patterns detecting this opportunity should guarantee
14264    the scratch register is avaliable.  */
14265
14266 bool
14267 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
14268                                        enum machine_mode mode)
14269 {
14270   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
14271   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
14272   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
14273   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
14274
14275   if (load)
14276     {
14277       reg_1 = operands[0];
14278       mem_1 = operands[1];
14279       reg_2 = operands[2];
14280       mem_2 = operands[3];
14281       reg_3 = operands[4];
14282       mem_3 = operands[5];
14283       reg_4 = operands[6];
14284       mem_4 = operands[7];
14285       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
14286                   && REG_P (reg_3) && REG_P (reg_4));
14287       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
14288         return false;
14289     }
14290   else
14291     {
14292       mem_1 = operands[0];
14293       reg_1 = operands[1];
14294       mem_2 = operands[2];
14295       reg_2 = operands[3];
14296       mem_3 = operands[4];
14297       reg_3 = operands[5];
14298       mem_4 = operands[6];
14299       reg_4 = operands[7];
14300     }
14301   /* Skip if memory operand is by itslef valid for ldp/stp.  */
14302   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
14303     return false;
14304
14305   /* The mems cannot be volatile.  */
14306   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
14307       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
14308     return false;
14309
14310   /* Check if the addresses are in the form of [base+offset].  */
14311   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14312   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14313     return false;
14314   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14315   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14316     return false;
14317   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
14318   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
14319     return false;
14320   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
14321   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
14322     return false;
14323
14324   /* Check if the bases are same.  */
14325   if (!rtx_equal_p (base_1, base_2)
14326       || !rtx_equal_p (base_2, base_3)
14327       || !rtx_equal_p (base_3, base_4))
14328     return false;
14329
14330   offval_1 = INTVAL (offset_1);
14331   offval_2 = INTVAL (offset_2);
14332   offval_3 = INTVAL (offset_3);
14333   offval_4 = INTVAL (offset_4);
14334   msize = GET_MODE_SIZE (mode);
14335   /* Check if the offsets are consecutive.  */
14336   if ((offval_1 != (offval_2 + msize)
14337        || offval_1 != (offval_3 + msize * 2)
14338        || offval_1 != (offval_4 + msize * 3))
14339       && (offval_4 != (offval_3 + msize)
14340           || offval_4 != (offval_2 + msize * 2)
14341           || offval_4 != (offval_1 + msize * 3)))
14342     return false;
14343
14344   /* Check if the addresses are clobbered by load.  */
14345   if (load)
14346     {
14347       if (reg_mentioned_p (reg_1, mem_1)
14348           || reg_mentioned_p (reg_2, mem_2)
14349           || reg_mentioned_p (reg_3, mem_3))
14350         return false;
14351
14352       /* In increasing order, the last load can clobber the address.  */
14353       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
14354         return false;
14355     }
14356
14357   /* If we have SImode and slow unaligned ldp,
14358      check the alignment to be at least 8 byte. */
14359   if (mode == SImode
14360       && (aarch64_tune_params.extra_tuning_flags
14361           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14362       && !optimize_size
14363       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14364     return false;
14365
14366   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14367     rclass_1 = FP_REGS;
14368   else
14369     rclass_1 = GENERAL_REGS;
14370
14371   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14372     rclass_2 = FP_REGS;
14373   else
14374     rclass_2 = GENERAL_REGS;
14375
14376   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
14377     rclass_3 = FP_REGS;
14378   else
14379     rclass_3 = GENERAL_REGS;
14380
14381   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
14382     rclass_4 = FP_REGS;
14383   else
14384     rclass_4 = GENERAL_REGS;
14385
14386   /* Check if the registers are of same class.  */
14387   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
14388     return false;
14389
14390   return true;
14391 }
14392
14393 /* Given OPERANDS of consecutive load/store, this function pairs them
14394    into ldp/stp after adjusting the offset.  It depends on the fact
14395    that addresses of load/store instructions are in increasing order.
14396    MODE is the mode of memory operands.  CODE is the rtl operator
14397    which should be applied to all memory operands, it's SIGN_EXTEND,
14398    ZERO_EXTEND or UNKNOWN.  */
14399
14400 bool
14401 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
14402                              enum machine_mode mode, RTX_CODE code)
14403 {
14404   rtx base, offset, t1, t2;
14405   rtx mem_1, mem_2, mem_3, mem_4;
14406   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
14407
14408   if (load)
14409     {
14410       mem_1 = operands[1];
14411       mem_2 = operands[3];
14412       mem_3 = operands[5];
14413       mem_4 = operands[7];
14414     }
14415   else
14416     {
14417       mem_1 = operands[0];
14418       mem_2 = operands[2];
14419       mem_3 = operands[4];
14420       mem_4 = operands[6];
14421       gcc_assert (code == UNKNOWN);
14422     }
14423
14424   extract_base_offset_in_addr (mem_1, &base, &offset);
14425   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
14426
14427   /* Adjust offset thus it can fit in ldp/stp instruction.  */
14428   msize = GET_MODE_SIZE (mode);
14429   stp_off_limit = msize * 0x40;
14430   off_val = INTVAL (offset);
14431   abs_off = (off_val < 0) ? -off_val : off_val;
14432   new_off = abs_off % stp_off_limit;
14433   adj_off = abs_off - new_off;
14434
14435   /* Further adjust to make sure all offsets are OK.  */
14436   if ((new_off + msize * 2) >= stp_off_limit)
14437     {
14438       adj_off += stp_off_limit;
14439       new_off -= stp_off_limit;
14440     }
14441
14442   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
14443   if (adj_off >= 0x1000)
14444     return false;
14445
14446   if (off_val < 0)
14447     {
14448       adj_off = -adj_off;
14449       new_off = -new_off;
14450     }
14451
14452   /* Create new memory references.  */
14453   mem_1 = change_address (mem_1, VOIDmode,
14454                           plus_constant (DImode, operands[8], new_off));
14455
14456   /* Check if the adjusted address is OK for ldp/stp.  */
14457   if (!aarch64_mem_pair_operand (mem_1, mode))
14458     return false;
14459
14460   msize = GET_MODE_SIZE (mode);
14461   mem_2 = change_address (mem_2, VOIDmode,
14462                           plus_constant (DImode,
14463                                          operands[8],
14464                                          new_off + msize));
14465   mem_3 = change_address (mem_3, VOIDmode,
14466                           plus_constant (DImode,
14467                                          operands[8],
14468                                          new_off + msize * 2));
14469   mem_4 = change_address (mem_4, VOIDmode,
14470                           plus_constant (DImode,
14471                                          operands[8],
14472                                          new_off + msize * 3));
14473
14474   if (code == ZERO_EXTEND)
14475     {
14476       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
14477       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
14478       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
14479       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
14480     }
14481   else if (code == SIGN_EXTEND)
14482     {
14483       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
14484       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
14485       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
14486       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
14487     }
14488
14489   if (load)
14490     {
14491       operands[1] = mem_1;
14492       operands[3] = mem_2;
14493       operands[5] = mem_3;
14494       operands[7] = mem_4;
14495     }
14496   else
14497     {
14498       operands[0] = mem_1;
14499       operands[2] = mem_2;
14500       operands[4] = mem_3;
14501       operands[6] = mem_4;
14502     }
14503
14504   /* Emit adjusting instruction.  */
14505   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
14506   /* Emit ldp/stp instructions.  */
14507   t1 = gen_rtx_SET (operands[0], operands[1]);
14508   t2 = gen_rtx_SET (operands[2], operands[3]);
14509   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
14510   t1 = gen_rtx_SET (operands[4], operands[5]);
14511   t2 = gen_rtx_SET (operands[6], operands[7]);
14512   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
14513   return true;
14514 }
14515
14516 /* Return 1 if pseudo register should be created and used to hold
14517    GOT address for PIC code.  */
14518
14519 bool
14520 aarch64_use_pseudo_pic_reg (void)
14521 {
14522   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
14523 }
14524
14525 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
14526
14527 static int
14528 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
14529 {
14530   switch (XINT (x, 1))
14531     {
14532     case UNSPEC_GOTSMALLPIC:
14533     case UNSPEC_GOTSMALLPIC28K:
14534     case UNSPEC_GOTTINYPIC:
14535       return 0;
14536     default:
14537       break;
14538     }
14539
14540   return default_unspec_may_trap_p (x, flags);
14541 }
14542
14543
14544 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
14545    return the log2 of that value.  Otherwise return -1.  */
14546
14547 int
14548 aarch64_fpconst_pow_of_2 (rtx x)
14549 {
14550   const REAL_VALUE_TYPE *r;
14551
14552   if (!CONST_DOUBLE_P (x))
14553     return -1;
14554
14555   r = CONST_DOUBLE_REAL_VALUE (x);
14556
14557   if (REAL_VALUE_NEGATIVE (*r)
14558       || REAL_VALUE_ISNAN (*r)
14559       || REAL_VALUE_ISINF (*r)
14560       || !real_isinteger (r, DFmode))
14561     return -1;
14562
14563   return exact_log2 (real_to_integer (r));
14564 }
14565
14566 /* If X is a vector of equal CONST_DOUBLE values and that value is
14567    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
14568
14569 int
14570 aarch64_vec_fpconst_pow_of_2 (rtx x)
14571 {
14572   if (GET_CODE (x) != CONST_VECTOR)
14573     return -1;
14574
14575   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
14576     return -1;
14577
14578   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
14579   if (firstval <= 0)
14580     return -1;
14581
14582   for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
14583     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
14584       return -1;
14585
14586   return firstval;
14587 }
14588
14589 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
14590    to float.
14591
14592    __fp16 always promotes through this hook.
14593    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
14594    through the generic excess precision logic rather than here.  */
14595
14596 static tree
14597 aarch64_promoted_type (const_tree t)
14598 {
14599   if (SCALAR_FLOAT_TYPE_P (t)
14600       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
14601     return float_type_node;
14602
14603   return NULL_TREE;
14604 }
14605
14606 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
14607
14608 static bool
14609 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
14610                            optimization_type opt_type)
14611 {
14612   switch (op)
14613     {
14614     case rsqrt_optab:
14615       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
14616
14617     default:
14618       return true;
14619     }
14620 }
14621
14622 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
14623    if MODE is HFmode, and punt to the generic implementation otherwise.  */
14624
14625 static bool
14626 aarch64_libgcc_floating_mode_supported_p (machine_mode mode)
14627 {
14628   return (mode == HFmode
14629           ? true
14630           : default_libgcc_floating_mode_supported_p (mode));
14631 }
14632
14633 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
14634    if MODE is HFmode, and punt to the generic implementation otherwise.  */
14635
14636 static bool
14637 aarch64_scalar_mode_supported_p (machine_mode mode)
14638 {
14639   return (mode == HFmode
14640           ? true
14641           : default_scalar_mode_supported_p (mode));
14642 }
14643
14644 /* Set the value of FLT_EVAL_METHOD.
14645    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
14646
14647     0: evaluate all operations and constants, whose semantic type has at
14648        most the range and precision of type float, to the range and
14649        precision of float; evaluate all other operations and constants to
14650        the range and precision of the semantic type;
14651
14652     N, where _FloatN is a supported interchange floating type
14653        evaluate all operations and constants, whose semantic type has at
14654        most the range and precision of _FloatN type, to the range and
14655        precision of the _FloatN type; evaluate all other operations and
14656        constants to the range and precision of the semantic type;
14657
14658    If we have the ARMv8.2-A extensions then we support _Float16 in native
14659    precision, so we should set this to 16.  Otherwise, we support the type,
14660    but want to evaluate expressions in float precision, so set this to
14661    0.  */
14662
14663 static enum flt_eval_method
14664 aarch64_excess_precision (enum excess_precision_type type)
14665 {
14666   switch (type)
14667     {
14668       case EXCESS_PRECISION_TYPE_FAST:
14669       case EXCESS_PRECISION_TYPE_STANDARD:
14670         /* We can calculate either in 16-bit range and precision or
14671            32-bit range and precision.  Make that decision based on whether
14672            we have native support for the ARMv8.2-A 16-bit floating-point
14673            instructions or not.  */
14674         return (TARGET_FP_F16INST
14675                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
14676                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
14677       case EXCESS_PRECISION_TYPE_IMPLICIT:
14678         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
14679       default:
14680         gcc_unreachable ();
14681     }
14682   return FLT_EVAL_METHOD_UNPREDICTABLE;
14683 }
14684
14685 /* Target-specific selftests.  */
14686
14687 #if CHECKING_P
14688
14689 namespace selftest {
14690
14691 /* Selftest for the RTL loader.
14692    Verify that the RTL loader copes with a dump from
14693    print_rtx_function.  This is essentially just a test that class
14694    function_reader can handle a real dump, but it also verifies
14695    that lookup_reg_by_dump_name correctly handles hard regs.
14696    The presence of hard reg names in the dump means that the test is
14697    target-specific, hence it is in this file.  */
14698
14699 static void
14700 aarch64_test_loading_full_dump ()
14701 {
14702   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
14703
14704   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
14705
14706   rtx_insn *insn_1 = get_insn_by_uid (1);
14707   ASSERT_EQ (NOTE, GET_CODE (insn_1));
14708
14709   rtx_insn *insn_15 = get_insn_by_uid (15);
14710   ASSERT_EQ (INSN, GET_CODE (insn_15));
14711   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
14712
14713   /* Verify crtl->return_rtx.  */
14714   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
14715   ASSERT_EQ (0, REGNO (crtl->return_rtx));
14716   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
14717 }
14718
14719 /* Run all target-specific selftests.  */
14720
14721 static void
14722 aarch64_run_selftests (void)
14723 {
14724   aarch64_test_loading_full_dump ();
14725 }
14726
14727 } // namespace selftest
14728
14729 #endif /* #if CHECKING_P */
14730
14731 #undef TARGET_ADDRESS_COST
14732 #define TARGET_ADDRESS_COST aarch64_address_cost
14733
14734 /* This hook will determines whether unnamed bitfields affect the alignment
14735    of the containing structure.  The hook returns true if the structure
14736    should inherit the alignment requirements of an unnamed bitfield's
14737    type.  */
14738 #undef TARGET_ALIGN_ANON_BITFIELD
14739 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
14740
14741 #undef TARGET_ASM_ALIGNED_DI_OP
14742 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
14743
14744 #undef TARGET_ASM_ALIGNED_HI_OP
14745 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
14746
14747 #undef TARGET_ASM_ALIGNED_SI_OP
14748 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
14749
14750 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
14751 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
14752   hook_bool_const_tree_hwi_hwi_const_tree_true
14753
14754 #undef TARGET_ASM_FILE_START
14755 #define TARGET_ASM_FILE_START aarch64_start_file
14756
14757 #undef TARGET_ASM_OUTPUT_MI_THUNK
14758 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
14759
14760 #undef TARGET_ASM_SELECT_RTX_SECTION
14761 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
14762
14763 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
14764 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
14765
14766 #undef TARGET_BUILD_BUILTIN_VA_LIST
14767 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
14768
14769 #undef TARGET_CALLEE_COPIES
14770 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
14771
14772 #undef TARGET_CAN_ELIMINATE
14773 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
14774
14775 #undef TARGET_CAN_INLINE_P
14776 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
14777
14778 #undef TARGET_CANNOT_FORCE_CONST_MEM
14779 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
14780
14781 #undef TARGET_CASE_VALUES_THRESHOLD
14782 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
14783
14784 #undef TARGET_CONDITIONAL_REGISTER_USAGE
14785 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
14786
14787 /* Only the least significant bit is used for initialization guard
14788    variables.  */
14789 #undef TARGET_CXX_GUARD_MASK_BIT
14790 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
14791
14792 #undef TARGET_C_MODE_FOR_SUFFIX
14793 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
14794
14795 #ifdef TARGET_BIG_ENDIAN_DEFAULT
14796 #undef  TARGET_DEFAULT_TARGET_FLAGS
14797 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
14798 #endif
14799
14800 #undef TARGET_CLASS_MAX_NREGS
14801 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
14802
14803 #undef TARGET_BUILTIN_DECL
14804 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
14805
14806 #undef TARGET_BUILTIN_RECIPROCAL
14807 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
14808
14809 #undef TARGET_C_EXCESS_PRECISION
14810 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
14811
14812 #undef  TARGET_EXPAND_BUILTIN
14813 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
14814
14815 #undef TARGET_EXPAND_BUILTIN_VA_START
14816 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
14817
14818 #undef TARGET_FOLD_BUILTIN
14819 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
14820
14821 #undef TARGET_FUNCTION_ARG
14822 #define TARGET_FUNCTION_ARG aarch64_function_arg
14823
14824 #undef TARGET_FUNCTION_ARG_ADVANCE
14825 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
14826
14827 #undef TARGET_FUNCTION_ARG_BOUNDARY
14828 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
14829
14830 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
14831 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
14832
14833 #undef TARGET_FUNCTION_VALUE
14834 #define TARGET_FUNCTION_VALUE aarch64_function_value
14835
14836 #undef TARGET_FUNCTION_VALUE_REGNO_P
14837 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
14838
14839 #undef TARGET_FRAME_POINTER_REQUIRED
14840 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
14841
14842 #undef TARGET_GIMPLE_FOLD_BUILTIN
14843 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
14844
14845 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
14846 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
14847
14848 #undef  TARGET_INIT_BUILTINS
14849 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
14850
14851 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
14852 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
14853   aarch64_ira_change_pseudo_allocno_class
14854
14855 #undef TARGET_LEGITIMATE_ADDRESS_P
14856 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
14857
14858 #undef TARGET_LEGITIMATE_CONSTANT_P
14859 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
14860
14861 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
14862 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
14863   aarch64_legitimize_address_displacement
14864
14865 #undef TARGET_LIBGCC_CMP_RETURN_MODE
14866 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
14867
14868 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
14869 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
14870 aarch64_libgcc_floating_mode_supported_p
14871
14872 #undef TARGET_MANGLE_TYPE
14873 #define TARGET_MANGLE_TYPE aarch64_mangle_type
14874
14875 #undef TARGET_MEMORY_MOVE_COST
14876 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
14877
14878 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
14879 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
14880
14881 #undef TARGET_MUST_PASS_IN_STACK
14882 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
14883
14884 /* This target hook should return true if accesses to volatile bitfields
14885    should use the narrowest mode possible.  It should return false if these
14886    accesses should use the bitfield container type.  */
14887 #undef TARGET_NARROW_VOLATILE_BITFIELD
14888 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
14889
14890 #undef  TARGET_OPTION_OVERRIDE
14891 #define TARGET_OPTION_OVERRIDE aarch64_override_options
14892
14893 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
14894 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
14895   aarch64_override_options_after_change
14896
14897 #undef TARGET_OPTION_SAVE
14898 #define TARGET_OPTION_SAVE aarch64_option_save
14899
14900 #undef TARGET_OPTION_RESTORE
14901 #define TARGET_OPTION_RESTORE aarch64_option_restore
14902
14903 #undef TARGET_OPTION_PRINT
14904 #define TARGET_OPTION_PRINT aarch64_option_print
14905
14906 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
14907 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
14908
14909 #undef TARGET_SET_CURRENT_FUNCTION
14910 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
14911
14912 #undef TARGET_PASS_BY_REFERENCE
14913 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
14914
14915 #undef TARGET_PREFERRED_RELOAD_CLASS
14916 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
14917
14918 #undef TARGET_SCHED_REASSOCIATION_WIDTH
14919 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
14920
14921 #undef TARGET_PROMOTED_TYPE
14922 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
14923
14924 #undef TARGET_SECONDARY_RELOAD
14925 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
14926
14927 #undef TARGET_SHIFT_TRUNCATION_MASK
14928 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
14929
14930 #undef TARGET_SETUP_INCOMING_VARARGS
14931 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
14932
14933 #undef TARGET_STRUCT_VALUE_RTX
14934 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
14935
14936 #undef TARGET_REGISTER_MOVE_COST
14937 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
14938
14939 #undef TARGET_RETURN_IN_MEMORY
14940 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
14941
14942 #undef TARGET_RETURN_IN_MSB
14943 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
14944
14945 #undef TARGET_RTX_COSTS
14946 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
14947
14948 #undef TARGET_SCALAR_MODE_SUPPORTED_P
14949 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
14950
14951 #undef TARGET_SCHED_ISSUE_RATE
14952 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
14953
14954 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
14955 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
14956   aarch64_sched_first_cycle_multipass_dfa_lookahead
14957
14958 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
14959 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
14960   aarch64_first_cycle_multipass_dfa_lookahead_guard
14961
14962 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
14963 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
14964   aarch64_get_separate_components
14965
14966 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
14967 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
14968   aarch64_components_for_bb
14969
14970 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
14971 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
14972   aarch64_disqualify_components
14973
14974 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
14975 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
14976   aarch64_emit_prologue_components
14977
14978 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
14979 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
14980   aarch64_emit_epilogue_components
14981
14982 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
14983 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
14984   aarch64_set_handled_components
14985
14986 #undef TARGET_TRAMPOLINE_INIT
14987 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
14988
14989 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
14990 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
14991
14992 #undef TARGET_VECTOR_MODE_SUPPORTED_P
14993 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
14994
14995 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
14996 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
14997   aarch64_builtin_support_vector_misalignment
14998
14999 #undef TARGET_ARRAY_MODE_SUPPORTED_P
15000 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
15001
15002 #undef TARGET_VECTORIZE_ADD_STMT_COST
15003 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
15004
15005 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
15006 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
15007   aarch64_builtin_vectorization_cost
15008
15009 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
15010 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
15011
15012 #undef TARGET_VECTORIZE_BUILTINS
15013 #define TARGET_VECTORIZE_BUILTINS
15014
15015 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
15016 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
15017   aarch64_builtin_vectorized_function
15018
15019 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
15020 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
15021   aarch64_autovectorize_vector_sizes
15022
15023 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
15024 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
15025   aarch64_atomic_assign_expand_fenv
15026
15027 /* Section anchor support.  */
15028
15029 #undef TARGET_MIN_ANCHOR_OFFSET
15030 #define TARGET_MIN_ANCHOR_OFFSET -256
15031
15032 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
15033    byte offset; we can do much more for larger data types, but have no way
15034    to determine the size of the access.  We assume accesses are aligned.  */
15035 #undef TARGET_MAX_ANCHOR_OFFSET
15036 #define TARGET_MAX_ANCHOR_OFFSET 4095
15037
15038 #undef TARGET_VECTOR_ALIGNMENT
15039 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
15040
15041 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
15042 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
15043   aarch64_simd_vector_alignment_reachable
15044
15045 /* vec_perm support.  */
15046
15047 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
15048 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
15049   aarch64_vectorize_vec_perm_const_ok
15050
15051 #undef TARGET_INIT_LIBFUNCS
15052 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
15053
15054 #undef TARGET_FIXED_CONDITION_CODE_REGS
15055 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
15056
15057 #undef TARGET_FLAGS_REGNUM
15058 #define TARGET_FLAGS_REGNUM CC_REGNUM
15059
15060 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
15061 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
15062
15063 #undef TARGET_ASAN_SHADOW_OFFSET
15064 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
15065
15066 #undef TARGET_LEGITIMIZE_ADDRESS
15067 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
15068
15069 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
15070 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
15071   aarch64_use_by_pieces_infrastructure_p
15072
15073 #undef TARGET_CAN_USE_DOLOOP_P
15074 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
15075
15076 #undef TARGET_SCHED_ADJUST_PRIORITY
15077 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
15078
15079 #undef TARGET_SCHED_MACRO_FUSION_P
15080 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
15081
15082 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
15083 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
15084
15085 #undef TARGET_SCHED_FUSION_PRIORITY
15086 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
15087
15088 #undef TARGET_UNSPEC_MAY_TRAP_P
15089 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
15090
15091 #undef TARGET_USE_PSEUDO_PIC_REG
15092 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
15093
15094 #undef TARGET_PRINT_OPERAND
15095 #define TARGET_PRINT_OPERAND aarch64_print_operand
15096
15097 #undef TARGET_PRINT_OPERAND_ADDRESS
15098 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
15099
15100 #undef TARGET_OPTAB_SUPPORTED_P
15101 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
15102
15103 #undef TARGET_OMIT_STRUCT_RETURN_REG
15104 #define TARGET_OMIT_STRUCT_RETURN_REG true
15105
15106 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
15107 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
15108 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
15109
15110 #if CHECKING_P
15111 #undef TARGET_RUN_TARGET_SELFTESTS
15112 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
15113 #endif /* #if CHECKING_P */
15114
15115 struct gcc_target targetm = TARGET_INITIALIZER;
15116
15117 #include "gt-aarch64.h"