gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2017 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #define INCLUDE_STRING
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "memmodel.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "cfgloop.h"
  33 #include "df.h"
  34 #include "tm_p.h"
  35 #include "stringpool.h"
  36 #include "optabs.h"
  37 #include "regs.h"
  38 #include "emit-rtl.h"
  39 #include "recog.h"
  40 #include "diagnostic.h"
  41 #include "insn-attr.h"
  42 #include "alias.h"
  43 #include "fold-const.h"
  44 #include "stor-layout.h"
  45 #include "calls.h"
  46 #include "varasm.h"
  47 #include "output.h"
  48 #include "flags.h"
  49 #include "explow.h"
  50 #include "expr.h"
  51 #include "reload.h"
  52 #include "langhooks.h"
  53 #include "opts.h"
  54 #include "params.h"
  55 #include "gimplify.h"
  56 #include "dwarf2.h"
  57 #include "gimple-iterator.h"
  58 #include "tree-vectorizer.h"
  59 #include "aarch64-cost-tables.h"
  60 #include "dumpfile.h"
  61 #include "builtins.h"
  62 #include "rtl-iter.h"
  63 #include "tm-constrs.h"
  64 #include "sched-int.h"
  65 #include "target-globals.h"
  66 #include "common/common-target.h"
  67 #include "selftest.h"
  68 #include "selftest-rtl.h"
  69
  70 /* This file should be included last.  */
  71 #include "target-def.h"
  72
  73 /* Defined for convenience.  */
  74 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  75
  76 /* Classifies an address.
  77
  78    ADDRESS_REG_IMM
  79        A simple base register plus immediate offset.
  80
  81    ADDRESS_REG_WB
  82        A base register indexed by immediate offset with writeback.
  83
  84    ADDRESS_REG_REG
  85        A base register indexed by (optionally scaled) register.
  86
  87    ADDRESS_REG_UXTW
  88        A base register indexed by (optionally scaled) zero-extended register.
  89
  90    ADDRESS_REG_SXTW
  91        A base register indexed by (optionally scaled) sign-extended register.
  92
  93    ADDRESS_LO_SUM
  94        A LO_SUM rtx with a base register and "LO12" symbol relocation.
  95
  96    ADDRESS_SYMBOLIC:
  97        A constant symbolic address, in pc-relative literal pool.  */
  98
  99 enum aarch64_address_type {
 100   ADDRESS_REG_IMM,
 101   ADDRESS_REG_WB,
 102   ADDRESS_REG_REG,
 103   ADDRESS_REG_UXTW,
 104   ADDRESS_REG_SXTW,
 105   ADDRESS_LO_SUM,
 106   ADDRESS_SYMBOLIC
 107 };
 108
 109 struct aarch64_address_info {
 110   enum aarch64_address_type type;
 111   rtx base;
 112   rtx offset;
 113   int shift;
 114   enum aarch64_symbol_type symbol_type;
 115 };
 116
 117 struct simd_immediate_info
 118 {
 119   rtx value;
 120   int shift;
 121   int element_width;
 122   bool mvn;
 123   bool msl;
 124 };
 125
 126 /* The current code model.  */
 127 enum aarch64_code_model aarch64_cmodel;
 128
 129 #ifdef HAVE_AS_TLS
 130 #undef TARGET_HAVE_TLS
 131 #define TARGET_HAVE_TLS 1
 132 #endif
 133
 134 static bool aarch64_composite_type_p (const_tree, machine_mode);
 135 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 136                                                      const_tree,
 137                                                      machine_mode *, int *,
 138                                                      bool *);
 139 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 140 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 141 static void aarch64_override_options_after_change (void);
 142 static bool aarch64_vector_mode_supported_p (machine_mode);
 143 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
 144                                                  const unsigned char *sel);
 145 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 146 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 147                                                          const_tree type,
 148                                                          int misalignment,
 149                                                          bool is_packed);
 150
 151 /* Major revision number of the ARM Architecture implemented by the target.  */
 152 unsigned aarch64_architecture_version;
 153
 154 /* The processor for which instructions should be scheduled.  */
 155 enum aarch64_processor aarch64_tune = cortexa53;
 156
 157 /* Mask to specify which instruction scheduling options should be used.  */
 158 unsigned long aarch64_tune_flags = 0;
 159
 160 /* Global flag for PC relative loads.  */
 161 bool aarch64_pcrelative_literal_loads;
 162
 163 /* Support for command line parsing of boolean flags in the tuning
 164    structures.  */
 165 struct aarch64_flag_desc
 166 {
 167   const char* name;
 168   unsigned int flag;
 169 };
 170
 171 #define AARCH64_FUSION_PAIR(name, internal_name) \
 172   { name, AARCH64_FUSE_##internal_name },
 173 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 174 {
 175   { "none", AARCH64_FUSE_NOTHING },
 176 #include "aarch64-fusion-pairs.def"
 177   { "all", AARCH64_FUSE_ALL },
 178   { NULL, AARCH64_FUSE_NOTHING }
 179 };
 180
 181 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 182   { name, AARCH64_EXTRA_TUNE_##internal_name },
 183 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 184 {
 185   { "none", AARCH64_EXTRA_TUNE_NONE },
 186 #include "aarch64-tuning-flags.def"
 187   { "all", AARCH64_EXTRA_TUNE_ALL },
 188   { NULL, AARCH64_EXTRA_TUNE_NONE }
 189 };
 190
 191 /* Tuning parameters.  */
 192
 193 static const struct cpu_addrcost_table generic_addrcost_table =
 194 {
 195     {
 196       0, /* hi  */
 197       0, /* si  */
 198       0, /* di  */
 199       0, /* ti  */
 200     },
 201   0, /* pre_modify  */
 202   0, /* post_modify  */
 203   0, /* register_offset  */
 204   0, /* register_sextend  */
 205   0, /* register_zextend  */
 206   0 /* imm_offset  */
 207 };
 208
 209 static const struct cpu_addrcost_table cortexa57_addrcost_table =
 210 {
 211     {
 212       1, /* hi  */
 213       0, /* si  */
 214       0, /* di  */
 215       1, /* ti  */
 216     },
 217   0, /* pre_modify  */
 218   0, /* post_modify  */
 219   0, /* register_offset  */
 220   0, /* register_sextend  */
 221   0, /* register_zextend  */
 222   0, /* imm_offset  */
 223 };
 224
 225 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 226 {
 227     {
 228       0, /* hi  */
 229       0, /* si  */
 230       0, /* di  */
 231       2, /* ti  */
 232     },
 233   0, /* pre_modify  */
 234   0, /* post_modify  */
 235   1, /* register_offset  */
 236   1, /* register_sextend  */
 237   2, /* register_zextend  */
 238   0, /* imm_offset  */
 239 };
 240
 241 static const struct cpu_addrcost_table xgene1_addrcost_table =
 242 {
 243     {
 244       1, /* hi  */
 245       0, /* si  */
 246       0, /* di  */
 247       1, /* ti  */
 248     },
 249   1, /* pre_modify  */
 250   0, /* post_modify  */
 251   0, /* register_offset  */
 252   1, /* register_sextend  */
 253   1, /* register_zextend  */
 254   0, /* imm_offset  */
 255 };
 256
 257 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 258 {
 259     {
 260       1, /* hi  */
 261       0, /* si  */
 262       0, /* di  */
 263       1, /* ti  */
 264     },
 265   0, /* pre_modify  */
 266   0, /* post_modify  */
 267   0, /* register_offset  */
 268   0, /* register_sextend  */
 269   0, /* register_zextend  */
 270   0 /* imm_offset  */
 271 };
 272
 273 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 274 {
 275     {
 276       1, /* hi  */
 277       1, /* si  */
 278       1, /* di  */
 279       2, /* ti  */
 280     },
 281   0, /* pre_modify  */
 282   0, /* post_modify  */
 283   2, /* register_offset  */
 284   3, /* register_sextend  */
 285   3, /* register_zextend  */
 286   0, /* imm_offset  */
 287 };
 288
 289 static const struct cpu_regmove_cost generic_regmove_cost =
 290 {
 291   1, /* GP2GP  */
 292   /* Avoid the use of slow int<->fp moves for spilling by setting
 293      their cost higher than memmov_cost.  */
 294   5, /* GP2FP  */
 295   5, /* FP2GP  */
 296   2 /* FP2FP  */
 297 };
 298
 299 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 300 {
 301   1, /* GP2GP  */
 302   /* Avoid the use of slow int<->fp moves for spilling by setting
 303      their cost higher than memmov_cost.  */
 304   5, /* GP2FP  */
 305   5, /* FP2GP  */
 306   2 /* FP2FP  */
 307 };
 308
 309 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 310 {
 311   1, /* GP2GP  */
 312   /* Avoid the use of slow int<->fp moves for spilling by setting
 313      their cost higher than memmov_cost.  */
 314   5, /* GP2FP  */
 315   5, /* FP2GP  */
 316   2 /* FP2FP  */
 317 };
 318
 319 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 320 {
 321   1, /* GP2GP  */
 322   /* Avoid the use of slow int<->fp moves for spilling by setting
 323      their cost higher than memmov_cost (actual, 4 and 9).  */
 324   9, /* GP2FP  */
 325   9, /* FP2GP  */
 326   1 /* FP2FP  */
 327 };
 328
 329 static const struct cpu_regmove_cost thunderx_regmove_cost =
 330 {
 331   2, /* GP2GP  */
 332   2, /* GP2FP  */
 333   6, /* FP2GP  */
 334   4 /* FP2FP  */
 335 };
 336
 337 static const struct cpu_regmove_cost xgene1_regmove_cost =
 338 {
 339   1, /* GP2GP  */
 340   /* Avoid the use of slow int<->fp moves for spilling by setting
 341      their cost higher than memmov_cost.  */
 342   8, /* GP2FP  */
 343   8, /* FP2GP  */
 344   2 /* FP2FP  */
 345 };
 346
 347 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 348 {
 349   2, /* GP2GP  */
 350   /* Avoid the use of int<->fp moves for spilling.  */
 351   6, /* GP2FP  */
 352   6, /* FP2GP  */
 353   4 /* FP2FP  */
 354 };
 355
 356 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 357 {
 358   1, /* GP2GP  */
 359   /* Avoid the use of int<->fp moves for spilling.  */
 360   8, /* GP2FP  */
 361   8, /* FP2GP  */
 362   4  /* FP2FP  */
 363 };
 364
 365 /* Generic costs for vector insn classes.  */
 366 static const struct cpu_vector_cost generic_vector_cost =
 367 {
 368   1, /* scalar_int_stmt_cost  */
 369   1, /* scalar_fp_stmt_cost  */
 370   1, /* scalar_load_cost  */
 371   1, /* scalar_store_cost  */
 372   1, /* vec_int_stmt_cost  */
 373   1, /* vec_fp_stmt_cost  */
 374   2, /* vec_permute_cost  */
 375   1, /* vec_to_scalar_cost  */
 376   1, /* scalar_to_vec_cost  */
 377   1, /* vec_align_load_cost  */
 378   1, /* vec_unalign_load_cost  */
 379   1, /* vec_unalign_store_cost  */
 380   1, /* vec_store_cost  */
 381   3, /* cond_taken_branch_cost  */
 382   1 /* cond_not_taken_branch_cost  */
 383 };
 384
 385 /* ThunderX costs for vector insn classes.  */
 386 static const struct cpu_vector_cost thunderx_vector_cost =
 387 {
 388   1, /* scalar_int_stmt_cost  */
 389   1, /* scalar_fp_stmt_cost  */
 390   3, /* scalar_load_cost  */
 391   1, /* scalar_store_cost  */
 392   4, /* vec_int_stmt_cost  */
 393   4, /* vec_fp_stmt_cost  */
 394   4, /* vec_permute_cost  */
 395   2, /* vec_to_scalar_cost  */
 396   2, /* scalar_to_vec_cost  */
 397   3, /* vec_align_load_cost  */
 398   10, /* vec_unalign_load_cost  */
 399   10, /* vec_unalign_store_cost  */
 400   1, /* vec_store_cost  */
 401   3, /* cond_taken_branch_cost  */
 402   3 /* cond_not_taken_branch_cost  */
 403 };
 404
 405 /* Generic costs for vector insn classes.  */
 406 static const struct cpu_vector_cost cortexa57_vector_cost =
 407 {
 408   1, /* scalar_int_stmt_cost  */
 409   1, /* scalar_fp_stmt_cost  */
 410   4, /* scalar_load_cost  */
 411   1, /* scalar_store_cost  */
 412   2, /* vec_int_stmt_cost  */
 413   2, /* vec_fp_stmt_cost  */
 414   3, /* vec_permute_cost  */
 415   8, /* vec_to_scalar_cost  */
 416   8, /* scalar_to_vec_cost  */
 417   4, /* vec_align_load_cost  */
 418   4, /* vec_unalign_load_cost  */
 419   1, /* vec_unalign_store_cost  */
 420   1, /* vec_store_cost  */
 421   1, /* cond_taken_branch_cost  */
 422   1 /* cond_not_taken_branch_cost  */
 423 };
 424
 425 static const struct cpu_vector_cost exynosm1_vector_cost =
 426 {
 427   1, /* scalar_int_stmt_cost  */
 428   1, /* scalar_fp_stmt_cost  */
 429   5, /* scalar_load_cost  */
 430   1, /* scalar_store_cost  */
 431   3, /* vec_int_stmt_cost  */
 432   3, /* vec_fp_stmt_cost  */
 433   3, /* vec_permute_cost  */
 434   3, /* vec_to_scalar_cost  */
 435   3, /* scalar_to_vec_cost  */
 436   5, /* vec_align_load_cost  */
 437   5, /* vec_unalign_load_cost  */
 438   1, /* vec_unalign_store_cost  */
 439   1, /* vec_store_cost  */
 440   1, /* cond_taken_branch_cost  */
 441   1 /* cond_not_taken_branch_cost  */
 442 };
 443
 444 /* Generic costs for vector insn classes.  */
 445 static const struct cpu_vector_cost xgene1_vector_cost =
 446 {
 447   1, /* scalar_int_stmt_cost  */
 448   1, /* scalar_fp_stmt_cost  */
 449   5, /* scalar_load_cost  */
 450   1, /* scalar_store_cost  */
 451   2, /* vec_int_stmt_cost  */
 452   2, /* vec_fp_stmt_cost  */
 453   2, /* vec_permute_cost  */
 454   4, /* vec_to_scalar_cost  */
 455   4, /* scalar_to_vec_cost  */
 456   10, /* vec_align_load_cost  */
 457   10, /* vec_unalign_load_cost  */
 458   2, /* vec_unalign_store_cost  */
 459   2, /* vec_store_cost  */
 460   2, /* cond_taken_branch_cost  */
 461   1 /* cond_not_taken_branch_cost  */
 462 };
 463
 464 /* Costs for vector insn classes for Vulcan.  */
 465 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 466 {
 467   1, /* scalar_int_stmt_cost  */
 468   6, /* scalar_fp_stmt_cost  */
 469   4, /* scalar_load_cost  */
 470   1, /* scalar_store_cost  */
 471   5, /* vec_int_stmt_cost  */
 472   6, /* vec_fp_stmt_cost  */
 473   3, /* vec_permute_cost  */
 474   6, /* vec_to_scalar_cost  */
 475   5, /* scalar_to_vec_cost  */
 476   8, /* vec_align_load_cost  */
 477   8, /* vec_unalign_load_cost  */
 478   4, /* vec_unalign_store_cost  */
 479   4, /* vec_store_cost  */
 480   2, /* cond_taken_branch_cost  */
 481   1  /* cond_not_taken_branch_cost  */
 482 };
 483
 484 /* Generic costs for branch instructions.  */
 485 static const struct cpu_branch_cost generic_branch_cost =
 486 {
 487   2,  /* Predictable.  */
 488   2   /* Unpredictable.  */
 489 };
 490
 491 /* Branch costs for Cortex-A57.  */
 492 static const struct cpu_branch_cost cortexa57_branch_cost =
 493 {
 494   1,  /* Predictable.  */
 495   3   /* Unpredictable.  */
 496 };
 497
 498 /* Branch costs for Vulcan.  */
 499 static const struct cpu_branch_cost thunderx2t99_branch_cost =
 500 {
 501   1,  /* Predictable.  */
 502   3   /* Unpredictable.  */
 503 };
 504
 505 /* Generic approximation modes.  */
 506 static const cpu_approx_modes generic_approx_modes =
 507 {
 508   AARCH64_APPROX_NONE,  /* division  */
 509   AARCH64_APPROX_NONE,  /* sqrt  */
 510   AARCH64_APPROX_NONE   /* recip_sqrt  */
 511 };
 512
 513 /* Approximation modes for Exynos M1.  */
 514 static const cpu_approx_modes exynosm1_approx_modes =
 515 {
 516   AARCH64_APPROX_NONE,  /* division  */
 517   AARCH64_APPROX_ALL,   /* sqrt  */
 518   AARCH64_APPROX_ALL    /* recip_sqrt  */
 519 };
 520
 521 /* Approximation modes for X-Gene 1.  */
 522 static const cpu_approx_modes xgene1_approx_modes =
 523 {
 524   AARCH64_APPROX_NONE,  /* division  */
 525   AARCH64_APPROX_NONE,  /* sqrt  */
 526   AARCH64_APPROX_ALL    /* recip_sqrt  */
 527 };
 528
 529 static const struct tune_params generic_tunings =
 530 {
 531   &cortexa57_extra_costs,
 532   &generic_addrcost_table,
 533   &generic_regmove_cost,
 534   &generic_vector_cost,
 535   &generic_branch_cost,
 536   &generic_approx_modes,
 537   4, /* memmov_cost  */
 538   2, /* issue_rate  */
 539   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 540   8,    /* function_align.  */
 541   8,    /* jump_align.  */
 542   4,    /* loop_align.  */
 543   2,    /* int_reassoc_width.  */
 544   4,    /* fp_reassoc_width.  */
 545   1,    /* vec_reassoc_width.  */
 546   2,    /* min_div_recip_mul_sf.  */
 547   2,    /* min_div_recip_mul_df.  */
 548   0,    /* max_case_values.  */
 549   0,    /* cache_line_size.  */
 550   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 551   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 552 };
 553
 554 static const struct tune_params cortexa35_tunings =
 555 {
 556   &cortexa53_extra_costs,
 557   &generic_addrcost_table,
 558   &cortexa53_regmove_cost,
 559   &generic_vector_cost,
 560   &cortexa57_branch_cost,
 561   &generic_approx_modes,
 562   4, /* memmov_cost  */
 563   1, /* issue_rate  */
 564   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 565    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 566   16,   /* function_align.  */
 567   8,    /* jump_align.  */
 568   8,    /* loop_align.  */
 569   2,    /* int_reassoc_width.  */
 570   4,    /* fp_reassoc_width.  */
 571   1,    /* vec_reassoc_width.  */
 572   2,    /* min_div_recip_mul_sf.  */
 573   2,    /* min_div_recip_mul_df.  */
 574   0,    /* max_case_values.  */
 575   0,    /* cache_line_size.  */
 576   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 577   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 578 };
 579
 580 static const struct tune_params cortexa53_tunings =
 581 {
 582   &cortexa53_extra_costs,
 583   &generic_addrcost_table,
 584   &cortexa53_regmove_cost,
 585   &generic_vector_cost,
 586   &cortexa57_branch_cost,
 587   &generic_approx_modes,
 588   4, /* memmov_cost  */
 589   2, /* issue_rate  */
 590   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 591    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 592   16,   /* function_align.  */
 593   8,    /* jump_align.  */
 594   8,    /* loop_align.  */
 595   2,    /* int_reassoc_width.  */
 596   4,    /* fp_reassoc_width.  */
 597   1,    /* vec_reassoc_width.  */
 598   2,    /* min_div_recip_mul_sf.  */
 599   2,    /* min_div_recip_mul_df.  */
 600   0,    /* max_case_values.  */
 601   0,    /* cache_line_size.  */
 602   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 603   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 604 };
 605
 606 static const struct tune_params cortexa57_tunings =
 607 {
 608   &cortexa57_extra_costs,
 609   &cortexa57_addrcost_table,
 610   &cortexa57_regmove_cost,
 611   &cortexa57_vector_cost,
 612   &cortexa57_branch_cost,
 613   &generic_approx_modes,
 614   4, /* memmov_cost  */
 615   3, /* issue_rate  */
 616   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 617    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 618   16,   /* function_align.  */
 619   8,    /* jump_align.  */
 620   8,    /* loop_align.  */
 621   2,    /* int_reassoc_width.  */
 622   4,    /* fp_reassoc_width.  */
 623   1,    /* vec_reassoc_width.  */
 624   2,    /* min_div_recip_mul_sf.  */
 625   2,    /* min_div_recip_mul_df.  */
 626   0,    /* max_case_values.  */
 627   0,    /* cache_line_size.  */
 628   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 629   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS)  /* tune_flags.  */
 630 };
 631
 632 static const struct tune_params cortexa72_tunings =
 633 {
 634   &cortexa57_extra_costs,
 635   &cortexa57_addrcost_table,
 636   &cortexa57_regmove_cost,
 637   &cortexa57_vector_cost,
 638   &cortexa57_branch_cost,
 639   &generic_approx_modes,
 640   4, /* memmov_cost  */
 641   3, /* issue_rate  */
 642   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 643    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 644   16,   /* function_align.  */
 645   8,    /* jump_align.  */
 646   8,    /* loop_align.  */
 647   2,    /* int_reassoc_width.  */
 648   4,    /* fp_reassoc_width.  */
 649   1,    /* vec_reassoc_width.  */
 650   2,    /* min_div_recip_mul_sf.  */
 651   2,    /* min_div_recip_mul_df.  */
 652   0,    /* max_case_values.  */
 653   0,    /* cache_line_size.  */
 654   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 655   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 656 };
 657
 658 static const struct tune_params cortexa73_tunings =
 659 {
 660   &cortexa57_extra_costs,
 661   &cortexa57_addrcost_table,
 662   &cortexa57_regmove_cost,
 663   &cortexa57_vector_cost,
 664   &cortexa57_branch_cost,
 665   &generic_approx_modes,
 666   4, /* memmov_cost.  */
 667   2, /* issue_rate.  */
 668   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 669    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 670   16,   /* function_align.  */
 671   8,    /* jump_align.  */
 672   8,    /* loop_align.  */
 673   2,    /* int_reassoc_width.  */
 674   4,    /* fp_reassoc_width.  */
 675   1,    /* vec_reassoc_width.  */
 676   2,    /* min_div_recip_mul_sf.  */
 677   2,    /* min_div_recip_mul_df.  */
 678   0,    /* max_case_values.  */
 679   0,    /* cache_line_size.  */
 680   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 681   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 682 };
 683
 684 static const struct tune_params exynosm1_tunings =
 685 {
 686   &exynosm1_extra_costs,
 687   &exynosm1_addrcost_table,
 688   &exynosm1_regmove_cost,
 689   &exynosm1_vector_cost,
 690   &generic_branch_cost,
 691   &exynosm1_approx_modes,
 692   4,    /* memmov_cost  */
 693   3,    /* issue_rate  */
 694   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 695   4,    /* function_align.  */
 696   4,    /* jump_align.  */
 697   4,    /* loop_align.  */
 698   2,    /* int_reassoc_width.  */
 699   4,    /* fp_reassoc_width.  */
 700   1,    /* vec_reassoc_width.  */
 701   2,    /* min_div_recip_mul_sf.  */
 702   2,    /* min_div_recip_mul_df.  */
 703   48,   /* max_case_values.  */
 704   64,   /* cache_line_size.  */
 705   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 706   (AARCH64_EXTRA_TUNE_NONE) /* tune_flags.  */
 707 };
 708
 709 static const struct tune_params thunderx_tunings =
 710 {
 711   &thunderx_extra_costs,
 712   &generic_addrcost_table,
 713   &thunderx_regmove_cost,
 714   &thunderx_vector_cost,
 715   &generic_branch_cost,
 716   &generic_approx_modes,
 717   6, /* memmov_cost  */
 718   2, /* issue_rate  */
 719   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 720   8,    /* function_align.  */
 721   8,    /* jump_align.  */
 722   8,    /* loop_align.  */
 723   2,    /* int_reassoc_width.  */
 724   4,    /* fp_reassoc_width.  */
 725   1,    /* vec_reassoc_width.  */
 726   2,    /* min_div_recip_mul_sf.  */
 727   2,    /* min_div_recip_mul_df.  */
 728   0,    /* max_case_values.  */
 729   0,    /* cache_line_size.  */
 730   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 731   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)      /* tune_flags.  */
 732 };
 733
 734 static const struct tune_params xgene1_tunings =
 735 {
 736   &xgene1_extra_costs,
 737   &xgene1_addrcost_table,
 738   &xgene1_regmove_cost,
 739   &xgene1_vector_cost,
 740   &generic_branch_cost,
 741   &xgene1_approx_modes,
 742   6, /* memmov_cost  */
 743   4, /* issue_rate  */
 744   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 745   16,   /* function_align.  */
 746   8,    /* jump_align.  */
 747   16,   /* loop_align.  */
 748   2,    /* int_reassoc_width.  */
 749   4,    /* fp_reassoc_width.  */
 750   1,    /* vec_reassoc_width.  */
 751   2,    /* min_div_recip_mul_sf.  */
 752   2,    /* min_div_recip_mul_df.  */
 753   0,    /* max_case_values.  */
 754   0,    /* cache_line_size.  */
 755   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 756   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 757 };
 758
 759 static const struct tune_params qdf24xx_tunings =
 760 {
 761   &qdf24xx_extra_costs,
 762   &qdf24xx_addrcost_table,
 763   &qdf24xx_regmove_cost,
 764   &generic_vector_cost,
 765   &generic_branch_cost,
 766   &generic_approx_modes,
 767   4, /* memmov_cost  */
 768   4, /* issue_rate  */
 769   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 770    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 771   16,   /* function_align.  */
 772   8,    /* jump_align.  */
 773   16,   /* loop_align.  */
 774   2,    /* int_reassoc_width.  */
 775   4,    /* fp_reassoc_width.  */
 776   1,    /* vec_reassoc_width.  */
 777   2,    /* min_div_recip_mul_sf.  */
 778   2,    /* min_div_recip_mul_df.  */
 779   0,    /* max_case_values.  */
 780   64,   /* cache_line_size.  */
 781   tune_params::AUTOPREFETCHER_STRONG,   /* autoprefetcher_model.  */
 782   (AARCH64_EXTRA_TUNE_NONE)             /* tune_flags.  */
 783 };
 784
 785 static const struct tune_params thunderx2t99_tunings =
 786 {
 787   &thunderx2t99_extra_costs,
 788   &thunderx2t99_addrcost_table,
 789   &thunderx2t99_regmove_cost,
 790   &thunderx2t99_vector_cost,
 791   &thunderx2t99_branch_cost,
 792   &generic_approx_modes,
 793   4, /* memmov_cost.  */
 794   4, /* issue_rate.  */
 795   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 796   16,   /* function_align.  */
 797   8,    /* jump_align.  */
 798   16,   /* loop_align.  */
 799   3,    /* int_reassoc_width.  */
 800   2,    /* fp_reassoc_width.  */
 801   2,    /* vec_reassoc_width.  */
 802   2,    /* min_div_recip_mul_sf.  */
 803   2,    /* min_div_recip_mul_df.  */
 804   0,    /* max_case_values.  */
 805   64,   /* cache_line_size.  */
 806   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 807   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 808 };
 809
 810 /* Support for fine-grained override of the tuning structures.  */
 811 struct aarch64_tuning_override_function
 812 {
 813   const char* name;
 814   void (*parse_override)(const char*, struct tune_params*);
 815 };
 816
 817 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 818 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 819
 820 static const struct aarch64_tuning_override_function
 821 aarch64_tuning_override_functions[] =
 822 {
 823   { "fuse", aarch64_parse_fuse_string },
 824   { "tune", aarch64_parse_tune_string },
 825   { NULL, NULL }
 826 };
 827
 828 /* A processor implementing AArch64.  */
 829 struct processor
 830 {
 831   const char *const name;
 832   enum aarch64_processor ident;
 833   enum aarch64_processor sched_core;
 834   enum aarch64_arch arch;
 835   unsigned architecture_version;
 836   const unsigned long flags;
 837   const struct tune_params *const tune;
 838 };
 839
 840 /* Architectures implementing AArch64.  */
 841 static const struct processor all_architectures[] =
 842 {
 843 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
 844   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
 845 #include "aarch64-arches.def"
 846   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 847 };
 848
 849 /* Processor cores implementing AArch64.  */
 850 static const struct processor all_cores[] =
 851 {
 852 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
 853   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
 854   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
 855   FLAGS, &COSTS##_tunings},
 856 #include "aarch64-cores.def"
 857   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
 858     AARCH64_FL_FOR_ARCH8, &generic_tunings},
 859   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 860 };
 861
 862
 863 /* Target specification.  These are populated by the -march, -mtune, -mcpu
 864    handling code or by target attributes.  */
 865 static const struct processor *selected_arch;
 866 static const struct processor *selected_cpu;
 867 static const struct processor *selected_tune;
 868
 869 /* The current tuning set.  */
 870 struct tune_params aarch64_tune_params = generic_tunings;
 871
 872 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 873
 874 /* An ISA extension in the co-processor and main instruction set space.  */
 875 struct aarch64_option_extension
 876 {
 877   const char *const name;
 878   const unsigned long flags_on;
 879   const unsigned long flags_off;
 880 };
 881
 882 typedef enum aarch64_cond_code
 883 {
 884   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 885   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 886   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 887 }
 888 aarch64_cc;
 889
 890 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 891
 892 /* The condition codes of the processor, and the inverse function.  */
 893 static const char * const aarch64_condition_codes[] =
 894 {
 895   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 896   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 897 };
 898
 899 /* Generate code to enable conditional branches in functions over 1 MiB.  */
 900 const char *
 901 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
 902                         const char * branch_format)
 903 {
 904     rtx_code_label * tmp_label = gen_label_rtx ();
 905     char label_buf[256];
 906     char buffer[128];
 907     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
 908                                  CODE_LABEL_NUMBER (tmp_label));
 909     const char *label_ptr = targetm.strip_name_encoding (label_buf);
 910     rtx dest_label = operands[pos_label];
 911     operands[pos_label] = tmp_label;
 912
 913     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
 914     output_asm_insn (buffer, operands);
 915
 916     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
 917     operands[pos_label] = dest_label;
 918     output_asm_insn (buffer, operands);
 919     return "";
 920 }
 921
 922 void
 923 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
 924 {
 925   const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
 926   if (TARGET_GENERAL_REGS_ONLY)
 927     error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
 928   else
 929     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
 930 }
 931
 932 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
 933    The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
 934    the same cost even if ALL_REGS has a much larger cost.  ALL_REGS is also
 935    used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
 936    cost (in this case the best class is the lowest cost one).  Using ALL_REGS
 937    irrespectively of its cost results in bad allocations with many redundant
 938    int<->FP moves which are expensive on various cores.
 939    To avoid this we don't allow ALL_REGS as the allocno class, but force a
 940    decision between FP_REGS and GENERAL_REGS.  We use the allocno class if it
 941    isn't ALL_REGS.  Similarly, use the best class if it isn't ALL_REGS.
 942    Otherwise set the allocno class depending on the mode.
 943    The result of this is that it is no longer inefficient to have a higher
 944    memory move cost than the register move cost.
 945 */
 946
 947 static reg_class_t
 948 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
 949                                          reg_class_t best_class)
 950 {
 951   enum machine_mode mode;
 952
 953   if (allocno_class != ALL_REGS)
 954     return allocno_class;
 955
 956   if (best_class != ALL_REGS)
 957     return best_class;
 958
 959   mode = PSEUDO_REGNO_MODE (regno);
 960   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
 961 }
 962
 963 static unsigned int
 964 aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
 965 {
 966   if (GET_MODE_UNIT_SIZE (mode) == 4)
 967     return aarch64_tune_params.min_div_recip_mul_sf;
 968   return aarch64_tune_params.min_div_recip_mul_df;
 969 }
 970
 971 static int
 972 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
 973                              enum machine_mode mode)
 974 {
 975   if (VECTOR_MODE_P (mode))
 976     return aarch64_tune_params.vec_reassoc_width;
 977   if (INTEGRAL_MODE_P (mode))
 978     return aarch64_tune_params.int_reassoc_width;
 979   if (FLOAT_MODE_P (mode))
 980     return aarch64_tune_params.fp_reassoc_width;
 981   return 1;
 982 }
 983
 984 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
 985 unsigned
 986 aarch64_dbx_register_number (unsigned regno)
 987 {
 988    if (GP_REGNUM_P (regno))
 989      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
 990    else if (regno == SP_REGNUM)
 991      return AARCH64_DWARF_SP;
 992    else if (FP_REGNUM_P (regno))
 993      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
 994
 995    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
 996       equivalent DWARF register.  */
 997    return DWARF_FRAME_REGISTERS;
 998 }
 999
1000 /* Return TRUE if MODE is any of the large INT modes.  */
1001 static bool
1002 aarch64_vect_struct_mode_p (machine_mode mode)
1003 {
1004   return mode == OImode || mode == CImode || mode == XImode;
1005 }
1006
1007 /* Return TRUE if MODE is any of the vector modes.  */
1008 static bool
1009 aarch64_vector_mode_p (machine_mode mode)
1010 {
1011   return aarch64_vector_mode_supported_p (mode)
1012          || aarch64_vect_struct_mode_p (mode);
1013 }
1014
1015 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1016 static bool
1017 aarch64_array_mode_supported_p (machine_mode mode,
1018                                 unsigned HOST_WIDE_INT nelems)
1019 {
1020   if (TARGET_SIMD
1021       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1022           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1023       && (nelems >= 2 && nelems <= 4))
1024     return true;
1025
1026   return false;
1027 }
1028
1029 /* Implement HARD_REGNO_NREGS.  */
1030
1031 int
1032 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1033 {
1034   switch (aarch64_regno_regclass (regno))
1035     {
1036     case FP_REGS:
1037     case FP_LO_REGS:
1038       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1039     default:
1040       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1041     }
1042   gcc_unreachable ();
1043 }
1044
1045 /* Implement HARD_REGNO_MODE_OK.  */
1046
1047 int
1048 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1049 {
1050   if (GET_MODE_CLASS (mode) == MODE_CC)
1051     return regno == CC_REGNUM;
1052
1053   if (regno == SP_REGNUM)
1054     /* The purpose of comparing with ptr_mode is to support the
1055        global register variable associated with the stack pointer
1056        register via the syntax of asm ("wsp") in ILP32.  */
1057     return mode == Pmode || mode == ptr_mode;
1058
1059   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1060     return mode == Pmode;
1061
1062   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1063     return 1;
1064
1065   if (FP_REGNUM_P (regno))
1066     {
1067       if (aarch64_vect_struct_mode_p (mode))
1068         return
1069           (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
1070       else
1071         return 1;
1072     }
1073
1074   return 0;
1075 }
1076
1077 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1078 machine_mode
1079 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1080                                      machine_mode mode)
1081 {
1082   /* Handle modes that fit within single registers.  */
1083   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1084     {
1085       if (GET_MODE_SIZE (mode) >= 4)
1086         return mode;
1087       else
1088         return SImode;
1089     }
1090   /* Fall back to generic for multi-reg and very large modes.  */
1091   else
1092     return choose_hard_reg_mode (regno, nregs, false);
1093 }
1094
1095 /* Return true if calls to DECL should be treated as
1096    long-calls (ie called via a register).  */
1097 static bool
1098 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1099 {
1100   return false;
1101 }
1102
1103 /* Return true if calls to symbol-ref SYM should be treated as
1104    long-calls (ie called via a register).  */
1105 bool
1106 aarch64_is_long_call_p (rtx sym)
1107 {
1108   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1109 }
1110
1111 /* Return true if calls to symbol-ref SYM should not go through
1112    plt stubs.  */
1113
1114 bool
1115 aarch64_is_noplt_call_p (rtx sym)
1116 {
1117   const_tree decl = SYMBOL_REF_DECL (sym);
1118
1119   if (flag_pic
1120       && decl
1121       && (!flag_plt
1122           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1123       && !targetm.binds_local_p (decl))
1124     return true;
1125
1126   return false;
1127 }
1128
1129 /* Return true if the offsets to a zero/sign-extract operation
1130    represent an expression that matches an extend operation.  The
1131    operands represent the paramters from
1132
1133    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1134 bool
1135 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
1136                                 rtx extract_imm)
1137 {
1138   HOST_WIDE_INT mult_val, extract_val;
1139
1140   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1141     return false;
1142
1143   mult_val = INTVAL (mult_imm);
1144   extract_val = INTVAL (extract_imm);
1145
1146   if (extract_val > 8
1147       && extract_val < GET_MODE_BITSIZE (mode)
1148       && exact_log2 (extract_val & ~7) > 0
1149       && (extract_val & 7) <= 4
1150       && mult_val == (1 << (extract_val & 7)))
1151     return true;
1152
1153   return false;
1154 }
1155
1156 /* Emit an insn that's a simple single-set.  Both the operands must be
1157    known to be valid.  */
1158 inline static rtx_insn *
1159 emit_set_insn (rtx x, rtx y)
1160 {
1161   return emit_insn (gen_rtx_SET (x, y));
1162 }
1163
1164 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1165    return the rtx for register 0 in the proper mode.  */
1166 rtx
1167 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1168 {
1169   machine_mode mode = SELECT_CC_MODE (code, x, y);
1170   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1171
1172   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1173   return cc_reg;
1174 }
1175
1176 /* Build the SYMBOL_REF for __tls_get_addr.  */
1177
1178 static GTY(()) rtx tls_get_addr_libfunc;
1179
1180 rtx
1181 aarch64_tls_get_addr (void)
1182 {
1183   if (!tls_get_addr_libfunc)
1184     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1185   return tls_get_addr_libfunc;
1186 }
1187
1188 /* Return the TLS model to use for ADDR.  */
1189
1190 static enum tls_model
1191 tls_symbolic_operand_type (rtx addr)
1192 {
1193   enum tls_model tls_kind = TLS_MODEL_NONE;
1194   rtx sym, addend;
1195
1196   if (GET_CODE (addr) == CONST)
1197     {
1198       split_const (addr, &sym, &addend);
1199       if (GET_CODE (sym) == SYMBOL_REF)
1200         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1201     }
1202   else if (GET_CODE (addr) == SYMBOL_REF)
1203     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1204
1205   return tls_kind;
1206 }
1207
1208 /* We'll allow lo_sum's in addresses in our legitimate addresses
1209    so that combine would take care of combining addresses where
1210    necessary, but for generation purposes, we'll generate the address
1211    as :
1212    RTL                               Absolute
1213    tmp = hi (symbol_ref);            adrp  x1, foo
1214    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1215                                      nop
1216
1217    PIC                               TLS
1218    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1219    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1220                                      bl   __tls_get_addr
1221                                      nop
1222
1223    Load TLS symbol, depending on TLS mechanism and TLS access model.
1224
1225    Global Dynamic - Traditional TLS:
1226    adrp tmp, :tlsgd:imm
1227    add  dest, tmp, #:tlsgd_lo12:imm
1228    bl   __tls_get_addr
1229
1230    Global Dynamic - TLS Descriptors:
1231    adrp dest, :tlsdesc:imm
1232    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1233    add  dest, dest, #:tlsdesc_lo12:imm
1234    blr  tmp
1235    mrs  tp, tpidr_el0
1236    add  dest, dest, tp
1237
1238    Initial Exec:
1239    mrs  tp, tpidr_el0
1240    adrp tmp, :gottprel:imm
1241    ldr  dest, [tmp, #:gottprel_lo12:imm]
1242    add  dest, dest, tp
1243
1244    Local Exec:
1245    mrs  tp, tpidr_el0
1246    add  t0, tp, #:tprel_hi12:imm, lsl #12
1247    add  t0, t0, #:tprel_lo12_nc:imm
1248 */
1249
1250 static void
1251 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1252                                    enum aarch64_symbol_type type)
1253 {
1254   switch (type)
1255     {
1256     case SYMBOL_SMALL_ABSOLUTE:
1257       {
1258         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1259         rtx tmp_reg = dest;
1260         machine_mode mode = GET_MODE (dest);
1261
1262         gcc_assert (mode == Pmode || mode == ptr_mode);
1263
1264         if (can_create_pseudo_p ())
1265           tmp_reg = gen_reg_rtx (mode);
1266
1267         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1268         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1269         return;
1270       }
1271
1272     case SYMBOL_TINY_ABSOLUTE:
1273       emit_insn (gen_rtx_SET (dest, imm));
1274       return;
1275
1276     case SYMBOL_SMALL_GOT_28K:
1277       {
1278         machine_mode mode = GET_MODE (dest);
1279         rtx gp_rtx = pic_offset_table_rtx;
1280         rtx insn;
1281         rtx mem;
1282
1283         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1284            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1285            decide rtx costs, in which case pic_offset_table_rtx is not
1286            initialized.  For that case no need to generate the first adrp
1287            instruction as the final cost for global variable access is
1288            one instruction.  */
1289         if (gp_rtx != NULL)
1290           {
1291             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1292                using the page base as GOT base, the first page may be wasted,
1293                in the worst scenario, there is only 28K space for GOT).
1294
1295                The generate instruction sequence for accessing global variable
1296                is:
1297
1298                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1299
1300                Only one instruction needed. But we must initialize
1301                pic_offset_table_rtx properly.  We generate initialize insn for
1302                every global access, and allow CSE to remove all redundant.
1303
1304                The final instruction sequences will look like the following
1305                for multiply global variables access.
1306
1307                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1308
1309                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1310                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1311                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1312                  ...  */
1313
1314             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1315             crtl->uses_pic_offset_table = 1;
1316             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1317
1318             if (mode != GET_MODE (gp_rtx))
1319              gp_rtx = gen_lowpart (mode, gp_rtx);
1320
1321           }
1322
1323         if (mode == ptr_mode)
1324           {
1325             if (mode == DImode)
1326               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1327             else
1328               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1329
1330             mem = XVECEXP (SET_SRC (insn), 0, 0);
1331           }
1332         else
1333           {
1334             gcc_assert (mode == Pmode);
1335
1336             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1337             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1338           }
1339
1340         /* The operand is expected to be MEM.  Whenever the related insn
1341            pattern changed, above code which calculate mem should be
1342            updated.  */
1343         gcc_assert (GET_CODE (mem) == MEM);
1344         MEM_READONLY_P (mem) = 1;
1345         MEM_NOTRAP_P (mem) = 1;
1346         emit_insn (insn);
1347         return;
1348       }
1349
1350     case SYMBOL_SMALL_GOT_4G:
1351       {
1352         /* In ILP32, the mode of dest can be either SImode or DImode,
1353            while the got entry is always of SImode size.  The mode of
1354            dest depends on how dest is used: if dest is assigned to a
1355            pointer (e.g. in the memory), it has SImode; it may have
1356            DImode if dest is dereferenced to access the memeory.
1357            This is why we have to handle three different ldr_got_small
1358            patterns here (two patterns for ILP32).  */
1359
1360         rtx insn;
1361         rtx mem;
1362         rtx tmp_reg = dest;
1363         machine_mode mode = GET_MODE (dest);
1364
1365         if (can_create_pseudo_p ())
1366           tmp_reg = gen_reg_rtx (mode);
1367
1368         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1369         if (mode == ptr_mode)
1370           {
1371             if (mode == DImode)
1372               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1373             else
1374               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1375
1376             mem = XVECEXP (SET_SRC (insn), 0, 0);
1377           }
1378         else
1379           {
1380             gcc_assert (mode == Pmode);
1381
1382             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1383             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1384           }
1385
1386         gcc_assert (GET_CODE (mem) == MEM);
1387         MEM_READONLY_P (mem) = 1;
1388         MEM_NOTRAP_P (mem) = 1;
1389         emit_insn (insn);
1390         return;
1391       }
1392
1393     case SYMBOL_SMALL_TLSGD:
1394       {
1395         rtx_insn *insns;
1396         machine_mode mode = GET_MODE (dest);
1397         rtx result = gen_rtx_REG (mode, R0_REGNUM);
1398
1399         start_sequence ();
1400         if (TARGET_ILP32)
1401           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1402         else
1403           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1404         insns = get_insns ();
1405         end_sequence ();
1406
1407         RTL_CONST_CALL_P (insns) = 1;
1408         emit_libcall_block (insns, dest, result, imm);
1409         return;
1410       }
1411
1412     case SYMBOL_SMALL_TLSDESC:
1413       {
1414         machine_mode mode = GET_MODE (dest);
1415         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1416         rtx tp;
1417
1418         gcc_assert (mode == Pmode || mode == ptr_mode);
1419
1420         /* In ILP32, the got entry is always of SImode size.  Unlike
1421            small GOT, the dest is fixed at reg 0.  */
1422         if (TARGET_ILP32)
1423           emit_insn (gen_tlsdesc_small_si (imm));
1424         else
1425           emit_insn (gen_tlsdesc_small_di (imm));
1426         tp = aarch64_load_tp (NULL);
1427
1428         if (mode != Pmode)
1429           tp = gen_lowpart (mode, tp);
1430
1431         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1432         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1433         return;
1434       }
1435
1436     case SYMBOL_SMALL_TLSIE:
1437       {
1438         /* In ILP32, the mode of dest can be either SImode or DImode,
1439            while the got entry is always of SImode size.  The mode of
1440            dest depends on how dest is used: if dest is assigned to a
1441            pointer (e.g. in the memory), it has SImode; it may have
1442            DImode if dest is dereferenced to access the memeory.
1443            This is why we have to handle three different tlsie_small
1444            patterns here (two patterns for ILP32).  */
1445         machine_mode mode = GET_MODE (dest);
1446         rtx tmp_reg = gen_reg_rtx (mode);
1447         rtx tp = aarch64_load_tp (NULL);
1448
1449         if (mode == ptr_mode)
1450           {
1451             if (mode == DImode)
1452               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1453             else
1454               {
1455                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1456                 tp = gen_lowpart (mode, tp);
1457               }
1458           }
1459         else
1460           {
1461             gcc_assert (mode == Pmode);
1462             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1463           }
1464
1465         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1466         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1467         return;
1468       }
1469
1470     case SYMBOL_TLSLE12:
1471     case SYMBOL_TLSLE24:
1472     case SYMBOL_TLSLE32:
1473     case SYMBOL_TLSLE48:
1474       {
1475         machine_mode mode = GET_MODE (dest);
1476         rtx tp = aarch64_load_tp (NULL);
1477
1478         if (mode != Pmode)
1479           tp = gen_lowpart (mode, tp);
1480
1481         switch (type)
1482           {
1483           case SYMBOL_TLSLE12:
1484             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1485                         (dest, tp, imm));
1486             break;
1487           case SYMBOL_TLSLE24:
1488             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1489                         (dest, tp, imm));
1490           break;
1491           case SYMBOL_TLSLE32:
1492             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1493                         (dest, imm));
1494             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1495                         (dest, dest, tp));
1496           break;
1497           case SYMBOL_TLSLE48:
1498             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1499                         (dest, imm));
1500             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1501                         (dest, dest, tp));
1502             break;
1503           default:
1504             gcc_unreachable ();
1505           }
1506
1507         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1508         return;
1509       }
1510
1511     case SYMBOL_TINY_GOT:
1512       emit_insn (gen_ldr_got_tiny (dest, imm));
1513       return;
1514
1515     case SYMBOL_TINY_TLSIE:
1516       {
1517         machine_mode mode = GET_MODE (dest);
1518         rtx tp = aarch64_load_tp (NULL);
1519
1520         if (mode == ptr_mode)
1521           {
1522             if (mode == DImode)
1523               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1524             else
1525               {
1526                 tp = gen_lowpart (mode, tp);
1527                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1528               }
1529           }
1530         else
1531           {
1532             gcc_assert (mode == Pmode);
1533             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1534           }
1535
1536         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1537         return;
1538       }
1539
1540     default:
1541       gcc_unreachable ();
1542     }
1543 }
1544
1545 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1546    handle all moves if !can_create_pseudo_p ().  The distinction is
1547    important because, unlike emit_move_insn, the move expanders know
1548    how to force Pmode objects into the constant pool even when the
1549    constant pool address is not itself legitimate.  */
1550 static rtx
1551 aarch64_emit_move (rtx dest, rtx src)
1552 {
1553   return (can_create_pseudo_p ()
1554           ? emit_move_insn (dest, src)
1555           : emit_move_insn_1 (dest, src));
1556 }
1557
1558 /* Split a 128-bit move operation into two 64-bit move operations,
1559    taking care to handle partial overlap of register to register
1560    copies.  Special cases are needed when moving between GP regs and
1561    FP regs.  SRC can be a register, constant or memory; DST a register
1562    or memory.  If either operand is memory it must not have any side
1563    effects.  */
1564 void
1565 aarch64_split_128bit_move (rtx dst, rtx src)
1566 {
1567   rtx dst_lo, dst_hi;
1568   rtx src_lo, src_hi;
1569
1570   machine_mode mode = GET_MODE (dst);
1571
1572   gcc_assert (mode == TImode || mode == TFmode);
1573   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1574   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1575
1576   if (REG_P (dst) && REG_P (src))
1577     {
1578       int src_regno = REGNO (src);
1579       int dst_regno = REGNO (dst);
1580
1581       /* Handle FP <-> GP regs.  */
1582       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1583         {
1584           src_lo = gen_lowpart (word_mode, src);
1585           src_hi = gen_highpart (word_mode, src);
1586
1587           if (mode == TImode)
1588             {
1589               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1590               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1591             }
1592           else
1593             {
1594               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1595               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1596             }
1597           return;
1598         }
1599       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1600         {
1601           dst_lo = gen_lowpart (word_mode, dst);
1602           dst_hi = gen_highpart (word_mode, dst);
1603
1604           if (mode == TImode)
1605             {
1606               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1607               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1608             }
1609           else
1610             {
1611               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1612               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1613             }
1614           return;
1615         }
1616     }
1617
1618   dst_lo = gen_lowpart (word_mode, dst);
1619   dst_hi = gen_highpart (word_mode, dst);
1620   src_lo = gen_lowpart (word_mode, src);
1621   src_hi = gen_highpart_mode (word_mode, mode, src);
1622
1623   /* At most one pairing may overlap.  */
1624   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1625     {
1626       aarch64_emit_move (dst_hi, src_hi);
1627       aarch64_emit_move (dst_lo, src_lo);
1628     }
1629   else
1630     {
1631       aarch64_emit_move (dst_lo, src_lo);
1632       aarch64_emit_move (dst_hi, src_hi);
1633     }
1634 }
1635
1636 bool
1637 aarch64_split_128bit_move_p (rtx dst, rtx src)
1638 {
1639   return (! REG_P (src)
1640           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1641 }
1642
1643 /* Split a complex SIMD combine.  */
1644
1645 void
1646 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1647 {
1648   machine_mode src_mode = GET_MODE (src1);
1649   machine_mode dst_mode = GET_MODE (dst);
1650
1651   gcc_assert (VECTOR_MODE_P (dst_mode));
1652
1653   if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1654     {
1655       rtx (*gen) (rtx, rtx, rtx);
1656
1657       switch (src_mode)
1658         {
1659         case V8QImode:
1660           gen = gen_aarch64_simd_combinev8qi;
1661           break;
1662         case V4HImode:
1663           gen = gen_aarch64_simd_combinev4hi;
1664           break;
1665         case V2SImode:
1666           gen = gen_aarch64_simd_combinev2si;
1667           break;
1668         case V4HFmode:
1669           gen = gen_aarch64_simd_combinev4hf;
1670           break;
1671         case V2SFmode:
1672           gen = gen_aarch64_simd_combinev2sf;
1673           break;
1674         case DImode:
1675           gen = gen_aarch64_simd_combinedi;
1676           break;
1677         case DFmode:
1678           gen = gen_aarch64_simd_combinedf;
1679           break;
1680         default:
1681           gcc_unreachable ();
1682         }
1683
1684       emit_insn (gen (dst, src1, src2));
1685       return;
1686     }
1687 }
1688
1689 /* Split a complex SIMD move.  */
1690
1691 void
1692 aarch64_split_simd_move (rtx dst, rtx src)
1693 {
1694   machine_mode src_mode = GET_MODE (src);
1695   machine_mode dst_mode = GET_MODE (dst);
1696
1697   gcc_assert (VECTOR_MODE_P (dst_mode));
1698
1699   if (REG_P (dst) && REG_P (src))
1700     {
1701       rtx (*gen) (rtx, rtx);
1702
1703       gcc_assert (VECTOR_MODE_P (src_mode));
1704
1705       switch (src_mode)
1706         {
1707         case V16QImode:
1708           gen = gen_aarch64_split_simd_movv16qi;
1709           break;
1710         case V8HImode:
1711           gen = gen_aarch64_split_simd_movv8hi;
1712           break;
1713         case V4SImode:
1714           gen = gen_aarch64_split_simd_movv4si;
1715           break;
1716         case V2DImode:
1717           gen = gen_aarch64_split_simd_movv2di;
1718           break;
1719         case V8HFmode:
1720           gen = gen_aarch64_split_simd_movv8hf;
1721           break;
1722         case V4SFmode:
1723           gen = gen_aarch64_split_simd_movv4sf;
1724           break;
1725         case V2DFmode:
1726           gen = gen_aarch64_split_simd_movv2df;
1727           break;
1728         default:
1729           gcc_unreachable ();
1730         }
1731
1732       emit_insn (gen (dst, src));
1733       return;
1734     }
1735 }
1736
1737 bool
1738 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1739                               machine_mode ymode, rtx y)
1740 {
1741   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1742   gcc_assert (r != NULL);
1743   return rtx_equal_p (x, r);
1744 }
1745
1746
1747 static rtx
1748 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1749 {
1750   if (can_create_pseudo_p ())
1751     return force_reg (mode, value);
1752   else
1753     {
1754       x = aarch64_emit_move (x, value);
1755       return x;
1756     }
1757 }
1758
1759
1760 static rtx
1761 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1762 {
1763   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1764     {
1765       rtx high;
1766       /* Load the full offset into a register.  This
1767          might be improvable in the future.  */
1768       high = GEN_INT (offset);
1769       offset = 0;
1770       high = aarch64_force_temporary (mode, temp, high);
1771       reg = aarch64_force_temporary (mode, temp,
1772                                      gen_rtx_PLUS (mode, high, reg));
1773     }
1774   return plus_constant (mode, reg, offset);
1775 }
1776
1777 static int
1778 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1779                                 machine_mode mode)
1780 {
1781   int i;
1782   unsigned HOST_WIDE_INT val, val2, mask;
1783   int one_match, zero_match;
1784   int num_insns;
1785
1786   val = INTVAL (imm);
1787
1788   if (aarch64_move_imm (val, mode))
1789     {
1790       if (generate)
1791         emit_insn (gen_rtx_SET (dest, imm));
1792       return 1;
1793     }
1794
1795   if ((val >> 32) == 0 || mode == SImode)
1796     {
1797       if (generate)
1798         {
1799           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1800           if (mode == SImode)
1801             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1802                                        GEN_INT ((val >> 16) & 0xffff)));
1803           else
1804             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1805                                        GEN_INT ((val >> 16) & 0xffff)));
1806         }
1807       return 2;
1808     }
1809
1810   /* Remaining cases are all for DImode.  */
1811
1812   mask = 0xffff;
1813   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1814     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1815   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1816     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1817
1818   if (zero_match != 2 && one_match != 2)
1819     {
1820       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1821          For a 64-bit bitmask try whether changing 16 bits to all ones or
1822          zeroes creates a valid bitmask.  To check any repeated bitmask,
1823          try using 16 bits from the other 32-bit half of val.  */
1824
1825       for (i = 0; i < 64; i += 16, mask <<= 16)
1826         {
1827           val2 = val & ~mask;
1828           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1829             break;
1830           val2 = val | mask;
1831           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1832             break;
1833           val2 = val2 & ~mask;
1834           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1835           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1836             break;
1837         }
1838       if (i != 64)
1839         {
1840           if (generate)
1841             {
1842               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1843               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1844                                          GEN_INT ((val >> i) & 0xffff)));
1845             }
1846           return 2;
1847         }
1848     }
1849
1850   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1851      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
1852      otherwise skip zero bits.  */
1853
1854   num_insns = 1;
1855   mask = 0xffff;
1856   val2 = one_match > zero_match ? ~val : val;
1857   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1858
1859   if (generate)
1860     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1861                                            ? (val | ~(mask << i))
1862                                            : (val & (mask << i)))));
1863   for (i += 16; i < 64; i += 16)
1864     {
1865       if ((val2 & (mask << i)) == 0)
1866         continue;
1867       if (generate)
1868         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1869                                    GEN_INT ((val >> i) & 0xffff)));
1870       num_insns ++;
1871     }
1872
1873   return num_insns;
1874 }
1875
1876
1877 void
1878 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1879 {
1880   machine_mode mode = GET_MODE (dest);
1881
1882   gcc_assert (mode == SImode || mode == DImode);
1883
1884   /* Check on what type of symbol it is.  */
1885   if (GET_CODE (imm) == SYMBOL_REF
1886       || GET_CODE (imm) == LABEL_REF
1887       || GET_CODE (imm) == CONST)
1888     {
1889       rtx mem, base, offset;
1890       enum aarch64_symbol_type sty;
1891
1892       /* If we have (const (plus symbol offset)), separate out the offset
1893          before we start classifying the symbol.  */
1894       split_const (imm, &base, &offset);
1895
1896       sty = aarch64_classify_symbol (base, offset);
1897       switch (sty)
1898         {
1899         case SYMBOL_FORCE_TO_MEM:
1900           if (offset != const0_rtx
1901               && targetm.cannot_force_const_mem (mode, imm))
1902             {
1903               gcc_assert (can_create_pseudo_p ());
1904               base = aarch64_force_temporary (mode, dest, base);
1905               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1906               aarch64_emit_move (dest, base);
1907               return;
1908             }
1909
1910           mem = force_const_mem (ptr_mode, imm);
1911           gcc_assert (mem);
1912
1913           /* If we aren't generating PC relative literals, then
1914              we need to expand the literal pool access carefully.
1915              This is something that needs to be done in a number
1916              of places, so could well live as a separate function.  */
1917           if (!aarch64_pcrelative_literal_loads)
1918             {
1919               gcc_assert (can_create_pseudo_p ());
1920               base = gen_reg_rtx (ptr_mode);
1921               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
1922               mem = gen_rtx_MEM (ptr_mode, base);
1923             }
1924
1925           if (mode != ptr_mode)
1926             mem = gen_rtx_ZERO_EXTEND (mode, mem);
1927
1928           emit_insn (gen_rtx_SET (dest, mem));
1929
1930           return;
1931
1932         case SYMBOL_SMALL_TLSGD:
1933         case SYMBOL_SMALL_TLSDESC:
1934         case SYMBOL_SMALL_TLSIE:
1935         case SYMBOL_SMALL_GOT_28K:
1936         case SYMBOL_SMALL_GOT_4G:
1937         case SYMBOL_TINY_GOT:
1938         case SYMBOL_TINY_TLSIE:
1939           if (offset != const0_rtx)
1940             {
1941               gcc_assert(can_create_pseudo_p ());
1942               base = aarch64_force_temporary (mode, dest, base);
1943               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1944               aarch64_emit_move (dest, base);
1945               return;
1946             }
1947           /* FALLTHRU */
1948
1949         case SYMBOL_SMALL_ABSOLUTE:
1950         case SYMBOL_TINY_ABSOLUTE:
1951         case SYMBOL_TLSLE12:
1952         case SYMBOL_TLSLE24:
1953         case SYMBOL_TLSLE32:
1954         case SYMBOL_TLSLE48:
1955           aarch64_load_symref_appropriately (dest, imm, sty);
1956           return;
1957
1958         default:
1959           gcc_unreachable ();
1960         }
1961     }
1962
1963   if (!CONST_INT_P (imm))
1964     {
1965       if (GET_CODE (imm) == HIGH)
1966         emit_insn (gen_rtx_SET (dest, imm));
1967       else
1968         {
1969           rtx mem = force_const_mem (mode, imm);
1970           gcc_assert (mem);
1971           emit_insn (gen_rtx_SET (dest, mem));
1972         }
1973
1974       return;
1975     }
1976
1977   aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1978 }
1979
1980 /* Add DELTA to REGNUM in mode MODE.  SCRATCHREG can be used to hold a
1981    temporary value if necessary.  FRAME_RELATED_P should be true if
1982    the RTX_FRAME_RELATED flag should be set and CFA adjustments added
1983    to the generated instructions.  If SCRATCHREG is known to hold
1984    abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
1985    immediate again.
1986
1987    Since this function may be used to adjust the stack pointer, we must
1988    ensure that it cannot cause transient stack deallocation (for example
1989    by first incrementing SP and then decrementing when adjusting by a
1990    large immediate).  */
1991
1992 static void
1993 aarch64_add_constant_internal (machine_mode mode, int regnum, int scratchreg,
1994                                HOST_WIDE_INT delta, bool frame_related_p,
1995                                bool emit_move_imm)
1996 {
1997   HOST_WIDE_INT mdelta = abs_hwi (delta);
1998   rtx this_rtx = gen_rtx_REG (mode, regnum);
1999   rtx_insn *insn;
2000
2001   if (!mdelta)
2002     return;
2003
2004   /* Single instruction adjustment.  */
2005   if (aarch64_uimm12_shift (mdelta))
2006     {
2007       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
2008       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2009       return;
2010     }
2011
2012   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
2013      Only do this if mdelta is not a 16-bit move as adjusting using a move
2014      is better.  */
2015   if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode))
2016     {
2017       HOST_WIDE_INT low_off = mdelta & 0xfff;
2018
2019       low_off = delta < 0 ? -low_off : low_off;
2020       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
2021       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2022       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2023       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2024       return;
2025     }
2026
2027   /* Emit a move immediate if required and an addition/subtraction.  */
2028   rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2029   if (emit_move_imm)
2030     aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode);
2031   insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx)
2032                               : gen_add2_insn (this_rtx, scratch_rtx));
2033   if (frame_related_p)
2034     {
2035       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2036       rtx adj = plus_constant (mode, this_rtx, delta);
2037       add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2038     }
2039 }
2040
2041 static inline void
2042 aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
2043                       HOST_WIDE_INT delta)
2044 {
2045   aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true);
2046 }
2047
2048 static inline void
2049 aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm)
2050 {
2051   aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta,
2052                                  true, emit_move_imm);
2053 }
2054
2055 static inline void
2056 aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p)
2057 {
2058   aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta,
2059                                  frame_related_p, true);
2060 }
2061
2062 static bool
2063 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2064                                  tree exp ATTRIBUTE_UNUSED)
2065 {
2066   /* Currently, always true.  */
2067   return true;
2068 }
2069
2070 /* Implement TARGET_PASS_BY_REFERENCE.  */
2071
2072 static bool
2073 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2074                            machine_mode mode,
2075                            const_tree type,
2076                            bool named ATTRIBUTE_UNUSED)
2077 {
2078   HOST_WIDE_INT size;
2079   machine_mode dummymode;
2080   int nregs;
2081
2082   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
2083   size = (mode == BLKmode && type)
2084     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2085
2086   /* Aggregates are passed by reference based on their size.  */
2087   if (type && AGGREGATE_TYPE_P (type))
2088     {
2089       size = int_size_in_bytes (type);
2090     }
2091
2092   /* Variable sized arguments are always returned by reference.  */
2093   if (size < 0)
2094     return true;
2095
2096   /* Can this be a candidate to be passed in fp/simd register(s)?  */
2097   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2098                                                &dummymode, &nregs,
2099                                                NULL))
2100     return false;
2101
2102   /* Arguments which are variable sized or larger than 2 registers are
2103      passed by reference unless they are a homogenous floating point
2104      aggregate.  */
2105   return size > 2 * UNITS_PER_WORD;
2106 }
2107
2108 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
2109 static bool
2110 aarch64_return_in_msb (const_tree valtype)
2111 {
2112   machine_mode dummy_mode;
2113   int dummy_int;
2114
2115   /* Never happens in little-endian mode.  */
2116   if (!BYTES_BIG_ENDIAN)
2117     return false;
2118
2119   /* Only composite types smaller than or equal to 16 bytes can
2120      be potentially returned in registers.  */
2121   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2122       || int_size_in_bytes (valtype) <= 0
2123       || int_size_in_bytes (valtype) > 16)
2124     return false;
2125
2126   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2127      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2128      is always passed/returned in the least significant bits of fp/simd
2129      register(s).  */
2130   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2131                                                &dummy_mode, &dummy_int, NULL))
2132     return false;
2133
2134   return true;
2135 }
2136
2137 /* Implement TARGET_FUNCTION_VALUE.
2138    Define how to find the value returned by a function.  */
2139
2140 static rtx
2141 aarch64_function_value (const_tree type, const_tree func,
2142                         bool outgoing ATTRIBUTE_UNUSED)
2143 {
2144   machine_mode mode;
2145   int unsignedp;
2146   int count;
2147   machine_mode ag_mode;
2148
2149   mode = TYPE_MODE (type);
2150   if (INTEGRAL_TYPE_P (type))
2151     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2152
2153   if (aarch64_return_in_msb (type))
2154     {
2155       HOST_WIDE_INT size = int_size_in_bytes (type);
2156
2157       if (size % UNITS_PER_WORD != 0)
2158         {
2159           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2160           mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
2161         }
2162     }
2163
2164   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2165                                                &ag_mode, &count, NULL))
2166     {
2167       if (!aarch64_composite_type_p (type, mode))
2168         {
2169           gcc_assert (count == 1 && mode == ag_mode);
2170           return gen_rtx_REG (mode, V0_REGNUM);
2171         }
2172       else
2173         {
2174           int i;
2175           rtx par;
2176
2177           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2178           for (i = 0; i < count; i++)
2179             {
2180               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2181               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2182                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2183               XVECEXP (par, 0, i) = tmp;
2184             }
2185           return par;
2186         }
2187     }
2188   else
2189     return gen_rtx_REG (mode, R0_REGNUM);
2190 }
2191
2192 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2193    Return true if REGNO is the number of a hard register in which the values
2194    of called function may come back.  */
2195
2196 static bool
2197 aarch64_function_value_regno_p (const unsigned int regno)
2198 {
2199   /* Maximum of 16 bytes can be returned in the general registers.  Examples
2200      of 16-byte return values are: 128-bit integers and 16-byte small
2201      structures (excluding homogeneous floating-point aggregates).  */
2202   if (regno == R0_REGNUM || regno == R1_REGNUM)
2203     return true;
2204
2205   /* Up to four fp/simd registers can return a function value, e.g. a
2206      homogeneous floating-point aggregate having four members.  */
2207   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2208     return TARGET_FLOAT;
2209
2210   return false;
2211 }
2212
2213 /* Implement TARGET_RETURN_IN_MEMORY.
2214
2215    If the type T of the result of a function is such that
2216      void func (T arg)
2217    would require that arg be passed as a value in a register (or set of
2218    registers) according to the parameter passing rules, then the result
2219    is returned in the same registers as would be used for such an
2220    argument.  */
2221
2222 static bool
2223 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2224 {
2225   HOST_WIDE_INT size;
2226   machine_mode ag_mode;
2227   int count;
2228
2229   if (!AGGREGATE_TYPE_P (type)
2230       && TREE_CODE (type) != COMPLEX_TYPE
2231       && TREE_CODE (type) != VECTOR_TYPE)
2232     /* Simple scalar types always returned in registers.  */
2233     return false;
2234
2235   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2236                                                type,
2237                                                &ag_mode,
2238                                                &count,
2239                                                NULL))
2240     return false;
2241
2242   /* Types larger than 2 registers returned in memory.  */
2243   size = int_size_in_bytes (type);
2244   return (size < 0 || size > 2 * UNITS_PER_WORD);
2245 }
2246
2247 static bool
2248 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2249                                const_tree type, int *nregs)
2250 {
2251   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2252   return aarch64_vfp_is_call_or_return_candidate (mode,
2253                                                   type,
2254                                                   &pcum->aapcs_vfp_rmode,
2255                                                   nregs,
2256                                                   NULL);
2257 }
2258
2259 /* Given MODE and TYPE of a function argument, return the alignment in
2260    bits.  The idea is to suppress any stronger alignment requested by
2261    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2262    This is a helper function for local use only.  */
2263
2264 static unsigned int
2265 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2266 {
2267   if (!type)
2268     return GET_MODE_ALIGNMENT (mode);
2269   if (integer_zerop (TYPE_SIZE (type)))
2270     return 0;
2271
2272   gcc_assert (TYPE_MODE (type) == mode);
2273
2274   if (!AGGREGATE_TYPE_P (type))
2275     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2276
2277   if (TREE_CODE (type) == ARRAY_TYPE)
2278     return TYPE_ALIGN (TREE_TYPE (type));
2279
2280   unsigned int alignment = 0;
2281
2282   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2283     alignment = std::max (alignment, DECL_ALIGN (field));
2284
2285   return alignment;
2286 }
2287
2288 /* Layout a function argument according to the AAPCS64 rules.  The rule
2289    numbers refer to the rule numbers in the AAPCS64.  */
2290
2291 static void
2292 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2293                     const_tree type,
2294                     bool named ATTRIBUTE_UNUSED)
2295 {
2296   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2297   int ncrn, nvrn, nregs;
2298   bool allocate_ncrn, allocate_nvrn;
2299   HOST_WIDE_INT size;
2300
2301   /* We need to do this once per argument.  */
2302   if (pcum->aapcs_arg_processed)
2303     return;
2304
2305   pcum->aapcs_arg_processed = true;
2306
2307   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
2308   size
2309     = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2310                 UNITS_PER_WORD);
2311
2312   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2313   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2314                                                  mode,
2315                                                  type,
2316                                                  &nregs);
2317
2318   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2319      The following code thus handles passing by SIMD/FP registers first.  */
2320
2321   nvrn = pcum->aapcs_nvrn;
2322
2323   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2324      and homogenous short-vector aggregates (HVA).  */
2325   if (allocate_nvrn)
2326     {
2327       if (!TARGET_FLOAT)
2328         aarch64_err_no_fpadvsimd (mode, "argument");
2329
2330       if (nvrn + nregs <= NUM_FP_ARG_REGS)
2331         {
2332           pcum->aapcs_nextnvrn = nvrn + nregs;
2333           if (!aarch64_composite_type_p (type, mode))
2334             {
2335               gcc_assert (nregs == 1);
2336               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2337             }
2338           else
2339             {
2340               rtx par;
2341               int i;
2342               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2343               for (i = 0; i < nregs; i++)
2344                 {
2345                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2346                                          V0_REGNUM + nvrn + i);
2347                   tmp = gen_rtx_EXPR_LIST
2348                     (VOIDmode, tmp,
2349                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2350                   XVECEXP (par, 0, i) = tmp;
2351                 }
2352               pcum->aapcs_reg = par;
2353             }
2354           return;
2355         }
2356       else
2357         {
2358           /* C.3 NSRN is set to 8.  */
2359           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2360           goto on_stack;
2361         }
2362     }
2363
2364   ncrn = pcum->aapcs_ncrn;
2365   nregs = size / UNITS_PER_WORD;
2366
2367   /* C6 - C9.  though the sign and zero extension semantics are
2368      handled elsewhere.  This is the case where the argument fits
2369      entirely general registers.  */
2370   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2371     {
2372       unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2373
2374       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2375
2376       /* C.8 if the argument has an alignment of 16 then the NGRN is
2377          rounded up to the next even number.  */
2378       if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
2379         {
2380           ++ncrn;
2381           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2382         }
2383       /* NREGS can be 0 when e.g. an empty structure is to be passed.
2384          A reg is still generated for it, but the caller should be smart
2385          enough not to use it.  */
2386       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2387         {
2388           pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2389         }
2390       else
2391         {
2392           rtx par;
2393           int i;
2394
2395           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2396           for (i = 0; i < nregs; i++)
2397             {
2398               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2399               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2400                                        GEN_INT (i * UNITS_PER_WORD));
2401               XVECEXP (par, 0, i) = tmp;
2402             }
2403           pcum->aapcs_reg = par;
2404         }
2405
2406       pcum->aapcs_nextncrn = ncrn + nregs;
2407       return;
2408     }
2409
2410   /* C.11  */
2411   pcum->aapcs_nextncrn = NUM_ARG_REGS;
2412
2413   /* The argument is passed on stack; record the needed number of words for
2414      this argument and align the total size if necessary.  */
2415 on_stack:
2416   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2417   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2418     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2419                                        16 / UNITS_PER_WORD);
2420   return;
2421 }
2422
2423 /* Implement TARGET_FUNCTION_ARG.  */
2424
2425 static rtx
2426 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2427                       const_tree type, bool named)
2428 {
2429   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2430   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2431
2432   if (mode == VOIDmode)
2433     return NULL_RTX;
2434
2435   aarch64_layout_arg (pcum_v, mode, type, named);
2436   return pcum->aapcs_reg;
2437 }
2438
2439 void
2440 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2441                            const_tree fntype ATTRIBUTE_UNUSED,
2442                            rtx libname ATTRIBUTE_UNUSED,
2443                            const_tree fndecl ATTRIBUTE_UNUSED,
2444                            unsigned n_named ATTRIBUTE_UNUSED)
2445 {
2446   pcum->aapcs_ncrn = 0;
2447   pcum->aapcs_nvrn = 0;
2448   pcum->aapcs_nextncrn = 0;
2449   pcum->aapcs_nextnvrn = 0;
2450   pcum->pcs_variant = ARM_PCS_AAPCS64;
2451   pcum->aapcs_reg = NULL_RTX;
2452   pcum->aapcs_arg_processed = false;
2453   pcum->aapcs_stack_words = 0;
2454   pcum->aapcs_stack_size = 0;
2455
2456   if (!TARGET_FLOAT
2457       && fndecl && TREE_PUBLIC (fndecl)
2458       && fntype && fntype != error_mark_node)
2459     {
2460       const_tree type = TREE_TYPE (fntype);
2461       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
2462       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
2463       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2464                                                    &mode, &nregs, NULL))
2465         aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2466     }
2467   return;
2468 }
2469
2470 static void
2471 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2472                               machine_mode mode,
2473                               const_tree type,
2474                               bool named)
2475 {
2476   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2477   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2478     {
2479       aarch64_layout_arg (pcum_v, mode, type, named);
2480       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2481                   != (pcum->aapcs_stack_words != 0));
2482       pcum->aapcs_arg_processed = false;
2483       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2484       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2485       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2486       pcum->aapcs_stack_words = 0;
2487       pcum->aapcs_reg = NULL_RTX;
2488     }
2489 }
2490
2491 bool
2492 aarch64_function_arg_regno_p (unsigned regno)
2493 {
2494   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2495           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2496 }
2497
2498 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
2499    PARM_BOUNDARY bits of alignment, but will be given anything up
2500    to STACK_BOUNDARY bits if the type requires it.  This makes sure
2501    that both before and after the layout of each argument, the Next
2502    Stacked Argument Address (NSAA) will have a minimum alignment of
2503    8 bytes.  */
2504
2505 static unsigned int
2506 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2507 {
2508   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2509
2510   if (alignment < PARM_BOUNDARY)
2511     alignment = PARM_BOUNDARY;
2512   if (alignment > STACK_BOUNDARY)
2513     alignment = STACK_BOUNDARY;
2514   return alignment;
2515 }
2516
2517 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2518
2519    Return true if an argument passed on the stack should be padded upwards,
2520    i.e. if the least-significant byte of the stack slot has useful data.
2521
2522    Small aggregate types are placed in the lowest memory address.
2523
2524    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
2525
2526 bool
2527 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
2528 {
2529   /* On little-endian targets, the least significant byte of every stack
2530      argument is passed at the lowest byte address of the stack slot.  */
2531   if (!BYTES_BIG_ENDIAN)
2532     return true;
2533
2534   /* Otherwise, integral, floating-point and pointer types are padded downward:
2535      the least significant byte of a stack argument is passed at the highest
2536      byte address of the stack slot.  */
2537   if (type
2538       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2539          || POINTER_TYPE_P (type))
2540       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2541     return false;
2542
2543   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
2544   return true;
2545 }
2546
2547 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2548
2549    It specifies padding for the last (may also be the only)
2550    element of a block move between registers and memory.  If
2551    assuming the block is in the memory, padding upward means that
2552    the last element is padded after its highest significant byte,
2553    while in downward padding, the last element is padded at the
2554    its least significant byte side.
2555
2556    Small aggregates and small complex types are always padded
2557    upwards.
2558
2559    We don't need to worry about homogeneous floating-point or
2560    short-vector aggregates; their move is not affected by the
2561    padding direction determined here.  Regardless of endianness,
2562    each element of such an aggregate is put in the least
2563    significant bits of a fp/simd register.
2564
2565    Return !BYTES_BIG_ENDIAN if the least significant byte of the
2566    register has useful data, and return the opposite if the most
2567    significant byte does.  */
2568
2569 bool
2570 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2571                      bool first ATTRIBUTE_UNUSED)
2572 {
2573
2574   /* Small composite types are always padded upward.  */
2575   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2576     {
2577       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2578                             : GET_MODE_SIZE (mode));
2579       if (size < 2 * UNITS_PER_WORD)
2580         return true;
2581     }
2582
2583   /* Otherwise, use the default padding.  */
2584   return !BYTES_BIG_ENDIAN;
2585 }
2586
2587 static machine_mode
2588 aarch64_libgcc_cmp_return_mode (void)
2589 {
2590   return SImode;
2591 }
2592
2593 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2594
2595 /* We use the 12-bit shifted immediate arithmetic instructions so values
2596    must be multiple of (1 << 12), i.e. 4096.  */
2597 #define ARITH_FACTOR 4096
2598
2599 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2600 #error Cannot use simple address calculation for stack probing
2601 #endif
2602
2603 /* The pair of scratch registers used for stack probing.  */
2604 #define PROBE_STACK_FIRST_REG  9
2605 #define PROBE_STACK_SECOND_REG 10
2606
2607 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2608    inclusive.  These are offsets from the current stack pointer.  */
2609
2610 static void
2611 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2612 {
2613   rtx reg1 = gen_rtx_REG (ptr_mode, PROBE_STACK_FIRST_REG);
2614
2615   /* See the same assertion on PROBE_INTERVAL above.  */
2616   gcc_assert ((first % ARITH_FACTOR) == 0);
2617
2618   /* See if we have a constant small number of probes to generate.  If so,
2619      that's the easy case.  */
2620   if (size <= PROBE_INTERVAL)
2621     {
2622       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2623
2624       emit_set_insn (reg1,
2625                      plus_constant (ptr_mode,
2626                                     stack_pointer_rtx, -(first + base)));
2627       emit_stack_probe (plus_constant (ptr_mode, reg1, base - size));
2628     }
2629
2630   /* The run-time loop is made up of 8 insns in the generic case while the
2631      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
2632   else if (size <= 4 * PROBE_INTERVAL)
2633     {
2634       HOST_WIDE_INT i, rem;
2635
2636       emit_set_insn (reg1,
2637                      plus_constant (ptr_mode,
2638                                     stack_pointer_rtx,
2639                                     -(first + PROBE_INTERVAL)));
2640       emit_stack_probe (reg1);
2641
2642       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2643          it exceeds SIZE.  If only two probes are needed, this will not
2644          generate any code.  Then probe at FIRST + SIZE.  */
2645       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2646         {
2647           emit_set_insn (reg1,
2648                          plus_constant (ptr_mode, reg1, -PROBE_INTERVAL));
2649           emit_stack_probe (reg1);
2650         }
2651
2652       rem = size - (i - PROBE_INTERVAL);
2653       if (rem > 256)
2654         {
2655           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2656
2657           emit_set_insn (reg1, plus_constant (ptr_mode, reg1, -base));
2658           emit_stack_probe (plus_constant (ptr_mode, reg1, base - rem));
2659         }
2660       else
2661         emit_stack_probe (plus_constant (ptr_mode, reg1, -rem));
2662     }
2663
2664   /* Otherwise, do the same as above, but in a loop.  Note that we must be
2665      extra careful with variables wrapping around because we might be at
2666      the very top (or the very bottom) of the address space and we have
2667      to be able to handle this case properly; in particular, we use an
2668      equality test for the loop condition.  */
2669   else
2670     {
2671       rtx reg2 = gen_rtx_REG (ptr_mode, PROBE_STACK_SECOND_REG);
2672
2673       /* Step 1: round SIZE to the previous multiple of the interval.  */
2674
2675       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2676
2677
2678       /* Step 2: compute initial and final value of the loop counter.  */
2679
2680       /* TEST_ADDR = SP + FIRST.  */
2681       emit_set_insn (reg1,
2682                      plus_constant (ptr_mode, stack_pointer_rtx, -first));
2683
2684       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
2685       emit_set_insn (reg2,
2686                      plus_constant (ptr_mode, stack_pointer_rtx,
2687                                     -(first + rounded_size)));
2688
2689
2690       /* Step 3: the loop
2691
2692          do
2693            {
2694              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2695              probe at TEST_ADDR
2696            }
2697          while (TEST_ADDR != LAST_ADDR)
2698
2699          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2700          until it is equal to ROUNDED_SIZE.  */
2701
2702       if (ptr_mode == DImode)
2703         emit_insn (gen_probe_stack_range_di (reg1, reg1, reg2));
2704       else
2705         emit_insn (gen_probe_stack_range_si (reg1, reg1, reg2));
2706
2707
2708       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2709          that SIZE is equal to ROUNDED_SIZE.  */
2710
2711       if (size != rounded_size)
2712         {
2713           HOST_WIDE_INT rem = size - rounded_size;
2714
2715           if (rem > 256)
2716             {
2717               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2718
2719               emit_set_insn (reg2, plus_constant (ptr_mode, reg2, -base));
2720               emit_stack_probe (plus_constant (ptr_mode, reg2, base - rem));
2721             }
2722           else
2723             emit_stack_probe (plus_constant (ptr_mode, reg2, -rem));
2724         }
2725     }
2726
2727   /* Make sure nothing is scheduled before we are done.  */
2728   emit_insn (gen_blockage ());
2729 }
2730
2731 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
2732    absolute addresses.  */
2733
2734 const char *
2735 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2736 {
2737   static int labelno = 0;
2738   char loop_lab[32];
2739   rtx xops[2];
2740
2741   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2742
2743   /* Loop.  */
2744   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2745
2746   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
2747   xops[0] = reg1;
2748   xops[1] = GEN_INT (PROBE_INTERVAL);
2749   output_asm_insn ("sub\t%0, %0, %1", xops);
2750
2751   /* Probe at TEST_ADDR.  */
2752   output_asm_insn ("str\txzr, [%0]", xops);
2753
2754   /* Test if TEST_ADDR == LAST_ADDR.  */
2755   xops[1] = reg2;
2756   output_asm_insn ("cmp\t%0, %1", xops);
2757
2758   /* Branch.  */
2759   fputs ("\tb.ne\t", asm_out_file);
2760   assemble_name_raw (asm_out_file, loop_lab);
2761   fputc ('\n', asm_out_file);
2762
2763   return "";
2764 }
2765
2766 static bool
2767 aarch64_frame_pointer_required (void)
2768 {
2769   /* In aarch64_override_options_after_change
2770      flag_omit_leaf_frame_pointer turns off the frame pointer by
2771      default.  Turn it back on now if we've not got a leaf
2772      function.  */
2773   if (flag_omit_leaf_frame_pointer
2774       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2775     return true;
2776
2777   /* Force a frame pointer for EH returns so the return address is at FP+8.  */
2778   if (crtl->calls_eh_return)
2779     return true;
2780
2781   return false;
2782 }
2783
2784 /* Mark the registers that need to be saved by the callee and calculate
2785    the size of the callee-saved registers area and frame record (both FP
2786    and LR may be omitted).  */
2787 static void
2788 aarch64_layout_frame (void)
2789 {
2790   HOST_WIDE_INT offset = 0;
2791   int regno, last_fp_reg = INVALID_REGNUM;
2792
2793   if (reload_completed && cfun->machine->frame.laid_out)
2794     return;
2795
2796 #define SLOT_NOT_REQUIRED (-2)
2797 #define SLOT_REQUIRED     (-1)
2798
2799   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2800   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2801
2802   /* First mark all the registers that really need to be saved...  */
2803   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2804     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2805
2806   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2807     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2808
2809   /* ... that includes the eh data registers (if needed)...  */
2810   if (crtl->calls_eh_return)
2811     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2812       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2813         = SLOT_REQUIRED;
2814
2815   /* ... and any callee saved register that dataflow says is live.  */
2816   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2817     if (df_regs_ever_live_p (regno)
2818         && (regno == R30_REGNUM
2819             || !call_used_regs[regno]))
2820       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2821
2822   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2823     if (df_regs_ever_live_p (regno)
2824         && !call_used_regs[regno])
2825       {
2826         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2827         last_fp_reg = regno;
2828       }
2829
2830   if (frame_pointer_needed)
2831     {
2832       /* FP and LR are placed in the linkage record.  */
2833       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2834       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2835       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2836       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2837       offset += 2 * UNITS_PER_WORD;
2838     }
2839
2840   /* Now assign stack slots for them.  */
2841   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2842     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2843       {
2844         cfun->machine->frame.reg_offset[regno] = offset;
2845         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2846           cfun->machine->frame.wb_candidate1 = regno;
2847         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2848           cfun->machine->frame.wb_candidate2 = regno;
2849         offset += UNITS_PER_WORD;
2850       }
2851
2852   HOST_WIDE_INT max_int_offset = offset;
2853   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2854   bool has_align_gap = offset != max_int_offset;
2855
2856   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2857     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2858       {
2859         /* If there is an alignment gap between integer and fp callee-saves,
2860            allocate the last fp register to it if possible.  */
2861         if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
2862           {
2863             cfun->machine->frame.reg_offset[regno] = max_int_offset;
2864             break;
2865           }
2866
2867         cfun->machine->frame.reg_offset[regno] = offset;
2868         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2869           cfun->machine->frame.wb_candidate1 = regno;
2870         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2871                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2872           cfun->machine->frame.wb_candidate2 = regno;
2873         offset += UNITS_PER_WORD;
2874       }
2875
2876   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2877
2878   cfun->machine->frame.saved_regs_size = offset;
2879
2880   HOST_WIDE_INT varargs_and_saved_regs_size
2881     = offset + cfun->machine->frame.saved_varargs_size;
2882
2883   cfun->machine->frame.hard_fp_offset
2884     = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
2885                 STACK_BOUNDARY / BITS_PER_UNIT);
2886
2887   cfun->machine->frame.frame_size
2888     = ROUND_UP (cfun->machine->frame.hard_fp_offset
2889                 + crtl->outgoing_args_size,
2890                 STACK_BOUNDARY / BITS_PER_UNIT);
2891
2892   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
2893
2894   cfun->machine->frame.initial_adjust = 0;
2895   cfun->machine->frame.final_adjust = 0;
2896   cfun->machine->frame.callee_adjust = 0;
2897   cfun->machine->frame.callee_offset = 0;
2898
2899   HOST_WIDE_INT max_push_offset = 0;
2900   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
2901     max_push_offset = 512;
2902   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
2903     max_push_offset = 256;
2904
2905   if (cfun->machine->frame.frame_size < max_push_offset
2906       && crtl->outgoing_args_size == 0)
2907     {
2908       /* Simple, small frame with no outgoing arguments:
2909          stp reg1, reg2, [sp, -frame_size]!
2910          stp reg3, reg4, [sp, 16]  */
2911       cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
2912     }
2913   else if ((crtl->outgoing_args_size
2914             + cfun->machine->frame.saved_regs_size < 512)
2915            && !(cfun->calls_alloca
2916                 && cfun->machine->frame.hard_fp_offset < max_push_offset))
2917     {
2918       /* Frame with small outgoing arguments:
2919          sub sp, sp, frame_size
2920          stp reg1, reg2, [sp, outgoing_args_size]
2921          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
2922       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
2923       cfun->machine->frame.callee_offset
2924         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
2925     }
2926   else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
2927     {
2928       /* Frame with large outgoing arguments but a small local area:
2929          stp reg1, reg2, [sp, -hard_fp_offset]!
2930          stp reg3, reg4, [sp, 16]
2931          sub sp, sp, outgoing_args_size  */
2932       cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
2933       cfun->machine->frame.final_adjust
2934         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
2935     }
2936   else if (!frame_pointer_needed
2937            && varargs_and_saved_regs_size < max_push_offset)
2938     {
2939       /* Frame with large local area and outgoing arguments (this pushes the
2940          callee-saves first, followed by the locals and outgoing area):
2941          stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
2942          stp reg3, reg4, [sp, 16]
2943          sub sp, sp, frame_size - varargs_and_saved_regs_size  */
2944       cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
2945       cfun->machine->frame.final_adjust
2946         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
2947       cfun->machine->frame.hard_fp_offset = cfun->machine->frame.callee_adjust;
2948       cfun->machine->frame.locals_offset = cfun->machine->frame.hard_fp_offset;
2949     }
2950   else
2951     {
2952       /* Frame with large local area and outgoing arguments using frame pointer:
2953          sub sp, sp, hard_fp_offset
2954          stp x29, x30, [sp, 0]
2955          add x29, sp, 0
2956          stp reg3, reg4, [sp, 16]
2957          sub sp, sp, outgoing_args_size  */
2958       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
2959       cfun->machine->frame.final_adjust
2960         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
2961     }
2962
2963   cfun->machine->frame.laid_out = true;
2964 }
2965
2966 /* Return true if the register REGNO is saved on entry to
2967    the current function.  */
2968
2969 static bool
2970 aarch64_register_saved_on_entry (int regno)
2971 {
2972   return cfun->machine->frame.reg_offset[regno] >= 0;
2973 }
2974
2975 /* Return the next register up from REGNO up to LIMIT for the callee
2976    to save.  */
2977
2978 static unsigned
2979 aarch64_next_callee_save (unsigned regno, unsigned limit)
2980 {
2981   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2982     regno ++;
2983   return regno;
2984 }
2985
2986 /* Push the register number REGNO of mode MODE to the stack with write-back
2987    adjusting the stack by ADJUSTMENT.  */
2988
2989 static void
2990 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2991                            HOST_WIDE_INT adjustment)
2992  {
2993   rtx base_rtx = stack_pointer_rtx;
2994   rtx insn, reg, mem;
2995
2996   reg = gen_rtx_REG (mode, regno);
2997   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2998                             plus_constant (Pmode, base_rtx, -adjustment));
2999   mem = gen_rtx_MEM (mode, mem);
3000
3001   insn = emit_move_insn (mem, reg);
3002   RTX_FRAME_RELATED_P (insn) = 1;
3003 }
3004
3005 /* Generate and return an instruction to store the pair of registers
3006    REG and REG2 of mode MODE to location BASE with write-back adjusting
3007    the stack location BASE by ADJUSTMENT.  */
3008
3009 static rtx
3010 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3011                           HOST_WIDE_INT adjustment)
3012 {
3013   switch (mode)
3014     {
3015     case DImode:
3016       return gen_storewb_pairdi_di (base, base, reg, reg2,
3017                                     GEN_INT (-adjustment),
3018                                     GEN_INT (UNITS_PER_WORD - adjustment));
3019     case DFmode:
3020       return gen_storewb_pairdf_di (base, base, reg, reg2,
3021                                     GEN_INT (-adjustment),
3022                                     GEN_INT (UNITS_PER_WORD - adjustment));
3023     default:
3024       gcc_unreachable ();
3025     }
3026 }
3027
3028 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3029    stack pointer by ADJUSTMENT.  */
3030
3031 static void
3032 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
3033 {
3034   rtx_insn *insn;
3035   machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
3036
3037   if (regno2 == INVALID_REGNUM)
3038     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
3039
3040   rtx reg1 = gen_rtx_REG (mode, regno1);
3041   rtx reg2 = gen_rtx_REG (mode, regno2);
3042
3043   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
3044                                               reg2, adjustment));
3045   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
3046   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3047   RTX_FRAME_RELATED_P (insn) = 1;
3048 }
3049
3050 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3051    adjusting it by ADJUSTMENT afterwards.  */
3052
3053 static rtx
3054 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3055                          HOST_WIDE_INT adjustment)
3056 {
3057   switch (mode)
3058     {
3059     case DImode:
3060       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3061                                    GEN_INT (UNITS_PER_WORD));
3062     case DFmode:
3063       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3064                                    GEN_INT (UNITS_PER_WORD));
3065     default:
3066       gcc_unreachable ();
3067     }
3068 }
3069
3070 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3071    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3072    into CFI_OPS.  */
3073
3074 static void
3075 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
3076                   rtx *cfi_ops)
3077 {
3078   machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
3079   rtx reg1 = gen_rtx_REG (mode, regno1);
3080
3081   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
3082
3083   if (regno2 == INVALID_REGNUM)
3084     {
3085       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
3086       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3087       emit_move_insn (reg1, gen_rtx_MEM (mode, mem));
3088     }
3089   else
3090     {
3091       rtx reg2 = gen_rtx_REG (mode, regno2);
3092       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3093       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3094                                           reg2, adjustment));
3095     }
3096 }
3097
3098 /* Generate and return a store pair instruction of mode MODE to store
3099    register REG1 to MEM1 and register REG2 to MEM2.  */
3100
3101 static rtx
3102 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3103                         rtx reg2)
3104 {
3105   switch (mode)
3106     {
3107     case DImode:
3108       return gen_store_pairdi (mem1, reg1, mem2, reg2);
3109
3110     case DFmode:
3111       return gen_store_pairdf (mem1, reg1, mem2, reg2);
3112
3113     default:
3114       gcc_unreachable ();
3115     }
3116 }
3117
3118 /* Generate and regurn a load pair isntruction of mode MODE to load register
3119    REG1 from MEM1 and register REG2 from MEM2.  */
3120
3121 static rtx
3122 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3123                        rtx mem2)
3124 {
3125   switch (mode)
3126     {
3127     case DImode:
3128       return gen_load_pairdi (reg1, mem1, reg2, mem2);
3129
3130     case DFmode:
3131       return gen_load_pairdf (reg1, mem1, reg2, mem2);
3132
3133     default:
3134       gcc_unreachable ();
3135     }
3136 }
3137
3138 /* Return TRUE if return address signing should be enabled for the current
3139    function, otherwise return FALSE.  */
3140
3141 bool
3142 aarch64_return_address_signing_enabled (void)
3143 {
3144   /* This function should only be called after frame laid out.   */
3145   gcc_assert (cfun->machine->frame.laid_out);
3146
3147   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
3148      if it's LR is pushed onto stack.  */
3149   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
3150           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
3151               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
3152 }
3153
3154 /* Emit code to save the callee-saved registers from register number START
3155    to LIMIT to the stack at the location starting at offset START_OFFSET,
3156    skipping any write-back candidates if SKIP_WB is true.  */
3157
3158 static void
3159 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3160                            unsigned start, unsigned limit, bool skip_wb)
3161 {
3162   rtx_insn *insn;
3163   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3164                                                  ? gen_frame_mem : gen_rtx_MEM);
3165   unsigned regno;
3166   unsigned regno2;
3167
3168   for (regno = aarch64_next_callee_save (start, limit);
3169        regno <= limit;
3170        regno = aarch64_next_callee_save (regno + 1, limit))
3171     {
3172       rtx reg, mem;
3173       HOST_WIDE_INT offset;
3174
3175       if (skip_wb
3176           && (regno == cfun->machine->frame.wb_candidate1
3177               || regno == cfun->machine->frame.wb_candidate2))
3178         continue;
3179
3180       if (cfun->machine->reg_is_wrapped_separately[regno])
3181        continue;
3182
3183       reg = gen_rtx_REG (mode, regno);
3184       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3185       mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3186                                               offset));
3187
3188       regno2 = aarch64_next_callee_save (regno + 1, limit);
3189
3190       if (regno2 <= limit
3191           && !cfun->machine->reg_is_wrapped_separately[regno2]
3192           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3193               == cfun->machine->frame.reg_offset[regno2]))
3194
3195         {
3196           rtx reg2 = gen_rtx_REG (mode, regno2);
3197           rtx mem2;
3198
3199           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3200           mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3201                                                    offset));
3202           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3203                                                     reg2));
3204
3205           /* The first part of a frame-related parallel insn is
3206              always assumed to be relevant to the frame
3207              calculations; subsequent parts, are only
3208              frame-related if explicitly marked.  */
3209           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3210           regno = regno2;
3211         }
3212       else
3213         insn = emit_move_insn (mem, reg);
3214
3215       RTX_FRAME_RELATED_P (insn) = 1;
3216     }
3217 }
3218
3219 /* Emit code to restore the callee registers of mode MODE from register
3220    number START up to and including LIMIT.  Restore from the stack offset
3221    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3222    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
3223
3224 static void
3225 aarch64_restore_callee_saves (machine_mode mode,
3226                               HOST_WIDE_INT start_offset, unsigned start,
3227                               unsigned limit, bool skip_wb, rtx *cfi_ops)
3228 {
3229   rtx base_rtx = stack_pointer_rtx;
3230   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3231                                                  ? gen_frame_mem : gen_rtx_MEM);
3232   unsigned regno;
3233   unsigned regno2;
3234   HOST_WIDE_INT offset;
3235
3236   for (regno = aarch64_next_callee_save (start, limit);
3237        regno <= limit;
3238        regno = aarch64_next_callee_save (regno + 1, limit))
3239     {
3240       if (cfun->machine->reg_is_wrapped_separately[regno])
3241        continue;
3242
3243       rtx reg, mem;
3244
3245       if (skip_wb
3246           && (regno == cfun->machine->frame.wb_candidate1
3247               || regno == cfun->machine->frame.wb_candidate2))
3248         continue;
3249
3250       reg = gen_rtx_REG (mode, regno);
3251       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3252       mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3253
3254       regno2 = aarch64_next_callee_save (regno + 1, limit);
3255
3256       if (regno2 <= limit
3257           && !cfun->machine->reg_is_wrapped_separately[regno2]
3258           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3259               == cfun->machine->frame.reg_offset[regno2]))
3260         {
3261           rtx reg2 = gen_rtx_REG (mode, regno2);
3262           rtx mem2;
3263
3264           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3265           mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3266           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3267
3268           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3269           regno = regno2;
3270         }
3271       else
3272         emit_move_insn (reg, mem);
3273       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3274     }
3275 }
3276
3277 static inline bool
3278 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3279                                HOST_WIDE_INT offset)
3280 {
3281   return offset >= -256 && offset < 256;
3282 }
3283
3284 static inline bool
3285 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3286 {
3287   return (offset >= 0
3288           && offset < 4096 * GET_MODE_SIZE (mode)
3289           && offset % GET_MODE_SIZE (mode) == 0);
3290 }
3291
3292 bool
3293 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3294 {
3295   return (offset >= -64 * GET_MODE_SIZE (mode)
3296           && offset < 64 * GET_MODE_SIZE (mode)
3297           && offset % GET_MODE_SIZE (mode) == 0);
3298 }
3299
3300 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
3301
3302 static sbitmap
3303 aarch64_get_separate_components (void)
3304 {
3305   aarch64_layout_frame ();
3306
3307   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3308   bitmap_clear (components);
3309
3310   /* The registers we need saved to the frame.  */
3311   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3312     if (aarch64_register_saved_on_entry (regno))
3313       {
3314         HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3315         if (!frame_pointer_needed)
3316           offset += cfun->machine->frame.frame_size
3317                     - cfun->machine->frame.hard_fp_offset;
3318         /* Check that we can access the stack slot of the register with one
3319            direct load with no adjustments needed.  */
3320         if (offset_12bit_unsigned_scaled_p (DImode, offset))
3321           bitmap_set_bit (components, regno);
3322       }
3323
3324   /* Don't mess with the hard frame pointer.  */
3325   if (frame_pointer_needed)
3326     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
3327
3328   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3329   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3330   /* If aarch64_layout_frame has chosen registers to store/restore with
3331      writeback don't interfere with them to avoid having to output explicit
3332      stack adjustment instructions.  */
3333   if (reg2 != INVALID_REGNUM)
3334     bitmap_clear_bit (components, reg2);
3335   if (reg1 != INVALID_REGNUM)
3336     bitmap_clear_bit (components, reg1);
3337
3338   bitmap_clear_bit (components, LR_REGNUM);
3339   bitmap_clear_bit (components, SP_REGNUM);
3340
3341   return components;
3342 }
3343
3344 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
3345
3346 static sbitmap
3347 aarch64_components_for_bb (basic_block bb)
3348 {
3349   bitmap in = DF_LIVE_IN (bb);
3350   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
3351   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
3352
3353   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3354   bitmap_clear (components);
3355
3356   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
3357   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3358     if ((!call_used_regs[regno])
3359        && (bitmap_bit_p (in, regno)
3360            || bitmap_bit_p (gen, regno)
3361            || bitmap_bit_p (kill, regno)))
3362           bitmap_set_bit (components, regno);
3363
3364   return components;
3365 }
3366
3367 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
3368    Nothing to do for aarch64.  */
3369
3370 static void
3371 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
3372 {
3373 }
3374
3375 /* Return the next set bit in BMP from START onwards.  Return the total number
3376    of bits in BMP if no set bit is found at or after START.  */
3377
3378 static unsigned int
3379 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
3380 {
3381   unsigned int nbits = SBITMAP_SIZE (bmp);
3382   if (start == nbits)
3383     return start;
3384
3385   gcc_assert (start < nbits);
3386   for (unsigned int i = start; i < nbits; i++)
3387     if (bitmap_bit_p (bmp, i))
3388       return i;
3389
3390   return nbits;
3391 }
3392
3393 /* Do the work for aarch64_emit_prologue_components and
3394    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
3395    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
3396    for these components or the epilogue sequence.  That is, it determines
3397    whether we should emit stores or loads and what kind of CFA notes to attach
3398    to the insns.  Otherwise the logic for the two sequences is very
3399    similar.  */
3400
3401 static void
3402 aarch64_process_components (sbitmap components, bool prologue_p)
3403 {
3404   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
3405                              ? HARD_FRAME_POINTER_REGNUM
3406                              : STACK_POINTER_REGNUM);
3407
3408   unsigned last_regno = SBITMAP_SIZE (components);
3409   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
3410   rtx_insn *insn = NULL;
3411
3412   while (regno != last_regno)
3413     {
3414       /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
3415          so DFmode for the vector registers is enough.  */
3416       machine_mode mode = GP_REGNUM_P (regno) ? DImode : DFmode;
3417       rtx reg = gen_rtx_REG (mode, regno);
3418       HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3419       if (!frame_pointer_needed)
3420         offset += cfun->machine->frame.frame_size
3421                   - cfun->machine->frame.hard_fp_offset;
3422       rtx addr = plus_constant (Pmode, ptr_reg, offset);
3423       rtx mem = gen_frame_mem (mode, addr);
3424
3425       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
3426       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
3427       /* No more registers to handle after REGNO.
3428          Emit a single save/restore and exit.  */
3429       if (regno2 == last_regno)
3430         {
3431           insn = emit_insn (set);
3432           RTX_FRAME_RELATED_P (insn) = 1;
3433           if (prologue_p)
3434             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3435           else
3436             add_reg_note (insn, REG_CFA_RESTORE, reg);
3437           break;
3438         }
3439
3440       HOST_WIDE_INT offset2 = cfun->machine->frame.reg_offset[regno2];
3441       /* The next register is not of the same class or its offset is not
3442          mergeable with the current one into a pair.  */
3443       if (!satisfies_constraint_Ump (mem)
3444           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
3445           || (offset2 - cfun->machine->frame.reg_offset[regno])
3446                 != GET_MODE_SIZE (mode))
3447         {
3448           insn = emit_insn (set);
3449           RTX_FRAME_RELATED_P (insn) = 1;
3450           if (prologue_p)
3451             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3452           else
3453             add_reg_note (insn, REG_CFA_RESTORE, reg);
3454
3455           regno = regno2;
3456           continue;
3457         }
3458
3459       /* REGNO2 can be saved/restored in a pair with REGNO.  */
3460       rtx reg2 = gen_rtx_REG (mode, regno2);
3461       if (!frame_pointer_needed)
3462         offset2 += cfun->machine->frame.frame_size
3463                   - cfun->machine->frame.hard_fp_offset;
3464       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
3465       rtx mem2 = gen_frame_mem (mode, addr2);
3466       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
3467                              : gen_rtx_SET (reg2, mem2);
3468
3469       if (prologue_p)
3470         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
3471       else
3472         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3473
3474       RTX_FRAME_RELATED_P (insn) = 1;
3475       if (prologue_p)
3476         {
3477           add_reg_note (insn, REG_CFA_OFFSET, set);
3478           add_reg_note (insn, REG_CFA_OFFSET, set2);
3479         }
3480       else
3481         {
3482           add_reg_note (insn, REG_CFA_RESTORE, reg);
3483           add_reg_note (insn, REG_CFA_RESTORE, reg2);
3484         }
3485
3486       regno = aarch64_get_next_set_bit (components, regno2 + 1);
3487     }
3488 }
3489
3490 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
3491
3492 static void
3493 aarch64_emit_prologue_components (sbitmap components)
3494 {
3495   aarch64_process_components (components, true);
3496 }
3497
3498 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
3499
3500 static void
3501 aarch64_emit_epilogue_components (sbitmap components)
3502 {
3503   aarch64_process_components (components, false);
3504 }
3505
3506 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
3507
3508 static void
3509 aarch64_set_handled_components (sbitmap components)
3510 {
3511   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3512     if (bitmap_bit_p (components, regno))
3513       cfun->machine->reg_is_wrapped_separately[regno] = true;
3514 }
3515
3516 /* AArch64 stack frames generated by this compiler look like:
3517
3518         +-------------------------------+
3519         |                               |
3520         |  incoming stack arguments     |
3521         |                               |
3522         +-------------------------------+
3523         |                               | <-- incoming stack pointer (aligned)
3524         |  callee-allocated save area   |
3525         |  for register varargs         |
3526         |                               |
3527         +-------------------------------+
3528         |  local variables              | <-- frame_pointer_rtx
3529         |                               |
3530         +-------------------------------+
3531         |  padding0                     | \
3532         +-------------------------------+  |
3533         |  callee-saved registers       |  | frame.saved_regs_size
3534         +-------------------------------+  |
3535         |  LR'                          |  |
3536         +-------------------------------+  |
3537         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
3538         +-------------------------------+
3539         |  dynamic allocation           |
3540         +-------------------------------+
3541         |  padding                      |
3542         +-------------------------------+
3543         |  outgoing stack arguments     | <-- arg_pointer
3544         |                               |
3545         +-------------------------------+
3546         |                               | <-- stack_pointer_rtx (aligned)
3547
3548    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3549    but leave frame_pointer_rtx and hard_frame_pointer_rtx
3550    unchanged.  */
3551
3552 /* Generate the prologue instructions for entry into a function.
3553    Establish the stack frame by decreasing the stack pointer with a
3554    properly calculated size and, if necessary, create a frame record
3555    filled with the values of LR and previous frame pointer.  The
3556    current FP is also set up if it is in use.  */
3557
3558 void
3559 aarch64_expand_prologue (void)
3560 {
3561   aarch64_layout_frame ();
3562
3563   HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3564   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3565   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3566   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3567   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3568   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3569   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3570   rtx_insn *insn;
3571
3572   /* Sign return address for functions.  */
3573   if (aarch64_return_address_signing_enabled ())
3574     {
3575       insn = emit_insn (gen_pacisp ());
3576       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3577       RTX_FRAME_RELATED_P (insn) = 1;
3578     }
3579
3580   if (flag_stack_usage_info)
3581     current_function_static_stack_size = frame_size;
3582
3583   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3584     {
3585       if (crtl->is_leaf && !cfun->calls_alloca)
3586         {
3587           if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
3588             aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
3589                                             frame_size - STACK_CHECK_PROTECT);
3590         }
3591       else if (frame_size > 0)
3592         aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
3593     }
3594
3595   aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
3596
3597   if (callee_adjust != 0)
3598     aarch64_push_regs (reg1, reg2, callee_adjust);
3599
3600   if (frame_pointer_needed)
3601     {
3602       if (callee_adjust == 0)
3603         aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3604                                    R30_REGNUM, false);
3605       insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3606                                        stack_pointer_rtx,
3607                                        GEN_INT (callee_offset)));
3608       RTX_FRAME_RELATED_P (insn) = 1;
3609       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3610     }
3611
3612   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3613                              callee_adjust != 0 || frame_pointer_needed);
3614   aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3615                              callee_adjust != 0 || frame_pointer_needed);
3616   aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
3617 }
3618
3619 /* Return TRUE if we can use a simple_return insn.
3620
3621    This function checks whether the callee saved stack is empty, which
3622    means no restore actions are need. The pro_and_epilogue will use
3623    this to check whether shrink-wrapping opt is feasible.  */
3624
3625 bool
3626 aarch64_use_return_insn_p (void)
3627 {
3628   if (!reload_completed)
3629     return false;
3630
3631   if (crtl->profile)
3632     return false;
3633
3634   aarch64_layout_frame ();
3635
3636   return cfun->machine->frame.frame_size == 0;
3637 }
3638
3639 /* Generate the epilogue instructions for returning from a function.
3640    This is almost exactly the reverse of the prolog sequence, except
3641    that we need to insert barriers to avoid scheduling loads that read
3642    from a deallocated stack, and we optimize the unwind records by
3643    emitting them all together if possible.  */
3644 void
3645 aarch64_expand_epilogue (bool for_sibcall)
3646 {
3647   aarch64_layout_frame ();
3648
3649   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3650   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3651   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3652   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3653   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3654   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3655   rtx cfi_ops = NULL;
3656   rtx_insn *insn;
3657
3658   /* We need to add memory barrier to prevent read from deallocated stack.  */
3659   bool need_barrier_p = (get_frame_size ()
3660                          + cfun->machine->frame.saved_varargs_size) != 0;
3661
3662   /* Emit a barrier to prevent loads from a deallocated stack.  */
3663   if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca
3664       || crtl->calls_eh_return)
3665     {
3666       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3667       need_barrier_p = false;
3668     }
3669
3670   /* Restore the stack pointer from the frame pointer if it may not
3671      be the same as the stack pointer.  */
3672   if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3673     {
3674       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3675                                        hard_frame_pointer_rtx,
3676                                        GEN_INT (-callee_offset)));
3677       /* If writeback is used when restoring callee-saves, the CFA
3678          is restored on the instruction doing the writeback.  */
3679       RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3680     }
3681   else
3682     aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
3683
3684   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3685                                 callee_adjust != 0, &cfi_ops);
3686   aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3687                                 callee_adjust != 0, &cfi_ops);
3688
3689   if (need_barrier_p)
3690     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3691
3692   if (callee_adjust != 0)
3693     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3694
3695   if (callee_adjust != 0 || initial_adjust > 65536)
3696     {
3697       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
3698       insn = get_last_insn ();
3699       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3700       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3701       RTX_FRAME_RELATED_P (insn) = 1;
3702       cfi_ops = NULL;
3703     }
3704
3705   aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
3706
3707   if (cfi_ops)
3708     {
3709       /* Emit delayed restores and reset the CFA to be SP.  */
3710       insn = get_last_insn ();
3711       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3712       REG_NOTES (insn) = cfi_ops;
3713       RTX_FRAME_RELATED_P (insn) = 1;
3714     }
3715
3716   /* We prefer to emit the combined return/authenticate instruction RETAA,
3717      however there are three cases in which we must instead emit an explicit
3718      authentication instruction.
3719
3720         1) Sibcalls don't return in a normal way, so if we're about to call one
3721            we must authenticate.
3722
3723         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
3724            generating code for !TARGET_ARMV8_3 we can't use it and must
3725            explicitly authenticate.
3726
3727         3) On an eh_return path we make extra stack adjustments to update the
3728            canonical frame address to be the exception handler's CFA.  We want
3729            to authenticate using the CFA of the function which calls eh_return.
3730     */
3731   if (aarch64_return_address_signing_enabled ()
3732       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
3733     {
3734       insn = emit_insn (gen_autisp ());
3735       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3736       RTX_FRAME_RELATED_P (insn) = 1;
3737     }
3738
3739   /* Stack adjustment for exception handler.  */
3740   if (crtl->calls_eh_return)
3741     {
3742       /* We need to unwind the stack by the offset computed by
3743          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
3744          to be SP; letting the CFA move during this adjustment
3745          is just as correct as retaining the CFA from the body
3746          of the function.  Therefore, do nothing special.  */
3747       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3748     }
3749
3750   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3751   if (!for_sibcall)
3752     emit_jump_insn (ret_rtx);
3753 }
3754
3755 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
3756    normally or return to a previous frame after unwinding.
3757
3758    An EH return uses a single shared return sequence.  The epilogue is
3759    exactly like a normal epilogue except that it has an extra input
3760    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
3761    that must be applied after the frame has been destroyed.  An extra label
3762    is inserted before the epilogue which initializes this register to zero,
3763    and this is the entry point for a normal return.
3764
3765    An actual EH return updates the return address, initializes the stack
3766    adjustment and jumps directly into the epilogue (bypassing the zeroing
3767    of the adjustment).  Since the return address is typically saved on the
3768    stack when a function makes a call, the saved LR must be updated outside
3769    the epilogue.
3770
3771    This poses problems as the store is generated well before the epilogue,
3772    so the offset of LR is not known yet.  Also optimizations will remove the
3773    store as it appears dead, even after the epilogue is generated (as the
3774    base or offset for loading LR is different in many cases).
3775
3776    To avoid these problems this implementation forces the frame pointer
3777    in eh_return functions so that the location of LR is fixed and known early.
3778    It also marks the store volatile, so no optimization is permitted to
3779    remove the store.  */
3780 rtx
3781 aarch64_eh_return_handler_rtx (void)
3782 {
3783   rtx tmp = gen_frame_mem (Pmode,
3784     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3785
3786   /* Mark the store volatile, so no optimization is permitted to remove it.  */
3787   MEM_VOLATILE_P (tmp) = true;
3788   return tmp;
3789 }
3790
3791 /* Output code to add DELTA to the first argument, and then jump
3792    to FUNCTION.  Used for C++ multiple inheritance.  */
3793 static void
3794 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3795                          HOST_WIDE_INT delta,
3796                          HOST_WIDE_INT vcall_offset,
3797                          tree function)
3798 {
3799   /* The this pointer is always in x0.  Note that this differs from
3800      Arm where the this pointer maybe bumped to r1 if r0 is required
3801      to return a pointer to an aggregate.  On AArch64 a result value
3802      pointer will be in x8.  */
3803   int this_regno = R0_REGNUM;
3804   rtx this_rtx, temp0, temp1, addr, funexp;
3805   rtx_insn *insn;
3806
3807   reload_completed = 1;
3808   emit_note (NOTE_INSN_PROLOGUE_END);
3809
3810   if (vcall_offset == 0)
3811     aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3812   else
3813     {
3814       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3815
3816       this_rtx = gen_rtx_REG (Pmode, this_regno);
3817       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3818       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3819
3820       addr = this_rtx;
3821       if (delta != 0)
3822         {
3823           if (delta >= -256 && delta < 256)
3824             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3825                                        plus_constant (Pmode, this_rtx, delta));
3826           else
3827             aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3828         }
3829
3830       if (Pmode == ptr_mode)
3831         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3832       else
3833         aarch64_emit_move (temp0,
3834                            gen_rtx_ZERO_EXTEND (Pmode,
3835                                                 gen_rtx_MEM (ptr_mode, addr)));
3836
3837       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3838           addr = plus_constant (Pmode, temp0, vcall_offset);
3839       else
3840         {
3841           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3842                                           Pmode);
3843           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3844         }
3845
3846       if (Pmode == ptr_mode)
3847         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3848       else
3849         aarch64_emit_move (temp1,
3850                            gen_rtx_SIGN_EXTEND (Pmode,
3851                                                 gen_rtx_MEM (ptr_mode, addr)));
3852
3853       emit_insn (gen_add2_insn (this_rtx, temp1));
3854     }
3855
3856   /* Generate a tail call to the target function.  */
3857   if (!TREE_USED (function))
3858     {
3859       assemble_external (function);
3860       TREE_USED (function) = 1;
3861     }
3862   funexp = XEXP (DECL_RTL (function), 0);
3863   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3864   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3865   SIBLING_CALL_P (insn) = 1;
3866
3867   insn = get_insns ();
3868   shorten_branches (insn);
3869   final_start_function (insn, file, 1);
3870   final (insn, file, 1);
3871   final_end_function ();
3872
3873   /* Stop pretending to be a post-reload pass.  */
3874   reload_completed = 0;
3875 }
3876
3877 static bool
3878 aarch64_tls_referenced_p (rtx x)
3879 {
3880   if (!TARGET_HAVE_TLS)
3881     return false;
3882   subrtx_iterator::array_type array;
3883   FOR_EACH_SUBRTX (iter, array, x, ALL)
3884     {
3885       const_rtx x = *iter;
3886       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3887         return true;
3888       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3889          TLS offsets, not real symbol references.  */
3890       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3891         iter.skip_subrtxes ();
3892     }
3893   return false;
3894 }
3895
3896
3897 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3898    a left shift of 0 or 12 bits.  */
3899 bool
3900 aarch64_uimm12_shift (HOST_WIDE_INT val)
3901 {
3902   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3903           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3904           );
3905 }
3906
3907
3908 /* Return true if val is an immediate that can be loaded into a
3909    register by a MOVZ instruction.  */
3910 static bool
3911 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3912 {
3913   if (GET_MODE_SIZE (mode) > 4)
3914     {
3915       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3916           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3917         return 1;
3918     }
3919   else
3920     {
3921       /* Ignore sign extension.  */
3922       val &= (HOST_WIDE_INT) 0xffffffff;
3923     }
3924   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3925           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3926 }
3927
3928 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
3929
3930 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
3931   {
3932     0x0000000100000001ull,
3933     0x0001000100010001ull,
3934     0x0101010101010101ull,
3935     0x1111111111111111ull,
3936     0x5555555555555555ull,
3937   };
3938
3939
3940 /* Return true if val is a valid bitmask immediate.  */
3941
3942 bool
3943 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
3944 {
3945   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
3946   int bits;
3947
3948   /* Check for a single sequence of one bits and return quickly if so.
3949      The special cases of all ones and all zeroes returns false.  */
3950   val = (unsigned HOST_WIDE_INT) val_in;
3951   tmp = val + (val & -val);
3952
3953   if (tmp == (tmp & -tmp))
3954     return (val + 1) > 1;
3955
3956   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
3957   if (mode == SImode)
3958     val = (val << 32) | (val & 0xffffffff);
3959
3960   /* Invert if the immediate doesn't start with a zero bit - this means we
3961      only need to search for sequences of one bits.  */
3962   if (val & 1)
3963     val = ~val;
3964
3965   /* Find the first set bit and set tmp to val with the first sequence of one
3966      bits removed.  Return success if there is a single sequence of ones.  */
3967   first_one = val & -val;
3968   tmp = val & (val + first_one);
3969
3970   if (tmp == 0)
3971     return true;
3972
3973   /* Find the next set bit and compute the difference in bit position.  */
3974   next_one = tmp & -tmp;
3975   bits = clz_hwi (first_one) - clz_hwi (next_one);
3976   mask = val ^ tmp;
3977
3978   /* Check the bit position difference is a power of 2, and that the first
3979      sequence of one bits fits within 'bits' bits.  */
3980   if ((mask >> bits) != 0 || bits != (bits & -bits))
3981     return false;
3982
3983   /* Check the sequence of one bits is repeated 64/bits times.  */
3984   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
3985 }
3986
3987 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
3988    Assumed precondition: VAL_IN Is not zero.  */
3989
3990 unsigned HOST_WIDE_INT
3991 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
3992 {
3993   int lowest_bit_set = ctz_hwi (val_in);
3994   int highest_bit_set = floor_log2 (val_in);
3995   gcc_assert (val_in != 0);
3996
3997   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
3998           (HOST_WIDE_INT_1U << lowest_bit_set));
3999 }
4000
4001 /* Create constant where bits outside of lowest bit set to highest bit set
4002    are set to 1.  */
4003
4004 unsigned HOST_WIDE_INT
4005 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
4006 {
4007   return val_in | ~aarch64_and_split_imm1 (val_in);
4008 }
4009
4010 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
4011
4012 bool
4013 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
4014 {
4015   if (aarch64_bitmask_imm (val_in, mode))
4016     return false;
4017
4018   if (aarch64_move_imm (val_in, mode))
4019     return false;
4020
4021   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
4022
4023   return aarch64_bitmask_imm (imm2, mode);
4024 }
4025
4026 /* Return true if val is an immediate that can be loaded into a
4027    register in a single instruction.  */
4028 bool
4029 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
4030 {
4031   if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
4032     return 1;
4033   return aarch64_bitmask_imm (val, mode);
4034 }
4035
4036 static bool
4037 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
4038 {
4039   rtx base, offset;
4040
4041   if (GET_CODE (x) == HIGH)
4042     return true;
4043
4044   split_const (x, &base, &offset);
4045   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
4046     {
4047       if (aarch64_classify_symbol (base, offset)
4048           != SYMBOL_FORCE_TO_MEM)
4049         return true;
4050       else
4051         /* Avoid generating a 64-bit relocation in ILP32; leave
4052            to aarch64_expand_mov_immediate to handle it properly.  */
4053         return mode != ptr_mode;
4054     }
4055
4056   return aarch64_tls_referenced_p (x);
4057 }
4058
4059 /* Implement TARGET_CASE_VALUES_THRESHOLD.
4060    The expansion for a table switch is quite expensive due to the number
4061    of instructions, the table lookup and hard to predict indirect jump.
4062    When optimizing for speed, and -O3 enabled, use the per-core tuning if
4063    set, otherwise use tables for > 16 cases as a tradeoff between size and
4064    performance.  When optimizing for size, use the default setting.  */
4065
4066 static unsigned int
4067 aarch64_case_values_threshold (void)
4068 {
4069   /* Use the specified limit for the number of cases before using jump
4070      tables at higher optimization levels.  */
4071   if (optimize > 2
4072       && selected_cpu->tune->max_case_values != 0)
4073     return selected_cpu->tune->max_case_values;
4074   else
4075     return optimize_size ? default_case_values_threshold () : 17;
4076 }
4077
4078 /* Return true if register REGNO is a valid index register.
4079    STRICT_P is true if REG_OK_STRICT is in effect.  */
4080
4081 bool
4082 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
4083 {
4084   if (!HARD_REGISTER_NUM_P (regno))
4085     {
4086       if (!strict_p)
4087         return true;
4088
4089       if (!reg_renumber)
4090         return false;
4091
4092       regno = reg_renumber[regno];
4093     }
4094   return GP_REGNUM_P (regno);
4095 }
4096
4097 /* Return true if register REGNO is a valid base register for mode MODE.
4098    STRICT_P is true if REG_OK_STRICT is in effect.  */
4099
4100 bool
4101 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
4102 {
4103   if (!HARD_REGISTER_NUM_P (regno))
4104     {
4105       if (!strict_p)
4106         return true;
4107
4108       if (!reg_renumber)
4109         return false;
4110
4111       regno = reg_renumber[regno];
4112     }
4113
4114   /* The fake registers will be eliminated to either the stack or
4115      hard frame pointer, both of which are usually valid base registers.
4116      Reload deals with the cases where the eliminated form isn't valid.  */
4117   return (GP_REGNUM_P (regno)
4118           || regno == SP_REGNUM
4119           || regno == FRAME_POINTER_REGNUM
4120           || regno == ARG_POINTER_REGNUM);
4121 }
4122
4123 /* Return true if X is a valid base register for mode MODE.
4124    STRICT_P is true if REG_OK_STRICT is in effect.  */
4125
4126 static bool
4127 aarch64_base_register_rtx_p (rtx x, bool strict_p)
4128 {
4129   if (!strict_p && GET_CODE (x) == SUBREG)
4130     x = SUBREG_REG (x);
4131
4132   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
4133 }
4134
4135 /* Return true if address offset is a valid index.  If it is, fill in INFO
4136    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
4137
4138 static bool
4139 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
4140                         machine_mode mode, bool strict_p)
4141 {
4142   enum aarch64_address_type type;
4143   rtx index;
4144   int shift;
4145
4146   /* (reg:P) */
4147   if ((REG_P (x) || GET_CODE (x) == SUBREG)
4148       && GET_MODE (x) == Pmode)
4149     {
4150       type = ADDRESS_REG_REG;
4151       index = x;
4152       shift = 0;
4153     }
4154   /* (sign_extend:DI (reg:SI)) */
4155   else if ((GET_CODE (x) == SIGN_EXTEND
4156             || GET_CODE (x) == ZERO_EXTEND)
4157            && GET_MODE (x) == DImode
4158            && GET_MODE (XEXP (x, 0)) == SImode)
4159     {
4160       type = (GET_CODE (x) == SIGN_EXTEND)
4161         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4162       index = XEXP (x, 0);
4163       shift = 0;
4164     }
4165   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
4166   else if (GET_CODE (x) == MULT
4167            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4168                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4169            && GET_MODE (XEXP (x, 0)) == DImode
4170            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4171            && CONST_INT_P (XEXP (x, 1)))
4172     {
4173       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4174         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4175       index = XEXP (XEXP (x, 0), 0);
4176       shift = exact_log2 (INTVAL (XEXP (x, 1)));
4177     }
4178   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
4179   else if (GET_CODE (x) == ASHIFT
4180            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4181                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4182            && GET_MODE (XEXP (x, 0)) == DImode
4183            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4184            && CONST_INT_P (XEXP (x, 1)))
4185     {
4186       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4187         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4188       index = XEXP (XEXP (x, 0), 0);
4189       shift = INTVAL (XEXP (x, 1));
4190     }
4191   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
4192   else if ((GET_CODE (x) == SIGN_EXTRACT
4193             || GET_CODE (x) == ZERO_EXTRACT)
4194            && GET_MODE (x) == DImode
4195            && GET_CODE (XEXP (x, 0)) == MULT
4196            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4197            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4198     {
4199       type = (GET_CODE (x) == SIGN_EXTRACT)
4200         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4201       index = XEXP (XEXP (x, 0), 0);
4202       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4203       if (INTVAL (XEXP (x, 1)) != 32 + shift
4204           || INTVAL (XEXP (x, 2)) != 0)
4205         shift = -1;
4206     }
4207   /* (and:DI (mult:DI (reg:DI) (const_int scale))
4208      (const_int 0xffffffff<<shift)) */
4209   else if (GET_CODE (x) == AND
4210            && GET_MODE (x) == DImode
4211            && GET_CODE (XEXP (x, 0)) == MULT
4212            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4213            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4214            && CONST_INT_P (XEXP (x, 1)))
4215     {
4216       type = ADDRESS_REG_UXTW;
4217       index = XEXP (XEXP (x, 0), 0);
4218       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4219       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4220         shift = -1;
4221     }
4222   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
4223   else if ((GET_CODE (x) == SIGN_EXTRACT
4224             || GET_CODE (x) == ZERO_EXTRACT)
4225            && GET_MODE (x) == DImode
4226            && GET_CODE (XEXP (x, 0)) == ASHIFT
4227            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4228            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4229     {
4230       type = (GET_CODE (x) == SIGN_EXTRACT)
4231         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4232       index = XEXP (XEXP (x, 0), 0);
4233       shift = INTVAL (XEXP (XEXP (x, 0), 1));
4234       if (INTVAL (XEXP (x, 1)) != 32 + shift
4235           || INTVAL (XEXP (x, 2)) != 0)
4236         shift = -1;
4237     }
4238   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
4239      (const_int 0xffffffff<<shift)) */
4240   else if (GET_CODE (x) == AND
4241            && GET_MODE (x) == DImode
4242            && GET_CODE (XEXP (x, 0)) == ASHIFT
4243            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4244            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4245            && CONST_INT_P (XEXP (x, 1)))
4246     {
4247       type = ADDRESS_REG_UXTW;
4248       index = XEXP (XEXP (x, 0), 0);
4249       shift = INTVAL (XEXP (XEXP (x, 0), 1));
4250       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4251         shift = -1;
4252     }
4253   /* (mult:P (reg:P) (const_int scale)) */
4254   else if (GET_CODE (x) == MULT
4255            && GET_MODE (x) == Pmode
4256            && GET_MODE (XEXP (x, 0)) == Pmode
4257            && CONST_INT_P (XEXP (x, 1)))
4258     {
4259       type = ADDRESS_REG_REG;
4260       index = XEXP (x, 0);
4261       shift = exact_log2 (INTVAL (XEXP (x, 1)));
4262     }
4263   /* (ashift:P (reg:P) (const_int shift)) */
4264   else if (GET_CODE (x) == ASHIFT
4265            && GET_MODE (x) == Pmode
4266            && GET_MODE (XEXP (x, 0)) == Pmode
4267            && CONST_INT_P (XEXP (x, 1)))
4268     {
4269       type = ADDRESS_REG_REG;
4270       index = XEXP (x, 0);
4271       shift = INTVAL (XEXP (x, 1));
4272     }
4273   else
4274     return false;
4275
4276   if (GET_CODE (index) == SUBREG)
4277     index = SUBREG_REG (index);
4278
4279   if ((shift == 0 ||
4280        (shift > 0 && shift <= 3
4281         && (1 << shift) == GET_MODE_SIZE (mode)))
4282       && REG_P (index)
4283       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
4284     {
4285       info->type = type;
4286       info->offset = index;
4287       info->shift = shift;
4288       return true;
4289     }
4290
4291   return false;
4292 }
4293
4294 /* Return true if MODE is one of the modes for which we
4295    support LDP/STP operations.  */
4296
4297 static bool
4298 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
4299 {
4300   return mode == SImode || mode == DImode
4301          || mode == SFmode || mode == DFmode
4302          || (aarch64_vector_mode_supported_p (mode)
4303              && GET_MODE_SIZE (mode) == 8);
4304 }
4305
4306 /* Return true if REGNO is a virtual pointer register, or an eliminable
4307    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
4308    include stack_pointer or hard_frame_pointer.  */
4309 static bool
4310 virt_or_elim_regno_p (unsigned regno)
4311 {
4312   return ((regno >= FIRST_VIRTUAL_REGISTER
4313            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
4314           || regno == FRAME_POINTER_REGNUM
4315           || regno == ARG_POINTER_REGNUM);
4316 }
4317
4318 /* Return true if X is a valid address for machine mode MODE.  If it is,
4319    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
4320    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
4321
4322 static bool
4323 aarch64_classify_address (struct aarch64_address_info *info,
4324                           rtx x, machine_mode mode,
4325                           RTX_CODE outer_code, bool strict_p)
4326 {
4327   enum rtx_code code = GET_CODE (x);
4328   rtx op0, op1;
4329
4330   /* On BE, we use load/store pair for all large int mode load/stores.
4331      TI/TFmode may also use a load/store pair.  */
4332   bool load_store_pair_p = (outer_code == PARALLEL
4333                             || mode == TImode
4334                             || mode == TFmode
4335                             || (BYTES_BIG_ENDIAN
4336                                 && aarch64_vect_struct_mode_p (mode)));
4337
4338   bool allow_reg_index_p =
4339     !load_store_pair_p
4340     && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
4341     && !aarch64_vect_struct_mode_p (mode);
4342
4343   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4344      REG addressing.  */
4345   if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
4346       && (code != POST_INC && code != REG))
4347     return false;
4348
4349   switch (code)
4350     {
4351     case REG:
4352     case SUBREG:
4353       info->type = ADDRESS_REG_IMM;
4354       info->base = x;
4355       info->offset = const0_rtx;
4356       return aarch64_base_register_rtx_p (x, strict_p);
4357
4358     case PLUS:
4359       op0 = XEXP (x, 0);
4360       op1 = XEXP (x, 1);
4361
4362       if (! strict_p
4363           && REG_P (op0)
4364           && virt_or_elim_regno_p (REGNO (op0))
4365           && CONST_INT_P (op1))
4366         {
4367           info->type = ADDRESS_REG_IMM;
4368           info->base = op0;
4369           info->offset = op1;
4370
4371           return true;
4372         }
4373
4374       if (GET_MODE_SIZE (mode) != 0
4375           && CONST_INT_P (op1)
4376           && aarch64_base_register_rtx_p (op0, strict_p))
4377         {
4378           HOST_WIDE_INT offset = INTVAL (op1);
4379
4380           info->type = ADDRESS_REG_IMM;
4381           info->base = op0;
4382           info->offset = op1;
4383
4384           /* TImode and TFmode values are allowed in both pairs of X
4385              registers and individual Q registers.  The available
4386              address modes are:
4387              X,X: 7-bit signed scaled offset
4388              Q:   9-bit signed offset
4389              We conservatively require an offset representable in either mode.
4390              When performing the check for pairs of X registers i.e.  LDP/STP
4391              pass down DImode since that is the natural size of the LDP/STP
4392              instruction memory accesses.  */
4393           if (mode == TImode || mode == TFmode)
4394             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
4395                     && (offset_9bit_signed_unscaled_p (mode, offset)
4396                         || offset_12bit_unsigned_scaled_p (mode, offset)));
4397
4398           /* A 7bit offset check because OImode will emit a ldp/stp
4399              instruction (only big endian will get here).
4400              For ldp/stp instructions, the offset is scaled for the size of a
4401              single element of the pair.  */
4402           if (mode == OImode)
4403             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4404
4405           /* Three 9/12 bit offsets checks because CImode will emit three
4406              ldr/str instructions (only big endian will get here).  */
4407           if (mode == CImode)
4408             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4409                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4410                         || offset_12bit_unsigned_scaled_p (V16QImode,
4411                                                            offset + 32)));
4412
4413           /* Two 7bit offsets checks because XImode will emit two ldp/stp
4414              instructions (only big endian will get here).  */
4415           if (mode == XImode)
4416             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4417                     && aarch64_offset_7bit_signed_scaled_p (TImode,
4418                                                             offset + 32));
4419
4420           if (load_store_pair_p)
4421             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4422                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4423           else
4424             return (offset_9bit_signed_unscaled_p (mode, offset)
4425                     || offset_12bit_unsigned_scaled_p (mode, offset));
4426         }
4427
4428       if (allow_reg_index_p)
4429         {
4430           /* Look for base + (scaled/extended) index register.  */
4431           if (aarch64_base_register_rtx_p (op0, strict_p)
4432               && aarch64_classify_index (info, op1, mode, strict_p))
4433             {
4434               info->base = op0;
4435               return true;
4436             }
4437           if (aarch64_base_register_rtx_p (op1, strict_p)
4438               && aarch64_classify_index (info, op0, mode, strict_p))
4439             {
4440               info->base = op1;
4441               return true;
4442             }
4443         }
4444
4445       return false;
4446
4447     case POST_INC:
4448     case POST_DEC:
4449     case PRE_INC:
4450     case PRE_DEC:
4451       info->type = ADDRESS_REG_WB;
4452       info->base = XEXP (x, 0);
4453       info->offset = NULL_RTX;
4454       return aarch64_base_register_rtx_p (info->base, strict_p);
4455
4456     case POST_MODIFY:
4457     case PRE_MODIFY:
4458       info->type = ADDRESS_REG_WB;
4459       info->base = XEXP (x, 0);
4460       if (GET_CODE (XEXP (x, 1)) == PLUS
4461           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4462           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4463           && aarch64_base_register_rtx_p (info->base, strict_p))
4464         {
4465           HOST_WIDE_INT offset;
4466           info->offset = XEXP (XEXP (x, 1), 1);
4467           offset = INTVAL (info->offset);
4468
4469           /* TImode and TFmode values are allowed in both pairs of X
4470              registers and individual Q registers.  The available
4471              address modes are:
4472              X,X: 7-bit signed scaled offset
4473              Q:   9-bit signed offset
4474              We conservatively require an offset representable in either mode.
4475            */
4476           if (mode == TImode || mode == TFmode)
4477             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4478                     && offset_9bit_signed_unscaled_p (mode, offset));
4479
4480           if (load_store_pair_p)
4481             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4482                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4483           else
4484             return offset_9bit_signed_unscaled_p (mode, offset);
4485         }
4486       return false;
4487
4488     case CONST:
4489     case SYMBOL_REF:
4490     case LABEL_REF:
4491       /* load literal: pc-relative constant pool entry.  Only supported
4492          for SI mode or larger.  */
4493       info->type = ADDRESS_SYMBOLIC;
4494
4495       if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4496         {
4497           rtx sym, addend;
4498
4499           split_const (x, &sym, &addend);
4500           return ((GET_CODE (sym) == LABEL_REF
4501                    || (GET_CODE (sym) == SYMBOL_REF
4502                        && CONSTANT_POOL_ADDRESS_P (sym)
4503                        && aarch64_pcrelative_literal_loads)));
4504         }
4505       return false;
4506
4507     case LO_SUM:
4508       info->type = ADDRESS_LO_SUM;
4509       info->base = XEXP (x, 0);
4510       info->offset = XEXP (x, 1);
4511       if (allow_reg_index_p
4512           && aarch64_base_register_rtx_p (info->base, strict_p))
4513         {
4514           rtx sym, offs;
4515           split_const (info->offset, &sym, &offs);
4516           if (GET_CODE (sym) == SYMBOL_REF
4517               && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4518             {
4519               /* The symbol and offset must be aligned to the access size.  */
4520               unsigned int align;
4521               unsigned int ref_size;
4522
4523               if (CONSTANT_POOL_ADDRESS_P (sym))
4524                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4525               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4526                 {
4527                   tree exp = SYMBOL_REF_DECL (sym);
4528                   align = TYPE_ALIGN (TREE_TYPE (exp));
4529                   align = CONSTANT_ALIGNMENT (exp, align);
4530                 }
4531               else if (SYMBOL_REF_DECL (sym))
4532                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4533               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4534                        && SYMBOL_REF_BLOCK (sym) != NULL)
4535                 align = SYMBOL_REF_BLOCK (sym)->alignment;
4536               else
4537                 align = BITS_PER_UNIT;
4538
4539               ref_size = GET_MODE_SIZE (mode);
4540               if (ref_size == 0)
4541                 ref_size = GET_MODE_SIZE (DImode);
4542
4543               return ((INTVAL (offs) & (ref_size - 1)) == 0
4544                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4545             }
4546         }
4547       return false;
4548
4549     default:
4550       return false;
4551     }
4552 }
4553
4554 bool
4555 aarch64_symbolic_address_p (rtx x)
4556 {
4557   rtx offset;
4558
4559   split_const (x, &x, &offset);
4560   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4561 }
4562
4563 /* Classify the base of symbolic expression X.  */
4564
4565 enum aarch64_symbol_type
4566 aarch64_classify_symbolic_expression (rtx x)
4567 {
4568   rtx offset;
4569
4570   split_const (x, &x, &offset);
4571   return aarch64_classify_symbol (x, offset);
4572 }
4573
4574
4575 /* Return TRUE if X is a legitimate address for accessing memory in
4576    mode MODE.  */
4577 static bool
4578 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4579 {
4580   struct aarch64_address_info addr;
4581
4582   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4583 }
4584
4585 /* Return TRUE if X is a legitimate address for accessing memory in
4586    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
4587    pair operation.  */
4588 bool
4589 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4590                               RTX_CODE outer_code, bool strict_p)
4591 {
4592   struct aarch64_address_info addr;
4593
4594   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4595 }
4596
4597 /* Split an out-of-range address displacement into a base and offset.
4598    Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4599    to increase opportunities for sharing the base address of different sizes.
4600    For unaligned accesses and TI/TF mode use the signed 9-bit range.  */
4601 static bool
4602 aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
4603 {
4604   HOST_WIDE_INT offset = INTVAL (*disp);
4605   HOST_WIDE_INT base = offset & ~(GET_MODE_SIZE (mode) < 4 ? 0xfff : 0x3ffc);
4606
4607   if (mode == TImode || mode == TFmode
4608       || (offset & (GET_MODE_SIZE (mode) - 1)) != 0)
4609     base = (offset + 0x100) & ~0x1ff;
4610
4611   *off = GEN_INT (base);
4612   *disp = GEN_INT (offset - base);
4613   return true;
4614 }
4615
4616 /* Return TRUE if rtx X is immediate constant 0.0 */
4617 bool
4618 aarch64_float_const_zero_rtx_p (rtx x)
4619 {
4620   if (GET_MODE (x) == VOIDmode)
4621     return false;
4622
4623   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4624     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4625   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4626 }
4627
4628 /* Return the fixed registers used for condition codes.  */
4629
4630 static bool
4631 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4632 {
4633   *p1 = CC_REGNUM;
4634   *p2 = INVALID_REGNUM;
4635   return true;
4636 }
4637
4638 /* Emit call insn with PAT and do aarch64-specific handling.  */
4639
4640 void
4641 aarch64_emit_call_insn (rtx pat)
4642 {
4643   rtx insn = emit_call_insn (pat);
4644
4645   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4646   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4647   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4648 }
4649
4650 machine_mode
4651 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4652 {
4653   /* All floating point compares return CCFP if it is an equality
4654      comparison, and CCFPE otherwise.  */
4655   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4656     {
4657       switch (code)
4658         {
4659         case EQ:
4660         case NE:
4661         case UNORDERED:
4662         case ORDERED:
4663         case UNLT:
4664         case UNLE:
4665         case UNGT:
4666         case UNGE:
4667         case UNEQ:
4668         case LTGT:
4669           return CCFPmode;
4670
4671         case LT:
4672         case LE:
4673         case GT:
4674         case GE:
4675           return CCFPEmode;
4676
4677         default:
4678           gcc_unreachable ();
4679         }
4680     }
4681
4682   /* Equality comparisons of short modes against zero can be performed
4683      using the TST instruction with the appropriate bitmask.  */
4684   if (y == const0_rtx && REG_P (x)
4685       && (code == EQ || code == NE)
4686       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4687     return CC_NZmode;
4688
4689   /* Similarly, comparisons of zero_extends from shorter modes can
4690      be performed using an ANDS with an immediate mask.  */
4691   if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
4692       && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4693       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
4694       && (code == EQ || code == NE))
4695     return CC_NZmode;
4696
4697   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4698       && y == const0_rtx
4699       && (code == EQ || code == NE || code == LT || code == GE)
4700       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4701           || GET_CODE (x) == NEG
4702           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4703               && CONST_INT_P (XEXP (x, 2)))))
4704     return CC_NZmode;
4705
4706   /* A compare with a shifted operand.  Because of canonicalization,
4707      the comparison will have to be swapped when we emit the assembly
4708      code.  */
4709   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4710       && (REG_P (y) || GET_CODE (y) == SUBREG)
4711       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4712           || GET_CODE (x) == LSHIFTRT
4713           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4714     return CC_SWPmode;
4715
4716   /* Similarly for a negated operand, but we can only do this for
4717      equalities.  */
4718   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4719       && (REG_P (y) || GET_CODE (y) == SUBREG)
4720       && (code == EQ || code == NE)
4721       && GET_CODE (x) == NEG)
4722     return CC_Zmode;
4723
4724   /* A test for unsigned overflow.  */
4725   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
4726       && code == NE
4727       && GET_CODE (x) == PLUS
4728       && GET_CODE (y) == ZERO_EXTEND)
4729     return CC_Cmode;
4730
4731   /* For everything else, return CCmode.  */
4732   return CCmode;
4733 }
4734
4735 static int
4736 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
4737
4738 int
4739 aarch64_get_condition_code (rtx x)
4740 {
4741   machine_mode mode = GET_MODE (XEXP (x, 0));
4742   enum rtx_code comp_code = GET_CODE (x);
4743
4744   if (GET_MODE_CLASS (mode) != MODE_CC)
4745     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
4746   return aarch64_get_condition_code_1 (mode, comp_code);
4747 }
4748
4749 static int
4750 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
4751 {
4752   switch (mode)
4753     {
4754     case CCFPmode:
4755     case CCFPEmode:
4756       switch (comp_code)
4757         {
4758         case GE: return AARCH64_GE;
4759         case GT: return AARCH64_GT;
4760         case LE: return AARCH64_LS;
4761         case LT: return AARCH64_MI;
4762         case NE: return AARCH64_NE;
4763         case EQ: return AARCH64_EQ;
4764         case ORDERED: return AARCH64_VC;
4765         case UNORDERED: return AARCH64_VS;
4766         case UNLT: return AARCH64_LT;
4767         case UNLE: return AARCH64_LE;
4768         case UNGT: return AARCH64_HI;
4769         case UNGE: return AARCH64_PL;
4770         default: return -1;
4771         }
4772       break;
4773
4774     case CCmode:
4775       switch (comp_code)
4776         {
4777         case NE: return AARCH64_NE;
4778         case EQ: return AARCH64_EQ;
4779         case GE: return AARCH64_GE;
4780         case GT: return AARCH64_GT;
4781         case LE: return AARCH64_LE;
4782         case LT: return AARCH64_LT;
4783         case GEU: return AARCH64_CS;
4784         case GTU: return AARCH64_HI;
4785         case LEU: return AARCH64_LS;
4786         case LTU: return AARCH64_CC;
4787         default: return -1;
4788         }
4789       break;
4790
4791     case CC_SWPmode:
4792       switch (comp_code)
4793         {
4794         case NE: return AARCH64_NE;
4795         case EQ: return AARCH64_EQ;
4796         case GE: return AARCH64_LE;
4797         case GT: return AARCH64_LT;
4798         case LE: return AARCH64_GE;
4799         case LT: return AARCH64_GT;
4800         case GEU: return AARCH64_LS;
4801         case GTU: return AARCH64_CC;
4802         case LEU: return AARCH64_CS;
4803         case LTU: return AARCH64_HI;
4804         default: return -1;
4805         }
4806       break;
4807
4808     case CC_NZmode:
4809       switch (comp_code)
4810         {
4811         case NE: return AARCH64_NE;
4812         case EQ: return AARCH64_EQ;
4813         case GE: return AARCH64_PL;
4814         case LT: return AARCH64_MI;
4815         default: return -1;
4816         }
4817       break;
4818
4819     case CC_Zmode:
4820       switch (comp_code)
4821         {
4822         case NE: return AARCH64_NE;
4823         case EQ: return AARCH64_EQ;
4824         default: return -1;
4825         }
4826       break;
4827
4828     case CC_Cmode:
4829       switch (comp_code)
4830         {
4831         case NE: return AARCH64_CS;
4832         case EQ: return AARCH64_CC;
4833         default: return -1;
4834         }
4835       break;
4836
4837     default:
4838       return -1;
4839     }
4840
4841   return -1;
4842 }
4843
4844 bool
4845 aarch64_const_vec_all_same_in_range_p (rtx x,
4846                                   HOST_WIDE_INT minval,
4847                                   HOST_WIDE_INT maxval)
4848 {
4849   HOST_WIDE_INT firstval;
4850   int count, i;
4851
4852   if (GET_CODE (x) != CONST_VECTOR
4853       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
4854     return false;
4855
4856   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
4857   if (firstval < minval || firstval > maxval)
4858     return false;
4859
4860   count = CONST_VECTOR_NUNITS (x);
4861   for (i = 1; i < count; i++)
4862     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
4863       return false;
4864
4865   return true;
4866 }
4867
4868 bool
4869 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
4870 {
4871   return aarch64_const_vec_all_same_in_range_p (x, val, val);
4872 }
4873
4874
4875 /* N Z C V.  */
4876 #define AARCH64_CC_V 1
4877 #define AARCH64_CC_C (1 << 1)
4878 #define AARCH64_CC_Z (1 << 2)
4879 #define AARCH64_CC_N (1 << 3)
4880
4881 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
4882 static const int aarch64_nzcv_codes[] =
4883 {
4884   0,            /* EQ, Z == 1.  */
4885   AARCH64_CC_Z, /* NE, Z == 0.  */
4886   0,            /* CS, C == 1.  */
4887   AARCH64_CC_C, /* CC, C == 0.  */
4888   0,            /* MI, N == 1.  */
4889   AARCH64_CC_N, /* PL, N == 0.  */
4890   0,            /* VS, V == 1.  */
4891   AARCH64_CC_V, /* VC, V == 0.  */
4892   0,            /* HI, C ==1 && Z == 0.  */
4893   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
4894   AARCH64_CC_V, /* GE, N == V.  */
4895   0,            /* LT, N != V.  */
4896   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
4897   0,            /* LE, !(Z == 0 && N == V).  */
4898   0,            /* AL, Any.  */
4899   0             /* NV, Any.  */
4900 };
4901
4902 static void
4903 aarch64_print_operand (FILE *f, rtx x, int code)
4904 {
4905   switch (code)
4906     {
4907     /* An integer or symbol address without a preceding # sign.  */
4908     case 'c':
4909       switch (GET_CODE (x))
4910         {
4911         case CONST_INT:
4912           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4913           break;
4914
4915         case SYMBOL_REF:
4916           output_addr_const (f, x);
4917           break;
4918
4919         case CONST:
4920           if (GET_CODE (XEXP (x, 0)) == PLUS
4921               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4922             {
4923               output_addr_const (f, x);
4924               break;
4925             }
4926           /* Fall through.  */
4927
4928         default:
4929           output_operand_lossage ("Unsupported operand for code '%c'", code);
4930         }
4931       break;
4932
4933     case 'e':
4934       /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w.  */
4935       {
4936         int n;
4937
4938         if (!CONST_INT_P (x)
4939             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4940           {
4941             output_operand_lossage ("invalid operand for '%%%c'", code);
4942             return;
4943           }
4944
4945         switch (n)
4946           {
4947           case 3:
4948             fputc ('b', f);
4949             break;
4950           case 4:
4951             fputc ('h', f);
4952             break;
4953           case 5:
4954             fputc ('w', f);
4955             break;
4956           default:
4957             output_operand_lossage ("invalid operand for '%%%c'", code);
4958             return;
4959           }
4960       }
4961       break;
4962
4963     case 'p':
4964       {
4965         int n;
4966
4967         /* Print N such that 2^N == X.  */
4968         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4969           {
4970             output_operand_lossage ("invalid operand for '%%%c'", code);
4971             return;
4972           }
4973
4974         asm_fprintf (f, "%d", n);
4975       }
4976       break;
4977
4978     case 'P':
4979       /* Print the number of non-zero bits in X (a const_int).  */
4980       if (!CONST_INT_P (x))
4981         {
4982           output_operand_lossage ("invalid operand for '%%%c'", code);
4983           return;
4984         }
4985
4986       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
4987       break;
4988
4989     case 'H':
4990       /* Print the higher numbered register of a pair (TImode) of regs.  */
4991       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4992         {
4993           output_operand_lossage ("invalid operand for '%%%c'", code);
4994           return;
4995         }
4996
4997       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4998       break;
4999
5000     case 'M':
5001     case 'm':
5002       {
5003         int cond_code;
5004         /* Print a condition (eq, ne, etc) or its inverse.  */
5005
5006         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
5007         if (x == const_true_rtx)
5008           {
5009             if (code == 'M')
5010               fputs ("nv", f);
5011             return;
5012           }
5013
5014         if (!COMPARISON_P (x))
5015           {
5016             output_operand_lossage ("invalid operand for '%%%c'", code);
5017             return;
5018           }
5019
5020         cond_code = aarch64_get_condition_code (x);
5021         gcc_assert (cond_code >= 0);
5022         if (code == 'M')
5023           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
5024         fputs (aarch64_condition_codes[cond_code], f);
5025       }
5026       break;
5027
5028     case 'b':
5029     case 'h':
5030     case 's':
5031     case 'd':
5032     case 'q':
5033       /* Print a scalar FP/SIMD register name.  */
5034       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5035         {
5036           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5037           return;
5038         }
5039       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
5040       break;
5041
5042     case 'S':
5043     case 'T':
5044     case 'U':
5045     case 'V':
5046       /* Print the first FP/SIMD register name in a list.  */
5047       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5048         {
5049           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5050           return;
5051         }
5052       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
5053       break;
5054
5055     case 'R':
5056       /* Print a scalar FP/SIMD register name + 1.  */
5057       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5058         {
5059           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5060           return;
5061         }
5062       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
5063       break;
5064
5065     case 'X':
5066       /* Print bottom 16 bits of integer constant in hex.  */
5067       if (!CONST_INT_P (x))
5068         {
5069           output_operand_lossage ("invalid operand for '%%%c'", code);
5070           return;
5071         }
5072       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
5073       break;
5074
5075     case 'w':
5076     case 'x':
5077       /* Print a general register name or the zero register (32-bit or
5078          64-bit).  */
5079       if (x == const0_rtx
5080           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
5081         {
5082           asm_fprintf (f, "%czr", code);
5083           break;
5084         }
5085
5086       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
5087         {
5088           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
5089           break;
5090         }
5091
5092       if (REG_P (x) && REGNO (x) == SP_REGNUM)
5093         {
5094           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
5095           break;
5096         }
5097
5098       /* Fall through */
5099
5100     case 0:
5101       /* Print a normal operand, if it's a general register, then we
5102          assume DImode.  */
5103       if (x == NULL)
5104         {
5105           output_operand_lossage ("missing operand");
5106           return;
5107         }
5108
5109       switch (GET_CODE (x))
5110         {
5111         case REG:
5112           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
5113           break;
5114
5115         case MEM:
5116           output_address (GET_MODE (x), XEXP (x, 0));
5117           break;
5118
5119         case CONST:
5120         case LABEL_REF:
5121         case SYMBOL_REF:
5122           output_addr_const (asm_out_file, x);
5123           break;
5124
5125         case CONST_INT:
5126           asm_fprintf (f, "%wd", INTVAL (x));
5127           break;
5128
5129         case CONST_VECTOR:
5130           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
5131             {
5132               gcc_assert (
5133                   aarch64_const_vec_all_same_in_range_p (x,
5134                                                          HOST_WIDE_INT_MIN,
5135                                                          HOST_WIDE_INT_MAX));
5136               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
5137             }
5138           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
5139             {
5140               fputc ('0', f);
5141             }
5142           else
5143             gcc_unreachable ();
5144           break;
5145
5146         case CONST_DOUBLE:
5147           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
5148              be getting CONST_DOUBLEs holding integers.  */
5149           gcc_assert (GET_MODE (x) != VOIDmode);
5150           if (aarch64_float_const_zero_rtx_p (x))
5151             {
5152               fputc ('0', f);
5153               break;
5154             }
5155           else if (aarch64_float_const_representable_p (x))
5156             {
5157 #define buf_size 20
5158               char float_buf[buf_size] = {'\0'};
5159               real_to_decimal_for_mode (float_buf,
5160                                         CONST_DOUBLE_REAL_VALUE (x),
5161                                         buf_size, buf_size,
5162                                         1, GET_MODE (x));
5163               asm_fprintf (asm_out_file, "%s", float_buf);
5164               break;
5165 #undef buf_size
5166             }
5167           output_operand_lossage ("invalid constant");
5168           return;
5169         default:
5170           output_operand_lossage ("invalid operand");
5171           return;
5172         }
5173       break;
5174
5175     case 'A':
5176       if (GET_CODE (x) == HIGH)
5177         x = XEXP (x, 0);
5178
5179       switch (aarch64_classify_symbolic_expression (x))
5180         {
5181         case SYMBOL_SMALL_GOT_4G:
5182           asm_fprintf (asm_out_file, ":got:");
5183           break;
5184
5185         case SYMBOL_SMALL_TLSGD:
5186           asm_fprintf (asm_out_file, ":tlsgd:");
5187           break;
5188
5189         case SYMBOL_SMALL_TLSDESC:
5190           asm_fprintf (asm_out_file, ":tlsdesc:");
5191           break;
5192
5193         case SYMBOL_SMALL_TLSIE:
5194           asm_fprintf (asm_out_file, ":gottprel:");
5195           break;
5196
5197         case SYMBOL_TLSLE24:
5198           asm_fprintf (asm_out_file, ":tprel:");
5199           break;
5200
5201         case SYMBOL_TINY_GOT:
5202           gcc_unreachable ();
5203           break;
5204
5205         default:
5206           break;
5207         }
5208       output_addr_const (asm_out_file, x);
5209       break;
5210
5211     case 'L':
5212       switch (aarch64_classify_symbolic_expression (x))
5213         {
5214         case SYMBOL_SMALL_GOT_4G:
5215           asm_fprintf (asm_out_file, ":lo12:");
5216           break;
5217
5218         case SYMBOL_SMALL_TLSGD:
5219           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
5220           break;
5221
5222         case SYMBOL_SMALL_TLSDESC:
5223           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
5224           break;
5225
5226         case SYMBOL_SMALL_TLSIE:
5227           asm_fprintf (asm_out_file, ":gottprel_lo12:");
5228           break;
5229
5230         case SYMBOL_TLSLE12:
5231           asm_fprintf (asm_out_file, ":tprel_lo12:");
5232           break;
5233
5234         case SYMBOL_TLSLE24:
5235           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
5236           break;
5237
5238         case SYMBOL_TINY_GOT:
5239           asm_fprintf (asm_out_file, ":got:");
5240           break;
5241
5242         case SYMBOL_TINY_TLSIE:
5243           asm_fprintf (asm_out_file, ":gottprel:");
5244           break;
5245
5246         default:
5247           break;
5248         }
5249       output_addr_const (asm_out_file, x);
5250       break;
5251
5252     case 'G':
5253
5254       switch (aarch64_classify_symbolic_expression (x))
5255         {
5256         case SYMBOL_TLSLE24:
5257           asm_fprintf (asm_out_file, ":tprel_hi12:");
5258           break;
5259         default:
5260           break;
5261         }
5262       output_addr_const (asm_out_file, x);
5263       break;
5264
5265     case 'k':
5266       {
5267         HOST_WIDE_INT cond_code;
5268         /* Print nzcv.  */
5269
5270         if (!CONST_INT_P (x))
5271           {
5272             output_operand_lossage ("invalid operand for '%%%c'", code);
5273             return;
5274           }
5275
5276         cond_code = INTVAL (x);
5277         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
5278         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
5279       }
5280       break;
5281
5282     default:
5283       output_operand_lossage ("invalid operand prefix '%%%c'", code);
5284       return;
5285     }
5286 }
5287
5288 static void
5289 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
5290 {
5291   struct aarch64_address_info addr;
5292
5293   if (aarch64_classify_address (&addr, x, mode, MEM, true))
5294     switch (addr.type)
5295       {
5296       case ADDRESS_REG_IMM:
5297         if (addr.offset == const0_rtx)
5298           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
5299         else
5300           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
5301                        INTVAL (addr.offset));
5302         return;
5303
5304       case ADDRESS_REG_REG:
5305         if (addr.shift == 0)
5306           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
5307                        reg_names [REGNO (addr.offset)]);
5308         else
5309           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
5310                        reg_names [REGNO (addr.offset)], addr.shift);
5311         return;
5312
5313       case ADDRESS_REG_UXTW:
5314         if (addr.shift == 0)
5315           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
5316                        REGNO (addr.offset) - R0_REGNUM);
5317         else
5318           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
5319                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
5320         return;
5321
5322       case ADDRESS_REG_SXTW:
5323         if (addr.shift == 0)
5324           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
5325                        REGNO (addr.offset) - R0_REGNUM);
5326         else
5327           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
5328                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
5329         return;
5330
5331       case ADDRESS_REG_WB:
5332         switch (GET_CODE (x))
5333           {
5334           case PRE_INC:
5335             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
5336                          GET_MODE_SIZE (mode));
5337             return;
5338           case POST_INC:
5339             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
5340                          GET_MODE_SIZE (mode));
5341             return;
5342           case PRE_DEC:
5343             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
5344                          GET_MODE_SIZE (mode));
5345             return;
5346           case POST_DEC:
5347             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
5348                          GET_MODE_SIZE (mode));
5349             return;
5350           case PRE_MODIFY:
5351             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
5352                          INTVAL (addr.offset));
5353             return;
5354           case POST_MODIFY:
5355             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
5356                          INTVAL (addr.offset));
5357             return;
5358           default:
5359             break;
5360           }
5361         break;
5362
5363       case ADDRESS_LO_SUM:
5364         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
5365         output_addr_const (f, addr.offset);
5366         asm_fprintf (f, "]");
5367         return;
5368
5369       case ADDRESS_SYMBOLIC:
5370         break;
5371       }
5372
5373   output_addr_const (f, x);
5374 }
5375
5376 bool
5377 aarch64_label_mentioned_p (rtx x)
5378 {
5379   const char *fmt;
5380   int i;
5381
5382   if (GET_CODE (x) == LABEL_REF)
5383     return true;
5384
5385   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5386      referencing instruction, but they are constant offsets, not
5387      symbols.  */
5388   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5389     return false;
5390
5391   fmt = GET_RTX_FORMAT (GET_CODE (x));
5392   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
5393     {
5394       if (fmt[i] == 'E')
5395         {
5396           int j;
5397
5398           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
5399             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
5400               return 1;
5401         }
5402       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
5403         return 1;
5404     }
5405
5406   return 0;
5407 }
5408
5409 /* Implement REGNO_REG_CLASS.  */
5410
5411 enum reg_class
5412 aarch64_regno_regclass (unsigned regno)
5413 {
5414   if (GP_REGNUM_P (regno))
5415     return GENERAL_REGS;
5416
5417   if (regno == SP_REGNUM)
5418     return STACK_REG;
5419
5420   if (regno == FRAME_POINTER_REGNUM
5421       || regno == ARG_POINTER_REGNUM)
5422     return POINTER_REGS;
5423
5424   if (FP_REGNUM_P (regno))
5425     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
5426
5427   return NO_REGS;
5428 }
5429
5430 static rtx
5431 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
5432 {
5433   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5434      where mask is selected by alignment and size of the offset.
5435      We try to pick as large a range for the offset as possible to
5436      maximize the chance of a CSE.  However, for aligned addresses
5437      we limit the range to 4k so that structures with different sized
5438      elements are likely to use the same base.  We need to be careful
5439      not to split a CONST for some forms of address expression, otherwise
5440      it will generate sub-optimal code.  */
5441
5442   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5443     {
5444       rtx base = XEXP (x, 0);
5445       rtx offset_rtx = XEXP (x, 1);
5446       HOST_WIDE_INT offset = INTVAL (offset_rtx);
5447
5448       if (GET_CODE (base) == PLUS)
5449         {
5450           rtx op0 = XEXP (base, 0);
5451           rtx op1 = XEXP (base, 1);
5452
5453           /* Force any scaling into a temp for CSE.  */
5454           op0 = force_reg (Pmode, op0);
5455           op1 = force_reg (Pmode, op1);
5456
5457           /* Let the pointer register be in op0.  */
5458           if (REG_POINTER (op1))
5459             std::swap (op0, op1);
5460
5461           /* If the pointer is virtual or frame related, then we know that
5462              virtual register instantiation or register elimination is going
5463              to apply a second constant.  We want the two constants folded
5464              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
5465           if (virt_or_elim_regno_p (REGNO (op0)))
5466             {
5467               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5468                                    NULL_RTX, true, OPTAB_DIRECT);
5469               return gen_rtx_PLUS (Pmode, base, op1);
5470             }
5471
5472           /* Otherwise, in order to encourage CSE (and thence loop strength
5473              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
5474           base = expand_binop (Pmode, add_optab, op0, op1,
5475                                NULL_RTX, true, OPTAB_DIRECT);
5476           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5477         }
5478
5479       /* Does it look like we'll need a 16-byte load/store-pair operation?  */
5480       HOST_WIDE_INT base_offset;
5481       if (GET_MODE_SIZE (mode) > 16)
5482         base_offset = (offset + 0x400) & ~0x7f0;
5483       /* For offsets aren't a multiple of the access size, the limit is
5484          -256...255.  */
5485       else if (offset & (GET_MODE_SIZE (mode) - 1))
5486         {
5487           base_offset = (offset + 0x100) & ~0x1ff;
5488
5489           /* BLKmode typically uses LDP of X-registers.  */
5490           if (mode == BLKmode)
5491             base_offset = (offset + 512) & ~0x3ff;
5492         }
5493       /* Small negative offsets are supported.  */
5494       else if (IN_RANGE (offset, -256, 0))
5495         base_offset = 0;
5496       else if (mode == TImode || mode == TFmode)
5497         base_offset = (offset + 0x100) & ~0x1ff;
5498       /* Use 12-bit offset by access size.  */
5499       else
5500         base_offset = offset & (~0xfff * GET_MODE_SIZE (mode));
5501
5502       if (base_offset != 0)
5503         {
5504           base = plus_constant (Pmode, base, base_offset);
5505           base = force_operand (base, NULL_RTX);
5506           return plus_constant (Pmode, base, offset - base_offset);
5507         }
5508     }
5509
5510   return x;
5511 }
5512
5513 /* Return the reload icode required for a constant pool in mode.  */
5514 static enum insn_code
5515 aarch64_constant_pool_reload_icode (machine_mode mode)
5516 {
5517   switch (mode)
5518     {
5519     case SFmode:
5520       return CODE_FOR_aarch64_reload_movcpsfdi;
5521
5522     case DFmode:
5523       return CODE_FOR_aarch64_reload_movcpdfdi;
5524
5525     case TFmode:
5526       return CODE_FOR_aarch64_reload_movcptfdi;
5527
5528     case V8QImode:
5529       return CODE_FOR_aarch64_reload_movcpv8qidi;
5530
5531     case V16QImode:
5532       return CODE_FOR_aarch64_reload_movcpv16qidi;
5533
5534     case V4HImode:
5535       return CODE_FOR_aarch64_reload_movcpv4hidi;
5536
5537     case V8HImode:
5538       return CODE_FOR_aarch64_reload_movcpv8hidi;
5539
5540     case V2SImode:
5541       return CODE_FOR_aarch64_reload_movcpv2sidi;
5542
5543     case V4SImode:
5544       return CODE_FOR_aarch64_reload_movcpv4sidi;
5545
5546     case V2DImode:
5547       return CODE_FOR_aarch64_reload_movcpv2didi;
5548
5549     case V2DFmode:
5550       return CODE_FOR_aarch64_reload_movcpv2dfdi;
5551
5552     default:
5553       gcc_unreachable ();
5554     }
5555
5556   gcc_unreachable ();
5557 }
5558 static reg_class_t
5559 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5560                           reg_class_t rclass,
5561                           machine_mode mode,
5562                           secondary_reload_info *sri)
5563 {
5564
5565   /* If we have to disable direct literal pool loads and stores because the
5566      function is too big, then we need a scratch register.  */
5567   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5568       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5569           || targetm.vector_mode_supported_p (GET_MODE (x)))
5570       && !aarch64_pcrelative_literal_loads)
5571     {
5572       sri->icode = aarch64_constant_pool_reload_icode (mode);
5573       return NO_REGS;
5574     }
5575
5576   /* Without the TARGET_SIMD instructions we cannot move a Q register
5577      to a Q register directly.  We need a scratch.  */
5578   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5579       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5580       && reg_class_subset_p (rclass, FP_REGS))
5581     {
5582       if (mode == TFmode)
5583         sri->icode = CODE_FOR_aarch64_reload_movtf;
5584       else if (mode == TImode)
5585         sri->icode = CODE_FOR_aarch64_reload_movti;
5586       return NO_REGS;
5587     }
5588
5589   /* A TFmode or TImode memory access should be handled via an FP_REGS
5590      because AArch64 has richer addressing modes for LDR/STR instructions
5591      than LDP/STP instructions.  */
5592   if (TARGET_FLOAT && rclass == GENERAL_REGS
5593       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5594     return FP_REGS;
5595
5596   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5597       return GENERAL_REGS;
5598
5599   return NO_REGS;
5600 }
5601
5602 static bool
5603 aarch64_can_eliminate (const int from, const int to)
5604 {
5605   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5606      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
5607
5608   if (frame_pointer_needed)
5609     {
5610       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5611         return true;
5612       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5613         return false;
5614       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5615           && !cfun->calls_alloca)
5616         return true;
5617       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5618         return true;
5619
5620       return false;
5621     }
5622   else
5623     {
5624       /* If we decided that we didn't need a leaf frame pointer but then used
5625          LR in the function, then we'll want a frame pointer after all, so
5626          prevent this elimination to ensure a frame pointer is used.  */
5627       if (to == STACK_POINTER_REGNUM
5628           && flag_omit_leaf_frame_pointer
5629           && df_regs_ever_live_p (LR_REGNUM))
5630         return false;
5631     }
5632
5633   return true;
5634 }
5635
5636 HOST_WIDE_INT
5637 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5638 {
5639   aarch64_layout_frame ();
5640
5641   if (to == HARD_FRAME_POINTER_REGNUM)
5642     {
5643       if (from == ARG_POINTER_REGNUM)
5644         return cfun->machine->frame.hard_fp_offset;
5645
5646       if (from == FRAME_POINTER_REGNUM)
5647         return cfun->machine->frame.hard_fp_offset
5648                - cfun->machine->frame.locals_offset;
5649     }
5650
5651   if (to == STACK_POINTER_REGNUM)
5652     {
5653       if (from == FRAME_POINTER_REGNUM)
5654           return cfun->machine->frame.frame_size
5655                  - cfun->machine->frame.locals_offset;
5656     }
5657
5658   return cfun->machine->frame.frame_size;
5659 }
5660
5661 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
5662    previous frame.  */
5663
5664 rtx
5665 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5666 {
5667   if (count != 0)
5668     return const0_rtx;
5669   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5670 }
5671
5672
5673 static void
5674 aarch64_asm_trampoline_template (FILE *f)
5675 {
5676   if (TARGET_ILP32)
5677     {
5678       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5679       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5680     }
5681   else
5682     {
5683       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5684       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5685     }
5686   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5687   assemble_aligned_integer (4, const0_rtx);
5688   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5689   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5690 }
5691
5692 static void
5693 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5694 {
5695   rtx fnaddr, mem, a_tramp;
5696   const int tramp_code_sz = 16;
5697
5698   /* Don't need to copy the trailing D-words, we fill those in below.  */
5699   emit_block_move (m_tramp, assemble_trampoline_template (),
5700                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5701   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5702   fnaddr = XEXP (DECL_RTL (fndecl), 0);
5703   if (GET_MODE (fnaddr) != ptr_mode)
5704     fnaddr = convert_memory_address (ptr_mode, fnaddr);
5705   emit_move_insn (mem, fnaddr);
5706
5707   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5708   emit_move_insn (mem, chain_value);
5709
5710   /* XXX We should really define a "clear_cache" pattern and use
5711      gen_clear_cache().  */
5712   a_tramp = XEXP (m_tramp, 0);
5713   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5714                      LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
5715                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5716                      ptr_mode);
5717 }
5718
5719 static unsigned char
5720 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5721 {
5722   switch (regclass)
5723     {
5724     case CALLER_SAVE_REGS:
5725     case POINTER_REGS:
5726     case GENERAL_REGS:
5727     case ALL_REGS:
5728     case FP_REGS:
5729     case FP_LO_REGS:
5730       return
5731         aarch64_vector_mode_p (mode)
5732           ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
5733           : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5734     case STACK_REG:
5735       return 1;
5736
5737     case NO_REGS:
5738       return 0;
5739
5740     default:
5741       break;
5742     }
5743   gcc_unreachable ();
5744 }
5745
5746 static reg_class_t
5747 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
5748 {
5749   if (regclass == POINTER_REGS)
5750     return GENERAL_REGS;
5751
5752   if (regclass == STACK_REG)
5753     {
5754       if (REG_P(x)
5755           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
5756           return regclass;
5757
5758       return NO_REGS;
5759     }
5760
5761   /* If it's an integer immediate that MOVI can't handle, then
5762      FP_REGS is not an option, so we return NO_REGS instead.  */
5763   if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
5764       && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
5765     return NO_REGS;
5766
5767   /* Register eliminiation can result in a request for
5768      SP+constant->FP_REGS.  We cannot support such operations which
5769      use SP as source and an FP_REG as destination, so reject out
5770      right now.  */
5771   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
5772     {
5773       rtx lhs = XEXP (x, 0);
5774
5775       /* Look through a possible SUBREG introduced by ILP32.  */
5776       if (GET_CODE (lhs) == SUBREG)
5777         lhs = SUBREG_REG (lhs);
5778
5779       gcc_assert (REG_P (lhs));
5780       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
5781                                       POINTER_REGS));
5782       return NO_REGS;
5783     }
5784
5785   return regclass;
5786 }
5787
5788 void
5789 aarch64_asm_output_labelref (FILE* f, const char *name)
5790 {
5791   asm_fprintf (f, "%U%s", name);
5792 }
5793
5794 static void
5795 aarch64_elf_asm_constructor (rtx symbol, int priority)
5796 {
5797   if (priority == DEFAULT_INIT_PRIORITY)
5798     default_ctor_section_asm_out_constructor (symbol, priority);
5799   else
5800     {
5801       section *s;
5802       /* While priority is known to be in range [0, 65535], so 18 bytes
5803          would be enough, the compiler might not know that.  To avoid
5804          -Wformat-truncation false positive, use a larger size.  */
5805       char buf[23];
5806       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
5807       s = get_section (buf, SECTION_WRITE, NULL);
5808       switch_to_section (s);
5809       assemble_align (POINTER_SIZE);
5810       assemble_aligned_integer (POINTER_BYTES, symbol);
5811     }
5812 }
5813
5814 static void
5815 aarch64_elf_asm_destructor (rtx symbol, int priority)
5816 {
5817   if (priority == DEFAULT_INIT_PRIORITY)
5818     default_dtor_section_asm_out_destructor (symbol, priority);
5819   else
5820     {
5821       section *s;
5822       /* While priority is known to be in range [0, 65535], so 18 bytes
5823          would be enough, the compiler might not know that.  To avoid
5824          -Wformat-truncation false positive, use a larger size.  */
5825       char buf[23];
5826       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5827       s = get_section (buf, SECTION_WRITE, NULL);
5828       switch_to_section (s);
5829       assemble_align (POINTER_SIZE);
5830       assemble_aligned_integer (POINTER_BYTES, symbol);
5831     }
5832 }
5833
5834 const char*
5835 aarch64_output_casesi (rtx *operands)
5836 {
5837   char buf[100];
5838   char label[100];
5839   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5840   int index;
5841   static const char *const patterns[4][2] =
5842   {
5843     {
5844       "ldrb\t%w3, [%0,%w1,uxtw]",
5845       "add\t%3, %4, %w3, sxtb #2"
5846     },
5847     {
5848       "ldrh\t%w3, [%0,%w1,uxtw #1]",
5849       "add\t%3, %4, %w3, sxth #2"
5850     },
5851     {
5852       "ldr\t%w3, [%0,%w1,uxtw #2]",
5853       "add\t%3, %4, %w3, sxtw #2"
5854     },
5855     /* We assume that DImode is only generated when not optimizing and
5856        that we don't really need 64-bit address offsets.  That would
5857        imply an object file with 8GB of code in a single function!  */
5858     {
5859       "ldr\t%w3, [%0,%w1,uxtw #2]",
5860       "add\t%3, %4, %w3, sxtw #2"
5861     }
5862   };
5863
5864   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5865
5866   index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5867
5868   gcc_assert (index >= 0 && index <= 3);
5869
5870   /* Need to implement table size reduction, by chaning the code below.  */
5871   output_asm_insn (patterns[index][0], operands);
5872   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5873   snprintf (buf, sizeof (buf),
5874             "adr\t%%4, %s", targetm.strip_name_encoding (label));
5875   output_asm_insn (buf, operands);
5876   output_asm_insn (patterns[index][1], operands);
5877   output_asm_insn ("br\t%3", operands);
5878   assemble_label (asm_out_file, label);
5879   return "";
5880 }
5881
5882
5883 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5884    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5885    operator.  */
5886
5887 int
5888 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5889 {
5890   if (shift >= 0 && shift <= 3)
5891     {
5892       int size;
5893       for (size = 8; size <= 32; size *= 2)
5894         {
5895           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5896           if (mask == bits << shift)
5897             return size;
5898         }
5899     }
5900   return 0;
5901 }
5902
5903 /* Constant pools are per function only when PC relative
5904    literal loads are true or we are in the large memory
5905    model.  */
5906
5907 static inline bool
5908 aarch64_can_use_per_function_literal_pools_p (void)
5909 {
5910   return (aarch64_pcrelative_literal_loads
5911           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
5912 }
5913
5914 static bool
5915 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
5916 {
5917   /* Fixme:: In an ideal world this would work similar
5918      to the logic in aarch64_select_rtx_section but this
5919      breaks bootstrap in gcc go.  For now we workaround
5920      this by returning false here.  */
5921   return false;
5922 }
5923
5924 /* Select appropriate section for constants depending
5925    on where we place literal pools.  */
5926
5927 static section *
5928 aarch64_select_rtx_section (machine_mode mode,
5929                             rtx x,
5930                             unsigned HOST_WIDE_INT align)
5931 {
5932   if (aarch64_can_use_per_function_literal_pools_p ())
5933     return function_section (current_function_decl);
5934
5935   return default_elf_select_rtx_section (mode, x, align);
5936 }
5937
5938 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
5939 void
5940 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
5941                                   HOST_WIDE_INT offset)
5942 {
5943   /* When using per-function literal pools, we must ensure that any code
5944      section is aligned to the minimal instruction length, lest we get
5945      errors from the assembler re "unaligned instructions".  */
5946   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
5947     ASM_OUTPUT_ALIGN (f, 2);
5948 }
5949
5950 /* Costs.  */
5951
5952 /* Helper function for rtx cost calculation.  Strip a shift expression
5953    from X.  Returns the inner operand if successful, or the original
5954    expression on failure.  */
5955 static rtx
5956 aarch64_strip_shift (rtx x)
5957 {
5958   rtx op = x;
5959
5960   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5961      we can convert both to ROR during final output.  */
5962   if ((GET_CODE (op) == ASHIFT
5963        || GET_CODE (op) == ASHIFTRT
5964        || GET_CODE (op) == LSHIFTRT
5965        || GET_CODE (op) == ROTATERT
5966        || GET_CODE (op) == ROTATE)
5967       && CONST_INT_P (XEXP (op, 1)))
5968     return XEXP (op, 0);
5969
5970   if (GET_CODE (op) == MULT
5971       && CONST_INT_P (XEXP (op, 1))
5972       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5973     return XEXP (op, 0);
5974
5975   return x;
5976 }
5977
5978 /* Helper function for rtx cost calculation.  Strip an extend
5979    expression from X.  Returns the inner operand if successful, or the
5980    original expression on failure.  We deal with a number of possible
5981    canonicalization variations here.  */
5982 static rtx
5983 aarch64_strip_extend (rtx x)
5984 {
5985   rtx op = x;
5986
5987   /* Zero and sign extraction of a widened value.  */
5988   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5989       && XEXP (op, 2) == const0_rtx
5990       && GET_CODE (XEXP (op, 0)) == MULT
5991       && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5992                                          XEXP (op, 1)))
5993     return XEXP (XEXP (op, 0), 0);
5994
5995   /* It can also be represented (for zero-extend) as an AND with an
5996      immediate.  */
5997   if (GET_CODE (op) == AND
5998       && GET_CODE (XEXP (op, 0)) == MULT
5999       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
6000       && CONST_INT_P (XEXP (op, 1))
6001       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
6002                            INTVAL (XEXP (op, 1))) != 0)
6003     return XEXP (XEXP (op, 0), 0);
6004
6005   /* Now handle extended register, as this may also have an optional
6006      left shift by 1..4.  */
6007   if (GET_CODE (op) == ASHIFT
6008       && CONST_INT_P (XEXP (op, 1))
6009       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
6010     op = XEXP (op, 0);
6011
6012   if (GET_CODE (op) == ZERO_EXTEND
6013       || GET_CODE (op) == SIGN_EXTEND)
6014     op = XEXP (op, 0);
6015
6016   if (op != x)
6017     return op;
6018
6019   return x;
6020 }
6021
6022 /* Return true iff CODE is a shift supported in combination
6023    with arithmetic instructions.  */
6024
6025 static bool
6026 aarch64_shift_p (enum rtx_code code)
6027 {
6028   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
6029 }
6030
6031 /* Helper function for rtx cost calculation.  Calculate the cost of
6032    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
6033    Return the calculated cost of the expression, recursing manually in to
6034    operands where needed.  */
6035
6036 static int
6037 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
6038 {
6039   rtx op0, op1;
6040   const struct cpu_cost_table *extra_cost
6041     = aarch64_tune_params.insn_extra_cost;
6042   int cost = 0;
6043   bool compound_p = (outer == PLUS || outer == MINUS);
6044   machine_mode mode = GET_MODE (x);
6045
6046   gcc_checking_assert (code == MULT);
6047
6048   op0 = XEXP (x, 0);
6049   op1 = XEXP (x, 1);
6050
6051   if (VECTOR_MODE_P (mode))
6052     mode = GET_MODE_INNER (mode);
6053
6054   /* Integer multiply/fma.  */
6055   if (GET_MODE_CLASS (mode) == MODE_INT)
6056     {
6057       /* The multiply will be canonicalized as a shift, cost it as such.  */
6058       if (aarch64_shift_p (GET_CODE (x))
6059           || (CONST_INT_P (op1)
6060               && exact_log2 (INTVAL (op1)) > 0))
6061         {
6062           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
6063                            || GET_CODE (op0) == SIGN_EXTEND;
6064           if (speed)
6065             {
6066               if (compound_p)
6067                 {
6068                   if (REG_P (op1))
6069                     /* ARITH + shift-by-register.  */
6070                     cost += extra_cost->alu.arith_shift_reg;
6071                   else if (is_extend)
6072                     /* ARITH + extended register.  We don't have a cost field
6073                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
6074                     cost += extra_cost->alu.extend_arith;
6075                   else
6076                     /* ARITH + shift-by-immediate.  */
6077                     cost += extra_cost->alu.arith_shift;
6078                 }
6079               else
6080                 /* LSL (immediate).  */
6081                 cost += extra_cost->alu.shift;
6082
6083             }
6084           /* Strip extends as we will have costed them in the case above.  */
6085           if (is_extend)
6086             op0 = aarch64_strip_extend (op0);
6087
6088           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
6089
6090           return cost;
6091         }
6092
6093       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
6094          compound and let the below cases handle it.  After all, MNEG is a
6095          special-case alias of MSUB.  */
6096       if (GET_CODE (op0) == NEG)
6097         {
6098           op0 = XEXP (op0, 0);
6099           compound_p = true;
6100         }
6101
6102       /* Integer multiplies or FMAs have zero/sign extending variants.  */
6103       if ((GET_CODE (op0) == ZERO_EXTEND
6104            && GET_CODE (op1) == ZERO_EXTEND)
6105           || (GET_CODE (op0) == SIGN_EXTEND
6106               && GET_CODE (op1) == SIGN_EXTEND))
6107         {
6108           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
6109           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
6110
6111           if (speed)
6112             {
6113               if (compound_p)
6114                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
6115                 cost += extra_cost->mult[0].extend_add;
6116               else
6117                 /* MUL/SMULL/UMULL.  */
6118                 cost += extra_cost->mult[0].extend;
6119             }
6120
6121           return cost;
6122         }
6123
6124       /* This is either an integer multiply or a MADD.  In both cases
6125          we want to recurse and cost the operands.  */
6126       cost += rtx_cost (op0, mode, MULT, 0, speed);
6127       cost += rtx_cost (op1, mode, MULT, 1, speed);
6128
6129       if (speed)
6130         {
6131           if (compound_p)
6132             /* MADD/MSUB.  */
6133             cost += extra_cost->mult[mode == DImode].add;
6134           else
6135             /* MUL.  */
6136             cost += extra_cost->mult[mode == DImode].simple;
6137         }
6138
6139       return cost;
6140     }
6141   else
6142     {
6143       if (speed)
6144         {
6145           /* Floating-point FMA/FMUL can also support negations of the
6146              operands, unless the rounding mode is upward or downward in
6147              which case FNMUL is different than FMUL with operand negation.  */
6148           bool neg0 = GET_CODE (op0) == NEG;
6149           bool neg1 = GET_CODE (op1) == NEG;
6150           if (compound_p || !flag_rounding_math || (neg0 && neg1))
6151             {
6152               if (neg0)
6153                 op0 = XEXP (op0, 0);
6154               if (neg1)
6155                 op1 = XEXP (op1, 0);
6156             }
6157
6158           if (compound_p)
6159             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
6160             cost += extra_cost->fp[mode == DFmode].fma;
6161           else
6162             /* FMUL/FNMUL.  */
6163             cost += extra_cost->fp[mode == DFmode].mult;
6164         }
6165
6166       cost += rtx_cost (op0, mode, MULT, 0, speed);
6167       cost += rtx_cost (op1, mode, MULT, 1, speed);
6168       return cost;
6169     }
6170 }
6171
6172 static int
6173 aarch64_address_cost (rtx x,
6174                       machine_mode mode,
6175                       addr_space_t as ATTRIBUTE_UNUSED,
6176                       bool speed)
6177 {
6178   enum rtx_code c = GET_CODE (x);
6179   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
6180   struct aarch64_address_info info;
6181   int cost = 0;
6182   info.shift = 0;
6183
6184   if (!aarch64_classify_address (&info, x, mode, c, false))
6185     {
6186       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
6187         {
6188           /* This is a CONST or SYMBOL ref which will be split
6189              in a different way depending on the code model in use.
6190              Cost it through the generic infrastructure.  */
6191           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
6192           /* Divide through by the cost of one instruction to
6193              bring it to the same units as the address costs.  */
6194           cost_symbol_ref /= COSTS_N_INSNS (1);
6195           /* The cost is then the cost of preparing the address,
6196              followed by an immediate (possibly 0) offset.  */
6197           return cost_symbol_ref + addr_cost->imm_offset;
6198         }
6199       else
6200         {
6201           /* This is most likely a jump table from a case
6202              statement.  */
6203           return addr_cost->register_offset;
6204         }
6205     }
6206
6207   switch (info.type)
6208     {
6209       case ADDRESS_LO_SUM:
6210       case ADDRESS_SYMBOLIC:
6211       case ADDRESS_REG_IMM:
6212         cost += addr_cost->imm_offset;
6213         break;
6214
6215       case ADDRESS_REG_WB:
6216         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
6217           cost += addr_cost->pre_modify;
6218         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
6219           cost += addr_cost->post_modify;
6220         else
6221           gcc_unreachable ();
6222
6223         break;
6224
6225       case ADDRESS_REG_REG:
6226         cost += addr_cost->register_offset;
6227         break;
6228
6229       case ADDRESS_REG_SXTW:
6230         cost += addr_cost->register_sextend;
6231         break;
6232
6233       case ADDRESS_REG_UXTW:
6234         cost += addr_cost->register_zextend;
6235         break;
6236
6237       default:
6238         gcc_unreachable ();
6239     }
6240
6241
6242   if (info.shift > 0)
6243     {
6244       /* For the sake of calculating the cost of the shifted register
6245          component, we can treat same sized modes in the same way.  */
6246       switch (GET_MODE_BITSIZE (mode))
6247         {
6248           case 16:
6249             cost += addr_cost->addr_scale_costs.hi;
6250             break;
6251
6252           case 32:
6253             cost += addr_cost->addr_scale_costs.si;
6254             break;
6255
6256           case 64:
6257             cost += addr_cost->addr_scale_costs.di;
6258             break;
6259
6260           /* We can't tell, or this is a 128-bit vector.  */
6261           default:
6262             cost += addr_cost->addr_scale_costs.ti;
6263             break;
6264         }
6265     }
6266
6267   return cost;
6268 }
6269
6270 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
6271    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
6272    to be taken.  */
6273
6274 int
6275 aarch64_branch_cost (bool speed_p, bool predictable_p)
6276 {
6277   /* When optimizing for speed, use the cost of unpredictable branches.  */
6278   const struct cpu_branch_cost *branch_costs =
6279     aarch64_tune_params.branch_costs;
6280
6281   if (!speed_p || predictable_p)
6282     return branch_costs->predictable;
6283   else
6284     return branch_costs->unpredictable;
6285 }
6286
6287 /* Return true if the RTX X in mode MODE is a zero or sign extract
6288    usable in an ADD or SUB (extended register) instruction.  */
6289 static bool
6290 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
6291 {
6292   /* Catch add with a sign extract.
6293      This is add_<optab><mode>_multp2.  */
6294   if (GET_CODE (x) == SIGN_EXTRACT
6295       || GET_CODE (x) == ZERO_EXTRACT)
6296     {
6297       rtx op0 = XEXP (x, 0);
6298       rtx op1 = XEXP (x, 1);
6299       rtx op2 = XEXP (x, 2);
6300
6301       if (GET_CODE (op0) == MULT
6302           && CONST_INT_P (op1)
6303           && op2 == const0_rtx
6304           && CONST_INT_P (XEXP (op0, 1))
6305           && aarch64_is_extend_from_extract (mode,
6306                                              XEXP (op0, 1),
6307                                              op1))
6308         {
6309           return true;
6310         }
6311     }
6312   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
6313      No shift.  */
6314   else if (GET_CODE (x) == SIGN_EXTEND
6315            || GET_CODE (x) == ZERO_EXTEND)
6316     return REG_P (XEXP (x, 0));
6317
6318   return false;
6319 }
6320
6321 static bool
6322 aarch64_frint_unspec_p (unsigned int u)
6323 {
6324   switch (u)
6325     {
6326       case UNSPEC_FRINTZ:
6327       case UNSPEC_FRINTP:
6328       case UNSPEC_FRINTM:
6329       case UNSPEC_FRINTA:
6330       case UNSPEC_FRINTN:
6331       case UNSPEC_FRINTX:
6332       case UNSPEC_FRINTI:
6333         return true;
6334
6335       default:
6336         return false;
6337     }
6338 }
6339
6340 /* Return true iff X is an rtx that will match an extr instruction
6341    i.e. as described in the *extr<mode>5_insn family of patterns.
6342    OP0 and OP1 will be set to the operands of the shifts involved
6343    on success and will be NULL_RTX otherwise.  */
6344
6345 static bool
6346 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
6347 {
6348   rtx op0, op1;
6349   machine_mode mode = GET_MODE (x);
6350
6351   *res_op0 = NULL_RTX;
6352   *res_op1 = NULL_RTX;
6353
6354   if (GET_CODE (x) != IOR)
6355     return false;
6356
6357   op0 = XEXP (x, 0);
6358   op1 = XEXP (x, 1);
6359
6360   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
6361       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
6362     {
6363      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
6364       if (GET_CODE (op1) == ASHIFT)
6365         std::swap (op0, op1);
6366
6367       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6368         return false;
6369
6370       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6371       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6372
6373       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6374           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6375         {
6376           *res_op0 = XEXP (op0, 0);
6377           *res_op1 = XEXP (op1, 0);
6378           return true;
6379         }
6380     }
6381
6382   return false;
6383 }
6384
6385 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6386    storing it in *COST.  Result is true if the total cost of the operation
6387    has now been calculated.  */
6388 static bool
6389 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6390 {
6391   rtx inner;
6392   rtx comparator;
6393   enum rtx_code cmpcode;
6394
6395   if (COMPARISON_P (op0))
6396     {
6397       inner = XEXP (op0, 0);
6398       comparator = XEXP (op0, 1);
6399       cmpcode = GET_CODE (op0);
6400     }
6401   else
6402     {
6403       inner = op0;
6404       comparator = const0_rtx;
6405       cmpcode = NE;
6406     }
6407
6408   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6409     {
6410       /* Conditional branch.  */
6411       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6412         return true;
6413       else
6414         {
6415           if (cmpcode == NE || cmpcode == EQ)
6416             {
6417               if (comparator == const0_rtx)
6418                 {
6419                   /* TBZ/TBNZ/CBZ/CBNZ.  */
6420                   if (GET_CODE (inner) == ZERO_EXTRACT)
6421                     /* TBZ/TBNZ.  */
6422                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6423                                        ZERO_EXTRACT, 0, speed);
6424                   else
6425                     /* CBZ/CBNZ.  */
6426                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6427
6428                 return true;
6429               }
6430             }
6431           else if (cmpcode == LT || cmpcode == GE)
6432             {
6433               /* TBZ/TBNZ.  */
6434               if (comparator == const0_rtx)
6435                 return true;
6436             }
6437         }
6438     }
6439   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6440     {
6441       /* CCMP.  */
6442       if (GET_CODE (op1) == COMPARE)
6443         {
6444           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
6445           if (XEXP (op1, 1) == const0_rtx)
6446             *cost += 1;
6447           if (speed)
6448             {
6449               machine_mode mode = GET_MODE (XEXP (op1, 0));
6450               const struct cpu_cost_table *extra_cost
6451                 = aarch64_tune_params.insn_extra_cost;
6452
6453               if (GET_MODE_CLASS (mode) == MODE_INT)
6454                 *cost += extra_cost->alu.arith;
6455               else
6456                 *cost += extra_cost->fp[mode == DFmode].compare;
6457             }
6458           return true;
6459         }
6460
6461       /* It's a conditional operation based on the status flags,
6462          so it must be some flavor of CSEL.  */
6463
6464       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
6465       if (GET_CODE (op1) == NEG
6466           || GET_CODE (op1) == NOT
6467           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6468         op1 = XEXP (op1, 0);
6469       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6470         {
6471           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
6472           op1 = XEXP (op1, 0);
6473           op2 = XEXP (op2, 0);
6474         }
6475
6476       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6477       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6478       return true;
6479     }
6480
6481   /* We don't know what this is, cost all operands.  */
6482   return false;
6483 }
6484
6485 /* Check whether X is a bitfield operation of the form shift + extend that
6486    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
6487    operand to which the bitfield operation is applied.  Otherwise return
6488    NULL_RTX.  */
6489
6490 static rtx
6491 aarch64_extend_bitfield_pattern_p (rtx x)
6492 {
6493   rtx_code outer_code = GET_CODE (x);
6494   machine_mode outer_mode = GET_MODE (x);
6495
6496   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6497       && outer_mode != SImode && outer_mode != DImode)
6498     return NULL_RTX;
6499
6500   rtx inner = XEXP (x, 0);
6501   rtx_code inner_code = GET_CODE (inner);
6502   machine_mode inner_mode = GET_MODE (inner);
6503   rtx op = NULL_RTX;
6504
6505   switch (inner_code)
6506     {
6507       case ASHIFT:
6508         if (CONST_INT_P (XEXP (inner, 1))
6509             && (inner_mode == QImode || inner_mode == HImode))
6510           op = XEXP (inner, 0);
6511         break;
6512       case LSHIFTRT:
6513         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6514             && (inner_mode == QImode || inner_mode == HImode))
6515           op = XEXP (inner, 0);
6516         break;
6517       case ASHIFTRT:
6518         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6519             && (inner_mode == QImode || inner_mode == HImode))
6520           op = XEXP (inner, 0);
6521         break;
6522       default:
6523         break;
6524     }
6525
6526   return op;
6527 }
6528
6529 /* Return true if the mask and a shift amount from an RTX of the form
6530    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6531    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
6532
6533 bool
6534 aarch64_mask_and_shift_for_ubfiz_p (machine_mode mode, rtx mask, rtx shft_amnt)
6535 {
6536   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6537          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6538          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6539          && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6540 }
6541
6542 /* Calculate the cost of calculating X, storing it in *COST.  Result
6543    is true if the total cost of the operation has now been calculated.  */
6544 static bool
6545 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6546                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6547 {
6548   rtx op0, op1, op2;
6549   const struct cpu_cost_table *extra_cost
6550     = aarch64_tune_params.insn_extra_cost;
6551   int code = GET_CODE (x);
6552
6553   /* By default, assume that everything has equivalent cost to the
6554      cheapest instruction.  Any additional costs are applied as a delta
6555      above this default.  */
6556   *cost = COSTS_N_INSNS (1);
6557
6558   switch (code)
6559     {
6560     case SET:
6561       /* The cost depends entirely on the operands to SET.  */
6562       *cost = 0;
6563       op0 = SET_DEST (x);
6564       op1 = SET_SRC (x);
6565
6566       switch (GET_CODE (op0))
6567         {
6568         case MEM:
6569           if (speed)
6570             {
6571               rtx address = XEXP (op0, 0);
6572               if (VECTOR_MODE_P (mode))
6573                 *cost += extra_cost->ldst.storev;
6574               else if (GET_MODE_CLASS (mode) == MODE_INT)
6575                 *cost += extra_cost->ldst.store;
6576               else if (mode == SFmode)
6577                 *cost += extra_cost->ldst.storef;
6578               else if (mode == DFmode)
6579                 *cost += extra_cost->ldst.stored;
6580
6581               *cost +=
6582                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6583                                                      0, speed));
6584             }
6585
6586           *cost += rtx_cost (op1, mode, SET, 1, speed);
6587           return true;
6588
6589         case SUBREG:
6590           if (! REG_P (SUBREG_REG (op0)))
6591             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6592
6593           /* Fall through.  */
6594         case REG:
6595           /* The cost is one per vector-register copied.  */
6596           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6597             {
6598               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6599                               / GET_MODE_SIZE (V4SImode);
6600               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6601             }
6602           /* const0_rtx is in general free, but we will use an
6603              instruction to set a register to 0.  */
6604           else if (REG_P (op1) || op1 == const0_rtx)
6605             {
6606               /* The cost is 1 per register copied.  */
6607               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6608                               / UNITS_PER_WORD;
6609               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6610             }
6611           else
6612             /* Cost is just the cost of the RHS of the set.  */
6613             *cost += rtx_cost (op1, mode, SET, 1, speed);
6614           return true;
6615
6616         case ZERO_EXTRACT:
6617         case SIGN_EXTRACT:
6618           /* Bit-field insertion.  Strip any redundant widening of
6619              the RHS to meet the width of the target.  */
6620           if (GET_CODE (op1) == SUBREG)
6621             op1 = SUBREG_REG (op1);
6622           if ((GET_CODE (op1) == ZERO_EXTEND
6623                || GET_CODE (op1) == SIGN_EXTEND)
6624               && CONST_INT_P (XEXP (op0, 1))
6625               && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
6626                   >= INTVAL (XEXP (op0, 1))))
6627             op1 = XEXP (op1, 0);
6628
6629           if (CONST_INT_P (op1))
6630             {
6631               /* MOV immediate is assumed to always be cheap.  */
6632               *cost = COSTS_N_INSNS (1);
6633             }
6634           else
6635             {
6636               /* BFM.  */
6637               if (speed)
6638                 *cost += extra_cost->alu.bfi;
6639               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6640             }
6641
6642           return true;
6643
6644         default:
6645           /* We can't make sense of this, assume default cost.  */
6646           *cost = COSTS_N_INSNS (1);
6647           return false;
6648         }
6649       return false;
6650
6651     case CONST_INT:
6652       /* If an instruction can incorporate a constant within the
6653          instruction, the instruction's expression avoids calling
6654          rtx_cost() on the constant.  If rtx_cost() is called on a
6655          constant, then it is usually because the constant must be
6656          moved into a register by one or more instructions.
6657
6658          The exception is constant 0, which can be expressed
6659          as XZR/WZR and is therefore free.  The exception to this is
6660          if we have (set (reg) (const0_rtx)) in which case we must cost
6661          the move.  However, we can catch that when we cost the SET, so
6662          we don't need to consider that here.  */
6663       if (x == const0_rtx)
6664         *cost = 0;
6665       else
6666         {
6667           /* To an approximation, building any other constant is
6668              proportionally expensive to the number of instructions
6669              required to build that constant.  This is true whether we
6670              are compiling for SPEED or otherwise.  */
6671           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
6672                                  (NULL_RTX, x, false, mode));
6673         }
6674       return true;
6675
6676     case CONST_DOUBLE:
6677       if (speed)
6678         {
6679           /* mov[df,sf]_aarch64.  */
6680           if (aarch64_float_const_representable_p (x))
6681             /* FMOV (scalar immediate).  */
6682             *cost += extra_cost->fp[mode == DFmode].fpconst;
6683           else if (!aarch64_float_const_zero_rtx_p (x))
6684             {
6685               /* This will be a load from memory.  */
6686               if (mode == DFmode)
6687                 *cost += extra_cost->ldst.loadd;
6688               else
6689                 *cost += extra_cost->ldst.loadf;
6690             }
6691           else
6692             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
6693                or MOV v0.s[0], wzr - neither of which are modeled by the
6694                cost tables.  Just use the default cost.  */
6695             {
6696             }
6697         }
6698
6699       return true;
6700
6701     case MEM:
6702       if (speed)
6703         {
6704           /* For loads we want the base cost of a load, plus an
6705              approximation for the additional cost of the addressing
6706              mode.  */
6707           rtx address = XEXP (x, 0);
6708           if (VECTOR_MODE_P (mode))
6709             *cost += extra_cost->ldst.loadv;
6710           else if (GET_MODE_CLASS (mode) == MODE_INT)
6711             *cost += extra_cost->ldst.load;
6712           else if (mode == SFmode)
6713             *cost += extra_cost->ldst.loadf;
6714           else if (mode == DFmode)
6715             *cost += extra_cost->ldst.loadd;
6716
6717           *cost +=
6718                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6719                                                      0, speed));
6720         }
6721
6722       return true;
6723
6724     case NEG:
6725       op0 = XEXP (x, 0);
6726
6727       if (VECTOR_MODE_P (mode))
6728         {
6729           if (speed)
6730             {
6731               /* FNEG.  */
6732               *cost += extra_cost->vect.alu;
6733             }
6734           return false;
6735         }
6736
6737       if (GET_MODE_CLASS (mode) == MODE_INT)
6738         {
6739           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6740               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6741             {
6742               /* CSETM.  */
6743               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
6744               return true;
6745             }
6746
6747           /* Cost this as SUB wzr, X.  */
6748           op0 = CONST0_RTX (mode);
6749           op1 = XEXP (x, 0);
6750           goto cost_minus;
6751         }
6752
6753       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6754         {
6755           /* Support (neg(fma...)) as a single instruction only if
6756              sign of zeros is unimportant.  This matches the decision
6757              making in aarch64.md.  */
6758           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
6759             {
6760               /* FNMADD.  */
6761               *cost = rtx_cost (op0, mode, NEG, 0, speed);
6762               return true;
6763             }
6764           if (GET_CODE (op0) == MULT)
6765             {
6766               /* FNMUL.  */
6767               *cost = rtx_cost (op0, mode, NEG, 0, speed);
6768               return true;
6769             }
6770           if (speed)
6771             /* FNEG.  */
6772             *cost += extra_cost->fp[mode == DFmode].neg;
6773           return false;
6774         }
6775
6776       return false;
6777
6778     case CLRSB:
6779     case CLZ:
6780       if (speed)
6781         {
6782           if (VECTOR_MODE_P (mode))
6783             *cost += extra_cost->vect.alu;
6784           else
6785             *cost += extra_cost->alu.clz;
6786         }
6787
6788       return false;
6789
6790     case COMPARE:
6791       op0 = XEXP (x, 0);
6792       op1 = XEXP (x, 1);
6793
6794       if (op1 == const0_rtx
6795           && GET_CODE (op0) == AND)
6796         {
6797           x = op0;
6798           mode = GET_MODE (op0);
6799           goto cost_logic;
6800         }
6801
6802       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
6803         {
6804           /* TODO: A write to the CC flags possibly costs extra, this
6805              needs encoding in the cost tables.  */
6806
6807           mode = GET_MODE (op0);
6808           /* ANDS.  */
6809           if (GET_CODE (op0) == AND)
6810             {
6811               x = op0;
6812               goto cost_logic;
6813             }
6814
6815           if (GET_CODE (op0) == PLUS)
6816             {
6817               /* ADDS (and CMN alias).  */
6818               x = op0;
6819               goto cost_plus;
6820             }
6821
6822           if (GET_CODE (op0) == MINUS)
6823             {
6824               /* SUBS.  */
6825               x = op0;
6826               goto cost_minus;
6827             }
6828
6829           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
6830               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
6831               && CONST_INT_P (XEXP (op0, 2)))
6832             {
6833               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
6834                  Handle it here directly rather than going to cost_logic
6835                  since we know the immediate generated for the TST is valid
6836                  so we can avoid creating an intermediate rtx for it only
6837                  for costing purposes.  */
6838               if (speed)
6839                 *cost += extra_cost->alu.logical;
6840
6841               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
6842                                  ZERO_EXTRACT, 0, speed);
6843               return true;
6844             }
6845
6846           if (GET_CODE (op1) == NEG)
6847             {
6848               /* CMN.  */
6849               if (speed)
6850                 *cost += extra_cost->alu.arith;
6851
6852               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
6853               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
6854               return true;
6855             }
6856
6857           /* CMP.
6858
6859              Compare can freely swap the order of operands, and
6860              canonicalization puts the more complex operation first.
6861              But the integer MINUS logic expects the shift/extend
6862              operation in op1.  */
6863           if (! (REG_P (op0)
6864                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
6865           {
6866             op0 = XEXP (x, 1);
6867             op1 = XEXP (x, 0);
6868           }
6869           goto cost_minus;
6870         }
6871
6872       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
6873         {
6874           /* FCMP.  */
6875           if (speed)
6876             *cost += extra_cost->fp[mode == DFmode].compare;
6877
6878           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
6879             {
6880               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
6881               /* FCMP supports constant 0.0 for no extra cost. */
6882               return true;
6883             }
6884           return false;
6885         }
6886
6887       if (VECTOR_MODE_P (mode))
6888         {
6889           /* Vector compare.  */
6890           if (speed)
6891             *cost += extra_cost->vect.alu;
6892
6893           if (aarch64_float_const_zero_rtx_p (op1))
6894             {
6895               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
6896                  cost.  */
6897               return true;
6898             }
6899           return false;
6900         }
6901       return false;
6902
6903     case MINUS:
6904       {
6905         op0 = XEXP (x, 0);
6906         op1 = XEXP (x, 1);
6907
6908 cost_minus:
6909         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
6910
6911         /* Detect valid immediates.  */
6912         if ((GET_MODE_CLASS (mode) == MODE_INT
6913              || (GET_MODE_CLASS (mode) == MODE_CC
6914                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
6915             && CONST_INT_P (op1)
6916             && aarch64_uimm12_shift (INTVAL (op1)))
6917           {
6918             if (speed)
6919               /* SUB(S) (immediate).  */
6920               *cost += extra_cost->alu.arith;
6921             return true;
6922           }
6923
6924         /* Look for SUB (extended register).  */
6925         if (aarch64_rtx_arith_op_extract_p (op1, mode))
6926           {
6927             if (speed)
6928               *cost += extra_cost->alu.extend_arith;
6929
6930             op1 = aarch64_strip_extend (op1);
6931             *cost += rtx_cost (op1, VOIDmode,
6932                                (enum rtx_code) GET_CODE (op1), 0, speed);
6933             return true;
6934           }
6935
6936         rtx new_op1 = aarch64_strip_extend (op1);
6937
6938         /* Cost this as an FMA-alike operation.  */
6939         if ((GET_CODE (new_op1) == MULT
6940              || aarch64_shift_p (GET_CODE (new_op1)))
6941             && code != COMPARE)
6942           {
6943             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
6944                                             (enum rtx_code) code,
6945                                             speed);
6946             return true;
6947           }
6948
6949         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
6950
6951         if (speed)
6952           {
6953             if (VECTOR_MODE_P (mode))
6954               {
6955                 /* Vector SUB.  */
6956                 *cost += extra_cost->vect.alu;
6957               }
6958             else if (GET_MODE_CLASS (mode) == MODE_INT)
6959               {
6960                 /* SUB(S).  */
6961                 *cost += extra_cost->alu.arith;
6962               }
6963             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6964               {
6965                 /* FSUB.  */
6966                 *cost += extra_cost->fp[mode == DFmode].addsub;
6967               }
6968           }
6969         return true;
6970       }
6971
6972     case PLUS:
6973       {
6974         rtx new_op0;
6975
6976         op0 = XEXP (x, 0);
6977         op1 = XEXP (x, 1);
6978
6979 cost_plus:
6980         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6981             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6982           {
6983             /* CSINC.  */
6984             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
6985             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6986             return true;
6987           }
6988
6989         if (GET_MODE_CLASS (mode) == MODE_INT
6990             && CONST_INT_P (op1)
6991             && aarch64_uimm12_shift (INTVAL (op1)))
6992           {
6993             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
6994
6995             if (speed)
6996               /* ADD (immediate).  */
6997               *cost += extra_cost->alu.arith;
6998             return true;
6999           }
7000
7001         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7002
7003         /* Look for ADD (extended register).  */
7004         if (aarch64_rtx_arith_op_extract_p (op0, mode))
7005           {
7006             if (speed)
7007               *cost += extra_cost->alu.extend_arith;
7008
7009             op0 = aarch64_strip_extend (op0);
7010             *cost += rtx_cost (op0, VOIDmode,
7011                                (enum rtx_code) GET_CODE (op0), 0, speed);
7012             return true;
7013           }
7014
7015         /* Strip any extend, leave shifts behind as we will
7016            cost them through mult_cost.  */
7017         new_op0 = aarch64_strip_extend (op0);
7018
7019         if (GET_CODE (new_op0) == MULT
7020             || aarch64_shift_p (GET_CODE (new_op0)))
7021           {
7022             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
7023                                             speed);
7024             return true;
7025           }
7026
7027         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
7028
7029         if (speed)
7030           {
7031             if (VECTOR_MODE_P (mode))
7032               {
7033                 /* Vector ADD.  */
7034                 *cost += extra_cost->vect.alu;
7035               }
7036             else if (GET_MODE_CLASS (mode) == MODE_INT)
7037               {
7038                 /* ADD.  */
7039                 *cost += extra_cost->alu.arith;
7040               }
7041             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7042               {
7043                 /* FADD.  */
7044                 *cost += extra_cost->fp[mode == DFmode].addsub;
7045               }
7046           }
7047         return true;
7048       }
7049
7050     case BSWAP:
7051       *cost = COSTS_N_INSNS (1);
7052
7053       if (speed)
7054         {
7055           if (VECTOR_MODE_P (mode))
7056             *cost += extra_cost->vect.alu;
7057           else
7058             *cost += extra_cost->alu.rev;
7059         }
7060       return false;
7061
7062     case IOR:
7063       if (aarch_rev16_p (x))
7064         {
7065           *cost = COSTS_N_INSNS (1);
7066
7067           if (speed)
7068             {
7069               if (VECTOR_MODE_P (mode))
7070                 *cost += extra_cost->vect.alu;
7071               else
7072                 *cost += extra_cost->alu.rev;
7073             }
7074           return true;
7075         }
7076
7077       if (aarch64_extr_rtx_p (x, &op0, &op1))
7078         {
7079           *cost += rtx_cost (op0, mode, IOR, 0, speed);
7080           *cost += rtx_cost (op1, mode, IOR, 1, speed);
7081           if (speed)
7082             *cost += extra_cost->alu.shift;
7083
7084           return true;
7085         }
7086     /* Fall through.  */
7087     case XOR:
7088     case AND:
7089     cost_logic:
7090       op0 = XEXP (x, 0);
7091       op1 = XEXP (x, 1);
7092
7093       if (VECTOR_MODE_P (mode))
7094         {
7095           if (speed)
7096             *cost += extra_cost->vect.alu;
7097           return true;
7098         }
7099
7100       if (code == AND
7101           && GET_CODE (op0) == MULT
7102           && CONST_INT_P (XEXP (op0, 1))
7103           && CONST_INT_P (op1)
7104           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
7105                                INTVAL (op1)) != 0)
7106         {
7107           /* This is a UBFM/SBFM.  */
7108           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
7109           if (speed)
7110             *cost += extra_cost->alu.bfx;
7111           return true;
7112         }
7113
7114       if (GET_MODE_CLASS (mode) == MODE_INT)
7115         {
7116           if (CONST_INT_P (op1))
7117             {
7118               /* We have a mask + shift version of a UBFIZ
7119                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
7120               if (GET_CODE (op0) == ASHIFT
7121                   && aarch64_mask_and_shift_for_ubfiz_p (mode, op1,
7122                                                           XEXP (op0, 1)))
7123                 {
7124                   *cost += rtx_cost (XEXP (op0, 0), mode,
7125                                      (enum rtx_code) code, 0, speed);
7126                   if (speed)
7127                     *cost += extra_cost->alu.bfx;
7128
7129                   return true;
7130                 }
7131               else if (aarch64_bitmask_imm (INTVAL (op1), mode))
7132                 {
7133                 /* We possibly get the immediate for free, this is not
7134                    modelled.  */
7135                   *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7136                   if (speed)
7137                     *cost += extra_cost->alu.logical;
7138
7139                   return true;
7140                 }
7141             }
7142           else
7143             {
7144               rtx new_op0 = op0;
7145
7146               /* Handle ORN, EON, or BIC.  */
7147               if (GET_CODE (op0) == NOT)
7148                 op0 = XEXP (op0, 0);
7149
7150               new_op0 = aarch64_strip_shift (op0);
7151
7152               /* If we had a shift on op0 then this is a logical-shift-
7153                  by-register/immediate operation.  Otherwise, this is just
7154                  a logical operation.  */
7155               if (speed)
7156                 {
7157                   if (new_op0 != op0)
7158                     {
7159                       /* Shift by immediate.  */
7160                       if (CONST_INT_P (XEXP (op0, 1)))
7161                         *cost += extra_cost->alu.log_shift;
7162                       else
7163                         *cost += extra_cost->alu.log_shift_reg;
7164                     }
7165                   else
7166                     *cost += extra_cost->alu.logical;
7167                 }
7168
7169               /* In both cases we want to cost both operands.  */
7170               *cost += rtx_cost (new_op0, mode, (enum rtx_code) code, 0, speed);
7171               *cost += rtx_cost (op1, mode, (enum rtx_code) code, 1, speed);
7172
7173               return true;
7174             }
7175         }
7176       return false;
7177
7178     case NOT:
7179       x = XEXP (x, 0);
7180       op0 = aarch64_strip_shift (x);
7181
7182       if (VECTOR_MODE_P (mode))
7183         {
7184           /* Vector NOT.  */
7185           *cost += extra_cost->vect.alu;
7186           return false;
7187         }
7188
7189       /* MVN-shifted-reg.  */
7190       if (op0 != x)
7191         {
7192           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7193
7194           if (speed)
7195             *cost += extra_cost->alu.log_shift;
7196
7197           return true;
7198         }
7199       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
7200          Handle the second form here taking care that 'a' in the above can
7201          be a shift.  */
7202       else if (GET_CODE (op0) == XOR)
7203         {
7204           rtx newop0 = XEXP (op0, 0);
7205           rtx newop1 = XEXP (op0, 1);
7206           rtx op0_stripped = aarch64_strip_shift (newop0);
7207
7208           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
7209           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
7210
7211           if (speed)
7212             {
7213               if (op0_stripped != newop0)
7214                 *cost += extra_cost->alu.log_shift;
7215               else
7216                 *cost += extra_cost->alu.logical;
7217             }
7218
7219           return true;
7220         }
7221       /* MVN.  */
7222       if (speed)
7223         *cost += extra_cost->alu.logical;
7224
7225       return false;
7226
7227     case ZERO_EXTEND:
7228
7229       op0 = XEXP (x, 0);
7230       /* If a value is written in SI mode, then zero extended to DI
7231          mode, the operation will in general be free as a write to
7232          a 'w' register implicitly zeroes the upper bits of an 'x'
7233          register.  However, if this is
7234
7235            (set (reg) (zero_extend (reg)))
7236
7237          we must cost the explicit register move.  */
7238       if (mode == DImode
7239           && GET_MODE (op0) == SImode
7240           && outer == SET)
7241         {
7242           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
7243
7244         /* If OP_COST is non-zero, then the cost of the zero extend
7245            is effectively the cost of the inner operation.  Otherwise
7246            we have a MOV instruction and we take the cost from the MOV
7247            itself.  This is true independently of whether we are
7248            optimizing for space or time.  */
7249           if (op_cost)
7250             *cost = op_cost;
7251
7252           return true;
7253         }
7254       else if (MEM_P (op0))
7255         {
7256           /* All loads can zero extend to any size for free.  */
7257           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
7258           return true;
7259         }
7260
7261       op0 = aarch64_extend_bitfield_pattern_p (x);
7262       if (op0)
7263         {
7264           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
7265           if (speed)
7266             *cost += extra_cost->alu.bfx;
7267           return true;
7268         }
7269
7270       if (speed)
7271         {
7272           if (VECTOR_MODE_P (mode))
7273             {
7274               /* UMOV.  */
7275               *cost += extra_cost->vect.alu;
7276             }
7277           else
7278             {
7279               /* We generate an AND instead of UXTB/UXTH.  */
7280               *cost += extra_cost->alu.logical;
7281             }
7282         }
7283       return false;
7284
7285     case SIGN_EXTEND:
7286       if (MEM_P (XEXP (x, 0)))
7287         {
7288           /* LDRSH.  */
7289           if (speed)
7290             {
7291               rtx address = XEXP (XEXP (x, 0), 0);
7292               *cost += extra_cost->ldst.load_sign_extend;
7293
7294               *cost +=
7295                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7296                                                      0, speed));
7297             }
7298           return true;
7299         }
7300
7301       op0 = aarch64_extend_bitfield_pattern_p (x);
7302       if (op0)
7303         {
7304           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
7305           if (speed)
7306             *cost += extra_cost->alu.bfx;
7307           return true;
7308         }
7309
7310       if (speed)
7311         {
7312           if (VECTOR_MODE_P (mode))
7313             *cost += extra_cost->vect.alu;
7314           else
7315             *cost += extra_cost->alu.extend;
7316         }
7317       return false;
7318
7319     case ASHIFT:
7320       op0 = XEXP (x, 0);
7321       op1 = XEXP (x, 1);
7322
7323       if (CONST_INT_P (op1))
7324         {
7325           if (speed)
7326             {
7327               if (VECTOR_MODE_P (mode))
7328                 {
7329                   /* Vector shift (immediate).  */
7330                   *cost += extra_cost->vect.alu;
7331                 }
7332               else
7333                 {
7334                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
7335                      aliases.  */
7336                   *cost += extra_cost->alu.shift;
7337                 }
7338             }
7339
7340           /* We can incorporate zero/sign extend for free.  */
7341           if (GET_CODE (op0) == ZERO_EXTEND
7342               || GET_CODE (op0) == SIGN_EXTEND)
7343             op0 = XEXP (op0, 0);
7344
7345           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
7346           return true;
7347         }
7348       else
7349         {
7350           if (speed)
7351             {
7352               if (VECTOR_MODE_P (mode))
7353                 {
7354                   /* Vector shift (register).  */
7355                   *cost += extra_cost->vect.alu;
7356                 }
7357               else
7358                 {
7359                   /* LSLV.  */
7360                   *cost += extra_cost->alu.shift_reg;
7361                 }
7362             }
7363           return false;  /* All arguments need to be in registers.  */
7364         }
7365
7366     case ROTATE:
7367     case ROTATERT:
7368     case LSHIFTRT:
7369     case ASHIFTRT:
7370       op0 = XEXP (x, 0);
7371       op1 = XEXP (x, 1);
7372
7373       if (CONST_INT_P (op1))
7374         {
7375           /* ASR (immediate) and friends.  */
7376           if (speed)
7377             {
7378               if (VECTOR_MODE_P (mode))
7379                 *cost += extra_cost->vect.alu;
7380               else
7381                 *cost += extra_cost->alu.shift;
7382             }
7383
7384           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7385           return true;
7386         }
7387       else
7388         {
7389
7390           /* ASR (register) and friends.  */
7391           if (speed)
7392             {
7393               if (VECTOR_MODE_P (mode))
7394                 *cost += extra_cost->vect.alu;
7395               else
7396                 *cost += extra_cost->alu.shift_reg;
7397             }
7398           return false;  /* All arguments need to be in registers.  */
7399         }
7400
7401     case SYMBOL_REF:
7402
7403       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7404           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7405         {
7406           /* LDR.  */
7407           if (speed)
7408             *cost += extra_cost->ldst.load;
7409         }
7410       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7411                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7412         {
7413           /* ADRP, followed by ADD.  */
7414           *cost += COSTS_N_INSNS (1);
7415           if (speed)
7416             *cost += 2 * extra_cost->alu.arith;
7417         }
7418       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7419                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7420         {
7421           /* ADR.  */
7422           if (speed)
7423             *cost += extra_cost->alu.arith;
7424         }
7425
7426       if (flag_pic)
7427         {
7428           /* One extra load instruction, after accessing the GOT.  */
7429           *cost += COSTS_N_INSNS (1);
7430           if (speed)
7431             *cost += extra_cost->ldst.load;
7432         }
7433       return true;
7434
7435     case HIGH:
7436     case LO_SUM:
7437       /* ADRP/ADD (immediate).  */
7438       if (speed)
7439         *cost += extra_cost->alu.arith;
7440       return true;
7441
7442     case ZERO_EXTRACT:
7443     case SIGN_EXTRACT:
7444       /* UBFX/SBFX.  */
7445       if (speed)
7446         {
7447           if (VECTOR_MODE_P (mode))
7448             *cost += extra_cost->vect.alu;
7449           else
7450             *cost += extra_cost->alu.bfx;
7451         }
7452
7453       /* We can trust that the immediates used will be correct (there
7454          are no by-register forms), so we need only cost op0.  */
7455       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7456       return true;
7457
7458     case MULT:
7459       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7460       /* aarch64_rtx_mult_cost always handles recursion to its
7461          operands.  */
7462       return true;
7463
7464     case MOD:
7465     /* We can expand signed mod by power of 2 using a NEGS, two parallel
7466        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
7467        an unconditional negate.  This case should only ever be reached through
7468        the set_smod_pow2_cheap check in expmed.c.  */
7469       if (CONST_INT_P (XEXP (x, 1))
7470           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7471           && (mode == SImode || mode == DImode))
7472         {
7473           /* We expand to 4 instructions.  Reset the baseline.  */
7474           *cost = COSTS_N_INSNS (4);
7475
7476           if (speed)
7477             *cost += 2 * extra_cost->alu.logical
7478                      + 2 * extra_cost->alu.arith;
7479
7480           return true;
7481         }
7482
7483     /* Fall-through.  */
7484     case UMOD:
7485       if (speed)
7486         {
7487           if (VECTOR_MODE_P (mode))
7488             *cost += extra_cost->vect.alu;
7489           else if (GET_MODE_CLASS (mode) == MODE_INT)
7490             *cost += (extra_cost->mult[mode == DImode].add
7491                       + extra_cost->mult[mode == DImode].idiv);
7492           else if (mode == DFmode)
7493             *cost += (extra_cost->fp[1].mult
7494                       + extra_cost->fp[1].div);
7495           else if (mode == SFmode)
7496             *cost += (extra_cost->fp[0].mult
7497                       + extra_cost->fp[0].div);
7498         }
7499       return false;  /* All arguments need to be in registers.  */
7500
7501     case DIV:
7502     case UDIV:
7503     case SQRT:
7504       if (speed)
7505         {
7506           if (VECTOR_MODE_P (mode))
7507             *cost += extra_cost->vect.alu;
7508           else if (GET_MODE_CLASS (mode) == MODE_INT)
7509             /* There is no integer SQRT, so only DIV and UDIV can get
7510                here.  */
7511             *cost += extra_cost->mult[mode == DImode].idiv;
7512           else
7513             *cost += extra_cost->fp[mode == DFmode].div;
7514         }
7515       return false;  /* All arguments need to be in registers.  */
7516
7517     case IF_THEN_ELSE:
7518       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7519                                          XEXP (x, 2), cost, speed);
7520
7521     case EQ:
7522     case NE:
7523     case GT:
7524     case GTU:
7525     case LT:
7526     case LTU:
7527     case GE:
7528     case GEU:
7529     case LE:
7530     case LEU:
7531
7532       return false; /* All arguments must be in registers.  */
7533
7534     case FMA:
7535       op0 = XEXP (x, 0);
7536       op1 = XEXP (x, 1);
7537       op2 = XEXP (x, 2);
7538
7539       if (speed)
7540         {
7541           if (VECTOR_MODE_P (mode))
7542             *cost += extra_cost->vect.alu;
7543           else
7544             *cost += extra_cost->fp[mode == DFmode].fma;
7545         }
7546
7547       /* FMSUB, FNMADD, and FNMSUB are free.  */
7548       if (GET_CODE (op0) == NEG)
7549         op0 = XEXP (op0, 0);
7550
7551       if (GET_CODE (op2) == NEG)
7552         op2 = XEXP (op2, 0);
7553
7554       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7555          and the by-element operand as operand 0.  */
7556       if (GET_CODE (op1) == NEG)
7557         op1 = XEXP (op1, 0);
7558
7559       /* Catch vector-by-element operations.  The by-element operand can
7560          either be (vec_duplicate (vec_select (x))) or just
7561          (vec_select (x)), depending on whether we are multiplying by
7562          a vector or a scalar.
7563
7564          Canonicalization is not very good in these cases, FMA4 will put the
7565          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
7566       if (GET_CODE (op0) == VEC_DUPLICATE)
7567         op0 = XEXP (op0, 0);
7568       else if (GET_CODE (op1) == VEC_DUPLICATE)
7569         op1 = XEXP (op1, 0);
7570
7571       if (GET_CODE (op0) == VEC_SELECT)
7572         op0 = XEXP (op0, 0);
7573       else if (GET_CODE (op1) == VEC_SELECT)
7574         op1 = XEXP (op1, 0);
7575
7576       /* If the remaining parameters are not registers,
7577          get the cost to put them into registers.  */
7578       *cost += rtx_cost (op0, mode, FMA, 0, speed);
7579       *cost += rtx_cost (op1, mode, FMA, 1, speed);
7580       *cost += rtx_cost (op2, mode, FMA, 2, speed);
7581       return true;
7582
7583     case FLOAT:
7584     case UNSIGNED_FLOAT:
7585       if (speed)
7586         *cost += extra_cost->fp[mode == DFmode].fromint;
7587       return false;
7588
7589     case FLOAT_EXTEND:
7590       if (speed)
7591         {
7592           if (VECTOR_MODE_P (mode))
7593             {
7594               /*Vector truncate.  */
7595               *cost += extra_cost->vect.alu;
7596             }
7597           else
7598             *cost += extra_cost->fp[mode == DFmode].widen;
7599         }
7600       return false;
7601
7602     case FLOAT_TRUNCATE:
7603       if (speed)
7604         {
7605           if (VECTOR_MODE_P (mode))
7606             {
7607               /*Vector conversion.  */
7608               *cost += extra_cost->vect.alu;
7609             }
7610           else
7611             *cost += extra_cost->fp[mode == DFmode].narrow;
7612         }
7613       return false;
7614
7615     case FIX:
7616     case UNSIGNED_FIX:
7617       x = XEXP (x, 0);
7618       /* Strip the rounding part.  They will all be implemented
7619          by the fcvt* family of instructions anyway.  */
7620       if (GET_CODE (x) == UNSPEC)
7621         {
7622           unsigned int uns_code = XINT (x, 1);
7623
7624           if (uns_code == UNSPEC_FRINTA
7625               || uns_code == UNSPEC_FRINTM
7626               || uns_code == UNSPEC_FRINTN
7627               || uns_code == UNSPEC_FRINTP
7628               || uns_code == UNSPEC_FRINTZ)
7629             x = XVECEXP (x, 0, 0);
7630         }
7631
7632       if (speed)
7633         {
7634           if (VECTOR_MODE_P (mode))
7635             *cost += extra_cost->vect.alu;
7636           else
7637             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
7638         }
7639
7640       /* We can combine fmul by a power of 2 followed by a fcvt into a single
7641          fixed-point fcvt.  */
7642       if (GET_CODE (x) == MULT
7643           && ((VECTOR_MODE_P (mode)
7644                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
7645               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
7646         {
7647           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
7648                              0, speed);
7649           return true;
7650         }
7651
7652       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
7653       return true;
7654
7655     case ABS:
7656       if (VECTOR_MODE_P (mode))
7657         {
7658           /* ABS (vector).  */
7659           if (speed)
7660             *cost += extra_cost->vect.alu;
7661         }
7662       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7663         {
7664           op0 = XEXP (x, 0);
7665
7666           /* FABD, which is analogous to FADD.  */
7667           if (GET_CODE (op0) == MINUS)
7668             {
7669               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
7670               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
7671               if (speed)
7672                 *cost += extra_cost->fp[mode == DFmode].addsub;
7673
7674               return true;
7675             }
7676           /* Simple FABS is analogous to FNEG.  */
7677           if (speed)
7678             *cost += extra_cost->fp[mode == DFmode].neg;
7679         }
7680       else
7681         {
7682           /* Integer ABS will either be split to
7683              two arithmetic instructions, or will be an ABS
7684              (scalar), which we don't model.  */
7685           *cost = COSTS_N_INSNS (2);
7686           if (speed)
7687             *cost += 2 * extra_cost->alu.arith;
7688         }
7689       return false;
7690
7691     case SMAX:
7692     case SMIN:
7693       if (speed)
7694         {
7695           if (VECTOR_MODE_P (mode))
7696             *cost += extra_cost->vect.alu;
7697           else
7698             {
7699               /* FMAXNM/FMINNM/FMAX/FMIN.
7700                  TODO: This may not be accurate for all implementations, but
7701                  we do not model this in the cost tables.  */
7702               *cost += extra_cost->fp[mode == DFmode].addsub;
7703             }
7704         }
7705       return false;
7706
7707     case UNSPEC:
7708       /* The floating point round to integer frint* instructions.  */
7709       if (aarch64_frint_unspec_p (XINT (x, 1)))
7710         {
7711           if (speed)
7712             *cost += extra_cost->fp[mode == DFmode].roundint;
7713
7714           return false;
7715         }
7716
7717       if (XINT (x, 1) == UNSPEC_RBIT)
7718         {
7719           if (speed)
7720             *cost += extra_cost->alu.rev;
7721
7722           return false;
7723         }
7724       break;
7725
7726     case TRUNCATE:
7727
7728       /* Decompose <su>muldi3_highpart.  */
7729       if (/* (truncate:DI  */
7730           mode == DImode
7731           /*   (lshiftrt:TI  */
7732           && GET_MODE (XEXP (x, 0)) == TImode
7733           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
7734           /*      (mult:TI  */
7735           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
7736           /*        (ANY_EXTEND:TI (reg:DI))
7737                     (ANY_EXTEND:TI (reg:DI)))  */
7738           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
7739                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
7740               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
7741                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
7742           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
7743           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
7744           /*     (const_int 64)  */
7745           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7746           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
7747         {
7748           /* UMULH/SMULH.  */
7749           if (speed)
7750             *cost += extra_cost->mult[mode == DImode].extend;
7751           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
7752                              mode, MULT, 0, speed);
7753           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
7754                              mode, MULT, 1, speed);
7755           return true;
7756         }
7757
7758       /* Fall through.  */
7759     default:
7760       break;
7761     }
7762
7763   if (dump_file
7764       && flag_aarch64_verbose_cost)
7765     fprintf (dump_file,
7766       "\nFailed to cost RTX.  Assuming default cost.\n");
7767
7768   return true;
7769 }
7770
7771 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
7772    calculated for X.  This cost is stored in *COST.  Returns true
7773    if the total cost of X was calculated.  */
7774 static bool
7775 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
7776                    int param, int *cost, bool speed)
7777 {
7778   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
7779
7780   if (dump_file
7781       && flag_aarch64_verbose_cost)
7782     {
7783       print_rtl_single (dump_file, x);
7784       fprintf (dump_file, "\n%s cost: %d (%s)\n",
7785                speed ? "Hot" : "Cold",
7786                *cost, result ? "final" : "partial");
7787     }
7788
7789   return result;
7790 }
7791
7792 static int
7793 aarch64_register_move_cost (machine_mode mode,
7794                             reg_class_t from_i, reg_class_t to_i)
7795 {
7796   enum reg_class from = (enum reg_class) from_i;
7797   enum reg_class to = (enum reg_class) to_i;
7798   const struct cpu_regmove_cost *regmove_cost
7799     = aarch64_tune_params.regmove_cost;
7800
7801   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
7802   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
7803     to = GENERAL_REGS;
7804
7805   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
7806     from = GENERAL_REGS;
7807
7808   /* Moving between GPR and stack cost is the same as GP2GP.  */
7809   if ((from == GENERAL_REGS && to == STACK_REG)
7810       || (to == GENERAL_REGS && from == STACK_REG))
7811     return regmove_cost->GP2GP;
7812
7813   /* To/From the stack register, we move via the gprs.  */
7814   if (to == STACK_REG || from == STACK_REG)
7815     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
7816             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
7817
7818   if (GET_MODE_SIZE (mode) == 16)
7819     {
7820       /* 128-bit operations on general registers require 2 instructions.  */
7821       if (from == GENERAL_REGS && to == GENERAL_REGS)
7822         return regmove_cost->GP2GP * 2;
7823       else if (from == GENERAL_REGS)
7824         return regmove_cost->GP2FP * 2;
7825       else if (to == GENERAL_REGS)
7826         return regmove_cost->FP2GP * 2;
7827
7828       /* When AdvSIMD instructions are disabled it is not possible to move
7829          a 128-bit value directly between Q registers.  This is handled in
7830          secondary reload.  A general register is used as a scratch to move
7831          the upper DI value and the lower DI value is moved directly,
7832          hence the cost is the sum of three moves. */
7833       if (! TARGET_SIMD)
7834         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
7835
7836       return regmove_cost->FP2FP;
7837     }
7838
7839   if (from == GENERAL_REGS && to == GENERAL_REGS)
7840     return regmove_cost->GP2GP;
7841   else if (from == GENERAL_REGS)
7842     return regmove_cost->GP2FP;
7843   else if (to == GENERAL_REGS)
7844     return regmove_cost->FP2GP;
7845
7846   return regmove_cost->FP2FP;
7847 }
7848
7849 static int
7850 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
7851                           reg_class_t rclass ATTRIBUTE_UNUSED,
7852                           bool in ATTRIBUTE_UNUSED)
7853 {
7854   return aarch64_tune_params.memmov_cost;
7855 }
7856
7857 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
7858    to optimize 1.0/sqrt.  */
7859
7860 static bool
7861 use_rsqrt_p (machine_mode mode)
7862 {
7863   return (!flag_trapping_math
7864           && flag_unsafe_math_optimizations
7865           && ((aarch64_tune_params.approx_modes->recip_sqrt
7866                & AARCH64_APPROX_MODE (mode))
7867               || flag_mrecip_low_precision_sqrt));
7868 }
7869
7870 /* Function to decide when to use the approximate reciprocal square root
7871    builtin.  */
7872
7873 static tree
7874 aarch64_builtin_reciprocal (tree fndecl)
7875 {
7876   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
7877
7878   if (!use_rsqrt_p (mode))
7879     return NULL_TREE;
7880   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
7881 }
7882
7883 typedef rtx (*rsqrte_type) (rtx, rtx);
7884
7885 /* Select reciprocal square root initial estimate insn depending on machine
7886    mode.  */
7887
7888 static rsqrte_type
7889 get_rsqrte_type (machine_mode mode)
7890 {
7891   switch (mode)
7892   {
7893     case DFmode:   return gen_aarch64_rsqrtedf;
7894     case SFmode:   return gen_aarch64_rsqrtesf;
7895     case V2DFmode: return gen_aarch64_rsqrtev2df;
7896     case V2SFmode: return gen_aarch64_rsqrtev2sf;
7897     case V4SFmode: return gen_aarch64_rsqrtev4sf;
7898     default: gcc_unreachable ();
7899   }
7900 }
7901
7902 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
7903
7904 /* Select reciprocal square root series step insn depending on machine mode.  */
7905
7906 static rsqrts_type
7907 get_rsqrts_type (machine_mode mode)
7908 {
7909   switch (mode)
7910   {
7911     case DFmode:   return gen_aarch64_rsqrtsdf;
7912     case SFmode:   return gen_aarch64_rsqrtssf;
7913     case V2DFmode: return gen_aarch64_rsqrtsv2df;
7914     case V2SFmode: return gen_aarch64_rsqrtsv2sf;
7915     case V4SFmode: return gen_aarch64_rsqrtsv4sf;
7916     default: gcc_unreachable ();
7917   }
7918 }
7919
7920 /* Emit instruction sequence to compute either the approximate square root
7921    or its approximate reciprocal, depending on the flag RECP, and return
7922    whether the sequence was emitted or not.  */
7923
7924 bool
7925 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
7926 {
7927   machine_mode mode = GET_MODE (dst);
7928
7929   if (GET_MODE_INNER (mode) == HFmode)
7930     return false;
7931
7932   machine_mode mmsk = mode_for_vector
7933                         (int_mode_for_mode (GET_MODE_INNER (mode)),
7934                          GET_MODE_NUNITS (mode));
7935   bool use_approx_sqrt_p = (!recp
7936                             && (flag_mlow_precision_sqrt
7937                                 || (aarch64_tune_params.approx_modes->sqrt
7938                                     & AARCH64_APPROX_MODE (mode))));
7939   bool use_approx_rsqrt_p = (recp
7940                              && (flag_mrecip_low_precision_sqrt
7941                                  || (aarch64_tune_params.approx_modes->recip_sqrt
7942                                      & AARCH64_APPROX_MODE (mode))));
7943
7944   if (!flag_finite_math_only
7945       || flag_trapping_math
7946       || !flag_unsafe_math_optimizations
7947       || !(use_approx_sqrt_p || use_approx_rsqrt_p)
7948       || optimize_function_for_size_p (cfun))
7949     return false;
7950
7951   rtx xmsk = gen_reg_rtx (mmsk);
7952   if (!recp)
7953     /* When calculating the approximate square root, compare the argument with
7954        0.0 and create a mask.  */
7955     emit_insn (gen_rtx_SET (xmsk, gen_rtx_NEG (mmsk, gen_rtx_EQ (mmsk, src,
7956                                                           CONST0_RTX (mode)))));
7957
7958   /* Estimate the approximate reciprocal square root.  */
7959   rtx xdst = gen_reg_rtx (mode);
7960   emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
7961
7962   /* Iterate over the series twice for SF and thrice for DF.  */
7963   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
7964
7965   /* Optionally iterate over the series once less for faster performance
7966      while sacrificing the accuracy.  */
7967   if ((recp && flag_mrecip_low_precision_sqrt)
7968       || (!recp && flag_mlow_precision_sqrt))
7969     iterations--;
7970
7971   /* Iterate over the series to calculate the approximate reciprocal square
7972      root.  */
7973   rtx x1 = gen_reg_rtx (mode);
7974   while (iterations--)
7975     {
7976       rtx x2 = gen_reg_rtx (mode);
7977       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
7978
7979       emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
7980
7981       if (iterations > 0)
7982         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
7983     }
7984
7985   if (!recp)
7986     {
7987       /* Qualify the approximate reciprocal square root when the argument is
7988          0.0 by squashing the intermediary result to 0.0.  */
7989       rtx xtmp = gen_reg_rtx (mmsk);
7990       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
7991                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
7992       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
7993
7994       /* Calculate the approximate square root.  */
7995       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
7996     }
7997
7998   /* Finalize the approximation.  */
7999   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
8000
8001   return true;
8002 }
8003
8004 typedef rtx (*recpe_type) (rtx, rtx);
8005
8006 /* Select reciprocal initial estimate insn depending on machine mode.  */
8007
8008 static recpe_type
8009 get_recpe_type (machine_mode mode)
8010 {
8011   switch (mode)
8012   {
8013     case SFmode:   return (gen_aarch64_frecpesf);
8014     case V2SFmode: return (gen_aarch64_frecpev2sf);
8015     case V4SFmode: return (gen_aarch64_frecpev4sf);
8016     case DFmode:   return (gen_aarch64_frecpedf);
8017     case V2DFmode: return (gen_aarch64_frecpev2df);
8018     default:       gcc_unreachable ();
8019   }
8020 }
8021
8022 typedef rtx (*recps_type) (rtx, rtx, rtx);
8023
8024 /* Select reciprocal series step insn depending on machine mode.  */
8025
8026 static recps_type
8027 get_recps_type (machine_mode mode)
8028 {
8029   switch (mode)
8030   {
8031     case SFmode:   return (gen_aarch64_frecpssf);
8032     case V2SFmode: return (gen_aarch64_frecpsv2sf);
8033     case V4SFmode: return (gen_aarch64_frecpsv4sf);
8034     case DFmode:   return (gen_aarch64_frecpsdf);
8035     case V2DFmode: return (gen_aarch64_frecpsv2df);
8036     default:       gcc_unreachable ();
8037   }
8038 }
8039
8040 /* Emit the instruction sequence to compute the approximation for the division
8041    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
8042
8043 bool
8044 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
8045 {
8046   machine_mode mode = GET_MODE (quo);
8047
8048   if (GET_MODE_INNER (mode) == HFmode)
8049     return false;
8050
8051   bool use_approx_division_p = (flag_mlow_precision_div
8052                                 || (aarch64_tune_params.approx_modes->division
8053                                     & AARCH64_APPROX_MODE (mode)));
8054
8055   if (!flag_finite_math_only
8056       || flag_trapping_math
8057       || !flag_unsafe_math_optimizations
8058       || optimize_function_for_size_p (cfun)
8059       || !use_approx_division_p)
8060     return false;
8061
8062   /* Estimate the approximate reciprocal.  */
8063   rtx xrcp = gen_reg_rtx (mode);
8064   emit_insn ((*get_recpe_type (mode)) (xrcp, den));
8065
8066   /* Iterate over the series twice for SF and thrice for DF.  */
8067   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8068
8069   /* Optionally iterate over the series once less for faster performance,
8070      while sacrificing the accuracy.  */
8071   if (flag_mlow_precision_div)
8072     iterations--;
8073
8074   /* Iterate over the series to calculate the approximate reciprocal.  */
8075   rtx xtmp = gen_reg_rtx (mode);
8076   while (iterations--)
8077     {
8078       emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
8079
8080       if (iterations > 0)
8081         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
8082     }
8083
8084   if (num != CONST1_RTX (mode))
8085     {
8086       /* As the approximate reciprocal of DEN is already calculated, only
8087          calculate the approximate division when NUM is not 1.0.  */
8088       rtx xnum = force_reg (mode, num);
8089       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
8090     }
8091
8092   /* Finalize the approximation.  */
8093   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
8094   return true;
8095 }
8096
8097 /* Return the number of instructions that can be issued per cycle.  */
8098 static int
8099 aarch64_sched_issue_rate (void)
8100 {
8101   return aarch64_tune_params.issue_rate;
8102 }
8103
8104 static int
8105 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
8106 {
8107   int issue_rate = aarch64_sched_issue_rate ();
8108
8109   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
8110 }
8111
8112
8113 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
8114    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
8115    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
8116
8117 static int
8118 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
8119                                                     int ready_index)
8120 {
8121   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
8122 }
8123
8124
8125 /* Vectorizer cost model target hooks.  */
8126
8127 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
8128 static int
8129 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
8130                                     tree vectype,
8131                                     int misalign ATTRIBUTE_UNUSED)
8132 {
8133   unsigned elements;
8134   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
8135   bool fp = false;
8136
8137   if (vectype != NULL)
8138     fp = FLOAT_TYPE_P (vectype);
8139
8140   switch (type_of_cost)
8141     {
8142       case scalar_stmt:
8143         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
8144
8145       case scalar_load:
8146         return costs->scalar_load_cost;
8147
8148       case scalar_store:
8149         return costs->scalar_store_cost;
8150
8151       case vector_stmt:
8152         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8153
8154       case vector_load:
8155         return costs->vec_align_load_cost;
8156
8157       case vector_store:
8158         return costs->vec_store_cost;
8159
8160       case vec_to_scalar:
8161         return costs->vec_to_scalar_cost;
8162
8163       case scalar_to_vec:
8164         return costs->scalar_to_vec_cost;
8165
8166       case unaligned_load:
8167         return costs->vec_unalign_load_cost;
8168
8169       case unaligned_store:
8170         return costs->vec_unalign_store_cost;
8171
8172       case cond_branch_taken:
8173         return costs->cond_taken_branch_cost;
8174
8175       case cond_branch_not_taken:
8176         return costs->cond_not_taken_branch_cost;
8177
8178       case vec_perm:
8179         return costs->vec_permute_cost;
8180
8181       case vec_promote_demote:
8182         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8183
8184       case vec_construct:
8185         elements = TYPE_VECTOR_SUBPARTS (vectype);
8186         return elements / 2 + 1;
8187
8188       default:
8189         gcc_unreachable ();
8190     }
8191 }
8192
8193 /* Implement targetm.vectorize.add_stmt_cost.  */
8194 static unsigned
8195 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
8196                        struct _stmt_vec_info *stmt_info, int misalign,
8197                        enum vect_cost_model_location where)
8198 {
8199   unsigned *cost = (unsigned *) data;
8200   unsigned retval = 0;
8201
8202   if (flag_vect_cost_model)
8203     {
8204       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
8205       int stmt_cost =
8206             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
8207
8208       /* Statements in an inner loop relative to the loop being
8209          vectorized are weighted more heavily.  The value here is
8210          arbitrary and could potentially be improved with analysis.  */
8211       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
8212         count *= 50; /*  FIXME  */
8213
8214       retval = (unsigned) (count * stmt_cost);
8215       cost[where] += retval;
8216     }
8217
8218   return retval;
8219 }
8220
8221 static void initialize_aarch64_code_model (struct gcc_options *);
8222
8223 /* Parse the TO_PARSE string and put the architecture struct that it
8224    selects into RES and the architectural features into ISA_FLAGS.
8225    Return an aarch64_parse_opt_result describing the parse result.
8226    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
8227
8228 static enum aarch64_parse_opt_result
8229 aarch64_parse_arch (const char *to_parse, const struct processor **res,
8230                     unsigned long *isa_flags)
8231 {
8232   char *ext;
8233   const struct processor *arch;
8234   char *str = (char *) alloca (strlen (to_parse) + 1);
8235   size_t len;
8236
8237   strcpy (str, to_parse);
8238
8239   ext = strchr (str, '+');
8240
8241   if (ext != NULL)
8242     len = ext - str;
8243   else
8244     len = strlen (str);
8245
8246   if (len == 0)
8247     return AARCH64_PARSE_MISSING_ARG;
8248
8249
8250   /* Loop through the list of supported ARCHes to find a match.  */
8251   for (arch = all_architectures; arch->name != NULL; arch++)
8252     {
8253       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
8254         {
8255           unsigned long isa_temp = arch->flags;
8256
8257           if (ext != NULL)
8258             {
8259               /* TO_PARSE string contains at least one extension.  */
8260               enum aarch64_parse_opt_result ext_res
8261                 = aarch64_parse_extension (ext, &isa_temp);
8262
8263               if (ext_res != AARCH64_PARSE_OK)
8264                 return ext_res;
8265             }
8266           /* Extension parsing was successful.  Confirm the result
8267              arch and ISA flags.  */
8268           *res = arch;
8269           *isa_flags = isa_temp;
8270           return AARCH64_PARSE_OK;
8271         }
8272     }
8273
8274   /* ARCH name not found in list.  */
8275   return AARCH64_PARSE_INVALID_ARG;
8276 }
8277
8278 /* Parse the TO_PARSE string and put the result tuning in RES and the
8279    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
8280    describing the parse result.  If there is an error parsing, RES and
8281    ISA_FLAGS are left unchanged.  */
8282
8283 static enum aarch64_parse_opt_result
8284 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
8285                    unsigned long *isa_flags)
8286 {
8287   char *ext;
8288   const struct processor *cpu;
8289   char *str = (char *) alloca (strlen (to_parse) + 1);
8290   size_t len;
8291
8292   strcpy (str, to_parse);
8293
8294   ext = strchr (str, '+');
8295
8296   if (ext != NULL)
8297     len = ext - str;
8298   else
8299     len = strlen (str);
8300
8301   if (len == 0)
8302     return AARCH64_PARSE_MISSING_ARG;
8303
8304
8305   /* Loop through the list of supported CPUs to find a match.  */
8306   for (cpu = all_cores; cpu->name != NULL; cpu++)
8307     {
8308       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
8309         {
8310           unsigned long isa_temp = cpu->flags;
8311
8312
8313           if (ext != NULL)
8314             {
8315               /* TO_PARSE string contains at least one extension.  */
8316               enum aarch64_parse_opt_result ext_res
8317                 = aarch64_parse_extension (ext, &isa_temp);
8318
8319               if (ext_res != AARCH64_PARSE_OK)
8320                 return ext_res;
8321             }
8322           /* Extension parsing was successfull.  Confirm the result
8323              cpu and ISA flags.  */
8324           *res = cpu;
8325           *isa_flags = isa_temp;
8326           return AARCH64_PARSE_OK;
8327         }
8328     }
8329
8330   /* CPU name not found in list.  */
8331   return AARCH64_PARSE_INVALID_ARG;
8332 }
8333
8334 /* Parse the TO_PARSE string and put the cpu it selects into RES.
8335    Return an aarch64_parse_opt_result describing the parse result.
8336    If the parsing fails the RES does not change.  */
8337
8338 static enum aarch64_parse_opt_result
8339 aarch64_parse_tune (const char *to_parse, const struct processor **res)
8340 {
8341   const struct processor *cpu;
8342   char *str = (char *) alloca (strlen (to_parse) + 1);
8343
8344   strcpy (str, to_parse);
8345
8346   /* Loop through the list of supported CPUs to find a match.  */
8347   for (cpu = all_cores; cpu->name != NULL; cpu++)
8348     {
8349       if (strcmp (cpu->name, str) == 0)
8350         {
8351           *res = cpu;
8352           return AARCH64_PARSE_OK;
8353         }
8354     }
8355
8356   /* CPU name not found in list.  */
8357   return AARCH64_PARSE_INVALID_ARG;
8358 }
8359
8360 /* Parse TOKEN, which has length LENGTH to see if it is an option
8361    described in FLAG.  If it is, return the index bit for that fusion type.
8362    If not, error (printing OPTION_NAME) and return zero.  */
8363
8364 static unsigned int
8365 aarch64_parse_one_option_token (const char *token,
8366                                 size_t length,
8367                                 const struct aarch64_flag_desc *flag,
8368                                 const char *option_name)
8369 {
8370   for (; flag->name != NULL; flag++)
8371     {
8372       if (length == strlen (flag->name)
8373           && !strncmp (flag->name, token, length))
8374         return flag->flag;
8375     }
8376
8377   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
8378   return 0;
8379 }
8380
8381 /* Parse OPTION which is a comma-separated list of flags to enable.
8382    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8383    default state we inherit from the CPU tuning structures.  OPTION_NAME
8384    gives the top-level option we are parsing in the -moverride string,
8385    for use in error messages.  */
8386
8387 static unsigned int
8388 aarch64_parse_boolean_options (const char *option,
8389                                const struct aarch64_flag_desc *flags,
8390                                unsigned int initial_state,
8391                                const char *option_name)
8392 {
8393   const char separator = '.';
8394   const char* specs = option;
8395   const char* ntoken = option;
8396   unsigned int found_flags = initial_state;
8397
8398   while ((ntoken = strchr (specs, separator)))
8399     {
8400       size_t token_length = ntoken - specs;
8401       unsigned token_ops = aarch64_parse_one_option_token (specs,
8402                                                            token_length,
8403                                                            flags,
8404                                                            option_name);
8405       /* If we find "none" (or, for simplicity's sake, an error) anywhere
8406          in the token stream, reset the supported operations.  So:
8407
8408            adrp+add.cmp+branch.none.adrp+add
8409
8410            would have the result of turning on only adrp+add fusion.  */
8411       if (!token_ops)
8412         found_flags = 0;
8413
8414       found_flags |= token_ops;
8415       specs = ++ntoken;
8416     }
8417
8418   /* We ended with a comma, print something.  */
8419   if (!(*specs))
8420     {
8421       error ("%s string ill-formed\n", option_name);
8422       return 0;
8423     }
8424
8425   /* We still have one more token to parse.  */
8426   size_t token_length = strlen (specs);
8427   unsigned token_ops = aarch64_parse_one_option_token (specs,
8428                                                        token_length,
8429                                                        flags,
8430                                                        option_name);
8431    if (!token_ops)
8432      found_flags = 0;
8433
8434   found_flags |= token_ops;
8435   return found_flags;
8436 }
8437
8438 /* Support for overriding instruction fusion.  */
8439
8440 static void
8441 aarch64_parse_fuse_string (const char *fuse_string,
8442                             struct tune_params *tune)
8443 {
8444   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8445                                                      aarch64_fusible_pairs,
8446                                                      tune->fusible_ops,
8447                                                      "fuse=");
8448 }
8449
8450 /* Support for overriding other tuning flags.  */
8451
8452 static void
8453 aarch64_parse_tune_string (const char *tune_string,
8454                             struct tune_params *tune)
8455 {
8456   tune->extra_tuning_flags
8457     = aarch64_parse_boolean_options (tune_string,
8458                                      aarch64_tuning_flags,
8459                                      tune->extra_tuning_flags,
8460                                      "tune=");
8461 }
8462
8463 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8464    we understand.  If it is, extract the option string and handoff to
8465    the appropriate function.  */
8466
8467 void
8468 aarch64_parse_one_override_token (const char* token,
8469                                   size_t length,
8470                                   struct tune_params *tune)
8471 {
8472   const struct aarch64_tuning_override_function *fn
8473     = aarch64_tuning_override_functions;
8474
8475   const char *option_part = strchr (token, '=');
8476   if (!option_part)
8477     {
8478       error ("tuning string missing in option (%s)", token);
8479       return;
8480     }
8481
8482   /* Get the length of the option name.  */
8483   length = option_part - token;
8484   /* Skip the '=' to get to the option string.  */
8485   option_part++;
8486
8487   for (; fn->name != NULL; fn++)
8488     {
8489       if (!strncmp (fn->name, token, length))
8490         {
8491           fn->parse_override (option_part, tune);
8492           return;
8493         }
8494     }
8495
8496   error ("unknown tuning option (%s)",token);
8497   return;
8498 }
8499
8500 /* A checking mechanism for the implementation of the tls size.  */
8501
8502 static void
8503 initialize_aarch64_tls_size (struct gcc_options *opts)
8504 {
8505   if (aarch64_tls_size == 0)
8506     aarch64_tls_size = 24;
8507
8508   switch (opts->x_aarch64_cmodel_var)
8509     {
8510     case AARCH64_CMODEL_TINY:
8511       /* Both the default and maximum TLS size allowed under tiny is 1M which
8512          needs two instructions to address, so we clamp the size to 24.  */
8513       if (aarch64_tls_size > 24)
8514         aarch64_tls_size = 24;
8515       break;
8516     case AARCH64_CMODEL_SMALL:
8517       /* The maximum TLS size allowed under small is 4G.  */
8518       if (aarch64_tls_size > 32)
8519         aarch64_tls_size = 32;
8520       break;
8521     case AARCH64_CMODEL_LARGE:
8522       /* The maximum TLS size allowed under large is 16E.
8523          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
8524       if (aarch64_tls_size > 48)
8525         aarch64_tls_size = 48;
8526       break;
8527     default:
8528       gcc_unreachable ();
8529     }
8530
8531   return;
8532 }
8533
8534 /* Parse STRING looking for options in the format:
8535      string     :: option:string
8536      option     :: name=substring
8537      name       :: {a-z}
8538      substring  :: defined by option.  */
8539
8540 static void
8541 aarch64_parse_override_string (const char* input_string,
8542                                struct tune_params* tune)
8543 {
8544   const char separator = ':';
8545   size_t string_length = strlen (input_string) + 1;
8546   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8547   char *string = string_root;
8548   strncpy (string, input_string, string_length);
8549   string[string_length - 1] = '\0';
8550
8551   char* ntoken = string;
8552
8553   while ((ntoken = strchr (string, separator)))
8554     {
8555       size_t token_length = ntoken - string;
8556       /* Make this substring look like a string.  */
8557       *ntoken = '\0';
8558       aarch64_parse_one_override_token (string, token_length, tune);
8559       string = ++ntoken;
8560     }
8561
8562   /* One last option to parse.  */
8563   aarch64_parse_one_override_token (string, strlen (string), tune);
8564   free (string_root);
8565 }
8566
8567
8568 static void
8569 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8570 {
8571   /* The logic here is that if we are disabling all frame pointer generation
8572      then we do not need to disable leaf frame pointer generation as a
8573      separate operation.  But if we are *only* disabling leaf frame pointer
8574      generation then we set flag_omit_frame_pointer to true, but in
8575      aarch64_frame_pointer_required we return false only for leaf functions.
8576
8577      PR 70044: We have to be careful about being called multiple times for the
8578      same function.  Once we have decided to set flag_omit_frame_pointer just
8579      so that we can omit leaf frame pointers, we must then not interpret a
8580      second call as meaning that all frame pointer generation should be
8581      omitted.  We do this by setting flag_omit_frame_pointer to a special,
8582      non-zero value.  */
8583   if (opts->x_flag_omit_frame_pointer == 2)
8584     opts->x_flag_omit_frame_pointer = 0;
8585
8586   if (opts->x_flag_omit_frame_pointer)
8587     opts->x_flag_omit_leaf_frame_pointer = false;
8588   else if (opts->x_flag_omit_leaf_frame_pointer)
8589     opts->x_flag_omit_frame_pointer = 2;
8590
8591   /* If not optimizing for size, set the default
8592      alignment to what the target wants.  */
8593   if (!opts->x_optimize_size)
8594     {
8595       if (opts->x_align_loops <= 0)
8596         opts->x_align_loops = aarch64_tune_params.loop_align;
8597       if (opts->x_align_jumps <= 0)
8598         opts->x_align_jumps = aarch64_tune_params.jump_align;
8599       if (opts->x_align_functions <= 0)
8600         opts->x_align_functions = aarch64_tune_params.function_align;
8601     }
8602
8603   /* We default to no pc-relative literal loads.  */
8604
8605   aarch64_pcrelative_literal_loads = false;
8606
8607   /* If -mpc-relative-literal-loads is set on the command line, this
8608      implies that the user asked for PC relative literal loads.  */
8609   if (opts->x_pcrelative_literal_loads == 1)
8610     aarch64_pcrelative_literal_loads = true;
8611
8612   /* This is PR70113. When building the Linux kernel with
8613      CONFIG_ARM64_ERRATUM_843419, support for relocations
8614      R_AARCH64_ADR_PREL_PG_HI21 and R_AARCH64_ADR_PREL_PG_HI21_NC is
8615      removed from the kernel to avoid loading objects with possibly
8616      offending sequences.  Without -mpc-relative-literal-loads we would
8617      generate such relocations, preventing the kernel build from
8618      succeeding.  */
8619   if (opts->x_pcrelative_literal_loads == 2
8620       && TARGET_FIX_ERR_A53_843419)
8621     aarch64_pcrelative_literal_loads = true;
8622
8623   /* In the tiny memory model it makes no sense to disallow PC relative
8624      literal pool loads.  */
8625   if (aarch64_cmodel == AARCH64_CMODEL_TINY
8626       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
8627     aarch64_pcrelative_literal_loads = true;
8628
8629   /* When enabling the lower precision Newton series for the square root, also
8630      enable it for the reciprocal square root, since the latter is an
8631      intermediary step for the former.  */
8632   if (flag_mlow_precision_sqrt)
8633     flag_mrecip_low_precision_sqrt = true;
8634 }
8635
8636 /* 'Unpack' up the internal tuning structs and update the options
8637     in OPTS.  The caller must have set up selected_tune and selected_arch
8638     as all the other target-specific codegen decisions are
8639     derived from them.  */
8640
8641 void
8642 aarch64_override_options_internal (struct gcc_options *opts)
8643 {
8644   aarch64_tune_flags = selected_tune->flags;
8645   aarch64_tune = selected_tune->sched_core;
8646   /* Make a copy of the tuning parameters attached to the core, which
8647      we may later overwrite.  */
8648   aarch64_tune_params = *(selected_tune->tune);
8649   aarch64_architecture_version = selected_arch->architecture_version;
8650
8651   if (opts->x_aarch64_override_tune_string)
8652     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
8653                                   &aarch64_tune_params);
8654
8655   /* This target defaults to strict volatile bitfields.  */
8656   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
8657     opts->x_flag_strict_volatile_bitfields = 1;
8658
8659   initialize_aarch64_code_model (opts);
8660   initialize_aarch64_tls_size (opts);
8661
8662   int queue_depth = 0;
8663   switch (aarch64_tune_params.autoprefetcher_model)
8664     {
8665       case tune_params::AUTOPREFETCHER_OFF:
8666         queue_depth = -1;
8667         break;
8668       case tune_params::AUTOPREFETCHER_WEAK:
8669         queue_depth = 0;
8670         break;
8671       case tune_params::AUTOPREFETCHER_STRONG:
8672         queue_depth = max_insn_queue_index + 1;
8673         break;
8674       default:
8675         gcc_unreachable ();
8676     }
8677
8678   /* We don't mind passing in global_options_set here as we don't use
8679      the *options_set structs anyway.  */
8680   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
8681                          queue_depth,
8682                          opts->x_param_values,
8683                          global_options_set.x_param_values);
8684
8685   /* Set the L1 cache line size.  */
8686   if (selected_cpu->tune->cache_line_size != 0)
8687     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
8688                            selected_cpu->tune->cache_line_size,
8689                            opts->x_param_values,
8690                            global_options_set.x_param_values);
8691
8692   aarch64_override_options_after_change_1 (opts);
8693 }
8694
8695 /* Print a hint with a suggestion for a core or architecture name that
8696    most closely resembles what the user passed in STR.  ARCH is true if
8697    the user is asking for an architecture name.  ARCH is false if the user
8698    is asking for a core name.  */
8699
8700 static void
8701 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
8702 {
8703   auto_vec<const char *> candidates;
8704   const struct processor *entry = arch ? all_architectures : all_cores;
8705   for (; entry->name != NULL; entry++)
8706     candidates.safe_push (entry->name);
8707   char *s;
8708   const char *hint = candidates_list_and_hint (str, s, candidates);
8709   if (hint)
8710     inform (input_location, "valid arguments are: %s;"
8711                              " did you mean %qs?", s, hint);
8712   XDELETEVEC (s);
8713 }
8714
8715 /* Print a hint with a suggestion for a core name that most closely resembles
8716    what the user passed in STR.  */
8717
8718 inline static void
8719 aarch64_print_hint_for_core (const char *str)
8720 {
8721   aarch64_print_hint_for_core_or_arch (str, false);
8722 }
8723
8724 /* Print a hint with a suggestion for an architecture name that most closely
8725    resembles what the user passed in STR.  */
8726
8727 inline static void
8728 aarch64_print_hint_for_arch (const char *str)
8729 {
8730   aarch64_print_hint_for_core_or_arch (str, true);
8731 }
8732
8733 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
8734    specified in STR and throw errors if appropriate.  Put the results if
8735    they are valid in RES and ISA_FLAGS.  Return whether the option is
8736    valid.  */
8737
8738 static bool
8739 aarch64_validate_mcpu (const char *str, const struct processor **res,
8740                        unsigned long *isa_flags)
8741 {
8742   enum aarch64_parse_opt_result parse_res
8743     = aarch64_parse_cpu (str, res, isa_flags);
8744
8745   if (parse_res == AARCH64_PARSE_OK)
8746     return true;
8747
8748   switch (parse_res)
8749     {
8750       case AARCH64_PARSE_MISSING_ARG:
8751         error ("missing cpu name in -mcpu=%qs", str);
8752         break;
8753       case AARCH64_PARSE_INVALID_ARG:
8754         error ("unknown value %qs for -mcpu", str);
8755         aarch64_print_hint_for_core (str);
8756         break;
8757       case AARCH64_PARSE_INVALID_FEATURE:
8758         error ("invalid feature modifier in -mcpu=%qs", str);
8759         break;
8760       default:
8761         gcc_unreachable ();
8762     }
8763
8764   return false;
8765 }
8766
8767 /* Validate a command-line -march option.  Parse the arch and extensions
8768    (if any) specified in STR and throw errors if appropriate.  Put the
8769    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
8770    option is valid.  */
8771
8772 static bool
8773 aarch64_validate_march (const char *str, const struct processor **res,
8774                          unsigned long *isa_flags)
8775 {
8776   enum aarch64_parse_opt_result parse_res
8777     = aarch64_parse_arch (str, res, isa_flags);
8778
8779   if (parse_res == AARCH64_PARSE_OK)
8780     return true;
8781
8782   switch (parse_res)
8783     {
8784       case AARCH64_PARSE_MISSING_ARG:
8785         error ("missing arch name in -march=%qs", str);
8786         break;
8787       case AARCH64_PARSE_INVALID_ARG:
8788         error ("unknown value %qs for -march", str);
8789         aarch64_print_hint_for_arch (str);
8790         break;
8791       case AARCH64_PARSE_INVALID_FEATURE:
8792         error ("invalid feature modifier in -march=%qs", str);
8793         break;
8794       default:
8795         gcc_unreachable ();
8796     }
8797
8798   return false;
8799 }
8800
8801 /* Validate a command-line -mtune option.  Parse the cpu
8802    specified in STR and throw errors if appropriate.  Put the
8803    result, if it is valid, in RES.  Return whether the option is
8804    valid.  */
8805
8806 static bool
8807 aarch64_validate_mtune (const char *str, const struct processor **res)
8808 {
8809   enum aarch64_parse_opt_result parse_res
8810     = aarch64_parse_tune (str, res);
8811
8812   if (parse_res == AARCH64_PARSE_OK)
8813     return true;
8814
8815   switch (parse_res)
8816     {
8817       case AARCH64_PARSE_MISSING_ARG:
8818         error ("missing cpu name in -mtune=%qs", str);
8819         break;
8820       case AARCH64_PARSE_INVALID_ARG:
8821         error ("unknown value %qs for -mtune", str);
8822         aarch64_print_hint_for_core (str);
8823         break;
8824       default:
8825         gcc_unreachable ();
8826     }
8827   return false;
8828 }
8829
8830 /* Return the CPU corresponding to the enum CPU.
8831    If it doesn't specify a cpu, return the default.  */
8832
8833 static const struct processor *
8834 aarch64_get_tune_cpu (enum aarch64_processor cpu)
8835 {
8836   if (cpu != aarch64_none)
8837     return &all_cores[cpu];
8838
8839   /* The & 0x3f is to extract the bottom 6 bits that encode the
8840      default cpu as selected by the --with-cpu GCC configure option
8841      in config.gcc.
8842      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
8843      flags mechanism should be reworked to make it more sane.  */
8844   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8845 }
8846
8847 /* Return the architecture corresponding to the enum ARCH.
8848    If it doesn't specify a valid architecture, return the default.  */
8849
8850 static const struct processor *
8851 aarch64_get_arch (enum aarch64_arch arch)
8852 {
8853   if (arch != aarch64_no_arch)
8854     return &all_architectures[arch];
8855
8856   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8857
8858   return &all_architectures[cpu->arch];
8859 }
8860
8861 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
8862    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
8863    tuning structs.  In particular it must set selected_tune and
8864    aarch64_isa_flags that define the available ISA features and tuning
8865    decisions.  It must also set selected_arch as this will be used to
8866    output the .arch asm tags for each function.  */
8867
8868 static void
8869 aarch64_override_options (void)
8870 {
8871   unsigned long cpu_isa = 0;
8872   unsigned long arch_isa = 0;
8873   aarch64_isa_flags = 0;
8874
8875   bool valid_cpu = true;
8876   bool valid_tune = true;
8877   bool valid_arch = true;
8878
8879   selected_cpu = NULL;
8880   selected_arch = NULL;
8881   selected_tune = NULL;
8882
8883   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
8884      If either of -march or -mtune is given, they override their
8885      respective component of -mcpu.  */
8886   if (aarch64_cpu_string)
8887     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
8888                                         &cpu_isa);
8889
8890   if (aarch64_arch_string)
8891     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
8892                                           &arch_isa);
8893
8894   if (aarch64_tune_string)
8895     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
8896
8897   /* If the user did not specify a processor, choose the default
8898      one for them.  This will be the CPU set during configuration using
8899      --with-cpu, otherwise it is "generic".  */
8900   if (!selected_cpu)
8901     {
8902       if (selected_arch)
8903         {
8904           selected_cpu = &all_cores[selected_arch->ident];
8905           aarch64_isa_flags = arch_isa;
8906           explicit_arch = selected_arch->arch;
8907         }
8908       else
8909         {
8910           /* Get default configure-time CPU.  */
8911           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
8912           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
8913         }
8914
8915       if (selected_tune)
8916         explicit_tune_core = selected_tune->ident;
8917     }
8918   /* If both -mcpu and -march are specified check that they are architecturally
8919      compatible, warn if they're not and prefer the -march ISA flags.  */
8920   else if (selected_arch)
8921     {
8922       if (selected_arch->arch != selected_cpu->arch)
8923         {
8924           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
8925                        all_architectures[selected_cpu->arch].name,
8926                        selected_arch->name);
8927         }
8928       aarch64_isa_flags = arch_isa;
8929       explicit_arch = selected_arch->arch;
8930       explicit_tune_core = selected_tune ? selected_tune->ident
8931                                           : selected_cpu->ident;
8932     }
8933   else
8934     {
8935       /* -mcpu but no -march.  */
8936       aarch64_isa_flags = cpu_isa;
8937       explicit_tune_core = selected_tune ? selected_tune->ident
8938                                           : selected_cpu->ident;
8939       gcc_assert (selected_cpu);
8940       selected_arch = &all_architectures[selected_cpu->arch];
8941       explicit_arch = selected_arch->arch;
8942     }
8943
8944   /* Set the arch as well as we will need it when outputing
8945      the .arch directive in assembly.  */
8946   if (!selected_arch)
8947     {
8948       gcc_assert (selected_cpu);
8949       selected_arch = &all_architectures[selected_cpu->arch];
8950     }
8951
8952   if (!selected_tune)
8953     selected_tune = selected_cpu;
8954
8955 #ifndef HAVE_AS_MABI_OPTION
8956   /* The compiler may have been configured with 2.23.* binutils, which does
8957      not have support for ILP32.  */
8958   if (TARGET_ILP32)
8959     error ("Assembler does not support -mabi=ilp32");
8960 #endif
8961
8962   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
8963     sorry ("Return address signing is only supported for -mabi=lp64");
8964
8965   /* Make sure we properly set up the explicit options.  */
8966   if ((aarch64_cpu_string && valid_cpu)
8967        || (aarch64_tune_string && valid_tune))
8968     gcc_assert (explicit_tune_core != aarch64_none);
8969
8970   if ((aarch64_cpu_string && valid_cpu)
8971        || (aarch64_arch_string && valid_arch))
8972     gcc_assert (explicit_arch != aarch64_no_arch);
8973
8974   aarch64_override_options_internal (&global_options);
8975
8976   /* Save these options as the default ones in case we push and pop them later
8977      while processing functions with potential target attributes.  */
8978   target_option_default_node = target_option_current_node
8979       = build_target_option_node (&global_options);
8980 }
8981
8982 /* Implement targetm.override_options_after_change.  */
8983
8984 static void
8985 aarch64_override_options_after_change (void)
8986 {
8987   aarch64_override_options_after_change_1 (&global_options);
8988 }
8989
8990 static struct machine_function *
8991 aarch64_init_machine_status (void)
8992 {
8993   struct machine_function *machine;
8994   machine = ggc_cleared_alloc<machine_function> ();
8995   return machine;
8996 }
8997
8998 void
8999 aarch64_init_expanders (void)
9000 {
9001   init_machine_status = aarch64_init_machine_status;
9002 }
9003
9004 /* A checking mechanism for the implementation of the various code models.  */
9005 static void
9006 initialize_aarch64_code_model (struct gcc_options *opts)
9007 {
9008    if (opts->x_flag_pic)
9009      {
9010        switch (opts->x_aarch64_cmodel_var)
9011          {
9012          case AARCH64_CMODEL_TINY:
9013            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
9014            break;
9015          case AARCH64_CMODEL_SMALL:
9016 #ifdef HAVE_AS_SMALL_PIC_RELOCS
9017            aarch64_cmodel = (flag_pic == 2
9018                              ? AARCH64_CMODEL_SMALL_PIC
9019                              : AARCH64_CMODEL_SMALL_SPIC);
9020 #else
9021            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
9022 #endif
9023            break;
9024          case AARCH64_CMODEL_LARGE:
9025            sorry ("code model %qs with -f%s", "large",
9026                   opts->x_flag_pic > 1 ? "PIC" : "pic");
9027            break;
9028          default:
9029            gcc_unreachable ();
9030          }
9031      }
9032    else
9033      aarch64_cmodel = opts->x_aarch64_cmodel_var;
9034 }
9035
9036 /* Implement TARGET_OPTION_SAVE.  */
9037
9038 static void
9039 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
9040 {
9041   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
9042 }
9043
9044 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
9045    using the information saved in PTR.  */
9046
9047 static void
9048 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
9049 {
9050   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
9051   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9052   opts->x_explicit_arch = ptr->x_explicit_arch;
9053   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
9054   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
9055
9056   aarch64_override_options_internal (opts);
9057 }
9058
9059 /* Implement TARGET_OPTION_PRINT.  */
9060
9061 static void
9062 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
9063 {
9064   const struct processor *cpu
9065     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9066   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
9067   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
9068   std::string extension
9069     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
9070
9071   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
9072   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
9073            arch->name, extension.c_str ());
9074 }
9075
9076 static GTY(()) tree aarch64_previous_fndecl;
9077
9078 void
9079 aarch64_reset_previous_fndecl (void)
9080 {
9081   aarch64_previous_fndecl = NULL;
9082 }
9083
9084 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
9085    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
9086    make sure optab availability predicates are recomputed when necessary.  */
9087
9088 void
9089 aarch64_save_restore_target_globals (tree new_tree)
9090 {
9091   if (TREE_TARGET_GLOBALS (new_tree))
9092     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
9093   else if (new_tree == target_option_default_node)
9094     restore_target_globals (&default_target_globals);
9095   else
9096     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
9097 }
9098
9099 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
9100    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
9101    of the function, if such exists.  This function may be called multiple
9102    times on a single function so use aarch64_previous_fndecl to avoid
9103    setting up identical state.  */
9104
9105 static void
9106 aarch64_set_current_function (tree fndecl)
9107 {
9108   if (!fndecl || fndecl == aarch64_previous_fndecl)
9109     return;
9110
9111   tree old_tree = (aarch64_previous_fndecl
9112                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
9113                    : NULL_TREE);
9114
9115   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9116
9117   /* If current function has no attributes but the previous one did,
9118      use the default node.  */
9119   if (!new_tree && old_tree)
9120     new_tree = target_option_default_node;
9121
9122   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
9123      the default have been handled by aarch64_save_restore_target_globals from
9124      aarch64_pragma_target_parse.  */
9125   if (old_tree == new_tree)
9126     return;
9127
9128   aarch64_previous_fndecl = fndecl;
9129
9130   /* First set the target options.  */
9131   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
9132
9133   aarch64_save_restore_target_globals (new_tree);
9134 }
9135
9136 /* Enum describing the various ways we can handle attributes.
9137    In many cases we can reuse the generic option handling machinery.  */
9138
9139 enum aarch64_attr_opt_type
9140 {
9141   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
9142   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
9143   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
9144   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
9145 };
9146
9147 /* All the information needed to handle a target attribute.
9148    NAME is the name of the attribute.
9149    ATTR_TYPE specifies the type of behavior of the attribute as described
9150    in the definition of enum aarch64_attr_opt_type.
9151    ALLOW_NEG is true if the attribute supports a "no-" form.
9152    HANDLER is the function that takes the attribute string and whether
9153    it is a pragma or attribute and handles the option.  It is needed only
9154    when the ATTR_TYPE is aarch64_attr_custom.
9155    OPT_NUM is the enum specifying the option that the attribute modifies.
9156    This is needed for attributes that mirror the behavior of a command-line
9157    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
9158    aarch64_attr_enum.  */
9159
9160 struct aarch64_attribute_info
9161 {
9162   const char *name;
9163   enum aarch64_attr_opt_type attr_type;
9164   bool allow_neg;
9165   bool (*handler) (const char *, const char *);
9166   enum opt_code opt_num;
9167 };
9168
9169 /* Handle the ARCH_STR argument to the arch= target attribute.
9170    PRAGMA_OR_ATTR is used in potential error messages.  */
9171
9172 static bool
9173 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
9174 {
9175   const struct processor *tmp_arch = NULL;
9176   enum aarch64_parse_opt_result parse_res
9177     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
9178
9179   if (parse_res == AARCH64_PARSE_OK)
9180     {
9181       gcc_assert (tmp_arch);
9182       selected_arch = tmp_arch;
9183       explicit_arch = selected_arch->arch;
9184       return true;
9185     }
9186
9187   switch (parse_res)
9188     {
9189       case AARCH64_PARSE_MISSING_ARG:
9190         error ("missing architecture name in 'arch' target %s", pragma_or_attr);
9191         break;
9192       case AARCH64_PARSE_INVALID_ARG:
9193         error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
9194         aarch64_print_hint_for_arch (str);
9195         break;
9196       case AARCH64_PARSE_INVALID_FEATURE:
9197         error ("invalid feature modifier %qs for 'arch' target %s",
9198                str, pragma_or_attr);
9199         break;
9200       default:
9201         gcc_unreachable ();
9202     }
9203
9204   return false;
9205 }
9206
9207 /* Handle the argument CPU_STR to the cpu= target attribute.
9208    PRAGMA_OR_ATTR is used in potential error messages.  */
9209
9210 static bool
9211 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
9212 {
9213   const struct processor *tmp_cpu = NULL;
9214   enum aarch64_parse_opt_result parse_res
9215     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
9216
9217   if (parse_res == AARCH64_PARSE_OK)
9218     {
9219       gcc_assert (tmp_cpu);
9220       selected_tune = tmp_cpu;
9221       explicit_tune_core = selected_tune->ident;
9222
9223       selected_arch = &all_architectures[tmp_cpu->arch];
9224       explicit_arch = selected_arch->arch;
9225       return true;
9226     }
9227
9228   switch (parse_res)
9229     {
9230       case AARCH64_PARSE_MISSING_ARG:
9231         error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
9232         break;
9233       case AARCH64_PARSE_INVALID_ARG:
9234         error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
9235         aarch64_print_hint_for_core (str);
9236         break;
9237       case AARCH64_PARSE_INVALID_FEATURE:
9238         error ("invalid feature modifier %qs for 'cpu' target %s",
9239                str, pragma_or_attr);
9240         break;
9241       default:
9242         gcc_unreachable ();
9243     }
9244
9245   return false;
9246 }
9247
9248 /* Handle the argument STR to the tune= target attribute.
9249    PRAGMA_OR_ATTR is used in potential error messages.  */
9250
9251 static bool
9252 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
9253 {
9254   const struct processor *tmp_tune = NULL;
9255   enum aarch64_parse_opt_result parse_res
9256     = aarch64_parse_tune (str, &tmp_tune);
9257
9258   if (parse_res == AARCH64_PARSE_OK)
9259     {
9260       gcc_assert (tmp_tune);
9261       selected_tune = tmp_tune;
9262       explicit_tune_core = selected_tune->ident;
9263       return true;
9264     }
9265
9266   switch (parse_res)
9267     {
9268       case AARCH64_PARSE_INVALID_ARG:
9269         error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
9270         aarch64_print_hint_for_core (str);
9271         break;
9272       default:
9273         gcc_unreachable ();
9274     }
9275
9276   return false;
9277 }
9278
9279 /* Parse an architecture extensions target attribute string specified in STR.
9280    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
9281    if successful.  Update aarch64_isa_flags to reflect the ISA features
9282    modified.
9283    PRAGMA_OR_ATTR is used in potential error messages.  */
9284
9285 static bool
9286 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
9287 {
9288   enum aarch64_parse_opt_result parse_res;
9289   unsigned long isa_flags = aarch64_isa_flags;
9290
9291   /* We allow "+nothing" in the beginning to clear out all architectural
9292      features if the user wants to handpick specific features.  */
9293   if (strncmp ("+nothing", str, 8) == 0)
9294     {
9295       isa_flags = 0;
9296       str += 8;
9297     }
9298
9299   parse_res = aarch64_parse_extension (str, &isa_flags);
9300
9301   if (parse_res == AARCH64_PARSE_OK)
9302     {
9303       aarch64_isa_flags = isa_flags;
9304       return true;
9305     }
9306
9307   switch (parse_res)
9308     {
9309       case AARCH64_PARSE_MISSING_ARG:
9310         error ("missing feature modifier in target %s %qs",
9311                pragma_or_attr, str);
9312         break;
9313
9314       case AARCH64_PARSE_INVALID_FEATURE:
9315         error ("invalid feature modifier in target %s %qs",
9316                pragma_or_attr, str);
9317         break;
9318
9319       default:
9320         gcc_unreachable ();
9321     }
9322
9323  return false;
9324 }
9325
9326 /* The target attributes that we support.  On top of these we also support just
9327    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
9328    handled explicitly in aarch64_process_one_target_attr.  */
9329
9330 static const struct aarch64_attribute_info aarch64_attributes[] =
9331 {
9332   { "general-regs-only", aarch64_attr_mask, false, NULL,
9333      OPT_mgeneral_regs_only },
9334   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
9335      OPT_mfix_cortex_a53_835769 },
9336   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
9337      OPT_mfix_cortex_a53_843419 },
9338   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
9339   { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
9340   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
9341      OPT_momit_leaf_frame_pointer },
9342   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
9343   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
9344      OPT_march_ },
9345   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
9346   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
9347      OPT_mtune_ },
9348   { "sign-return-address", aarch64_attr_enum, false, NULL,
9349      OPT_msign_return_address_ },
9350   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
9351 };
9352
9353 /* Parse ARG_STR which contains the definition of one target attribute.
9354    Show appropriate errors if any or return true if the attribute is valid.
9355    PRAGMA_OR_ATTR holds the string to use in error messages about whether
9356    we're processing a target attribute or pragma.  */
9357
9358 static bool
9359 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
9360 {
9361   bool invert = false;
9362
9363   size_t len = strlen (arg_str);
9364
9365   if (len == 0)
9366     {
9367       error ("malformed target %s", pragma_or_attr);
9368       return false;
9369     }
9370
9371   char *str_to_check = (char *) alloca (len + 1);
9372   strcpy (str_to_check, arg_str);
9373
9374   /* Skip leading whitespace.  */
9375   while (*str_to_check == ' ' || *str_to_check == '\t')
9376     str_to_check++;
9377
9378   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
9379      It is easier to detect and handle it explicitly here rather than going
9380      through the machinery for the rest of the target attributes in this
9381      function.  */
9382   if (*str_to_check == '+')
9383     return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
9384
9385   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
9386     {
9387       invert = true;
9388       str_to_check += 3;
9389     }
9390   char *arg = strchr (str_to_check, '=');
9391
9392   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9393      and point ARG to "foo".  */
9394   if (arg)
9395     {
9396       *arg = '\0';
9397       arg++;
9398     }
9399   const struct aarch64_attribute_info *p_attr;
9400   bool found = false;
9401   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
9402     {
9403       /* If the names don't match up, or the user has given an argument
9404          to an attribute that doesn't accept one, or didn't give an argument
9405          to an attribute that expects one, fail to match.  */
9406       if (strcmp (str_to_check, p_attr->name) != 0)
9407         continue;
9408
9409       found = true;
9410       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
9411                               || p_attr->attr_type == aarch64_attr_enum;
9412
9413       if (attr_need_arg_p ^ (arg != NULL))
9414         {
9415           error ("target %s %qs does not accept an argument",
9416                   pragma_or_attr, str_to_check);
9417           return false;
9418         }
9419
9420       /* If the name matches but the attribute does not allow "no-" versions
9421          then we can't match.  */
9422       if (invert && !p_attr->allow_neg)
9423         {
9424           error ("target %s %qs does not allow a negated form",
9425                   pragma_or_attr, str_to_check);
9426           return false;
9427         }
9428
9429       switch (p_attr->attr_type)
9430         {
9431         /* Has a custom handler registered.
9432            For example, cpu=, arch=, tune=.  */
9433           case aarch64_attr_custom:
9434             gcc_assert (p_attr->handler);
9435             if (!p_attr->handler (arg, pragma_or_attr))
9436               return false;
9437             break;
9438
9439           /* Either set or unset a boolean option.  */
9440           case aarch64_attr_bool:
9441             {
9442               struct cl_decoded_option decoded;
9443
9444               generate_option (p_attr->opt_num, NULL, !invert,
9445                                CL_TARGET, &decoded);
9446               aarch64_handle_option (&global_options, &global_options_set,
9447                                       &decoded, input_location);
9448               break;
9449             }
9450           /* Set or unset a bit in the target_flags.  aarch64_handle_option
9451              should know what mask to apply given the option number.  */
9452           case aarch64_attr_mask:
9453             {
9454               struct cl_decoded_option decoded;
9455               /* We only need to specify the option number.
9456                  aarch64_handle_option will know which mask to apply.  */
9457               decoded.opt_index = p_attr->opt_num;
9458               decoded.value = !invert;
9459               aarch64_handle_option (&global_options, &global_options_set,
9460                                       &decoded, input_location);
9461               break;
9462             }
9463           /* Use the option setting machinery to set an option to an enum.  */
9464           case aarch64_attr_enum:
9465             {
9466               gcc_assert (arg);
9467               bool valid;
9468               int value;
9469               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
9470                                               &value, CL_TARGET);
9471               if (valid)
9472                 {
9473                   set_option (&global_options, NULL, p_attr->opt_num, value,
9474                               NULL, DK_UNSPECIFIED, input_location,
9475                               global_dc);
9476                 }
9477               else
9478                 {
9479                   error ("target %s %s=%s is not valid",
9480                          pragma_or_attr, str_to_check, arg);
9481                 }
9482               break;
9483             }
9484           default:
9485             gcc_unreachable ();
9486         }
9487     }
9488
9489   /* If we reached here we either have found an attribute and validated
9490      it or didn't match any.  If we matched an attribute but its arguments
9491      were malformed we will have returned false already.  */
9492   return found;
9493 }
9494
9495 /* Count how many times the character C appears in
9496    NULL-terminated string STR.  */
9497
9498 static unsigned int
9499 num_occurences_in_str (char c, char *str)
9500 {
9501   unsigned int res = 0;
9502   while (*str != '\0')
9503     {
9504       if (*str == c)
9505         res++;
9506
9507       str++;
9508     }
9509
9510   return res;
9511 }
9512
9513 /* Parse the tree in ARGS that contains the target attribute information
9514    and update the global target options space.  PRAGMA_OR_ATTR is a string
9515    to be used in error messages, specifying whether this is processing
9516    a target attribute or a target pragma.  */
9517
9518 bool
9519 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9520 {
9521   if (TREE_CODE (args) == TREE_LIST)
9522     {
9523       do
9524         {
9525           tree head = TREE_VALUE (args);
9526           if (head)
9527             {
9528               if (!aarch64_process_target_attr (head, pragma_or_attr))
9529                 return false;
9530             }
9531           args = TREE_CHAIN (args);
9532         } while (args);
9533
9534       return true;
9535     }
9536   /* We expect to find a string to parse.  */
9537   gcc_assert (TREE_CODE (args) == STRING_CST);
9538
9539   size_t len = strlen (TREE_STRING_POINTER (args));
9540   char *str_to_check = (char *) alloca (len + 1);
9541   strcpy (str_to_check, TREE_STRING_POINTER (args));
9542
9543   if (len == 0)
9544     {
9545       error ("malformed target %s value", pragma_or_attr);
9546       return false;
9547     }
9548
9549   /* Used to catch empty spaces between commas i.e.
9550      attribute ((target ("attr1,,attr2"))).  */
9551   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9552
9553   /* Handle multiple target attributes separated by ','.  */
9554   char *token = strtok (str_to_check, ",");
9555
9556   unsigned int num_attrs = 0;
9557   while (token)
9558     {
9559       num_attrs++;
9560       if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9561         {
9562           error ("target %s %qs is invalid", pragma_or_attr, token);
9563           return false;
9564         }
9565
9566       token = strtok (NULL, ",");
9567     }
9568
9569   if (num_attrs != num_commas + 1)
9570     {
9571       error ("malformed target %s list %qs",
9572               pragma_or_attr, TREE_STRING_POINTER (args));
9573       return false;
9574     }
9575
9576   return true;
9577 }
9578
9579 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
9580    process attribute ((target ("..."))).  */
9581
9582 static bool
9583 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9584 {
9585   struct cl_target_option cur_target;
9586   bool ret;
9587   tree old_optimize;
9588   tree new_target, new_optimize;
9589   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9590
9591   /* If what we're processing is the current pragma string then the
9592      target option node is already stored in target_option_current_node
9593      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
9594      having to re-parse the string.  This is especially useful to keep
9595      arm_neon.h compile times down since that header contains a lot
9596      of intrinsics enclosed in pragmas.  */
9597   if (!existing_target && args == current_target_pragma)
9598     {
9599       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
9600       return true;
9601     }
9602   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9603
9604   old_optimize = build_optimization_node (&global_options);
9605   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9606
9607   /* If the function changed the optimization levels as well as setting
9608      target options, start with the optimizations specified.  */
9609   if (func_optimize && func_optimize != old_optimize)
9610     cl_optimization_restore (&global_options,
9611                              TREE_OPTIMIZATION (func_optimize));
9612
9613   /* Save the current target options to restore at the end.  */
9614   cl_target_option_save (&cur_target, &global_options);
9615
9616   /* If fndecl already has some target attributes applied to it, unpack
9617      them so that we add this attribute on top of them, rather than
9618      overwriting them.  */
9619   if (existing_target)
9620     {
9621       struct cl_target_option *existing_options
9622         = TREE_TARGET_OPTION (existing_target);
9623
9624       if (existing_options)
9625         cl_target_option_restore (&global_options, existing_options);
9626     }
9627   else
9628     cl_target_option_restore (&global_options,
9629                         TREE_TARGET_OPTION (target_option_current_node));
9630
9631
9632   ret = aarch64_process_target_attr (args, "attribute");
9633
9634   /* Set up any additional state.  */
9635   if (ret)
9636     {
9637       aarch64_override_options_internal (&global_options);
9638       /* Initialize SIMD builtins if we haven't already.
9639          Set current_target_pragma to NULL for the duration so that
9640          the builtin initialization code doesn't try to tag the functions
9641          being built with the attributes specified by any current pragma, thus
9642          going into an infinite recursion.  */
9643       if (TARGET_SIMD)
9644         {
9645           tree saved_current_target_pragma = current_target_pragma;
9646           current_target_pragma = NULL;
9647           aarch64_init_simd_builtins ();
9648           current_target_pragma = saved_current_target_pragma;
9649         }
9650       new_target = build_target_option_node (&global_options);
9651     }
9652   else
9653     new_target = NULL;
9654
9655   new_optimize = build_optimization_node (&global_options);
9656
9657   if (fndecl && ret)
9658     {
9659       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
9660
9661       if (old_optimize != new_optimize)
9662         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
9663     }
9664
9665   cl_target_option_restore (&global_options, &cur_target);
9666
9667   if (old_optimize != new_optimize)
9668     cl_optimization_restore (&global_options,
9669                              TREE_OPTIMIZATION (old_optimize));
9670   return ret;
9671 }
9672
9673 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
9674    tri-bool options (yes, no, don't care) and the default value is
9675    DEF, determine whether to reject inlining.  */
9676
9677 static bool
9678 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
9679                                      int dont_care, int def)
9680 {
9681   /* If the callee doesn't care, always allow inlining.  */
9682   if (callee == dont_care)
9683     return true;
9684
9685   /* If the caller doesn't care, always allow inlining.  */
9686   if (caller == dont_care)
9687     return true;
9688
9689   /* Otherwise, allow inlining if either the callee and caller values
9690      agree, or if the callee is using the default value.  */
9691   return (callee == caller || callee == def);
9692 }
9693
9694 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
9695    to inline CALLEE into CALLER based on target-specific info.
9696    Make sure that the caller and callee have compatible architectural
9697    features.  Then go through the other possible target attributes
9698    and see if they can block inlining.  Try not to reject always_inline
9699    callees unless they are incompatible architecturally.  */
9700
9701 static bool
9702 aarch64_can_inline_p (tree caller, tree callee)
9703 {
9704   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
9705   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
9706
9707   /* If callee has no option attributes, then it is ok to inline.  */
9708   if (!callee_tree)
9709     return true;
9710
9711   struct cl_target_option *caller_opts
9712         = TREE_TARGET_OPTION (caller_tree ? caller_tree
9713                                            : target_option_default_node);
9714
9715   struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
9716
9717
9718   /* Callee's ISA flags should be a subset of the caller's.  */
9719   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
9720        != callee_opts->x_aarch64_isa_flags)
9721     return false;
9722
9723   /* Allow non-strict aligned functions inlining into strict
9724      aligned ones.  */
9725   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
9726        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
9727       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
9728            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
9729     return false;
9730
9731   bool always_inline = lookup_attribute ("always_inline",
9732                                           DECL_ATTRIBUTES (callee));
9733
9734   /* If the architectural features match up and the callee is always_inline
9735      then the other attributes don't matter.  */
9736   if (always_inline)
9737     return true;
9738
9739   if (caller_opts->x_aarch64_cmodel_var
9740       != callee_opts->x_aarch64_cmodel_var)
9741     return false;
9742
9743   if (caller_opts->x_aarch64_tls_dialect
9744       != callee_opts->x_aarch64_tls_dialect)
9745     return false;
9746
9747   /* Honour explicit requests to workaround errata.  */
9748   if (!aarch64_tribools_ok_for_inlining_p (
9749           caller_opts->x_aarch64_fix_a53_err835769,
9750           callee_opts->x_aarch64_fix_a53_err835769,
9751           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
9752     return false;
9753
9754   if (!aarch64_tribools_ok_for_inlining_p (
9755           caller_opts->x_aarch64_fix_a53_err843419,
9756           callee_opts->x_aarch64_fix_a53_err843419,
9757           2, TARGET_FIX_ERR_A53_843419))
9758     return false;
9759
9760   /* If the user explicitly specified -momit-leaf-frame-pointer for the
9761      caller and calle and they don't match up, reject inlining.  */
9762   if (!aarch64_tribools_ok_for_inlining_p (
9763           caller_opts->x_flag_omit_leaf_frame_pointer,
9764           callee_opts->x_flag_omit_leaf_frame_pointer,
9765           2, 1))
9766     return false;
9767
9768   /* If the callee has specific tuning overrides, respect them.  */
9769   if (callee_opts->x_aarch64_override_tune_string != NULL
9770       && caller_opts->x_aarch64_override_tune_string == NULL)
9771     return false;
9772
9773   /* If the user specified tuning override strings for the
9774      caller and callee and they don't match up, reject inlining.
9775      We just do a string compare here, we don't analyze the meaning
9776      of the string, as it would be too costly for little gain.  */
9777   if (callee_opts->x_aarch64_override_tune_string
9778       && caller_opts->x_aarch64_override_tune_string
9779       && (strcmp (callee_opts->x_aarch64_override_tune_string,
9780                   caller_opts->x_aarch64_override_tune_string) != 0))
9781     return false;
9782
9783   return true;
9784 }
9785
9786 /* Return true if SYMBOL_REF X binds locally.  */
9787
9788 static bool
9789 aarch64_symbol_binds_local_p (const_rtx x)
9790 {
9791   return (SYMBOL_REF_DECL (x)
9792           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
9793           : SYMBOL_REF_LOCAL_P (x));
9794 }
9795
9796 /* Return true if SYMBOL_REF X is thread local */
9797 static bool
9798 aarch64_tls_symbol_p (rtx x)
9799 {
9800   if (! TARGET_HAVE_TLS)
9801     return false;
9802
9803   if (GET_CODE (x) != SYMBOL_REF)
9804     return false;
9805
9806   return SYMBOL_REF_TLS_MODEL (x) != 0;
9807 }
9808
9809 /* Classify a TLS symbol into one of the TLS kinds.  */
9810 enum aarch64_symbol_type
9811 aarch64_classify_tls_symbol (rtx x)
9812 {
9813   enum tls_model tls_kind = tls_symbolic_operand_type (x);
9814
9815   switch (tls_kind)
9816     {
9817     case TLS_MODEL_GLOBAL_DYNAMIC:
9818     case TLS_MODEL_LOCAL_DYNAMIC:
9819       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
9820
9821     case TLS_MODEL_INITIAL_EXEC:
9822       switch (aarch64_cmodel)
9823         {
9824         case AARCH64_CMODEL_TINY:
9825         case AARCH64_CMODEL_TINY_PIC:
9826           return SYMBOL_TINY_TLSIE;
9827         default:
9828           return SYMBOL_SMALL_TLSIE;
9829         }
9830
9831     case TLS_MODEL_LOCAL_EXEC:
9832       if (aarch64_tls_size == 12)
9833         return SYMBOL_TLSLE12;
9834       else if (aarch64_tls_size == 24)
9835         return SYMBOL_TLSLE24;
9836       else if (aarch64_tls_size == 32)
9837         return SYMBOL_TLSLE32;
9838       else if (aarch64_tls_size == 48)
9839         return SYMBOL_TLSLE48;
9840       else
9841         gcc_unreachable ();
9842
9843     case TLS_MODEL_EMULATED:
9844     case TLS_MODEL_NONE:
9845       return SYMBOL_FORCE_TO_MEM;
9846
9847     default:
9848       gcc_unreachable ();
9849     }
9850 }
9851
9852 /* Return the method that should be used to access SYMBOL_REF or
9853    LABEL_REF X.  */
9854
9855 enum aarch64_symbol_type
9856 aarch64_classify_symbol (rtx x, rtx offset)
9857 {
9858   if (GET_CODE (x) == LABEL_REF)
9859     {
9860       switch (aarch64_cmodel)
9861         {
9862         case AARCH64_CMODEL_LARGE:
9863           return SYMBOL_FORCE_TO_MEM;
9864
9865         case AARCH64_CMODEL_TINY_PIC:
9866         case AARCH64_CMODEL_TINY:
9867           return SYMBOL_TINY_ABSOLUTE;
9868
9869         case AARCH64_CMODEL_SMALL_SPIC:
9870         case AARCH64_CMODEL_SMALL_PIC:
9871         case AARCH64_CMODEL_SMALL:
9872           return SYMBOL_SMALL_ABSOLUTE;
9873
9874         default:
9875           gcc_unreachable ();
9876         }
9877     }
9878
9879   if (GET_CODE (x) == SYMBOL_REF)
9880     {
9881       if (aarch64_tls_symbol_p (x))
9882         return aarch64_classify_tls_symbol (x);
9883
9884       switch (aarch64_cmodel)
9885         {
9886         case AARCH64_CMODEL_TINY:
9887           /* When we retrieve symbol + offset address, we have to make sure
9888              the offset does not cause overflow of the final address.  But
9889              we have no way of knowing the address of symbol at compile time
9890              so we can't accurately say if the distance between the PC and
9891              symbol + offset is outside the addressible range of +/-1M in the
9892              TINY code model.  So we rely on images not being greater than
9893              1M and cap the offset at 1M and anything beyond 1M will have to
9894              be loaded using an alternative mechanism.  Furthermore if the
9895              symbol is a weak reference to something that isn't known to
9896              resolve to a symbol in this module, then force to memory.  */
9897           if ((SYMBOL_REF_WEAK (x)
9898                && !aarch64_symbol_binds_local_p (x))
9899               || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
9900             return SYMBOL_FORCE_TO_MEM;
9901           return SYMBOL_TINY_ABSOLUTE;
9902
9903         case AARCH64_CMODEL_SMALL:
9904           /* Same reasoning as the tiny code model, but the offset cap here is
9905              4G.  */
9906           if ((SYMBOL_REF_WEAK (x)
9907                && !aarch64_symbol_binds_local_p (x))
9908               || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
9909                             HOST_WIDE_INT_C (4294967264)))
9910             return SYMBOL_FORCE_TO_MEM;
9911           return SYMBOL_SMALL_ABSOLUTE;
9912
9913         case AARCH64_CMODEL_TINY_PIC:
9914           if (!aarch64_symbol_binds_local_p (x))
9915             return SYMBOL_TINY_GOT;
9916           return SYMBOL_TINY_ABSOLUTE;
9917
9918         case AARCH64_CMODEL_SMALL_SPIC:
9919         case AARCH64_CMODEL_SMALL_PIC:
9920           if (!aarch64_symbol_binds_local_p (x))
9921             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
9922                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
9923           return SYMBOL_SMALL_ABSOLUTE;
9924
9925         case AARCH64_CMODEL_LARGE:
9926           /* This is alright even in PIC code as the constant
9927              pool reference is always PC relative and within
9928              the same translation unit.  */
9929           if (CONSTANT_POOL_ADDRESS_P (x))
9930             return SYMBOL_SMALL_ABSOLUTE;
9931           else
9932             return SYMBOL_FORCE_TO_MEM;
9933
9934         default:
9935           gcc_unreachable ();
9936         }
9937     }
9938
9939   /* By default push everything into the constant pool.  */
9940   return SYMBOL_FORCE_TO_MEM;
9941 }
9942
9943 bool
9944 aarch64_constant_address_p (rtx x)
9945 {
9946   return (CONSTANT_P (x) && memory_address_p (DImode, x));
9947 }
9948
9949 bool
9950 aarch64_legitimate_pic_operand_p (rtx x)
9951 {
9952   if (GET_CODE (x) == SYMBOL_REF
9953       || (GET_CODE (x) == CONST
9954           && GET_CODE (XEXP (x, 0)) == PLUS
9955           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
9956      return false;
9957
9958   return true;
9959 }
9960
9961 /* Return true if X holds either a quarter-precision or
9962      floating-point +0.0 constant.  */
9963 static bool
9964 aarch64_valid_floating_const (machine_mode mode, rtx x)
9965 {
9966   if (!CONST_DOUBLE_P (x))
9967     return false;
9968
9969   if (aarch64_float_const_zero_rtx_p (x))
9970     return true;
9971
9972   /* We only handle moving 0.0 to a TFmode register.  */
9973   if (!(mode == SFmode || mode == DFmode))
9974     return false;
9975
9976   return aarch64_float_const_representable_p (x);
9977 }
9978
9979 static bool
9980 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
9981 {
9982   /* Do not allow vector struct mode constants.  We could support
9983      0 and -1 easily, but they need support in aarch64-simd.md.  */
9984   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
9985     return false;
9986
9987   /* This could probably go away because
9988      we now decompose CONST_INTs according to expand_mov_immediate.  */
9989   if ((GET_CODE (x) == CONST_VECTOR
9990        && aarch64_simd_valid_immediate (x, mode, false, NULL))
9991       || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
9992         return !targetm.cannot_force_const_mem (mode, x);
9993
9994   if (GET_CODE (x) == HIGH
9995       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
9996     return true;
9997
9998   return aarch64_constant_address_p (x);
9999 }
10000
10001 rtx
10002 aarch64_load_tp (rtx target)
10003 {
10004   if (!target
10005       || GET_MODE (target) != Pmode
10006       || !register_operand (target, Pmode))
10007     target = gen_reg_rtx (Pmode);
10008
10009   /* Can return in any reg.  */
10010   emit_insn (gen_aarch64_load_tp_hard (target));
10011   return target;
10012 }
10013
10014 /* On AAPCS systems, this is the "struct __va_list".  */
10015 static GTY(()) tree va_list_type;
10016
10017 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
10018    Return the type to use as __builtin_va_list.
10019
10020    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
10021
10022    struct __va_list
10023    {
10024      void *__stack;
10025      void *__gr_top;
10026      void *__vr_top;
10027      int   __gr_offs;
10028      int   __vr_offs;
10029    };  */
10030
10031 static tree
10032 aarch64_build_builtin_va_list (void)
10033 {
10034   tree va_list_name;
10035   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10036
10037   /* Create the type.  */
10038   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
10039   /* Give it the required name.  */
10040   va_list_name = build_decl (BUILTINS_LOCATION,
10041                              TYPE_DECL,
10042                              get_identifier ("__va_list"),
10043                              va_list_type);
10044   DECL_ARTIFICIAL (va_list_name) = 1;
10045   TYPE_NAME (va_list_type) = va_list_name;
10046   TYPE_STUB_DECL (va_list_type) = va_list_name;
10047
10048   /* Create the fields.  */
10049   f_stack = build_decl (BUILTINS_LOCATION,
10050                         FIELD_DECL, get_identifier ("__stack"),
10051                         ptr_type_node);
10052   f_grtop = build_decl (BUILTINS_LOCATION,
10053                         FIELD_DECL, get_identifier ("__gr_top"),
10054                         ptr_type_node);
10055   f_vrtop = build_decl (BUILTINS_LOCATION,
10056                         FIELD_DECL, get_identifier ("__vr_top"),
10057                         ptr_type_node);
10058   f_groff = build_decl (BUILTINS_LOCATION,
10059                         FIELD_DECL, get_identifier ("__gr_offs"),
10060                         integer_type_node);
10061   f_vroff = build_decl (BUILTINS_LOCATION,
10062                         FIELD_DECL, get_identifier ("__vr_offs"),
10063                         integer_type_node);
10064
10065   /* Tell tree-stdarg pass about our internal offset fields.
10066      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
10067      purpose to identify whether the code is updating va_list internal
10068      offset fields through irregular way.  */
10069   va_list_gpr_counter_field = f_groff;
10070   va_list_fpr_counter_field = f_vroff;
10071
10072   DECL_ARTIFICIAL (f_stack) = 1;
10073   DECL_ARTIFICIAL (f_grtop) = 1;
10074   DECL_ARTIFICIAL (f_vrtop) = 1;
10075   DECL_ARTIFICIAL (f_groff) = 1;
10076   DECL_ARTIFICIAL (f_vroff) = 1;
10077
10078   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
10079   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
10080   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
10081   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
10082   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
10083
10084   TYPE_FIELDS (va_list_type) = f_stack;
10085   DECL_CHAIN (f_stack) = f_grtop;
10086   DECL_CHAIN (f_grtop) = f_vrtop;
10087   DECL_CHAIN (f_vrtop) = f_groff;
10088   DECL_CHAIN (f_groff) = f_vroff;
10089
10090   /* Compute its layout.  */
10091   layout_type (va_list_type);
10092
10093   return va_list_type;
10094 }
10095
10096 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
10097 static void
10098 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
10099 {
10100   const CUMULATIVE_ARGS *cum;
10101   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10102   tree stack, grtop, vrtop, groff, vroff;
10103   tree t;
10104   int gr_save_area_size = cfun->va_list_gpr_size;
10105   int vr_save_area_size = cfun->va_list_fpr_size;
10106   int vr_offset;
10107
10108   cum = &crtl->args.info;
10109   if (cfun->va_list_gpr_size)
10110     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
10111                              cfun->va_list_gpr_size);
10112   if (cfun->va_list_fpr_size)
10113     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
10114                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
10115
10116   if (!TARGET_FLOAT)
10117     {
10118       gcc_assert (cum->aapcs_nvrn == 0);
10119       vr_save_area_size = 0;
10120     }
10121
10122   f_stack = TYPE_FIELDS (va_list_type_node);
10123   f_grtop = DECL_CHAIN (f_stack);
10124   f_vrtop = DECL_CHAIN (f_grtop);
10125   f_groff = DECL_CHAIN (f_vrtop);
10126   f_vroff = DECL_CHAIN (f_groff);
10127
10128   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
10129                   NULL_TREE);
10130   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
10131                   NULL_TREE);
10132   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
10133                   NULL_TREE);
10134   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
10135                   NULL_TREE);
10136   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
10137                   NULL_TREE);
10138
10139   /* Emit code to initialize STACK, which points to the next varargs stack
10140      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
10141      by named arguments.  STACK is 8-byte aligned.  */
10142   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
10143   if (cum->aapcs_stack_size > 0)
10144     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
10145   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
10146   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10147
10148   /* Emit code to initialize GRTOP, the top of the GR save area.
10149      virtual_incoming_args_rtx should have been 16 byte aligned.  */
10150   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
10151   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
10152   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10153
10154   /* Emit code to initialize VRTOP, the top of the VR save area.
10155      This address is gr_save_area_bytes below GRTOP, rounded
10156      down to the next 16-byte boundary.  */
10157   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
10158   vr_offset = ROUND_UP (gr_save_area_size,
10159                         STACK_BOUNDARY / BITS_PER_UNIT);
10160
10161   if (vr_offset)
10162     t = fold_build_pointer_plus_hwi (t, -vr_offset);
10163   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
10164   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10165
10166   /* Emit code to initialize GROFF, the offset from GRTOP of the
10167      next GPR argument.  */
10168   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
10169               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
10170   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10171
10172   /* Likewise emit code to initialize VROFF, the offset from FTOP
10173      of the next VR argument.  */
10174   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
10175               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
10176   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10177 }
10178
10179 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
10180
10181 static tree
10182 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
10183                               gimple_seq *post_p ATTRIBUTE_UNUSED)
10184 {
10185   tree addr;
10186   bool indirect_p;
10187   bool is_ha;           /* is HFA or HVA.  */
10188   bool dw_align;        /* double-word align.  */
10189   machine_mode ag_mode = VOIDmode;
10190   int nregs;
10191   machine_mode mode;
10192
10193   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10194   tree stack, f_top, f_off, off, arg, roundup, on_stack;
10195   HOST_WIDE_INT size, rsize, adjust, align;
10196   tree t, u, cond1, cond2;
10197
10198   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
10199   if (indirect_p)
10200     type = build_pointer_type (type);
10201
10202   mode = TYPE_MODE (type);
10203
10204   f_stack = TYPE_FIELDS (va_list_type_node);
10205   f_grtop = DECL_CHAIN (f_stack);
10206   f_vrtop = DECL_CHAIN (f_grtop);
10207   f_groff = DECL_CHAIN (f_vrtop);
10208   f_vroff = DECL_CHAIN (f_groff);
10209
10210   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
10211                   f_stack, NULL_TREE);
10212   size = int_size_in_bytes (type);
10213   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
10214
10215   dw_align = false;
10216   adjust = 0;
10217   if (aarch64_vfp_is_call_or_return_candidate (mode,
10218                                                type,
10219                                                &ag_mode,
10220                                                &nregs,
10221                                                &is_ha))
10222     {
10223       /* TYPE passed in fp/simd registers.  */
10224       if (!TARGET_FLOAT)
10225         aarch64_err_no_fpadvsimd (mode, "varargs");
10226
10227       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
10228                       unshare_expr (valist), f_vrtop, NULL_TREE);
10229       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
10230                       unshare_expr (valist), f_vroff, NULL_TREE);
10231
10232       rsize = nregs * UNITS_PER_VREG;
10233
10234       if (is_ha)
10235         {
10236           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
10237             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
10238         }
10239       else if (BLOCK_REG_PADDING (mode, type, 1) == downward
10240                && size < UNITS_PER_VREG)
10241         {
10242           adjust = UNITS_PER_VREG - size;
10243         }
10244     }
10245   else
10246     {
10247       /* TYPE passed in general registers.  */
10248       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
10249                       unshare_expr (valist), f_grtop, NULL_TREE);
10250       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
10251                       unshare_expr (valist), f_groff, NULL_TREE);
10252       rsize = ROUND_UP (size, UNITS_PER_WORD);
10253       nregs = rsize / UNITS_PER_WORD;
10254
10255       if (align > 8)
10256         dw_align = true;
10257
10258       if (BLOCK_REG_PADDING (mode, type, 1) == downward
10259           && size < UNITS_PER_WORD)
10260         {
10261           adjust = UNITS_PER_WORD  - size;
10262         }
10263     }
10264
10265   /* Get a local temporary for the field value.  */
10266   off = get_initialized_tmp_var (f_off, pre_p, NULL);
10267
10268   /* Emit code to branch if off >= 0.  */
10269   t = build2 (GE_EXPR, boolean_type_node, off,
10270               build_int_cst (TREE_TYPE (off), 0));
10271   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
10272
10273   if (dw_align)
10274     {
10275       /* Emit: offs = (offs + 15) & -16.  */
10276       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10277                   build_int_cst (TREE_TYPE (off), 15));
10278       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
10279                   build_int_cst (TREE_TYPE (off), -16));
10280       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
10281     }
10282   else
10283     roundup = NULL;
10284
10285   /* Update ap.__[g|v]r_offs  */
10286   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10287               build_int_cst (TREE_TYPE (off), rsize));
10288   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
10289
10290   /* String up.  */
10291   if (roundup)
10292     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10293
10294   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
10295   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
10296               build_int_cst (TREE_TYPE (f_off), 0));
10297   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
10298
10299   /* String up: make sure the assignment happens before the use.  */
10300   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
10301   COND_EXPR_ELSE (cond1) = t;
10302
10303   /* Prepare the trees handling the argument that is passed on the stack;
10304      the top level node will store in ON_STACK.  */
10305   arg = get_initialized_tmp_var (stack, pre_p, NULL);
10306   if (align > 8)
10307     {
10308       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
10309       t = fold_convert (intDI_type_node, arg);
10310       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10311                   build_int_cst (TREE_TYPE (t), 15));
10312       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10313                   build_int_cst (TREE_TYPE (t), -16));
10314       t = fold_convert (TREE_TYPE (arg), t);
10315       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
10316     }
10317   else
10318     roundup = NULL;
10319   /* Advance ap.__stack  */
10320   t = fold_convert (intDI_type_node, arg);
10321   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10322               build_int_cst (TREE_TYPE (t), size + 7));
10323   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10324               build_int_cst (TREE_TYPE (t), -8));
10325   t = fold_convert (TREE_TYPE (arg), t);
10326   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
10327   /* String up roundup and advance.  */
10328   if (roundup)
10329     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10330   /* String up with arg */
10331   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
10332   /* Big-endianness related address adjustment.  */
10333   if (BLOCK_REG_PADDING (mode, type, 1) == downward
10334       && size < UNITS_PER_WORD)
10335   {
10336     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
10337                 size_int (UNITS_PER_WORD - size));
10338     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
10339   }
10340
10341   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
10342   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
10343
10344   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
10345   t = off;
10346   if (adjust)
10347     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
10348                 build_int_cst (TREE_TYPE (off), adjust));
10349
10350   t = fold_convert (sizetype, t);
10351   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
10352
10353   if (is_ha)
10354     {
10355       /* type ha; // treat as "struct {ftype field[n];}"
10356          ... [computing offs]
10357          for (i = 0; i <nregs; ++i, offs += 16)
10358            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
10359          return ha;  */
10360       int i;
10361       tree tmp_ha, field_t, field_ptr_t;
10362
10363       /* Declare a local variable.  */
10364       tmp_ha = create_tmp_var_raw (type, "ha");
10365       gimple_add_tmp_var (tmp_ha);
10366
10367       /* Establish the base type.  */
10368       switch (ag_mode)
10369         {
10370         case SFmode:
10371           field_t = float_type_node;
10372           field_ptr_t = float_ptr_type_node;
10373           break;
10374         case DFmode:
10375           field_t = double_type_node;
10376           field_ptr_t = double_ptr_type_node;
10377           break;
10378         case TFmode:
10379           field_t = long_double_type_node;
10380           field_ptr_t = long_double_ptr_type_node;
10381           break;
10382         case HFmode:
10383           field_t = aarch64_fp16_type_node;
10384           field_ptr_t = aarch64_fp16_ptr_type_node;
10385           break;
10386         case V2SImode:
10387         case V4SImode:
10388             {
10389               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
10390               field_t = build_vector_type_for_mode (innertype, ag_mode);
10391               field_ptr_t = build_pointer_type (field_t);
10392             }
10393           break;
10394         default:
10395           gcc_assert (0);
10396         }
10397
10398       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
10399       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
10400       addr = t;
10401       t = fold_convert (field_ptr_t, addr);
10402       t = build2 (MODIFY_EXPR, field_t,
10403                   build1 (INDIRECT_REF, field_t, tmp_ha),
10404                   build1 (INDIRECT_REF, field_t, t));
10405
10406       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
10407       for (i = 1; i < nregs; ++i)
10408         {
10409           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
10410           u = fold_convert (field_ptr_t, addr);
10411           u = build2 (MODIFY_EXPR, field_t,
10412                       build2 (MEM_REF, field_t, tmp_ha,
10413                               build_int_cst (field_ptr_t,
10414                                              (i *
10415                                               int_size_in_bytes (field_t)))),
10416                       build1 (INDIRECT_REF, field_t, u));
10417           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
10418         }
10419
10420       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
10421       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
10422     }
10423
10424   COND_EXPR_ELSE (cond2) = t;
10425   addr = fold_convert (build_pointer_type (type), cond1);
10426   addr = build_va_arg_indirect_ref (addr);
10427
10428   if (indirect_p)
10429     addr = build_va_arg_indirect_ref (addr);
10430
10431   return addr;
10432 }
10433
10434 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
10435
10436 static void
10437 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10438                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
10439                                 int no_rtl)
10440 {
10441   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10442   CUMULATIVE_ARGS local_cum;
10443   int gr_saved = cfun->va_list_gpr_size;
10444   int vr_saved = cfun->va_list_fpr_size;
10445
10446   /* The caller has advanced CUM up to, but not beyond, the last named
10447      argument.  Advance a local copy of CUM past the last "real" named
10448      argument, to find out how many registers are left over.  */
10449   local_cum = *cum;
10450   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
10451
10452   /* Found out how many registers we need to save.
10453      Honor tree-stdvar analysis results.  */
10454   if (cfun->va_list_gpr_size)
10455     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
10456                     cfun->va_list_gpr_size / UNITS_PER_WORD);
10457   if (cfun->va_list_fpr_size)
10458     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
10459                     cfun->va_list_fpr_size / UNITS_PER_VREG);
10460
10461   if (!TARGET_FLOAT)
10462     {
10463       gcc_assert (local_cum.aapcs_nvrn == 0);
10464       vr_saved = 0;
10465     }
10466
10467   if (!no_rtl)
10468     {
10469       if (gr_saved > 0)
10470         {
10471           rtx ptr, mem;
10472
10473           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
10474           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
10475                                - gr_saved * UNITS_PER_WORD);
10476           mem = gen_frame_mem (BLKmode, ptr);
10477           set_mem_alias_set (mem, get_varargs_alias_set ());
10478
10479           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
10480                                mem, gr_saved);
10481         }
10482       if (vr_saved > 0)
10483         {
10484           /* We can't use move_block_from_reg, because it will use
10485              the wrong mode, storing D regs only.  */
10486           machine_mode mode = TImode;
10487           int off, i, vr_start;
10488
10489           /* Set OFF to the offset from virtual_incoming_args_rtx of
10490              the first vector register.  The VR save area lies below
10491              the GR one, and is aligned to 16 bytes.  */
10492           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10493                            STACK_BOUNDARY / BITS_PER_UNIT);
10494           off -= vr_saved * UNITS_PER_VREG;
10495
10496           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10497           for (i = 0; i < vr_saved; ++i)
10498             {
10499               rtx ptr, mem;
10500
10501               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10502               mem = gen_frame_mem (mode, ptr);
10503               set_mem_alias_set (mem, get_varargs_alias_set ());
10504               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10505               off += UNITS_PER_VREG;
10506             }
10507         }
10508     }
10509
10510   /* We don't save the size into *PRETEND_SIZE because we want to avoid
10511      any complication of having crtl->args.pretend_args_size changed.  */
10512   cfun->machine->frame.saved_varargs_size
10513     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10514                  STACK_BOUNDARY / BITS_PER_UNIT)
10515        + vr_saved * UNITS_PER_VREG);
10516 }
10517
10518 static void
10519 aarch64_conditional_register_usage (void)
10520 {
10521   int i;
10522   if (!TARGET_FLOAT)
10523     {
10524       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10525         {
10526           fixed_regs[i] = 1;
10527           call_used_regs[i] = 1;
10528         }
10529     }
10530 }
10531
10532 /* Walk down the type tree of TYPE counting consecutive base elements.
10533    If *MODEP is VOIDmode, then set it to the first valid floating point
10534    type.  If a non-floating point type is found, or if a floating point
10535    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10536    otherwise return the count in the sub-tree.  */
10537 static int
10538 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10539 {
10540   machine_mode mode;
10541   HOST_WIDE_INT size;
10542
10543   switch (TREE_CODE (type))
10544     {
10545     case REAL_TYPE:
10546       mode = TYPE_MODE (type);
10547       if (mode != DFmode && mode != SFmode
10548           && mode != TFmode && mode != HFmode)
10549         return -1;
10550
10551       if (*modep == VOIDmode)
10552         *modep = mode;
10553
10554       if (*modep == mode)
10555         return 1;
10556
10557       break;
10558
10559     case COMPLEX_TYPE:
10560       mode = TYPE_MODE (TREE_TYPE (type));
10561       if (mode != DFmode && mode != SFmode
10562           && mode != TFmode && mode != HFmode)
10563         return -1;
10564
10565       if (*modep == VOIDmode)
10566         *modep = mode;
10567
10568       if (*modep == mode)
10569         return 2;
10570
10571       break;
10572
10573     case VECTOR_TYPE:
10574       /* Use V2SImode and V4SImode as representatives of all 64-bit
10575          and 128-bit vector types.  */
10576       size = int_size_in_bytes (type);
10577       switch (size)
10578         {
10579         case 8:
10580           mode = V2SImode;
10581           break;
10582         case 16:
10583           mode = V4SImode;
10584           break;
10585         default:
10586           return -1;
10587         }
10588
10589       if (*modep == VOIDmode)
10590         *modep = mode;
10591
10592       /* Vector modes are considered to be opaque: two vectors are
10593          equivalent for the purposes of being homogeneous aggregates
10594          if they are the same size.  */
10595       if (*modep == mode)
10596         return 1;
10597
10598       break;
10599
10600     case ARRAY_TYPE:
10601       {
10602         int count;
10603         tree index = TYPE_DOMAIN (type);
10604
10605         /* Can't handle incomplete types nor sizes that are not
10606            fixed.  */
10607         if (!COMPLETE_TYPE_P (type)
10608             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10609           return -1;
10610
10611         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
10612         if (count == -1
10613             || !index
10614             || !TYPE_MAX_VALUE (index)
10615             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
10616             || !TYPE_MIN_VALUE (index)
10617             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
10618             || count < 0)
10619           return -1;
10620
10621         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
10622                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
10623
10624         /* There must be no padding.  */
10625         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10626           return -1;
10627
10628         return count;
10629       }
10630
10631     case RECORD_TYPE:
10632       {
10633         int count = 0;
10634         int sub_count;
10635         tree field;
10636
10637         /* Can't handle incomplete types nor sizes that are not
10638            fixed.  */
10639         if (!COMPLETE_TYPE_P (type)
10640             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10641           return -1;
10642
10643         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10644           {
10645             if (TREE_CODE (field) != FIELD_DECL)
10646               continue;
10647
10648             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10649             if (sub_count < 0)
10650               return -1;
10651             count += sub_count;
10652           }
10653
10654         /* There must be no padding.  */
10655         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10656           return -1;
10657
10658         return count;
10659       }
10660
10661     case UNION_TYPE:
10662     case QUAL_UNION_TYPE:
10663       {
10664         /* These aren't very interesting except in a degenerate case.  */
10665         int count = 0;
10666         int sub_count;
10667         tree field;
10668
10669         /* Can't handle incomplete types nor sizes that are not
10670            fixed.  */
10671         if (!COMPLETE_TYPE_P (type)
10672             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10673           return -1;
10674
10675         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10676           {
10677             if (TREE_CODE (field) != FIELD_DECL)
10678               continue;
10679
10680             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10681             if (sub_count < 0)
10682               return -1;
10683             count = count > sub_count ? count : sub_count;
10684           }
10685
10686         /* There must be no padding.  */
10687         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10688           return -1;
10689
10690         return count;
10691       }
10692
10693     default:
10694       break;
10695     }
10696
10697   return -1;
10698 }
10699
10700 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
10701    type as described in AAPCS64 \S 4.1.2.
10702
10703    See the comment above aarch64_composite_type_p for the notes on MODE.  */
10704
10705 static bool
10706 aarch64_short_vector_p (const_tree type,
10707                         machine_mode mode)
10708 {
10709   HOST_WIDE_INT size = -1;
10710
10711   if (type && TREE_CODE (type) == VECTOR_TYPE)
10712     size = int_size_in_bytes (type);
10713   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
10714             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
10715     size = GET_MODE_SIZE (mode);
10716
10717   return (size == 8 || size == 16);
10718 }
10719
10720 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
10721    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
10722    array types.  The C99 floating-point complex types are also considered
10723    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
10724    types, which are GCC extensions and out of the scope of AAPCS64, are
10725    treated as composite types here as well.
10726
10727    Note that MODE itself is not sufficient in determining whether a type
10728    is such a composite type or not.  This is because
10729    stor-layout.c:compute_record_mode may have already changed the MODE
10730    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
10731    structure with only one field may have its MODE set to the mode of the
10732    field.  Also an integer mode whose size matches the size of the
10733    RECORD_TYPE type may be used to substitute the original mode
10734    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
10735    solely relied on.  */
10736
10737 static bool
10738 aarch64_composite_type_p (const_tree type,
10739                           machine_mode mode)
10740 {
10741   if (aarch64_short_vector_p (type, mode))
10742     return false;
10743
10744   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
10745     return true;
10746
10747   if (mode == BLKmode
10748       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
10749       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
10750     return true;
10751
10752   return false;
10753 }
10754
10755 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
10756    shall be passed or returned in simd/fp register(s) (providing these
10757    parameter passing registers are available).
10758
10759    Upon successful return, *COUNT returns the number of needed registers,
10760    *BASE_MODE returns the mode of the individual register and when IS_HAF
10761    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
10762    floating-point aggregate or a homogeneous short-vector aggregate.  */
10763
10764 static bool
10765 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
10766                                          const_tree type,
10767                                          machine_mode *base_mode,
10768                                          int *count,
10769                                          bool *is_ha)
10770 {
10771   machine_mode new_mode = VOIDmode;
10772   bool composite_p = aarch64_composite_type_p (type, mode);
10773
10774   if (is_ha != NULL) *is_ha = false;
10775
10776   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
10777       || aarch64_short_vector_p (type, mode))
10778     {
10779       *count = 1;
10780       new_mode = mode;
10781     }
10782   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
10783     {
10784       if (is_ha != NULL) *is_ha = true;
10785       *count = 2;
10786       new_mode = GET_MODE_INNER (mode);
10787     }
10788   else if (type && composite_p)
10789     {
10790       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
10791
10792       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
10793         {
10794           if (is_ha != NULL) *is_ha = true;
10795           *count = ag_count;
10796         }
10797       else
10798         return false;
10799     }
10800   else
10801     return false;
10802
10803   *base_mode = new_mode;
10804   return true;
10805 }
10806
10807 /* Implement TARGET_STRUCT_VALUE_RTX.  */
10808
10809 static rtx
10810 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
10811                           int incoming ATTRIBUTE_UNUSED)
10812 {
10813   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
10814 }
10815
10816 /* Implements target hook vector_mode_supported_p.  */
10817 static bool
10818 aarch64_vector_mode_supported_p (machine_mode mode)
10819 {
10820   if (TARGET_SIMD
10821       && (mode == V4SImode  || mode == V8HImode
10822           || mode == V16QImode || mode == V2DImode
10823           || mode == V2SImode  || mode == V4HImode
10824           || mode == V8QImode || mode == V2SFmode
10825           || mode == V4SFmode || mode == V2DFmode
10826           || mode == V4HFmode || mode == V8HFmode
10827           || mode == V1DFmode))
10828     return true;
10829
10830   return false;
10831 }
10832
10833 /* Return appropriate SIMD container
10834    for MODE within a vector of WIDTH bits.  */
10835 static machine_mode
10836 aarch64_simd_container_mode (machine_mode mode, unsigned width)
10837 {
10838   gcc_assert (width == 64 || width == 128);
10839   if (TARGET_SIMD)
10840     {
10841       if (width == 128)
10842         switch (mode)
10843           {
10844           case DFmode:
10845             return V2DFmode;
10846           case SFmode:
10847             return V4SFmode;
10848           case HFmode:
10849             return V8HFmode;
10850           case SImode:
10851             return V4SImode;
10852           case HImode:
10853             return V8HImode;
10854           case QImode:
10855             return V16QImode;
10856           case DImode:
10857             return V2DImode;
10858           default:
10859             break;
10860           }
10861       else
10862         switch (mode)
10863           {
10864           case SFmode:
10865             return V2SFmode;
10866           case HFmode:
10867             return V4HFmode;
10868           case SImode:
10869             return V2SImode;
10870           case HImode:
10871             return V4HImode;
10872           case QImode:
10873             return V8QImode;
10874           default:
10875             break;
10876           }
10877     }
10878   return word_mode;
10879 }
10880
10881 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
10882 static machine_mode
10883 aarch64_preferred_simd_mode (machine_mode mode)
10884 {
10885   return aarch64_simd_container_mode (mode, 128);
10886 }
10887
10888 /* Return the bitmask of possible vector sizes for the vectorizer
10889    to iterate over.  */
10890 static unsigned int
10891 aarch64_autovectorize_vector_sizes (void)
10892 {
10893   return (16 | 8);
10894 }
10895
10896 /* Implement TARGET_MANGLE_TYPE.  */
10897
10898 static const char *
10899 aarch64_mangle_type (const_tree type)
10900 {
10901   /* The AArch64 ABI documents say that "__va_list" has to be
10902      managled as if it is in the "std" namespace.  */
10903   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
10904     return "St9__va_list";
10905
10906   /* Half-precision float.  */
10907   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
10908     return "Dh";
10909
10910   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
10911      builtin types.  */
10912   if (TYPE_NAME (type) != NULL)
10913     return aarch64_mangle_builtin_type (type);
10914
10915   /* Use the default mangling.  */
10916   return NULL;
10917 }
10918
10919 /* Find the first rtx_insn before insn that will generate an assembly
10920    instruction.  */
10921
10922 static rtx_insn *
10923 aarch64_prev_real_insn (rtx_insn *insn)
10924 {
10925   if (!insn)
10926     return NULL;
10927
10928   do
10929     {
10930       insn = prev_real_insn (insn);
10931     }
10932   while (insn && recog_memoized (insn) < 0);
10933
10934   return insn;
10935 }
10936
10937 static bool
10938 is_madd_op (enum attr_type t1)
10939 {
10940   unsigned int i;
10941   /* A number of these may be AArch32 only.  */
10942   enum attr_type mlatypes[] = {
10943     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
10944     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
10945     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
10946   };
10947
10948   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
10949     {
10950       if (t1 == mlatypes[i])
10951         return true;
10952     }
10953
10954   return false;
10955 }
10956
10957 /* Check if there is a register dependency between a load and the insn
10958    for which we hold recog_data.  */
10959
10960 static bool
10961 dep_between_memop_and_curr (rtx memop)
10962 {
10963   rtx load_reg;
10964   int opno;
10965
10966   gcc_assert (GET_CODE (memop) == SET);
10967
10968   if (!REG_P (SET_DEST (memop)))
10969     return false;
10970
10971   load_reg = SET_DEST (memop);
10972   for (opno = 1; opno < recog_data.n_operands; opno++)
10973     {
10974       rtx operand = recog_data.operand[opno];
10975       if (REG_P (operand)
10976           && reg_overlap_mentioned_p (load_reg, operand))
10977         return true;
10978
10979     }
10980   return false;
10981 }
10982
10983
10984 /* When working around the Cortex-A53 erratum 835769,
10985    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
10986    instruction and has a preceding memory instruction such that a NOP
10987    should be inserted between them.  */
10988
10989 bool
10990 aarch64_madd_needs_nop (rtx_insn* insn)
10991 {
10992   enum attr_type attr_type;
10993   rtx_insn *prev;
10994   rtx body;
10995
10996   if (!TARGET_FIX_ERR_A53_835769)
10997     return false;
10998
10999   if (!INSN_P (insn) || recog_memoized (insn) < 0)
11000     return false;
11001
11002   attr_type = get_attr_type (insn);
11003   if (!is_madd_op (attr_type))
11004     return false;
11005
11006   prev = aarch64_prev_real_insn (insn);
11007   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
11008      Restore recog state to INSN to avoid state corruption.  */
11009   extract_constrain_insn_cached (insn);
11010
11011   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
11012     return false;
11013
11014   body = single_set (prev);
11015
11016   /* If the previous insn is a memory op and there is no dependency between
11017      it and the DImode madd, emit a NOP between them.  If body is NULL then we
11018      have a complex memory operation, probably a load/store pair.
11019      Be conservative for now and emit a NOP.  */
11020   if (GET_MODE (recog_data.operand[0]) == DImode
11021       && (!body || !dep_between_memop_and_curr (body)))
11022     return true;
11023
11024   return false;
11025
11026 }
11027
11028
11029 /* Implement FINAL_PRESCAN_INSN.  */
11030
11031 void
11032 aarch64_final_prescan_insn (rtx_insn *insn)
11033 {
11034   if (aarch64_madd_needs_nop (insn))
11035     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
11036 }
11037
11038
11039 /* Return the equivalent letter for size.  */
11040 static char
11041 sizetochar (int size)
11042 {
11043   switch (size)
11044     {
11045     case 64: return 'd';
11046     case 32: return 's';
11047     case 16: return 'h';
11048     case 8 : return 'b';
11049     default: gcc_unreachable ();
11050     }
11051 }
11052
11053 /* Return true iff x is a uniform vector of floating-point
11054    constants, and the constant can be represented in
11055    quarter-precision form.  Note, as aarch64_float_const_representable
11056    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
11057 static bool
11058 aarch64_vect_float_const_representable_p (rtx x)
11059 {
11060   rtx elt;
11061   return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
11062           && const_vec_duplicate_p (x, &elt)
11063           && aarch64_float_const_representable_p (elt));
11064 }
11065
11066 /* Return true for valid and false for invalid.  */
11067 bool
11068 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
11069                               struct simd_immediate_info *info)
11070 {
11071 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
11072   matches = 1;                                          \
11073   for (i = 0; i < idx; i += (STRIDE))                   \
11074     if (!(TEST))                                        \
11075       matches = 0;                                      \
11076   if (matches)                                          \
11077     {                                                   \
11078       immtype = (CLASS);                                \
11079       elsize = (ELSIZE);                                \
11080       eshift = (SHIFT);                                 \
11081       emvn = (NEG);                                     \
11082       break;                                            \
11083     }
11084
11085   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
11086   unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
11087   unsigned char bytes[16];
11088   int immtype = -1, matches;
11089   unsigned int invmask = inverse ? 0xff : 0;
11090   int eshift, emvn;
11091
11092   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11093     {
11094       if (! (aarch64_simd_imm_zero_p (op, mode)
11095              || aarch64_vect_float_const_representable_p (op)))
11096         return false;
11097
11098       if (info)
11099         {
11100           info->value = CONST_VECTOR_ELT (op, 0);
11101           info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
11102           info->mvn = false;
11103           info->shift = 0;
11104         }
11105
11106       return true;
11107     }
11108
11109   /* Splat vector constant out into a byte vector.  */
11110   for (i = 0; i < n_elts; i++)
11111     {
11112       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
11113          it must be laid out in the vector register in reverse order.  */
11114       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
11115       unsigned HOST_WIDE_INT elpart;
11116
11117       gcc_assert (CONST_INT_P (el));
11118       elpart = INTVAL (el);
11119
11120       for (unsigned int byte = 0; byte < innersize; byte++)
11121         {
11122           bytes[idx++] = (elpart & 0xff) ^ invmask;
11123           elpart >>= BITS_PER_UNIT;
11124         }
11125
11126     }
11127
11128   /* Sanity check.  */
11129   gcc_assert (idx == GET_MODE_SIZE (mode));
11130
11131   do
11132     {
11133       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
11134              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
11135
11136       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11137              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11138
11139       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
11140              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11141
11142       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
11143              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
11144
11145       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
11146
11147       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
11148
11149       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
11150              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
11151
11152       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11153              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11154
11155       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
11156              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11157
11158       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
11159              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
11160
11161       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
11162
11163       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
11164
11165       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11166              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11167
11168       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11169              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11170
11171       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
11172              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11173
11174       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
11175              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11176
11177       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
11178
11179       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
11180              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
11181     }
11182   while (0);
11183
11184   if (immtype == -1)
11185     return false;
11186
11187   if (info)
11188     {
11189       info->element_width = elsize;
11190       info->mvn = emvn != 0;
11191       info->shift = eshift;
11192
11193       unsigned HOST_WIDE_INT imm = 0;
11194
11195       if (immtype >= 12 && immtype <= 15)
11196         info->msl = true;
11197
11198       /* Un-invert bytes of recognized vector, if necessary.  */
11199       if (invmask != 0)
11200         for (i = 0; i < idx; i++)
11201           bytes[i] ^= invmask;
11202
11203       if (immtype == 17)
11204         {
11205           /* FIXME: Broken on 32-bit H_W_I hosts.  */
11206           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
11207
11208           for (i = 0; i < 8; i++)
11209             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
11210               << (i * BITS_PER_UNIT);
11211
11212
11213           info->value = GEN_INT (imm);
11214         }
11215       else
11216         {
11217           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
11218             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
11219
11220           /* Construct 'abcdefgh' because the assembler cannot handle
11221              generic constants.  */
11222           if (info->mvn)
11223             imm = ~imm;
11224           imm = (imm >> info->shift) & 0xff;
11225           info->value = GEN_INT (imm);
11226         }
11227     }
11228
11229   return true;
11230 #undef CHECK
11231 }
11232
11233 /* Check of immediate shift constants are within range.  */
11234 bool
11235 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
11236 {
11237   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
11238   if (left)
11239     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
11240   else
11241     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
11242 }
11243
11244 /* Return true if X is a uniform vector where all elements
11245    are either the floating-point constant 0.0 or the
11246    integer constant 0.  */
11247 bool
11248 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
11249 {
11250   return x == CONST0_RTX (mode);
11251 }
11252
11253
11254 /* Return the bitmask CONST_INT to select the bits required by a zero extract
11255    operation of width WIDTH at bit position POS.  */
11256
11257 rtx
11258 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
11259 {
11260   gcc_assert (CONST_INT_P (width));
11261   gcc_assert (CONST_INT_P (pos));
11262
11263   unsigned HOST_WIDE_INT mask
11264     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
11265   return GEN_INT (mask << UINTVAL (pos));
11266 }
11267
11268 bool
11269 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
11270 {
11271   HOST_WIDE_INT imm = INTVAL (x);
11272   int i;
11273
11274   for (i = 0; i < 8; i++)
11275     {
11276       unsigned int byte = imm & 0xff;
11277       if (byte != 0xff && byte != 0)
11278        return false;
11279       imm >>= 8;
11280     }
11281
11282   return true;
11283 }
11284
11285 bool
11286 aarch64_mov_operand_p (rtx x, machine_mode mode)
11287 {
11288   if (GET_CODE (x) == HIGH
11289       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
11290     return true;
11291
11292   if (CONST_INT_P (x))
11293     return true;
11294
11295   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
11296     return true;
11297
11298   return aarch64_classify_symbolic_expression (x)
11299     == SYMBOL_TINY_ABSOLUTE;
11300 }
11301
11302 /* Return a const_int vector of VAL.  */
11303 rtx
11304 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
11305 {
11306   int nunits = GET_MODE_NUNITS (mode);
11307   rtvec v = rtvec_alloc (nunits);
11308   int i;
11309
11310   rtx cache = GEN_INT (val);
11311
11312   for (i=0; i < nunits; i++)
11313     RTVEC_ELT (v, i) = cache;
11314
11315   return gen_rtx_CONST_VECTOR (mode, v);
11316 }
11317
11318 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
11319
11320 bool
11321 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
11322 {
11323   machine_mode vmode;
11324
11325   gcc_assert (!VECTOR_MODE_P (mode));
11326   vmode = aarch64_preferred_simd_mode (mode);
11327   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
11328   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
11329 }
11330
11331 /* Construct and return a PARALLEL RTX vector with elements numbering the
11332    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
11333    the vector - from the perspective of the architecture.  This does not
11334    line up with GCC's perspective on lane numbers, so we end up with
11335    different masks depending on our target endian-ness.  The diagram
11336    below may help.  We must draw the distinction when building masks
11337    which select one half of the vector.  An instruction selecting
11338    architectural low-lanes for a big-endian target, must be described using
11339    a mask selecting GCC high-lanes.
11340
11341                  Big-Endian             Little-Endian
11342
11343 GCC             0   1   2   3           3   2   1   0
11344               | x | x | x | x |       | x | x | x | x |
11345 Architecture    3   2   1   0           3   2   1   0
11346
11347 Low Mask:         { 2, 3 }                { 0, 1 }
11348 High Mask:        { 0, 1 }                { 2, 3 }
11349 */
11350
11351 rtx
11352 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
11353 {
11354   int nunits = GET_MODE_NUNITS (mode);
11355   rtvec v = rtvec_alloc (nunits / 2);
11356   int high_base = nunits / 2;
11357   int low_base = 0;
11358   int base;
11359   rtx t1;
11360   int i;
11361
11362   if (BYTES_BIG_ENDIAN)
11363     base = high ? low_base : high_base;
11364   else
11365     base = high ? high_base : low_base;
11366
11367   for (i = 0; i < nunits / 2; i++)
11368     RTVEC_ELT (v, i) = GEN_INT (base + i);
11369
11370   t1 = gen_rtx_PARALLEL (mode, v);
11371   return t1;
11372 }
11373
11374 /* Check OP for validity as a PARALLEL RTX vector with elements
11375    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
11376    from the perspective of the architecture.  See the diagram above
11377    aarch64_simd_vect_par_cnst_half for more details.  */
11378
11379 bool
11380 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
11381                                        bool high)
11382 {
11383   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
11384   HOST_WIDE_INT count_op = XVECLEN (op, 0);
11385   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
11386   int i = 0;
11387
11388   if (!VECTOR_MODE_P (mode))
11389     return false;
11390
11391   if (count_op != count_ideal)
11392     return false;
11393
11394   for (i = 0; i < count_ideal; i++)
11395     {
11396       rtx elt_op = XVECEXP (op, 0, i);
11397       rtx elt_ideal = XVECEXP (ideal, 0, i);
11398
11399       if (!CONST_INT_P (elt_op)
11400           || INTVAL (elt_ideal) != INTVAL (elt_op))
11401         return false;
11402     }
11403   return true;
11404 }
11405
11406 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
11407    HIGH (exclusive).  */
11408 void
11409 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
11410                           const_tree exp)
11411 {
11412   HOST_WIDE_INT lane;
11413   gcc_assert (CONST_INT_P (operand));
11414   lane = INTVAL (operand);
11415
11416   if (lane < low || lane >= high)
11417   {
11418     if (exp)
11419       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
11420     else
11421       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
11422   }
11423 }
11424
11425 /* Return TRUE if OP is a valid vector addressing mode.  */
11426 bool
11427 aarch64_simd_mem_operand_p (rtx op)
11428 {
11429   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
11430                         || REG_P (XEXP (op, 0)));
11431 }
11432
11433 /* Emit a register copy from operand to operand, taking care not to
11434    early-clobber source registers in the process.
11435
11436    COUNT is the number of components into which the copy needs to be
11437    decomposed.  */
11438 void
11439 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
11440                                 unsigned int count)
11441 {
11442   unsigned int i;
11443   int rdest = REGNO (operands[0]);
11444   int rsrc = REGNO (operands[1]);
11445
11446   if (!reg_overlap_mentioned_p (operands[0], operands[1])
11447       || rdest < rsrc)
11448     for (i = 0; i < count; i++)
11449       emit_move_insn (gen_rtx_REG (mode, rdest + i),
11450                       gen_rtx_REG (mode, rsrc + i));
11451   else
11452     for (i = 0; i < count; i++)
11453       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
11454                       gen_rtx_REG (mode, rsrc + count - i - 1));
11455 }
11456
11457 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11458    one of VSTRUCT modes: OI, CI, or XI.  */
11459 int
11460 aarch64_simd_attr_length_rglist (enum machine_mode mode)
11461 {
11462   return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
11463 }
11464
11465 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
11466    alignment of a vector to 128 bits.  */
11467 static HOST_WIDE_INT
11468 aarch64_simd_vector_alignment (const_tree type)
11469 {
11470   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
11471   return MIN (align, 128);
11472 }
11473
11474 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
11475 static bool
11476 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
11477 {
11478   if (is_packed)
11479     return false;
11480
11481   /* We guarantee alignment for vectors up to 128-bits.  */
11482   if (tree_int_cst_compare (TYPE_SIZE (type),
11483                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11484     return false;
11485
11486   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
11487   return true;
11488 }
11489
11490 /* Return true if the vector misalignment factor is supported by the
11491    target.  */
11492 static bool
11493 aarch64_builtin_support_vector_misalignment (machine_mode mode,
11494                                              const_tree type, int misalignment,
11495                                              bool is_packed)
11496 {
11497   if (TARGET_SIMD && STRICT_ALIGNMENT)
11498     {
11499       /* Return if movmisalign pattern is not supported for this mode.  */
11500       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
11501         return false;
11502
11503       if (misalignment == -1)
11504         {
11505           /* Misalignment factor is unknown at compile time but we know
11506              it's word aligned.  */
11507           if (aarch64_simd_vector_alignment_reachable (type, is_packed))
11508             {
11509               int element_size = TREE_INT_CST_LOW (TYPE_SIZE (type));
11510
11511               if (element_size != 64)
11512                 return true;
11513             }
11514           return false;
11515         }
11516     }
11517   return default_builtin_support_vector_misalignment (mode, type, misalignment,
11518                                                       is_packed);
11519 }
11520
11521 /* If VALS is a vector constant that can be loaded into a register
11522    using DUP, generate instructions to do so and return an RTX to
11523    assign to the register.  Otherwise return NULL_RTX.  */
11524 static rtx
11525 aarch64_simd_dup_constant (rtx vals)
11526 {
11527   machine_mode mode = GET_MODE (vals);
11528   machine_mode inner_mode = GET_MODE_INNER (mode);
11529   rtx x;
11530
11531   if (!const_vec_duplicate_p (vals, &x))
11532     return NULL_RTX;
11533
11534   /* We can load this constant by using DUP and a constant in a
11535      single ARM register.  This will be cheaper than a vector
11536      load.  */
11537   x = copy_to_mode_reg (inner_mode, x);
11538   return gen_rtx_VEC_DUPLICATE (mode, x);
11539 }
11540
11541
11542 /* Generate code to load VALS, which is a PARALLEL containing only
11543    constants (for vec_init) or CONST_VECTOR, efficiently into a
11544    register.  Returns an RTX to copy into the register, or NULL_RTX
11545    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
11546 static rtx
11547 aarch64_simd_make_constant (rtx vals)
11548 {
11549   machine_mode mode = GET_MODE (vals);
11550   rtx const_dup;
11551   rtx const_vec = NULL_RTX;
11552   int n_elts = GET_MODE_NUNITS (mode);
11553   int n_const = 0;
11554   int i;
11555
11556   if (GET_CODE (vals) == CONST_VECTOR)
11557     const_vec = vals;
11558   else if (GET_CODE (vals) == PARALLEL)
11559     {
11560       /* A CONST_VECTOR must contain only CONST_INTs and
11561          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11562          Only store valid constants in a CONST_VECTOR.  */
11563       for (i = 0; i < n_elts; ++i)
11564         {
11565           rtx x = XVECEXP (vals, 0, i);
11566           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11567             n_const++;
11568         }
11569       if (n_const == n_elts)
11570         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11571     }
11572   else
11573     gcc_unreachable ();
11574
11575   if (const_vec != NULL_RTX
11576       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11577     /* Load using MOVI/MVNI.  */
11578     return const_vec;
11579   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11580     /* Loaded using DUP.  */
11581     return const_dup;
11582   else if (const_vec != NULL_RTX)
11583     /* Load from constant pool. We can not take advantage of single-cycle
11584        LD1 because we need a PC-relative addressing mode.  */
11585     return const_vec;
11586   else
11587     /* A PARALLEL containing something not valid inside CONST_VECTOR.
11588        We can not construct an initializer.  */
11589     return NULL_RTX;
11590 }
11591
11592 /* Expand a vector initialisation sequence, such that TARGET is
11593    initialised to contain VALS.  */
11594
11595 void
11596 aarch64_expand_vector_init (rtx target, rtx vals)
11597 {
11598   machine_mode mode = GET_MODE (target);
11599   machine_mode inner_mode = GET_MODE_INNER (mode);
11600   /* The number of vector elements.  */
11601   int n_elts = GET_MODE_NUNITS (mode);
11602   /* The number of vector elements which are not constant.  */
11603   int n_var = 0;
11604   rtx any_const = NULL_RTX;
11605   /* The first element of vals.  */
11606   rtx v0 = XVECEXP (vals, 0, 0);
11607   bool all_same = true;
11608
11609   /* Count the number of variable elements to initialise.  */
11610   for (int i = 0; i < n_elts; ++i)
11611     {
11612       rtx x = XVECEXP (vals, 0, i);
11613       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
11614         ++n_var;
11615       else
11616         any_const = x;
11617
11618       all_same &= rtx_equal_p (x, v0);
11619     }
11620
11621   /* No variable elements, hand off to aarch64_simd_make_constant which knows
11622      how best to handle this.  */
11623   if (n_var == 0)
11624     {
11625       rtx constant = aarch64_simd_make_constant (vals);
11626       if (constant != NULL_RTX)
11627         {
11628           emit_move_insn (target, constant);
11629           return;
11630         }
11631     }
11632
11633   /* Splat a single non-constant element if we can.  */
11634   if (all_same)
11635     {
11636       rtx x = copy_to_mode_reg (inner_mode, v0);
11637       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
11638       return;
11639     }
11640
11641   /* Initialise a vector which is part-variable.  We want to first try
11642      to build those lanes which are constant in the most efficient way we
11643      can.  */
11644   if (n_var != n_elts)
11645     {
11646       rtx copy = copy_rtx (vals);
11647
11648       /* Load constant part of vector.  We really don't care what goes into the
11649          parts we will overwrite, but we're more likely to be able to load the
11650          constant efficiently if it has fewer, larger, repeating parts
11651          (see aarch64_simd_valid_immediate).  */
11652       for (int i = 0; i < n_elts; i++)
11653         {
11654           rtx x = XVECEXP (vals, 0, i);
11655           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11656             continue;
11657           rtx subst = any_const;
11658           for (int bit = n_elts / 2; bit > 0; bit /= 2)
11659             {
11660               /* Look in the copied vector, as more elements are const.  */
11661               rtx test = XVECEXP (copy, 0, i ^ bit);
11662               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
11663                 {
11664                   subst = test;
11665                   break;
11666                 }
11667             }
11668           XVECEXP (copy, 0, i) = subst;
11669         }
11670       aarch64_expand_vector_init (target, copy);
11671     }
11672
11673   /* Insert the variable lanes directly.  */
11674
11675   enum insn_code icode = optab_handler (vec_set_optab, mode);
11676   gcc_assert (icode != CODE_FOR_nothing);
11677
11678   for (int i = 0; i < n_elts; i++)
11679     {
11680       rtx x = XVECEXP (vals, 0, i);
11681       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11682         continue;
11683       x = copy_to_mode_reg (inner_mode, x);
11684       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
11685     }
11686 }
11687
11688 static unsigned HOST_WIDE_INT
11689 aarch64_shift_truncation_mask (machine_mode mode)
11690 {
11691   return
11692     (!SHIFT_COUNT_TRUNCATED
11693      || aarch64_vector_mode_supported_p (mode)
11694      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
11695 }
11696
11697 /* Select a format to encode pointers in exception handling data.  */
11698 int
11699 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
11700 {
11701    int type;
11702    switch (aarch64_cmodel)
11703      {
11704      case AARCH64_CMODEL_TINY:
11705      case AARCH64_CMODEL_TINY_PIC:
11706      case AARCH64_CMODEL_SMALL:
11707      case AARCH64_CMODEL_SMALL_PIC:
11708      case AARCH64_CMODEL_SMALL_SPIC:
11709        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
11710           for everything.  */
11711        type = DW_EH_PE_sdata4;
11712        break;
11713      default:
11714        /* No assumptions here.  8-byte relocs required.  */
11715        type = DW_EH_PE_sdata8;
11716        break;
11717      }
11718    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
11719 }
11720
11721 /* The last .arch and .tune assembly strings that we printed.  */
11722 static std::string aarch64_last_printed_arch_string;
11723 static std::string aarch64_last_printed_tune_string;
11724
11725 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
11726    by the function fndecl.  */
11727
11728 void
11729 aarch64_declare_function_name (FILE *stream, const char* name,
11730                                 tree fndecl)
11731 {
11732   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11733
11734   struct cl_target_option *targ_options;
11735   if (target_parts)
11736     targ_options = TREE_TARGET_OPTION (target_parts);
11737   else
11738     targ_options = TREE_TARGET_OPTION (target_option_current_node);
11739   gcc_assert (targ_options);
11740
11741   const struct processor *this_arch
11742     = aarch64_get_arch (targ_options->x_explicit_arch);
11743
11744   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
11745   std::string extension
11746     = aarch64_get_extension_string_for_isa_flags (isa_flags,
11747                                                   this_arch->flags);
11748   /* Only update the assembler .arch string if it is distinct from the last
11749      such string we printed.  */
11750   std::string to_print = this_arch->name + extension;
11751   if (to_print != aarch64_last_printed_arch_string)
11752     {
11753       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
11754       aarch64_last_printed_arch_string = to_print;
11755     }
11756
11757   /* Print the cpu name we're tuning for in the comments, might be
11758      useful to readers of the generated asm.  Do it only when it changes
11759      from function to function and verbose assembly is requested.  */
11760   const struct processor *this_tune
11761     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
11762
11763   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
11764     {
11765       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
11766                    this_tune->name);
11767       aarch64_last_printed_tune_string = this_tune->name;
11768     }
11769
11770   /* Don't forget the type directive for ELF.  */
11771   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
11772   ASM_OUTPUT_LABEL (stream, name);
11773 }
11774
11775 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
11776
11777 static void
11778 aarch64_start_file (void)
11779 {
11780   struct cl_target_option *default_options
11781     = TREE_TARGET_OPTION (target_option_default_node);
11782
11783   const struct processor *default_arch
11784     = aarch64_get_arch (default_options->x_explicit_arch);
11785   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
11786   std::string extension
11787     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
11788                                                   default_arch->flags);
11789
11790    aarch64_last_printed_arch_string = default_arch->name + extension;
11791    aarch64_last_printed_tune_string = "";
11792    asm_fprintf (asm_out_file, "\t.arch %s\n",
11793                 aarch64_last_printed_arch_string.c_str ());
11794
11795    default_file_start ();
11796 }
11797
11798 /* Emit load exclusive.  */
11799
11800 static void
11801 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
11802                              rtx mem, rtx model_rtx)
11803 {
11804   rtx (*gen) (rtx, rtx, rtx);
11805
11806   switch (mode)
11807     {
11808     case QImode: gen = gen_aarch64_load_exclusiveqi; break;
11809     case HImode: gen = gen_aarch64_load_exclusivehi; break;
11810     case SImode: gen = gen_aarch64_load_exclusivesi; break;
11811     case DImode: gen = gen_aarch64_load_exclusivedi; break;
11812     default:
11813       gcc_unreachable ();
11814     }
11815
11816   emit_insn (gen (rval, mem, model_rtx));
11817 }
11818
11819 /* Emit store exclusive.  */
11820
11821 static void
11822 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
11823                               rtx rval, rtx mem, rtx model_rtx)
11824 {
11825   rtx (*gen) (rtx, rtx, rtx, rtx);
11826
11827   switch (mode)
11828     {
11829     case QImode: gen = gen_aarch64_store_exclusiveqi; break;
11830     case HImode: gen = gen_aarch64_store_exclusivehi; break;
11831     case SImode: gen = gen_aarch64_store_exclusivesi; break;
11832     case DImode: gen = gen_aarch64_store_exclusivedi; break;
11833     default:
11834       gcc_unreachable ();
11835     }
11836
11837   emit_insn (gen (bval, rval, mem, model_rtx));
11838 }
11839
11840 /* Mark the previous jump instruction as unlikely.  */
11841
11842 static void
11843 aarch64_emit_unlikely_jump (rtx insn)
11844 {
11845   int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
11846
11847   rtx_insn *jump = emit_jump_insn (insn);
11848   add_int_reg_note (jump, REG_BR_PROB, very_unlikely);
11849 }
11850
11851 /* Expand a compare and swap pattern.  */
11852
11853 void
11854 aarch64_expand_compare_and_swap (rtx operands[])
11855 {
11856   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
11857   machine_mode mode, cmp_mode;
11858   typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
11859   int idx;
11860   gen_cas_fn gen;
11861   const gen_cas_fn split_cas[] =
11862   {
11863     gen_aarch64_compare_and_swapqi,
11864     gen_aarch64_compare_and_swaphi,
11865     gen_aarch64_compare_and_swapsi,
11866     gen_aarch64_compare_and_swapdi
11867   };
11868   const gen_cas_fn atomic_cas[] =
11869   {
11870     gen_aarch64_compare_and_swapqi_lse,
11871     gen_aarch64_compare_and_swaphi_lse,
11872     gen_aarch64_compare_and_swapsi_lse,
11873     gen_aarch64_compare_and_swapdi_lse
11874   };
11875
11876   bval = operands[0];
11877   rval = operands[1];
11878   mem = operands[2];
11879   oldval = operands[3];
11880   newval = operands[4];
11881   is_weak = operands[5];
11882   mod_s = operands[6];
11883   mod_f = operands[7];
11884   mode = GET_MODE (mem);
11885   cmp_mode = mode;
11886
11887   /* Normally the succ memory model must be stronger than fail, but in the
11888      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
11889      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
11890
11891   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
11892       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
11893     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
11894
11895   switch (mode)
11896     {
11897     case QImode:
11898     case HImode:
11899       /* For short modes, we're going to perform the comparison in SImode,
11900          so do the zero-extension now.  */
11901       cmp_mode = SImode;
11902       rval = gen_reg_rtx (SImode);
11903       oldval = convert_modes (SImode, mode, oldval, true);
11904       /* Fall through.  */
11905
11906     case SImode:
11907     case DImode:
11908       /* Force the value into a register if needed.  */
11909       if (!aarch64_plus_operand (oldval, mode))
11910         oldval = force_reg (cmp_mode, oldval);
11911       break;
11912
11913     default:
11914       gcc_unreachable ();
11915     }
11916
11917   switch (mode)
11918     {
11919     case QImode: idx = 0; break;
11920     case HImode: idx = 1; break;
11921     case SImode: idx = 2; break;
11922     case DImode: idx = 3; break;
11923     default:
11924       gcc_unreachable ();
11925     }
11926   if (TARGET_LSE)
11927     gen = atomic_cas[idx];
11928   else
11929     gen = split_cas[idx];
11930
11931   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
11932
11933   if (mode == QImode || mode == HImode)
11934     emit_move_insn (operands[1], gen_lowpart (mode, rval));
11935
11936   x = gen_rtx_REG (CCmode, CC_REGNUM);
11937   x = gen_rtx_EQ (SImode, x, const0_rtx);
11938   emit_insn (gen_rtx_SET (bval, x));
11939 }
11940
11941 /* Test whether the target supports using a atomic load-operate instruction.
11942    CODE is the operation and AFTER is TRUE if the data in memory after the
11943    operation should be returned and FALSE if the data before the operation
11944    should be returned.  Returns FALSE if the operation isn't supported by the
11945    architecture.  */
11946
11947 bool
11948 aarch64_atomic_ldop_supported_p (enum rtx_code code)
11949 {
11950   if (!TARGET_LSE)
11951     return false;
11952
11953   switch (code)
11954     {
11955     case SET:
11956     case AND:
11957     case IOR:
11958     case XOR:
11959     case MINUS:
11960     case PLUS:
11961       return true;
11962     default:
11963       return false;
11964     }
11965 }
11966
11967 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
11968    sequence implementing an atomic operation.  */
11969
11970 static void
11971 aarch64_emit_post_barrier (enum memmodel model)
11972 {
11973   const enum memmodel base_model = memmodel_base (model);
11974
11975   if (is_mm_sync (model)
11976       && (base_model == MEMMODEL_ACQUIRE
11977           || base_model == MEMMODEL_ACQ_REL
11978           || base_model == MEMMODEL_SEQ_CST))
11979     {
11980       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
11981     }
11982 }
11983
11984 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
11985    for the data in memory.  EXPECTED is the value expected to be in memory.
11986    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
11987    is the memory ordering to use.  */
11988
11989 void
11990 aarch64_gen_atomic_cas (rtx rval, rtx mem,
11991                         rtx expected, rtx desired,
11992                         rtx model)
11993 {
11994   rtx (*gen) (rtx, rtx, rtx, rtx);
11995   machine_mode mode;
11996
11997   mode = GET_MODE (mem);
11998
11999   switch (mode)
12000     {
12001     case QImode: gen = gen_aarch64_atomic_casqi; break;
12002     case HImode: gen = gen_aarch64_atomic_cashi; break;
12003     case SImode: gen = gen_aarch64_atomic_cassi; break;
12004     case DImode: gen = gen_aarch64_atomic_casdi; break;
12005     default:
12006       gcc_unreachable ();
12007     }
12008
12009   /* Move the expected value into the CAS destination register.  */
12010   emit_insn (gen_rtx_SET (rval, expected));
12011
12012   /* Emit the CAS.  */
12013   emit_insn (gen (rval, mem, desired, model));
12014
12015   /* Compare the expected value with the value loaded by the CAS, to establish
12016      whether the swap was made.  */
12017   aarch64_gen_compare_reg (EQ, rval, expected);
12018 }
12019
12020 /* Split a compare and swap pattern.  */
12021
12022 void
12023 aarch64_split_compare_and_swap (rtx operands[])
12024 {
12025   rtx rval, mem, oldval, newval, scratch;
12026   machine_mode mode;
12027   bool is_weak;
12028   rtx_code_label *label1, *label2;
12029   rtx x, cond;
12030   enum memmodel model;
12031   rtx model_rtx;
12032
12033   rval = operands[0];
12034   mem = operands[1];
12035   oldval = operands[2];
12036   newval = operands[3];
12037   is_weak = (operands[4] != const0_rtx);
12038   model_rtx = operands[5];
12039   scratch = operands[7];
12040   mode = GET_MODE (mem);
12041   model = memmodel_from_int (INTVAL (model_rtx));
12042
12043   label1 = NULL;
12044   if (!is_weak)
12045     {
12046       label1 = gen_label_rtx ();
12047       emit_label (label1);
12048     }
12049   label2 = gen_label_rtx ();
12050
12051   /* The initial load can be relaxed for a __sync operation since a final
12052      barrier will be emitted to stop code hoisting.  */
12053   if (is_mm_sync (model))
12054     aarch64_emit_load_exclusive (mode, rval, mem,
12055                                  GEN_INT (MEMMODEL_RELAXED));
12056   else
12057     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
12058
12059   cond = aarch64_gen_compare_reg (NE, rval, oldval);
12060   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12061   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12062                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12063   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12064
12065   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
12066
12067   if (!is_weak)
12068     {
12069       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
12070       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12071                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
12072       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12073     }
12074   else
12075     {
12076       cond = gen_rtx_REG (CCmode, CC_REGNUM);
12077       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
12078       emit_insn (gen_rtx_SET (cond, x));
12079     }
12080
12081   emit_label (label2);
12082
12083   /* Emit any final barrier needed for a __sync operation.  */
12084   if (is_mm_sync (model))
12085     aarch64_emit_post_barrier (model);
12086 }
12087
12088 /* Emit a BIC instruction.  */
12089
12090 static void
12091 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
12092 {
12093   rtx shift_rtx = GEN_INT (shift);
12094   rtx (*gen) (rtx, rtx, rtx, rtx);
12095
12096   switch (mode)
12097     {
12098     case SImode: gen = gen_and_one_cmpl_lshrsi3; break;
12099     case DImode: gen = gen_and_one_cmpl_lshrdi3; break;
12100     default:
12101       gcc_unreachable ();
12102     }
12103
12104   emit_insn (gen (dst, s2, shift_rtx, s1));
12105 }
12106
12107 /* Emit an atomic swap.  */
12108
12109 static void
12110 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
12111                           rtx mem, rtx model)
12112 {
12113   rtx (*gen) (rtx, rtx, rtx, rtx);
12114
12115   switch (mode)
12116     {
12117     case QImode: gen = gen_aarch64_atomic_swpqi; break;
12118     case HImode: gen = gen_aarch64_atomic_swphi; break;
12119     case SImode: gen = gen_aarch64_atomic_swpsi; break;
12120     case DImode: gen = gen_aarch64_atomic_swpdi; break;
12121     default:
12122       gcc_unreachable ();
12123     }
12124
12125   emit_insn (gen (dst, mem, value, model));
12126 }
12127
12128 /* Operations supported by aarch64_emit_atomic_load_op.  */
12129
12130 enum aarch64_atomic_load_op_code
12131 {
12132   AARCH64_LDOP_PLUS,    /* A + B  */
12133   AARCH64_LDOP_XOR,     /* A ^ B  */
12134   AARCH64_LDOP_OR,      /* A | B  */
12135   AARCH64_LDOP_BIC      /* A & ~B  */
12136 };
12137
12138 /* Emit an atomic load-operate.  */
12139
12140 static void
12141 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
12142                              machine_mode mode, rtx dst, rtx src,
12143                              rtx mem, rtx model)
12144 {
12145   typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
12146   const aarch64_atomic_load_op_fn plus[] =
12147   {
12148     gen_aarch64_atomic_loadaddqi,
12149     gen_aarch64_atomic_loadaddhi,
12150     gen_aarch64_atomic_loadaddsi,
12151     gen_aarch64_atomic_loadadddi
12152   };
12153   const aarch64_atomic_load_op_fn eor[] =
12154   {
12155     gen_aarch64_atomic_loadeorqi,
12156     gen_aarch64_atomic_loadeorhi,
12157     gen_aarch64_atomic_loadeorsi,
12158     gen_aarch64_atomic_loadeordi
12159   };
12160   const aarch64_atomic_load_op_fn ior[] =
12161   {
12162     gen_aarch64_atomic_loadsetqi,
12163     gen_aarch64_atomic_loadsethi,
12164     gen_aarch64_atomic_loadsetsi,
12165     gen_aarch64_atomic_loadsetdi
12166   };
12167   const aarch64_atomic_load_op_fn bic[] =
12168   {
12169     gen_aarch64_atomic_loadclrqi,
12170     gen_aarch64_atomic_loadclrhi,
12171     gen_aarch64_atomic_loadclrsi,
12172     gen_aarch64_atomic_loadclrdi
12173   };
12174   aarch64_atomic_load_op_fn gen;
12175   int idx = 0;
12176
12177   switch (mode)
12178     {
12179     case QImode: idx = 0; break;
12180     case HImode: idx = 1; break;
12181     case SImode: idx = 2; break;
12182     case DImode: idx = 3; break;
12183     default:
12184       gcc_unreachable ();
12185     }
12186
12187   switch (code)
12188     {
12189     case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
12190     case AARCH64_LDOP_XOR: gen = eor[idx]; break;
12191     case AARCH64_LDOP_OR: gen = ior[idx]; break;
12192     case AARCH64_LDOP_BIC: gen = bic[idx]; break;
12193     default:
12194       gcc_unreachable ();
12195     }
12196
12197   emit_insn (gen (dst, mem, src, model));
12198 }
12199
12200 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
12201    location to store the data read from memory.  OUT_RESULT is the location to
12202    store the result of the operation.  MEM is the memory location to read and
12203    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
12204    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
12205    be NULL.  */
12206
12207 void
12208 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
12209                          rtx mem, rtx value, rtx model_rtx)
12210 {
12211   machine_mode mode = GET_MODE (mem);
12212   machine_mode wmode = (mode == DImode ? DImode : SImode);
12213   const bool short_mode = (mode < SImode);
12214   aarch64_atomic_load_op_code ldop_code;
12215   rtx src;
12216   rtx x;
12217
12218   if (out_data)
12219     out_data = gen_lowpart (mode, out_data);
12220
12221   if (out_result)
12222     out_result = gen_lowpart (mode, out_result);
12223
12224   /* Make sure the value is in a register, putting it into a destination
12225      register if it needs to be manipulated.  */
12226   if (!register_operand (value, mode)
12227       || code == AND || code == MINUS)
12228     {
12229       src = out_result ? out_result : out_data;
12230       emit_move_insn (src, gen_lowpart (mode, value));
12231     }
12232   else
12233     src = value;
12234   gcc_assert (register_operand (src, mode));
12235
12236   /* Preprocess the data for the operation as necessary.  If the operation is
12237      a SET then emit a swap instruction and finish.  */
12238   switch (code)
12239     {
12240     case SET:
12241       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
12242       return;
12243
12244     case MINUS:
12245       /* Negate the value and treat it as a PLUS.  */
12246       {
12247         rtx neg_src;
12248
12249         /* Resize the value if necessary.  */
12250         if (short_mode)
12251           src = gen_lowpart (wmode, src);
12252
12253         neg_src = gen_rtx_NEG (wmode, src);
12254         emit_insn (gen_rtx_SET (src, neg_src));
12255
12256         if (short_mode)
12257           src = gen_lowpart (mode, src);
12258       }
12259       /* Fall-through.  */
12260     case PLUS:
12261       ldop_code = AARCH64_LDOP_PLUS;
12262       break;
12263
12264     case IOR:
12265       ldop_code = AARCH64_LDOP_OR;
12266       break;
12267
12268     case XOR:
12269       ldop_code = AARCH64_LDOP_XOR;
12270       break;
12271
12272     case AND:
12273       {
12274         rtx not_src;
12275
12276         /* Resize the value if necessary.  */
12277         if (short_mode)
12278           src = gen_lowpart (wmode, src);
12279
12280         not_src = gen_rtx_NOT (wmode, src);
12281         emit_insn (gen_rtx_SET (src, not_src));
12282
12283         if (short_mode)
12284           src = gen_lowpart (mode, src);
12285       }
12286       ldop_code = AARCH64_LDOP_BIC;
12287       break;
12288
12289     default:
12290       /* The operation can't be done with atomic instructions.  */
12291       gcc_unreachable ();
12292     }
12293
12294   aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
12295
12296   /* If necessary, calculate the data in memory after the update by redoing the
12297      operation from values in registers.  */
12298   if (!out_result)
12299     return;
12300
12301   if (short_mode)
12302     {
12303       src = gen_lowpart (wmode, src);
12304       out_data = gen_lowpart (wmode, out_data);
12305       out_result = gen_lowpart (wmode, out_result);
12306     }
12307
12308   x = NULL_RTX;
12309
12310   switch (code)
12311     {
12312     case MINUS:
12313     case PLUS:
12314       x = gen_rtx_PLUS (wmode, out_data, src);
12315       break;
12316     case IOR:
12317       x = gen_rtx_IOR (wmode, out_data, src);
12318       break;
12319     case XOR:
12320       x = gen_rtx_XOR (wmode, out_data, src);
12321       break;
12322     case AND:
12323       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
12324       return;
12325     default:
12326       gcc_unreachable ();
12327     }
12328
12329   emit_set_insn (out_result, x);
12330
12331   return;
12332 }
12333
12334 /* Split an atomic operation.  */
12335
12336 void
12337 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
12338                          rtx value, rtx model_rtx, rtx cond)
12339 {
12340   machine_mode mode = GET_MODE (mem);
12341   machine_mode wmode = (mode == DImode ? DImode : SImode);
12342   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
12343   const bool is_sync = is_mm_sync (model);
12344   rtx_code_label *label;
12345   rtx x;
12346
12347   /* Split the atomic operation into a sequence.  */
12348   label = gen_label_rtx ();
12349   emit_label (label);
12350
12351   if (new_out)
12352     new_out = gen_lowpart (wmode, new_out);
12353   if (old_out)
12354     old_out = gen_lowpart (wmode, old_out);
12355   else
12356     old_out = new_out;
12357   value = simplify_gen_subreg (wmode, value, mode, 0);
12358
12359   /* The initial load can be relaxed for a __sync operation since a final
12360      barrier will be emitted to stop code hoisting.  */
12361  if (is_sync)
12362     aarch64_emit_load_exclusive (mode, old_out, mem,
12363                                  GEN_INT (MEMMODEL_RELAXED));
12364   else
12365     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
12366
12367   switch (code)
12368     {
12369     case SET:
12370       new_out = value;
12371       break;
12372
12373     case NOT:
12374       x = gen_rtx_AND (wmode, old_out, value);
12375       emit_insn (gen_rtx_SET (new_out, x));
12376       x = gen_rtx_NOT (wmode, new_out);
12377       emit_insn (gen_rtx_SET (new_out, x));
12378       break;
12379
12380     case MINUS:
12381       if (CONST_INT_P (value))
12382         {
12383           value = GEN_INT (-INTVAL (value));
12384           code = PLUS;
12385         }
12386       /* Fall through.  */
12387
12388     default:
12389       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
12390       emit_insn (gen_rtx_SET (new_out, x));
12391       break;
12392     }
12393
12394   aarch64_emit_store_exclusive (mode, cond, mem,
12395                                 gen_lowpart (mode, new_out), model_rtx);
12396
12397   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12398   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12399                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
12400   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12401
12402   /* Emit any final barrier needed for a __sync operation.  */
12403   if (is_sync)
12404     aarch64_emit_post_barrier (model);
12405 }
12406
12407 static void
12408 aarch64_init_libfuncs (void)
12409 {
12410    /* Half-precision float operations.  The compiler handles all operations
12411      with NULL libfuncs by converting to SFmode.  */
12412
12413   /* Conversions.  */
12414   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
12415   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
12416
12417   /* Arithmetic.  */
12418   set_optab_libfunc (add_optab, HFmode, NULL);
12419   set_optab_libfunc (sdiv_optab, HFmode, NULL);
12420   set_optab_libfunc (smul_optab, HFmode, NULL);
12421   set_optab_libfunc (neg_optab, HFmode, NULL);
12422   set_optab_libfunc (sub_optab, HFmode, NULL);
12423
12424   /* Comparisons.  */
12425   set_optab_libfunc (eq_optab, HFmode, NULL);
12426   set_optab_libfunc (ne_optab, HFmode, NULL);
12427   set_optab_libfunc (lt_optab, HFmode, NULL);
12428   set_optab_libfunc (le_optab, HFmode, NULL);
12429   set_optab_libfunc (ge_optab, HFmode, NULL);
12430   set_optab_libfunc (gt_optab, HFmode, NULL);
12431   set_optab_libfunc (unord_optab, HFmode, NULL);
12432 }
12433
12434 /* Target hook for c_mode_for_suffix.  */
12435 static machine_mode
12436 aarch64_c_mode_for_suffix (char suffix)
12437 {
12438   if (suffix == 'q')
12439     return TFmode;
12440
12441   return VOIDmode;
12442 }
12443
12444 /* We can only represent floating point constants which will fit in
12445    "quarter-precision" values.  These values are characterised by
12446    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
12447    by:
12448
12449    (-1)^s * (n/16) * 2^r
12450
12451    Where:
12452      's' is the sign bit.
12453      'n' is an integer in the range 16 <= n <= 31.
12454      'r' is an integer in the range -3 <= r <= 4.  */
12455
12456 /* Return true iff X can be represented by a quarter-precision
12457    floating point immediate operand X.  Note, we cannot represent 0.0.  */
12458 bool
12459 aarch64_float_const_representable_p (rtx x)
12460 {
12461   /* This represents our current view of how many bits
12462      make up the mantissa.  */
12463   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
12464   int exponent;
12465   unsigned HOST_WIDE_INT mantissa, mask;
12466   REAL_VALUE_TYPE r, m;
12467   bool fail;
12468
12469   if (!CONST_DOUBLE_P (x))
12470     return false;
12471
12472   /* We don't support HFmode constants yet.  */
12473   if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
12474     return false;
12475
12476   r = *CONST_DOUBLE_REAL_VALUE (x);
12477
12478   /* We cannot represent infinities, NaNs or +/-zero.  We won't
12479      know if we have +zero until we analyse the mantissa, but we
12480      can reject the other invalid values.  */
12481   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
12482       || REAL_VALUE_MINUS_ZERO (r))
12483     return false;
12484
12485   /* Extract exponent.  */
12486   r = real_value_abs (&r);
12487   exponent = REAL_EXP (&r);
12488
12489   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12490      highest (sign) bit, with a fixed binary point at bit point_pos.
12491      m1 holds the low part of the mantissa, m2 the high part.
12492      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12493      bits for the mantissa, this can fail (low bits will be lost).  */
12494   real_ldexp (&m, &r, point_pos - exponent);
12495   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
12496
12497   /* If the low part of the mantissa has bits set we cannot represent
12498      the value.  */
12499   if (w.ulow () != 0)
12500     return false;
12501   /* We have rejected the lower HOST_WIDE_INT, so update our
12502      understanding of how many bits lie in the mantissa and
12503      look only at the high HOST_WIDE_INT.  */
12504   mantissa = w.elt (1);
12505   point_pos -= HOST_BITS_PER_WIDE_INT;
12506
12507   /* We can only represent values with a mantissa of the form 1.xxxx.  */
12508   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
12509   if ((mantissa & mask) != 0)
12510     return false;
12511
12512   /* Having filtered unrepresentable values, we may now remove all
12513      but the highest 5 bits.  */
12514   mantissa >>= point_pos - 5;
12515
12516   /* We cannot represent the value 0.0, so reject it.  This is handled
12517      elsewhere.  */
12518   if (mantissa == 0)
12519     return false;
12520
12521   /* Then, as bit 4 is always set, we can mask it off, leaving
12522      the mantissa in the range [0, 15].  */
12523   mantissa &= ~(1 << 4);
12524   gcc_assert (mantissa <= 15);
12525
12526   /* GCC internally does not use IEEE754-like encoding (where normalized
12527      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
12528      Our mantissa values are shifted 4 places to the left relative to
12529      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
12530      by 5 places to correct for GCC's representation.  */
12531   exponent = 5 - exponent;
12532
12533   return (exponent >= 0 && exponent <= 7);
12534 }
12535
12536 char*
12537 aarch64_output_simd_mov_immediate (rtx const_vector,
12538                                    machine_mode mode,
12539                                    unsigned width)
12540 {
12541   bool is_valid;
12542   static char templ[40];
12543   const char *mnemonic;
12544   const char *shift_op;
12545   unsigned int lane_count = 0;
12546   char element_char;
12547
12548   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
12549
12550   /* This will return true to show const_vector is legal for use as either
12551      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
12552      also update INFO to show how the immediate should be generated.  */
12553   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
12554   gcc_assert (is_valid);
12555
12556   element_char = sizetochar (info.element_width);
12557   lane_count = width / info.element_width;
12558
12559   mode = GET_MODE_INNER (mode);
12560   if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12561     {
12562       gcc_assert (info.shift == 0 && ! info.mvn);
12563       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
12564          move immediate path.  */
12565       if (aarch64_float_const_zero_rtx_p (info.value))
12566         info.value = GEN_INT (0);
12567       else
12568         {
12569           const unsigned int buf_size = 20;
12570           char float_buf[buf_size] = {'\0'};
12571           real_to_decimal_for_mode (float_buf,
12572                                     CONST_DOUBLE_REAL_VALUE (info.value),
12573                                     buf_size, buf_size, 1, mode);
12574
12575           if (lane_count == 1)
12576             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
12577           else
12578             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
12579                       lane_count, element_char, float_buf);
12580           return templ;
12581         }
12582     }
12583
12584   mnemonic = info.mvn ? "mvni" : "movi";
12585   shift_op = info.msl ? "msl" : "lsl";
12586
12587   gcc_assert (CONST_INT_P (info.value));
12588   if (lane_count == 1)
12589     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
12590               mnemonic, UINTVAL (info.value));
12591   else if (info.shift)
12592     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
12593               ", %s %d", mnemonic, lane_count, element_char,
12594               UINTVAL (info.value), shift_op, info.shift);
12595   else
12596     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
12597               mnemonic, lane_count, element_char, UINTVAL (info.value));
12598   return templ;
12599 }
12600
12601 char*
12602 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
12603                                           machine_mode mode)
12604 {
12605   machine_mode vmode;
12606
12607   gcc_assert (!VECTOR_MODE_P (mode));
12608   vmode = aarch64_simd_container_mode (mode, 64);
12609   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
12610   return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
12611 }
12612
12613 /* Split operands into moves from op[1] + op[2] into op[0].  */
12614
12615 void
12616 aarch64_split_combinev16qi (rtx operands[3])
12617 {
12618   unsigned int dest = REGNO (operands[0]);
12619   unsigned int src1 = REGNO (operands[1]);
12620   unsigned int src2 = REGNO (operands[2]);
12621   machine_mode halfmode = GET_MODE (operands[1]);
12622   unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
12623   rtx destlo, desthi;
12624
12625   gcc_assert (halfmode == V16QImode);
12626
12627   if (src1 == dest && src2 == dest + halfregs)
12628     {
12629       /* No-op move.  Can't split to nothing; emit something.  */
12630       emit_note (NOTE_INSN_DELETED);
12631       return;
12632     }
12633
12634   /* Preserve register attributes for variable tracking.  */
12635   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
12636   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
12637                                GET_MODE_SIZE (halfmode));
12638
12639   /* Special case of reversed high/low parts.  */
12640   if (reg_overlap_mentioned_p (operands[2], destlo)
12641       && reg_overlap_mentioned_p (operands[1], desthi))
12642     {
12643       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12644       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
12645       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12646     }
12647   else if (!reg_overlap_mentioned_p (operands[2], destlo))
12648     {
12649       /* Try to avoid unnecessary moves if part of the result
12650          is in the right place already.  */
12651       if (src1 != dest)
12652         emit_move_insn (destlo, operands[1]);
12653       if (src2 != dest + halfregs)
12654         emit_move_insn (desthi, operands[2]);
12655     }
12656   else
12657     {
12658       if (src2 != dest + halfregs)
12659         emit_move_insn (desthi, operands[2]);
12660       if (src1 != dest)
12661         emit_move_insn (destlo, operands[1]);
12662     }
12663 }
12664
12665 /* vec_perm support.  */
12666
12667 #define MAX_VECT_LEN 16
12668
12669 struct expand_vec_perm_d
12670 {
12671   rtx target, op0, op1;
12672   unsigned char perm[MAX_VECT_LEN];
12673   machine_mode vmode;
12674   unsigned char nelt;
12675   bool one_vector_p;
12676   bool testing_p;
12677 };
12678
12679 /* Generate a variable permutation.  */
12680
12681 static void
12682 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
12683 {
12684   machine_mode vmode = GET_MODE (target);
12685   bool one_vector_p = rtx_equal_p (op0, op1);
12686
12687   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
12688   gcc_checking_assert (GET_MODE (op0) == vmode);
12689   gcc_checking_assert (GET_MODE (op1) == vmode);
12690   gcc_checking_assert (GET_MODE (sel) == vmode);
12691   gcc_checking_assert (TARGET_SIMD);
12692
12693   if (one_vector_p)
12694     {
12695       if (vmode == V8QImode)
12696         {
12697           /* Expand the argument to a V16QI mode by duplicating it.  */
12698           rtx pair = gen_reg_rtx (V16QImode);
12699           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
12700           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12701         }
12702       else
12703         {
12704           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
12705         }
12706     }
12707   else
12708     {
12709       rtx pair;
12710
12711       if (vmode == V8QImode)
12712         {
12713           pair = gen_reg_rtx (V16QImode);
12714           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
12715           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12716         }
12717       else
12718         {
12719           pair = gen_reg_rtx (OImode);
12720           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
12721           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
12722         }
12723     }
12724 }
12725
12726 void
12727 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
12728 {
12729   machine_mode vmode = GET_MODE (target);
12730   unsigned int nelt = GET_MODE_NUNITS (vmode);
12731   bool one_vector_p = rtx_equal_p (op0, op1);
12732   rtx mask;
12733
12734   /* The TBL instruction does not use a modulo index, so we must take care
12735      of that ourselves.  */
12736   mask = aarch64_simd_gen_const_vector_dup (vmode,
12737       one_vector_p ? nelt - 1 : 2 * nelt - 1);
12738   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
12739
12740   /* For big-endian, we also need to reverse the index within the vector
12741      (but not which vector).  */
12742   if (BYTES_BIG_ENDIAN)
12743     {
12744       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
12745       if (!one_vector_p)
12746         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
12747       sel = expand_simple_binop (vmode, XOR, sel, mask,
12748                                  NULL, 0, OPTAB_LIB_WIDEN);
12749     }
12750   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
12751 }
12752
12753 /* Recognize patterns suitable for the TRN instructions.  */
12754 static bool
12755 aarch64_evpc_trn (struct expand_vec_perm_d *d)
12756 {
12757   unsigned int i, odd, mask, nelt = d->nelt;
12758   rtx out, in0, in1, x;
12759   rtx (*gen) (rtx, rtx, rtx);
12760   machine_mode vmode = d->vmode;
12761
12762   if (GET_MODE_UNIT_SIZE (vmode) > 8)
12763     return false;
12764
12765   /* Note that these are little-endian tests.
12766      We correct for big-endian later.  */
12767   if (d->perm[0] == 0)
12768     odd = 0;
12769   else if (d->perm[0] == 1)
12770     odd = 1;
12771   else
12772     return false;
12773   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12774
12775   for (i = 0; i < nelt; i += 2)
12776     {
12777       if (d->perm[i] != i + odd)
12778         return false;
12779       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
12780         return false;
12781     }
12782
12783   /* Success!  */
12784   if (d->testing_p)
12785     return true;
12786
12787   in0 = d->op0;
12788   in1 = d->op1;
12789   if (BYTES_BIG_ENDIAN)
12790     {
12791       x = in0, in0 = in1, in1 = x;
12792       odd = !odd;
12793     }
12794   out = d->target;
12795
12796   if (odd)
12797     {
12798       switch (vmode)
12799         {
12800         case V16QImode: gen = gen_aarch64_trn2v16qi; break;
12801         case V8QImode: gen = gen_aarch64_trn2v8qi; break;
12802         case V8HImode: gen = gen_aarch64_trn2v8hi; break;
12803         case V4HImode: gen = gen_aarch64_trn2v4hi; break;
12804         case V4SImode: gen = gen_aarch64_trn2v4si; break;
12805         case V2SImode: gen = gen_aarch64_trn2v2si; break;
12806         case V2DImode: gen = gen_aarch64_trn2v2di; break;
12807         case V4HFmode: gen = gen_aarch64_trn2v4hf; break;
12808         case V8HFmode: gen = gen_aarch64_trn2v8hf; break;
12809         case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
12810         case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
12811         case V2DFmode: gen = gen_aarch64_trn2v2df; break;
12812         default:
12813           return false;
12814         }
12815     }
12816   else
12817     {
12818       switch (vmode)
12819         {
12820         case V16QImode: gen = gen_aarch64_trn1v16qi; break;
12821         case V8QImode: gen = gen_aarch64_trn1v8qi; break;
12822         case V8HImode: gen = gen_aarch64_trn1v8hi; break;
12823         case V4HImode: gen = gen_aarch64_trn1v4hi; break;
12824         case V4SImode: gen = gen_aarch64_trn1v4si; break;
12825         case V2SImode: gen = gen_aarch64_trn1v2si; break;
12826         case V2DImode: gen = gen_aarch64_trn1v2di; break;
12827         case V4HFmode: gen = gen_aarch64_trn1v4hf; break;
12828         case V8HFmode: gen = gen_aarch64_trn1v8hf; break;
12829         case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
12830         case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
12831         case V2DFmode: gen = gen_aarch64_trn1v2df; break;
12832         default:
12833           return false;
12834         }
12835     }
12836
12837   emit_insn (gen (out, in0, in1));
12838   return true;
12839 }
12840
12841 /* Recognize patterns suitable for the UZP instructions.  */
12842 static bool
12843 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
12844 {
12845   unsigned int i, odd, mask, nelt = d->nelt;
12846   rtx out, in0, in1, x;
12847   rtx (*gen) (rtx, rtx, rtx);
12848   machine_mode vmode = d->vmode;
12849
12850   if (GET_MODE_UNIT_SIZE (vmode) > 8)
12851     return false;
12852
12853   /* Note that these are little-endian tests.
12854      We correct for big-endian later.  */
12855   if (d->perm[0] == 0)
12856     odd = 0;
12857   else if (d->perm[0] == 1)
12858     odd = 1;
12859   else
12860     return false;
12861   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12862
12863   for (i = 0; i < nelt; i++)
12864     {
12865       unsigned elt = (i * 2 + odd) & mask;
12866       if (d->perm[i] != elt)
12867         return false;
12868     }
12869
12870   /* Success!  */
12871   if (d->testing_p)
12872     return true;
12873
12874   in0 = d->op0;
12875   in1 = d->op1;
12876   if (BYTES_BIG_ENDIAN)
12877     {
12878       x = in0, in0 = in1, in1 = x;
12879       odd = !odd;
12880     }
12881   out = d->target;
12882
12883   if (odd)
12884     {
12885       switch (vmode)
12886         {
12887         case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
12888         case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
12889         case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
12890         case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
12891         case V4SImode: gen = gen_aarch64_uzp2v4si; break;
12892         case V2SImode: gen = gen_aarch64_uzp2v2si; break;
12893         case V2DImode: gen = gen_aarch64_uzp2v2di; break;
12894         case V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
12895         case V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
12896         case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
12897         case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
12898         case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
12899         default:
12900           return false;
12901         }
12902     }
12903   else
12904     {
12905       switch (vmode)
12906         {
12907         case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
12908         case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
12909         case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
12910         case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
12911         case V4SImode: gen = gen_aarch64_uzp1v4si; break;
12912         case V2SImode: gen = gen_aarch64_uzp1v2si; break;
12913         case V2DImode: gen = gen_aarch64_uzp1v2di; break;
12914         case V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
12915         case V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
12916         case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
12917         case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
12918         case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
12919         default:
12920           return false;
12921         }
12922     }
12923
12924   emit_insn (gen (out, in0, in1));
12925   return true;
12926 }
12927
12928 /* Recognize patterns suitable for the ZIP instructions.  */
12929 static bool
12930 aarch64_evpc_zip (struct expand_vec_perm_d *d)
12931 {
12932   unsigned int i, high, mask, nelt = d->nelt;
12933   rtx out, in0, in1, x;
12934   rtx (*gen) (rtx, rtx, rtx);
12935   machine_mode vmode = d->vmode;
12936
12937   if (GET_MODE_UNIT_SIZE (vmode) > 8)
12938     return false;
12939
12940   /* Note that these are little-endian tests.
12941      We correct for big-endian later.  */
12942   high = nelt / 2;
12943   if (d->perm[0] == high)
12944     /* Do Nothing.  */
12945     ;
12946   else if (d->perm[0] == 0)
12947     high = 0;
12948   else
12949     return false;
12950   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12951
12952   for (i = 0; i < nelt / 2; i++)
12953     {
12954       unsigned elt = (i + high) & mask;
12955       if (d->perm[i * 2] != elt)
12956         return false;
12957       elt = (elt + nelt) & mask;
12958       if (d->perm[i * 2 + 1] != elt)
12959         return false;
12960     }
12961
12962   /* Success!  */
12963   if (d->testing_p)
12964     return true;
12965
12966   in0 = d->op0;
12967   in1 = d->op1;
12968   if (BYTES_BIG_ENDIAN)
12969     {
12970       x = in0, in0 = in1, in1 = x;
12971       high = !high;
12972     }
12973   out = d->target;
12974
12975   if (high)
12976     {
12977       switch (vmode)
12978         {
12979         case V16QImode: gen = gen_aarch64_zip2v16qi; break;
12980         case V8QImode: gen = gen_aarch64_zip2v8qi; break;
12981         case V8HImode: gen = gen_aarch64_zip2v8hi; break;
12982         case V4HImode: gen = gen_aarch64_zip2v4hi; break;
12983         case V4SImode: gen = gen_aarch64_zip2v4si; break;
12984         case V2SImode: gen = gen_aarch64_zip2v2si; break;
12985         case V2DImode: gen = gen_aarch64_zip2v2di; break;
12986         case V4HFmode: gen = gen_aarch64_zip2v4hf; break;
12987         case V8HFmode: gen = gen_aarch64_zip2v8hf; break;
12988         case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
12989         case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
12990         case V2DFmode: gen = gen_aarch64_zip2v2df; break;
12991         default:
12992           return false;
12993         }
12994     }
12995   else
12996     {
12997       switch (vmode)
12998         {
12999         case V16QImode: gen = gen_aarch64_zip1v16qi; break;
13000         case V8QImode: gen = gen_aarch64_zip1v8qi; break;
13001         case V8HImode: gen = gen_aarch64_zip1v8hi; break;
13002         case V4HImode: gen = gen_aarch64_zip1v4hi; break;
13003         case V4SImode: gen = gen_aarch64_zip1v4si; break;
13004         case V2SImode: gen = gen_aarch64_zip1v2si; break;
13005         case V2DImode: gen = gen_aarch64_zip1v2di; break;
13006         case V4HFmode: gen = gen_aarch64_zip1v4hf; break;
13007         case V8HFmode: gen = gen_aarch64_zip1v8hf; break;
13008         case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
13009         case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
13010         case V2DFmode: gen = gen_aarch64_zip1v2df; break;
13011         default:
13012           return false;
13013         }
13014     }
13015
13016   emit_insn (gen (out, in0, in1));
13017   return true;
13018 }
13019
13020 /* Recognize patterns for the EXT insn.  */
13021
13022 static bool
13023 aarch64_evpc_ext (struct expand_vec_perm_d *d)
13024 {
13025   unsigned int i, nelt = d->nelt;
13026   rtx (*gen) (rtx, rtx, rtx, rtx);
13027   rtx offset;
13028
13029   unsigned int location = d->perm[0]; /* Always < nelt.  */
13030
13031   /* Check if the extracted indices are increasing by one.  */
13032   for (i = 1; i < nelt; i++)
13033     {
13034       unsigned int required = location + i;
13035       if (d->one_vector_p)
13036         {
13037           /* We'll pass the same vector in twice, so allow indices to wrap.  */
13038           required &= (nelt - 1);
13039         }
13040       if (d->perm[i] != required)
13041         return false;
13042     }
13043
13044   switch (d->vmode)
13045     {
13046     case V16QImode: gen = gen_aarch64_extv16qi; break;
13047     case V8QImode: gen = gen_aarch64_extv8qi; break;
13048     case V4HImode: gen = gen_aarch64_extv4hi; break;
13049     case V8HImode: gen = gen_aarch64_extv8hi; break;
13050     case V2SImode: gen = gen_aarch64_extv2si; break;
13051     case V4SImode: gen = gen_aarch64_extv4si; break;
13052     case V4HFmode: gen = gen_aarch64_extv4hf; break;
13053     case V8HFmode: gen = gen_aarch64_extv8hf; break;
13054     case V2SFmode: gen = gen_aarch64_extv2sf; break;
13055     case V4SFmode: gen = gen_aarch64_extv4sf; break;
13056     case V2DImode: gen = gen_aarch64_extv2di; break;
13057     case V2DFmode: gen = gen_aarch64_extv2df; break;
13058     default:
13059       return false;
13060     }
13061
13062   /* Success! */
13063   if (d->testing_p)
13064     return true;
13065
13066   /* The case where (location == 0) is a no-op for both big- and little-endian,
13067      and is removed by the mid-end at optimization levels -O1 and higher.  */
13068
13069   if (BYTES_BIG_ENDIAN && (location != 0))
13070     {
13071       /* After setup, we want the high elements of the first vector (stored
13072          at the LSB end of the register), and the low elements of the second
13073          vector (stored at the MSB end of the register). So swap.  */
13074       std::swap (d->op0, d->op1);
13075       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
13076       location = nelt - location;
13077     }
13078
13079   offset = GEN_INT (location);
13080   emit_insn (gen (d->target, d->op0, d->op1, offset));
13081   return true;
13082 }
13083
13084 /* Recognize patterns for the REV insns.  */
13085
13086 static bool
13087 aarch64_evpc_rev (struct expand_vec_perm_d *d)
13088 {
13089   unsigned int i, j, diff, nelt = d->nelt;
13090   rtx (*gen) (rtx, rtx);
13091
13092   if (!d->one_vector_p)
13093     return false;
13094
13095   diff = d->perm[0];
13096   switch (diff)
13097     {
13098     case 7:
13099       switch (d->vmode)
13100         {
13101         case V16QImode: gen = gen_aarch64_rev64v16qi; break;
13102         case V8QImode: gen = gen_aarch64_rev64v8qi;  break;
13103         default:
13104           return false;
13105         }
13106       break;
13107     case 3:
13108       switch (d->vmode)
13109         {
13110         case V16QImode: gen = gen_aarch64_rev32v16qi; break;
13111         case V8QImode: gen = gen_aarch64_rev32v8qi;  break;
13112         case V8HImode: gen = gen_aarch64_rev64v8hi;  break;
13113         case V4HImode: gen = gen_aarch64_rev64v4hi;  break;
13114         default:
13115           return false;
13116         }
13117       break;
13118     case 1:
13119       switch (d->vmode)
13120         {
13121         case V16QImode: gen = gen_aarch64_rev16v16qi; break;
13122         case V8QImode: gen = gen_aarch64_rev16v8qi;  break;
13123         case V8HImode: gen = gen_aarch64_rev32v8hi;  break;
13124         case V4HImode: gen = gen_aarch64_rev32v4hi;  break;
13125         case V4SImode: gen = gen_aarch64_rev64v4si;  break;
13126         case V2SImode: gen = gen_aarch64_rev64v2si;  break;
13127         case V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
13128         case V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
13129         case V8HFmode: gen = gen_aarch64_rev64v8hf;  break;
13130         case V4HFmode: gen = gen_aarch64_rev64v4hf;  break;
13131         default:
13132           return false;
13133         }
13134       break;
13135     default:
13136       return false;
13137     }
13138
13139   for (i = 0; i < nelt ; i += diff + 1)
13140     for (j = 0; j <= diff; j += 1)
13141       {
13142         /* This is guaranteed to be true as the value of diff
13143            is 7, 3, 1 and we should have enough elements in the
13144            queue to generate this.  Getting a vector mask with a
13145            value of diff other than these values implies that
13146            something is wrong by the time we get here.  */
13147         gcc_assert (i + j < nelt);
13148         if (d->perm[i + j] != i + diff - j)
13149           return false;
13150       }
13151
13152   /* Success! */
13153   if (d->testing_p)
13154     return true;
13155
13156   emit_insn (gen (d->target, d->op0));
13157   return true;
13158 }
13159
13160 static bool
13161 aarch64_evpc_dup (struct expand_vec_perm_d *d)
13162 {
13163   rtx (*gen) (rtx, rtx, rtx);
13164   rtx out = d->target;
13165   rtx in0;
13166   machine_mode vmode = d->vmode;
13167   unsigned int i, elt, nelt = d->nelt;
13168   rtx lane;
13169
13170   elt = d->perm[0];
13171   for (i = 1; i < nelt; i++)
13172     {
13173       if (elt != d->perm[i])
13174         return false;
13175     }
13176
13177   /* The generic preparation in aarch64_expand_vec_perm_const_1
13178      swaps the operand order and the permute indices if it finds
13179      d->perm[0] to be in the second operand.  Thus, we can always
13180      use d->op0 and need not do any extra arithmetic to get the
13181      correct lane number.  */
13182   in0 = d->op0;
13183   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
13184
13185   switch (vmode)
13186     {
13187     case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
13188     case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
13189     case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
13190     case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
13191     case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
13192     case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
13193     case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
13194     case V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
13195     case V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
13196     case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
13197     case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
13198     case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
13199     default:
13200       return false;
13201     }
13202
13203   emit_insn (gen (out, in0, lane));
13204   return true;
13205 }
13206
13207 static bool
13208 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
13209 {
13210   rtx rperm[MAX_VECT_LEN], sel;
13211   machine_mode vmode = d->vmode;
13212   unsigned int i, nelt = d->nelt;
13213
13214   if (d->testing_p)
13215     return true;
13216
13217   /* Generic code will try constant permutation twice.  Once with the
13218      original mode and again with the elements lowered to QImode.
13219      So wait and don't do the selector expansion ourselves.  */
13220   if (vmode != V8QImode && vmode != V16QImode)
13221     return false;
13222
13223   for (i = 0; i < nelt; ++i)
13224     {
13225       int nunits = GET_MODE_NUNITS (vmode);
13226
13227       /* If big-endian and two vectors we end up with a weird mixed-endian
13228          mode on NEON.  Reverse the index within each word but not the word
13229          itself.  */
13230       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
13231                                            : d->perm[i]);
13232     }
13233   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
13234   sel = force_reg (vmode, sel);
13235
13236   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
13237   return true;
13238 }
13239
13240 static bool
13241 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
13242 {
13243   /* The pattern matching functions above are written to look for a small
13244      number to begin the sequence (0, 1, N/2).  If we begin with an index
13245      from the second operand, we can swap the operands.  */
13246   if (d->perm[0] >= d->nelt)
13247     {
13248       unsigned i, nelt = d->nelt;
13249
13250       gcc_assert (nelt == (nelt & -nelt));
13251       for (i = 0; i < nelt; ++i)
13252         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
13253
13254       std::swap (d->op0, d->op1);
13255     }
13256
13257   if (TARGET_SIMD)
13258     {
13259       if (aarch64_evpc_rev (d))
13260         return true;
13261       else if (aarch64_evpc_ext (d))
13262         return true;
13263       else if (aarch64_evpc_dup (d))
13264         return true;
13265       else if (aarch64_evpc_zip (d))
13266         return true;
13267       else if (aarch64_evpc_uzp (d))
13268         return true;
13269       else if (aarch64_evpc_trn (d))
13270         return true;
13271       return aarch64_evpc_tbl (d);
13272     }
13273   return false;
13274 }
13275
13276 /* Expand a vec_perm_const pattern.  */
13277
13278 bool
13279 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
13280 {
13281   struct expand_vec_perm_d d;
13282   int i, nelt, which;
13283
13284   d.target = target;
13285   d.op0 = op0;
13286   d.op1 = op1;
13287
13288   d.vmode = GET_MODE (target);
13289   gcc_assert (VECTOR_MODE_P (d.vmode));
13290   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13291   d.testing_p = false;
13292
13293   for (i = which = 0; i < nelt; ++i)
13294     {
13295       rtx e = XVECEXP (sel, 0, i);
13296       int ei = INTVAL (e) & (2 * nelt - 1);
13297       which |= (ei < nelt ? 1 : 2);
13298       d.perm[i] = ei;
13299     }
13300
13301   switch (which)
13302     {
13303     default:
13304       gcc_unreachable ();
13305
13306     case 3:
13307       d.one_vector_p = false;
13308       if (!rtx_equal_p (op0, op1))
13309         break;
13310
13311       /* The elements of PERM do not suggest that only the first operand
13312          is used, but both operands are identical.  Allow easier matching
13313          of the permutation by folding the permutation into the single
13314          input vector.  */
13315       /* Fall Through.  */
13316     case 2:
13317       for (i = 0; i < nelt; ++i)
13318         d.perm[i] &= nelt - 1;
13319       d.op0 = op1;
13320       d.one_vector_p = true;
13321       break;
13322
13323     case 1:
13324       d.op1 = op0;
13325       d.one_vector_p = true;
13326       break;
13327     }
13328
13329   return aarch64_expand_vec_perm_const_1 (&d);
13330 }
13331
13332 static bool
13333 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
13334                                      const unsigned char *sel)
13335 {
13336   struct expand_vec_perm_d d;
13337   unsigned int i, nelt, which;
13338   bool ret;
13339
13340   d.vmode = vmode;
13341   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13342   d.testing_p = true;
13343   memcpy (d.perm, sel, nelt);
13344
13345   /* Calculate whether all elements are in one vector.  */
13346   for (i = which = 0; i < nelt; ++i)
13347     {
13348       unsigned char e = d.perm[i];
13349       gcc_assert (e < 2 * nelt);
13350       which |= (e < nelt ? 1 : 2);
13351     }
13352
13353   /* If all elements are from the second vector, reindex as if from the
13354      first vector.  */
13355   if (which == 2)
13356     for (i = 0; i < nelt; ++i)
13357       d.perm[i] -= nelt;
13358
13359   /* Check whether the mask can be applied to a single vector.  */
13360   d.one_vector_p = (which != 3);
13361
13362   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
13363   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
13364   if (!d.one_vector_p)
13365     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
13366
13367   start_sequence ();
13368   ret = aarch64_expand_vec_perm_const_1 (&d);
13369   end_sequence ();
13370
13371   return ret;
13372 }
13373
13374 rtx
13375 aarch64_reverse_mask (enum machine_mode mode)
13376 {
13377   /* We have to reverse each vector because we dont have
13378      a permuted load that can reverse-load according to ABI rules.  */
13379   rtx mask;
13380   rtvec v = rtvec_alloc (16);
13381   int i, j;
13382   int nunits = GET_MODE_NUNITS (mode);
13383   int usize = GET_MODE_UNIT_SIZE (mode);
13384
13385   gcc_assert (BYTES_BIG_ENDIAN);
13386   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
13387
13388   for (i = 0; i < nunits; i++)
13389     for (j = 0; j < usize; j++)
13390       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
13391   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
13392   return force_reg (V16QImode, mask);
13393 }
13394
13395 /* Implement MODES_TIEABLE_P.  In principle we should always return true.
13396    However due to issues with register allocation it is preferable to avoid
13397    tieing integer scalar and FP scalar modes.  Executing integer operations
13398    in general registers is better than treating them as scalar vector
13399    operations.  This reduces latency and avoids redundant int<->FP moves.
13400    So tie modes if they are either the same class, or vector modes with
13401    other vector modes, vector structs or any scalar mode.
13402 */
13403
13404 bool
13405 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
13406 {
13407   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
13408     return true;
13409
13410   /* We specifically want to allow elements of "structure" modes to
13411      be tieable to the structure.  This more general condition allows
13412      other rarer situations too.  */
13413   if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
13414     return true;
13415
13416   /* Also allow any scalar modes with vectors.  */
13417   if (aarch64_vector_mode_supported_p (mode1)
13418       || aarch64_vector_mode_supported_p (mode2))
13419     return true;
13420
13421   return false;
13422 }
13423
13424 /* Return a new RTX holding the result of moving POINTER forward by
13425    AMOUNT bytes.  */
13426
13427 static rtx
13428 aarch64_move_pointer (rtx pointer, int amount)
13429 {
13430   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
13431
13432   return adjust_automodify_address (pointer, GET_MODE (pointer),
13433                                     next, amount);
13434 }
13435
13436 /* Return a new RTX holding the result of moving POINTER forward by the
13437    size of the mode it points to.  */
13438
13439 static rtx
13440 aarch64_progress_pointer (rtx pointer)
13441 {
13442   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
13443
13444   return aarch64_move_pointer (pointer, amount);
13445 }
13446
13447 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13448    MODE bytes.  */
13449
13450 static void
13451 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
13452                                               machine_mode mode)
13453 {
13454   rtx reg = gen_reg_rtx (mode);
13455
13456   /* "Cast" the pointers to the correct mode.  */
13457   *src = adjust_address (*src, mode, 0);
13458   *dst = adjust_address (*dst, mode, 0);
13459   /* Emit the memcpy.  */
13460   emit_move_insn (reg, *src);
13461   emit_move_insn (*dst, reg);
13462   /* Move the pointers forward.  */
13463   *src = aarch64_progress_pointer (*src);
13464   *dst = aarch64_progress_pointer (*dst);
13465 }
13466
13467 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
13468    we succeed, otherwise return false.  */
13469
13470 bool
13471 aarch64_expand_movmem (rtx *operands)
13472 {
13473   unsigned int n;
13474   rtx dst = operands[0];
13475   rtx src = operands[1];
13476   rtx base;
13477   bool speed_p = !optimize_function_for_size_p (cfun);
13478
13479   /* When optimizing for size, give a better estimate of the length of a
13480      memcpy call, but use the default otherwise.  */
13481   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
13482
13483   /* We can't do anything smart if the amount to copy is not constant.  */
13484   if (!CONST_INT_P (operands[2]))
13485     return false;
13486
13487   n = UINTVAL (operands[2]);
13488
13489   /* Try to keep the number of instructions low.  For cases below 16 bytes we
13490      need to make at most two moves.  For cases above 16 bytes it will be one
13491      move for each 16 byte chunk, then at most two additional moves.  */
13492   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
13493     return false;
13494
13495   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
13496   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
13497
13498   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
13499   src = adjust_automodify_address (src, VOIDmode, base, 0);
13500
13501   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
13502      1-byte chunk.  */
13503   if (n < 4)
13504     {
13505       if (n >= 2)
13506         {
13507           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13508           n -= 2;
13509         }
13510
13511       if (n == 1)
13512         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13513
13514       return true;
13515     }
13516
13517   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
13518      4-byte chunk, partially overlapping with the previously copied chunk.  */
13519   if (n < 8)
13520     {
13521       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13522       n -= 4;
13523       if (n > 0)
13524         {
13525           int move = n - 4;
13526
13527           src = aarch64_move_pointer (src, move);
13528           dst = aarch64_move_pointer (dst, move);
13529           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13530         }
13531       return true;
13532     }
13533
13534   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
13535      them, then (if applicable) an 8-byte chunk.  */
13536   while (n >= 8)
13537     {
13538       if (n / 16)
13539         {
13540           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
13541           n -= 16;
13542         }
13543       else
13544         {
13545           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13546           n -= 8;
13547         }
13548     }
13549
13550   /* Finish the final bytes of the copy.  We can always do this in one
13551      instruction.  We either copy the exact amount we need, or partially
13552      overlap with the previous chunk we copied and copy 8-bytes.  */
13553   if (n == 0)
13554     return true;
13555   else if (n == 1)
13556     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13557   else if (n == 2)
13558     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13559   else if (n == 4)
13560     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13561   else
13562     {
13563       if (n == 3)
13564         {
13565           src = aarch64_move_pointer (src, -1);
13566           dst = aarch64_move_pointer (dst, -1);
13567           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13568         }
13569       else
13570         {
13571           int move = n - 8;
13572
13573           src = aarch64_move_pointer (src, move);
13574           dst = aarch64_move_pointer (dst, move);
13575           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13576         }
13577     }
13578
13579   return true;
13580 }
13581
13582 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
13583    SImode stores.  Handle the case when the constant has identical
13584    bottom and top halves.  This is beneficial when the two stores can be
13585    merged into an STP and we avoid synthesising potentially expensive
13586    immediates twice.  Return true if such a split is possible.  */
13587
13588 bool
13589 aarch64_split_dimode_const_store (rtx dst, rtx src)
13590 {
13591   rtx lo = gen_lowpart (SImode, src);
13592   rtx hi = gen_highpart_mode (SImode, DImode, src);
13593
13594   bool size_p = optimize_function_for_size_p (cfun);
13595
13596   if (!rtx_equal_p (lo, hi))
13597     return false;
13598
13599   unsigned int orig_cost
13600     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
13601   unsigned int lo_cost
13602     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
13603
13604   /* We want to transform:
13605      MOV        x1, 49370
13606      MOVK       x1, 0x140, lsl 16
13607      MOVK       x1, 0xc0da, lsl 32
13608      MOVK       x1, 0x140, lsl 48
13609      STR        x1, [x0]
13610    into:
13611      MOV        w1, 49370
13612      MOVK       w1, 0x140, lsl 16
13613      STP        w1, w1, [x0]
13614    So we want to perform this only when we save two instructions
13615    or more.  When optimizing for size, however, accept any code size
13616    savings we can.  */
13617   if (size_p && orig_cost <= lo_cost)
13618     return false;
13619
13620   if (!size_p
13621       && (orig_cost <= lo_cost + 1))
13622     return false;
13623
13624   rtx mem_lo = adjust_address (dst, SImode, 0);
13625   if (!aarch64_mem_pair_operand (mem_lo, SImode))
13626     return false;
13627
13628   rtx tmp_reg = gen_reg_rtx (SImode);
13629   aarch64_expand_mov_immediate (tmp_reg, lo);
13630   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
13631   /* Don't emit an explicit store pair as this may not be always profitable.
13632      Let the sched-fusion logic decide whether to merge them.  */
13633   emit_move_insn (mem_lo, tmp_reg);
13634   emit_move_insn (mem_hi, tmp_reg);
13635
13636   return true;
13637 }
13638
13639 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
13640
13641 static unsigned HOST_WIDE_INT
13642 aarch64_asan_shadow_offset (void)
13643 {
13644   return (HOST_WIDE_INT_1 << 36);
13645 }
13646
13647 static bool
13648 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
13649                                         unsigned int align,
13650                                         enum by_pieces_operation op,
13651                                         bool speed_p)
13652 {
13653   /* STORE_BY_PIECES can be used when copying a constant string, but
13654      in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
13655      For now we always fail this and let the move_by_pieces code copy
13656      the string from read-only memory.  */
13657   if (op == STORE_BY_PIECES)
13658     return false;
13659
13660   return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
13661 }
13662
13663 static rtx
13664 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
13665                         int code, tree treeop0, tree treeop1)
13666 {
13667   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13668   rtx op0, op1;
13669   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13670   insn_code icode;
13671   struct expand_operand ops[4];
13672
13673   start_sequence ();
13674   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13675
13676   op_mode = GET_MODE (op0);
13677   if (op_mode == VOIDmode)
13678     op_mode = GET_MODE (op1);
13679
13680   switch (op_mode)
13681     {
13682     case QImode:
13683     case HImode:
13684     case SImode:
13685       cmp_mode = SImode;
13686       icode = CODE_FOR_cmpsi;
13687       break;
13688
13689     case DImode:
13690       cmp_mode = DImode;
13691       icode = CODE_FOR_cmpdi;
13692       break;
13693
13694     case SFmode:
13695       cmp_mode = SFmode;
13696       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13697       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
13698       break;
13699
13700     case DFmode:
13701       cmp_mode = DFmode;
13702       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13703       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
13704       break;
13705
13706     default:
13707       end_sequence ();
13708       return NULL_RTX;
13709     }
13710
13711   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
13712   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
13713   if (!op0 || !op1)
13714     {
13715       end_sequence ();
13716       return NULL_RTX;
13717     }
13718   *prep_seq = get_insns ();
13719   end_sequence ();
13720
13721   create_fixed_operand (&ops[0], op0);
13722   create_fixed_operand (&ops[1], op1);
13723
13724   start_sequence ();
13725   if (!maybe_expand_insn (icode, 2, ops))
13726     {
13727       end_sequence ();
13728       return NULL_RTX;
13729     }
13730   *gen_seq = get_insns ();
13731   end_sequence ();
13732
13733   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
13734                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
13735 }
13736
13737 static rtx
13738 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
13739                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
13740 {
13741   rtx op0, op1, target;
13742   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13743   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13744   insn_code icode;
13745   struct expand_operand ops[6];
13746   int aarch64_cond;
13747
13748   push_to_sequence (*prep_seq);
13749   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13750
13751   op_mode = GET_MODE (op0);
13752   if (op_mode == VOIDmode)
13753     op_mode = GET_MODE (op1);
13754
13755   switch (op_mode)
13756     {
13757     case QImode:
13758     case HImode:
13759     case SImode:
13760       cmp_mode = SImode;
13761       icode = CODE_FOR_ccmpsi;
13762       break;
13763
13764     case DImode:
13765       cmp_mode = DImode;
13766       icode = CODE_FOR_ccmpdi;
13767       break;
13768
13769     case SFmode:
13770       cmp_mode = SFmode;
13771       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13772       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
13773       break;
13774
13775     case DFmode:
13776       cmp_mode = DFmode;
13777       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13778       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
13779       break;
13780
13781     default:
13782       end_sequence ();
13783       return NULL_RTX;
13784     }
13785
13786   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
13787   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
13788   if (!op0 || !op1)
13789     {
13790       end_sequence ();
13791       return NULL_RTX;
13792     }
13793   *prep_seq = get_insns ();
13794   end_sequence ();
13795
13796   target = gen_rtx_REG (cc_mode, CC_REGNUM);
13797   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
13798
13799   if (bit_code != AND)
13800     {
13801       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
13802                                                 GET_MODE (XEXP (prev, 0))),
13803                              VOIDmode, XEXP (prev, 0), const0_rtx);
13804       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
13805     }
13806
13807   create_fixed_operand (&ops[0], XEXP (prev, 0));
13808   create_fixed_operand (&ops[1], target);
13809   create_fixed_operand (&ops[2], op0);
13810   create_fixed_operand (&ops[3], op1);
13811   create_fixed_operand (&ops[4], prev);
13812   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
13813
13814   push_to_sequence (*gen_seq);
13815   if (!maybe_expand_insn (icode, 6, ops))
13816     {
13817       end_sequence ();
13818       return NULL_RTX;
13819     }
13820
13821   *gen_seq = get_insns ();
13822   end_sequence ();
13823
13824   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
13825 }
13826
13827 #undef TARGET_GEN_CCMP_FIRST
13828 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
13829
13830 #undef TARGET_GEN_CCMP_NEXT
13831 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
13832
13833 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
13834    instruction fusion of some sort.  */
13835
13836 static bool
13837 aarch64_macro_fusion_p (void)
13838 {
13839   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
13840 }
13841
13842
13843 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
13844    should be kept together during scheduling.  */
13845
13846 static bool
13847 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
13848 {
13849   rtx set_dest;
13850   rtx prev_set = single_set (prev);
13851   rtx curr_set = single_set (curr);
13852   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
13853   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
13854
13855   if (!aarch64_macro_fusion_p ())
13856     return false;
13857
13858   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
13859     {
13860       /* We are trying to match:
13861          prev (mov)  == (set (reg r0) (const_int imm16))
13862          curr (movk) == (set (zero_extract (reg r0)
13863                                            (const_int 16)
13864                                            (const_int 16))
13865                              (const_int imm16_1))  */
13866
13867       set_dest = SET_DEST (curr_set);
13868
13869       if (GET_CODE (set_dest) == ZERO_EXTRACT
13870           && CONST_INT_P (SET_SRC (curr_set))
13871           && CONST_INT_P (SET_SRC (prev_set))
13872           && CONST_INT_P (XEXP (set_dest, 2))
13873           && INTVAL (XEXP (set_dest, 2)) == 16
13874           && REG_P (XEXP (set_dest, 0))
13875           && REG_P (SET_DEST (prev_set))
13876           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
13877         {
13878           return true;
13879         }
13880     }
13881
13882   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
13883     {
13884
13885       /*  We're trying to match:
13886           prev (adrp) == (set (reg r1)
13887                               (high (symbol_ref ("SYM"))))
13888           curr (add) == (set (reg r0)
13889                              (lo_sum (reg r1)
13890                                      (symbol_ref ("SYM"))))
13891           Note that r0 need not necessarily be the same as r1, especially
13892           during pre-regalloc scheduling.  */
13893
13894       if (satisfies_constraint_Ush (SET_SRC (prev_set))
13895           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13896         {
13897           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
13898               && REG_P (XEXP (SET_SRC (curr_set), 0))
13899               && REGNO (XEXP (SET_SRC (curr_set), 0))
13900                  == REGNO (SET_DEST (prev_set))
13901               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
13902                               XEXP (SET_SRC (curr_set), 1)))
13903             return true;
13904         }
13905     }
13906
13907   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
13908     {
13909
13910       /* We're trying to match:
13911          prev (movk) == (set (zero_extract (reg r0)
13912                                            (const_int 16)
13913                                            (const_int 32))
13914                              (const_int imm16_1))
13915          curr (movk) == (set (zero_extract (reg r0)
13916                                            (const_int 16)
13917                                            (const_int 48))
13918                              (const_int imm16_2))  */
13919
13920       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
13921           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
13922           && REG_P (XEXP (SET_DEST (prev_set), 0))
13923           && REG_P (XEXP (SET_DEST (curr_set), 0))
13924           && REGNO (XEXP (SET_DEST (prev_set), 0))
13925              == REGNO (XEXP (SET_DEST (curr_set), 0))
13926           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
13927           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
13928           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
13929           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
13930           && CONST_INT_P (SET_SRC (prev_set))
13931           && CONST_INT_P (SET_SRC (curr_set)))
13932         return true;
13933
13934     }
13935   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
13936     {
13937       /* We're trying to match:
13938           prev (adrp) == (set (reg r0)
13939                               (high (symbol_ref ("SYM"))))
13940           curr (ldr) == (set (reg r1)
13941                              (mem (lo_sum (reg r0)
13942                                              (symbol_ref ("SYM")))))
13943                  or
13944           curr (ldr) == (set (reg r1)
13945                              (zero_extend (mem
13946                                            (lo_sum (reg r0)
13947                                                    (symbol_ref ("SYM"))))))  */
13948       if (satisfies_constraint_Ush (SET_SRC (prev_set))
13949           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13950         {
13951           rtx curr_src = SET_SRC (curr_set);
13952
13953           if (GET_CODE (curr_src) == ZERO_EXTEND)
13954             curr_src = XEXP (curr_src, 0);
13955
13956           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
13957               && REG_P (XEXP (XEXP (curr_src, 0), 0))
13958               && REGNO (XEXP (XEXP (curr_src, 0), 0))
13959                  == REGNO (SET_DEST (prev_set))
13960               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
13961                               XEXP (SET_SRC (prev_set), 0)))
13962               return true;
13963         }
13964     }
13965
13966   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
13967        && aarch_crypto_can_dual_issue (prev, curr))
13968     return true;
13969
13970   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
13971       && any_condjump_p (curr))
13972     {
13973       enum attr_type prev_type = get_attr_type (prev);
13974
13975       /* FIXME: this misses some which is considered simple arthematic
13976          instructions for ThunderX.  Simple shifts are missed here.  */
13977       if (prev_type == TYPE_ALUS_SREG
13978           || prev_type == TYPE_ALUS_IMM
13979           || prev_type == TYPE_LOGICS_REG
13980           || prev_type == TYPE_LOGICS_IMM)
13981         return true;
13982     }
13983
13984   return false;
13985 }
13986
13987 /* Return true iff the instruction fusion described by OP is enabled.  */
13988
13989 bool
13990 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
13991 {
13992   return (aarch64_tune_params.fusible_ops & op) != 0;
13993 }
13994
13995 /* If MEM is in the form of [base+offset], extract the two parts
13996    of address and set to BASE and OFFSET, otherwise return false
13997    after clearing BASE and OFFSET.  */
13998
13999 bool
14000 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
14001 {
14002   rtx addr;
14003
14004   gcc_assert (MEM_P (mem));
14005
14006   addr = XEXP (mem, 0);
14007
14008   if (REG_P (addr))
14009     {
14010       *base = addr;
14011       *offset = const0_rtx;
14012       return true;
14013     }
14014
14015   if (GET_CODE (addr) == PLUS
14016       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
14017     {
14018       *base = XEXP (addr, 0);
14019       *offset = XEXP (addr, 1);
14020       return true;
14021     }
14022
14023   *base = NULL_RTX;
14024   *offset = NULL_RTX;
14025
14026   return false;
14027 }
14028
14029 /* Types for scheduling fusion.  */
14030 enum sched_fusion_type
14031 {
14032   SCHED_FUSION_NONE = 0,
14033   SCHED_FUSION_LD_SIGN_EXTEND,
14034   SCHED_FUSION_LD_ZERO_EXTEND,
14035   SCHED_FUSION_LD,
14036   SCHED_FUSION_ST,
14037   SCHED_FUSION_NUM
14038 };
14039
14040 /* If INSN is a load or store of address in the form of [base+offset],
14041    extract the two parts and set to BASE and OFFSET.  Return scheduling
14042    fusion type this INSN is.  */
14043
14044 static enum sched_fusion_type
14045 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
14046 {
14047   rtx x, dest, src;
14048   enum sched_fusion_type fusion = SCHED_FUSION_LD;
14049
14050   gcc_assert (INSN_P (insn));
14051   x = PATTERN (insn);
14052   if (GET_CODE (x) != SET)
14053     return SCHED_FUSION_NONE;
14054
14055   src = SET_SRC (x);
14056   dest = SET_DEST (x);
14057
14058   machine_mode dest_mode = GET_MODE (dest);
14059
14060   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
14061     return SCHED_FUSION_NONE;
14062
14063   if (GET_CODE (src) == SIGN_EXTEND)
14064     {
14065       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
14066       src = XEXP (src, 0);
14067       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14068         return SCHED_FUSION_NONE;
14069     }
14070   else if (GET_CODE (src) == ZERO_EXTEND)
14071     {
14072       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
14073       src = XEXP (src, 0);
14074       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14075         return SCHED_FUSION_NONE;
14076     }
14077
14078   if (GET_CODE (src) == MEM && REG_P (dest))
14079     extract_base_offset_in_addr (src, base, offset);
14080   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
14081     {
14082       fusion = SCHED_FUSION_ST;
14083       extract_base_offset_in_addr (dest, base, offset);
14084     }
14085   else
14086     return SCHED_FUSION_NONE;
14087
14088   if (*base == NULL_RTX || *offset == NULL_RTX)
14089     fusion = SCHED_FUSION_NONE;
14090
14091   return fusion;
14092 }
14093
14094 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
14095
14096    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
14097    and PRI are only calculated for these instructions.  For other instruction,
14098    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
14099    type instruction fusion can be added by returning different priorities.
14100
14101    It's important that irrelevant instructions get the largest FUSION_PRI.  */
14102
14103 static void
14104 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
14105                                int *fusion_pri, int *pri)
14106 {
14107   int tmp, off_val;
14108   rtx base, offset;
14109   enum sched_fusion_type fusion;
14110
14111   gcc_assert (INSN_P (insn));
14112
14113   tmp = max_pri - 1;
14114   fusion = fusion_load_store (insn, &base, &offset);
14115   if (fusion == SCHED_FUSION_NONE)
14116     {
14117       *pri = tmp;
14118       *fusion_pri = tmp;
14119       return;
14120     }
14121
14122   /* Set FUSION_PRI according to fusion type and base register.  */
14123   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
14124
14125   /* Calculate PRI.  */
14126   tmp /= 2;
14127
14128   /* INSN with smaller offset goes first.  */
14129   off_val = (int)(INTVAL (offset));
14130   if (off_val >= 0)
14131     tmp -= (off_val & 0xfffff);
14132   else
14133     tmp += ((- off_val) & 0xfffff);
14134
14135   *pri = tmp;
14136   return;
14137 }
14138
14139 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
14140    Adjust priority of sha1h instructions so they are scheduled before
14141    other SHA1 instructions.  */
14142
14143 static int
14144 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
14145 {
14146   rtx x = PATTERN (insn);
14147
14148   if (GET_CODE (x) == SET)
14149     {
14150       x = SET_SRC (x);
14151
14152       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
14153         return priority + 10;
14154     }
14155
14156   return priority;
14157 }
14158
14159 /* Given OPERANDS of consecutive load/store, check if we can merge
14160    them into ldp/stp.  LOAD is true if they are load instructions.
14161    MODE is the mode of memory operands.  */
14162
14163 bool
14164 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
14165                                 enum machine_mode mode)
14166 {
14167   HOST_WIDE_INT offval_1, offval_2, msize;
14168   enum reg_class rclass_1, rclass_2;
14169   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
14170
14171   if (load)
14172     {
14173       mem_1 = operands[1];
14174       mem_2 = operands[3];
14175       reg_1 = operands[0];
14176       reg_2 = operands[2];
14177       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
14178       if (REGNO (reg_1) == REGNO (reg_2))
14179         return false;
14180     }
14181   else
14182     {
14183       mem_1 = operands[0];
14184       mem_2 = operands[2];
14185       reg_1 = operands[1];
14186       reg_2 = operands[3];
14187     }
14188
14189   /* The mems cannot be volatile.  */
14190   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
14191     return false;
14192
14193   /* If we have SImode and slow unaligned ldp,
14194      check the alignment to be at least 8 byte. */
14195   if (mode == SImode
14196       && (aarch64_tune_params.extra_tuning_flags
14197           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14198       && !optimize_size
14199       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14200     return false;
14201
14202   /* Check if the addresses are in the form of [base+offset].  */
14203   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14204   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14205     return false;
14206   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14207   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14208     return false;
14209
14210   /* Check if the bases are same.  */
14211   if (!rtx_equal_p (base_1, base_2))
14212     return false;
14213
14214   offval_1 = INTVAL (offset_1);
14215   offval_2 = INTVAL (offset_2);
14216   msize = GET_MODE_SIZE (mode);
14217   /* Check if the offsets are consecutive.  */
14218   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
14219     return false;
14220
14221   /* Check if the addresses are clobbered by load.  */
14222   if (load)
14223     {
14224       if (reg_mentioned_p (reg_1, mem_1))
14225         return false;
14226
14227       /* In increasing order, the last load can clobber the address.  */
14228       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
14229       return false;
14230     }
14231
14232   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14233     rclass_1 = FP_REGS;
14234   else
14235     rclass_1 = GENERAL_REGS;
14236
14237   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14238     rclass_2 = FP_REGS;
14239   else
14240     rclass_2 = GENERAL_REGS;
14241
14242   /* Check if the registers are of same class.  */
14243   if (rclass_1 != rclass_2)
14244     return false;
14245
14246   return true;
14247 }
14248
14249 /* Given OPERANDS of consecutive load/store, check if we can merge
14250    them into ldp/stp by adjusting the offset.  LOAD is true if they
14251    are load instructions.  MODE is the mode of memory operands.
14252
14253    Given below consecutive stores:
14254
14255      str  w1, [xb, 0x100]
14256      str  w1, [xb, 0x104]
14257      str  w1, [xb, 0x108]
14258      str  w1, [xb, 0x10c]
14259
14260    Though the offsets are out of the range supported by stp, we can
14261    still pair them after adjusting the offset, like:
14262
14263      add  scratch, xb, 0x100
14264      stp  w1, w1, [scratch]
14265      stp  w1, w1, [scratch, 0x8]
14266
14267    The peephole patterns detecting this opportunity should guarantee
14268    the scratch register is avaliable.  */
14269
14270 bool
14271 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
14272                                        enum machine_mode mode)
14273 {
14274   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
14275   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
14276   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
14277   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
14278
14279   if (load)
14280     {
14281       reg_1 = operands[0];
14282       mem_1 = operands[1];
14283       reg_2 = operands[2];
14284       mem_2 = operands[3];
14285       reg_3 = operands[4];
14286       mem_3 = operands[5];
14287       reg_4 = operands[6];
14288       mem_4 = operands[7];
14289       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
14290                   && REG_P (reg_3) && REG_P (reg_4));
14291       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
14292         return false;
14293     }
14294   else
14295     {
14296       mem_1 = operands[0];
14297       reg_1 = operands[1];
14298       mem_2 = operands[2];
14299       reg_2 = operands[3];
14300       mem_3 = operands[4];
14301       reg_3 = operands[5];
14302       mem_4 = operands[6];
14303       reg_4 = operands[7];
14304     }
14305   /* Skip if memory operand is by itslef valid for ldp/stp.  */
14306   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
14307     return false;
14308
14309   /* The mems cannot be volatile.  */
14310   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
14311       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
14312     return false;
14313
14314   /* Check if the addresses are in the form of [base+offset].  */
14315   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14316   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14317     return false;
14318   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14319   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14320     return false;
14321   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
14322   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
14323     return false;
14324   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
14325   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
14326     return false;
14327
14328   /* Check if the bases are same.  */
14329   if (!rtx_equal_p (base_1, base_2)
14330       || !rtx_equal_p (base_2, base_3)
14331       || !rtx_equal_p (base_3, base_4))
14332     return false;
14333
14334   offval_1 = INTVAL (offset_1);
14335   offval_2 = INTVAL (offset_2);
14336   offval_3 = INTVAL (offset_3);
14337   offval_4 = INTVAL (offset_4);
14338   msize = GET_MODE_SIZE (mode);
14339   /* Check if the offsets are consecutive.  */
14340   if ((offval_1 != (offval_2 + msize)
14341        || offval_1 != (offval_3 + msize * 2)
14342        || offval_1 != (offval_4 + msize * 3))
14343       && (offval_4 != (offval_3 + msize)
14344           || offval_4 != (offval_2 + msize * 2)
14345           || offval_4 != (offval_1 + msize * 3)))
14346     return false;
14347
14348   /* Check if the addresses are clobbered by load.  */
14349   if (load)
14350     {
14351       if (reg_mentioned_p (reg_1, mem_1)
14352           || reg_mentioned_p (reg_2, mem_2)
14353           || reg_mentioned_p (reg_3, mem_3))
14354         return false;
14355
14356       /* In increasing order, the last load can clobber the address.  */
14357       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
14358         return false;
14359     }
14360
14361   /* If we have SImode and slow unaligned ldp,
14362      check the alignment to be at least 8 byte. */
14363   if (mode == SImode
14364       && (aarch64_tune_params.extra_tuning_flags
14365           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14366       && !optimize_size
14367       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14368     return false;
14369
14370   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14371     rclass_1 = FP_REGS;
14372   else
14373     rclass_1 = GENERAL_REGS;
14374
14375   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14376     rclass_2 = FP_REGS;
14377   else
14378     rclass_2 = GENERAL_REGS;
14379
14380   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
14381     rclass_3 = FP_REGS;
14382   else
14383     rclass_3 = GENERAL_REGS;
14384
14385   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
14386     rclass_4 = FP_REGS;
14387   else
14388     rclass_4 = GENERAL_REGS;
14389
14390   /* Check if the registers are of same class.  */
14391   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
14392     return false;
14393
14394   return true;
14395 }
14396
14397 /* Given OPERANDS of consecutive load/store, this function pairs them
14398    into ldp/stp after adjusting the offset.  It depends on the fact
14399    that addresses of load/store instructions are in increasing order.
14400    MODE is the mode of memory operands.  CODE is the rtl operator
14401    which should be applied to all memory operands, it's SIGN_EXTEND,
14402    ZERO_EXTEND or UNKNOWN.  */
14403
14404 bool
14405 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
14406                              enum machine_mode mode, RTX_CODE code)
14407 {
14408   rtx base, offset, t1, t2;
14409   rtx mem_1, mem_2, mem_3, mem_4;
14410   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
14411
14412   if (load)
14413     {
14414       mem_1 = operands[1];
14415       mem_2 = operands[3];
14416       mem_3 = operands[5];
14417       mem_4 = operands[7];
14418     }
14419   else
14420     {
14421       mem_1 = operands[0];
14422       mem_2 = operands[2];
14423       mem_3 = operands[4];
14424       mem_4 = operands[6];
14425       gcc_assert (code == UNKNOWN);
14426     }
14427
14428   extract_base_offset_in_addr (mem_1, &base, &offset);
14429   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
14430
14431   /* Adjust offset thus it can fit in ldp/stp instruction.  */
14432   msize = GET_MODE_SIZE (mode);
14433   stp_off_limit = msize * 0x40;
14434   off_val = INTVAL (offset);
14435   abs_off = (off_val < 0) ? -off_val : off_val;
14436   new_off = abs_off % stp_off_limit;
14437   adj_off = abs_off - new_off;
14438
14439   /* Further adjust to make sure all offsets are OK.  */
14440   if ((new_off + msize * 2) >= stp_off_limit)
14441     {
14442       adj_off += stp_off_limit;
14443       new_off -= stp_off_limit;
14444     }
14445
14446   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
14447   if (adj_off >= 0x1000)
14448     return false;
14449
14450   if (off_val < 0)
14451     {
14452       adj_off = -adj_off;
14453       new_off = -new_off;
14454     }
14455
14456   /* Create new memory references.  */
14457   mem_1 = change_address (mem_1, VOIDmode,
14458                           plus_constant (DImode, operands[8], new_off));
14459
14460   /* Check if the adjusted address is OK for ldp/stp.  */
14461   if (!aarch64_mem_pair_operand (mem_1, mode))
14462     return false;
14463
14464   msize = GET_MODE_SIZE (mode);
14465   mem_2 = change_address (mem_2, VOIDmode,
14466                           plus_constant (DImode,
14467                                          operands[8],
14468                                          new_off + msize));
14469   mem_3 = change_address (mem_3, VOIDmode,
14470                           plus_constant (DImode,
14471                                          operands[8],
14472                                          new_off + msize * 2));
14473   mem_4 = change_address (mem_4, VOIDmode,
14474                           plus_constant (DImode,
14475                                          operands[8],
14476                                          new_off + msize * 3));
14477
14478   if (code == ZERO_EXTEND)
14479     {
14480       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
14481       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
14482       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
14483       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
14484     }
14485   else if (code == SIGN_EXTEND)
14486     {
14487       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
14488       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
14489       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
14490       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
14491     }
14492
14493   if (load)
14494     {
14495       operands[1] = mem_1;
14496       operands[3] = mem_2;
14497       operands[5] = mem_3;
14498       operands[7] = mem_4;
14499     }
14500   else
14501     {
14502       operands[0] = mem_1;
14503       operands[2] = mem_2;
14504       operands[4] = mem_3;
14505       operands[6] = mem_4;
14506     }
14507
14508   /* Emit adjusting instruction.  */
14509   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
14510   /* Emit ldp/stp instructions.  */
14511   t1 = gen_rtx_SET (operands[0], operands[1]);
14512   t2 = gen_rtx_SET (operands[2], operands[3]);
14513   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
14514   t1 = gen_rtx_SET (operands[4], operands[5]);
14515   t2 = gen_rtx_SET (operands[6], operands[7]);
14516   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
14517   return true;
14518 }
14519
14520 /* Return 1 if pseudo register should be created and used to hold
14521    GOT address for PIC code.  */
14522
14523 bool
14524 aarch64_use_pseudo_pic_reg (void)
14525 {
14526   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
14527 }
14528
14529 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
14530
14531 static int
14532 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
14533 {
14534   switch (XINT (x, 1))
14535     {
14536     case UNSPEC_GOTSMALLPIC:
14537     case UNSPEC_GOTSMALLPIC28K:
14538     case UNSPEC_GOTTINYPIC:
14539       return 0;
14540     default:
14541       break;
14542     }
14543
14544   return default_unspec_may_trap_p (x, flags);
14545 }
14546
14547
14548 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
14549    return the log2 of that value.  Otherwise return -1.  */
14550
14551 int
14552 aarch64_fpconst_pow_of_2 (rtx x)
14553 {
14554   const REAL_VALUE_TYPE *r;
14555
14556   if (!CONST_DOUBLE_P (x))
14557     return -1;
14558
14559   r = CONST_DOUBLE_REAL_VALUE (x);
14560
14561   if (REAL_VALUE_NEGATIVE (*r)
14562       || REAL_VALUE_ISNAN (*r)
14563       || REAL_VALUE_ISINF (*r)
14564       || !real_isinteger (r, DFmode))
14565     return -1;
14566
14567   return exact_log2 (real_to_integer (r));
14568 }
14569
14570 /* If X is a vector of equal CONST_DOUBLE values and that value is
14571    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
14572
14573 int
14574 aarch64_vec_fpconst_pow_of_2 (rtx x)
14575 {
14576   if (GET_CODE (x) != CONST_VECTOR)
14577     return -1;
14578
14579   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
14580     return -1;
14581
14582   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
14583   if (firstval <= 0)
14584     return -1;
14585
14586   for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
14587     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
14588       return -1;
14589
14590   return firstval;
14591 }
14592
14593 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
14594    to float.
14595
14596    __fp16 always promotes through this hook.
14597    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
14598    through the generic excess precision logic rather than here.  */
14599
14600 static tree
14601 aarch64_promoted_type (const_tree t)
14602 {
14603   if (SCALAR_FLOAT_TYPE_P (t)
14604       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
14605     return float_type_node;
14606
14607   return NULL_TREE;
14608 }
14609
14610 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
14611
14612 static bool
14613 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
14614                            optimization_type opt_type)
14615 {
14616   switch (op)
14617     {
14618     case rsqrt_optab:
14619       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
14620
14621     default:
14622       return true;
14623     }
14624 }
14625
14626 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
14627    if MODE is HFmode, and punt to the generic implementation otherwise.  */
14628
14629 static bool
14630 aarch64_libgcc_floating_mode_supported_p (machine_mode mode)
14631 {
14632   return (mode == HFmode
14633           ? true
14634           : default_libgcc_floating_mode_supported_p (mode));
14635 }
14636
14637 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
14638    if MODE is HFmode, and punt to the generic implementation otherwise.  */
14639
14640 static bool
14641 aarch64_scalar_mode_supported_p (machine_mode mode)
14642 {
14643   return (mode == HFmode
14644           ? true
14645           : default_scalar_mode_supported_p (mode));
14646 }
14647
14648 /* Set the value of FLT_EVAL_METHOD.
14649    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
14650
14651     0: evaluate all operations and constants, whose semantic type has at
14652        most the range and precision of type float, to the range and
14653        precision of float; evaluate all other operations and constants to
14654        the range and precision of the semantic type;
14655
14656     N, where _FloatN is a supported interchange floating type
14657        evaluate all operations and constants, whose semantic type has at
14658        most the range and precision of _FloatN type, to the range and
14659        precision of the _FloatN type; evaluate all other operations and
14660        constants to the range and precision of the semantic type;
14661
14662    If we have the ARMv8.2-A extensions then we support _Float16 in native
14663    precision, so we should set this to 16.  Otherwise, we support the type,
14664    but want to evaluate expressions in float precision, so set this to
14665    0.  */
14666
14667 static enum flt_eval_method
14668 aarch64_excess_precision (enum excess_precision_type type)
14669 {
14670   switch (type)
14671     {
14672       case EXCESS_PRECISION_TYPE_FAST:
14673       case EXCESS_PRECISION_TYPE_STANDARD:
14674         /* We can calculate either in 16-bit range and precision or
14675            32-bit range and precision.  Make that decision based on whether
14676            we have native support for the ARMv8.2-A 16-bit floating-point
14677            instructions or not.  */
14678         return (TARGET_FP_F16INST
14679                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
14680                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
14681       case EXCESS_PRECISION_TYPE_IMPLICIT:
14682         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
14683       default:
14684         gcc_unreachable ();
14685     }
14686   return FLT_EVAL_METHOD_UNPREDICTABLE;
14687 }
14688
14689 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
14690    scheduled for speculative execution.  Reject the long-running division
14691    and square-root instructions.  */
14692
14693 static bool
14694 aarch64_sched_can_speculate_insn (rtx_insn *insn)
14695 {
14696   switch (get_attr_type (insn))
14697     {
14698       case TYPE_SDIV:
14699       case TYPE_UDIV:
14700       case TYPE_FDIVS:
14701       case TYPE_FDIVD:
14702       case TYPE_FSQRTS:
14703       case TYPE_FSQRTD:
14704       case TYPE_NEON_FP_SQRT_S:
14705       case TYPE_NEON_FP_SQRT_D:
14706       case TYPE_NEON_FP_SQRT_S_Q:
14707       case TYPE_NEON_FP_SQRT_D_Q:
14708       case TYPE_NEON_FP_DIV_S:
14709       case TYPE_NEON_FP_DIV_D:
14710       case TYPE_NEON_FP_DIV_S_Q:
14711       case TYPE_NEON_FP_DIV_D_Q:
14712         return false;
14713       default:
14714         return true;
14715     }
14716 }
14717
14718 /* Target-specific selftests.  */
14719
14720 #if CHECKING_P
14721
14722 namespace selftest {
14723
14724 /* Selftest for the RTL loader.
14725    Verify that the RTL loader copes with a dump from
14726    print_rtx_function.  This is essentially just a test that class
14727    function_reader can handle a real dump, but it also verifies
14728    that lookup_reg_by_dump_name correctly handles hard regs.
14729    The presence of hard reg names in the dump means that the test is
14730    target-specific, hence it is in this file.  */
14731
14732 static void
14733 aarch64_test_loading_full_dump ()
14734 {
14735   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
14736
14737   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
14738
14739   rtx_insn *insn_1 = get_insn_by_uid (1);
14740   ASSERT_EQ (NOTE, GET_CODE (insn_1));
14741
14742   rtx_insn *insn_15 = get_insn_by_uid (15);
14743   ASSERT_EQ (INSN, GET_CODE (insn_15));
14744   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
14745
14746   /* Verify crtl->return_rtx.  */
14747   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
14748   ASSERT_EQ (0, REGNO (crtl->return_rtx));
14749   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
14750 }
14751
14752 /* Run all target-specific selftests.  */
14753
14754 static void
14755 aarch64_run_selftests (void)
14756 {
14757   aarch64_test_loading_full_dump ();
14758 }
14759
14760 } // namespace selftest
14761
14762 #endif /* #if CHECKING_P */
14763
14764 #undef TARGET_ADDRESS_COST
14765 #define TARGET_ADDRESS_COST aarch64_address_cost
14766
14767 /* This hook will determines whether unnamed bitfields affect the alignment
14768    of the containing structure.  The hook returns true if the structure
14769    should inherit the alignment requirements of an unnamed bitfield's
14770    type.  */
14771 #undef TARGET_ALIGN_ANON_BITFIELD
14772 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
14773
14774 #undef TARGET_ASM_ALIGNED_DI_OP
14775 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
14776
14777 #undef TARGET_ASM_ALIGNED_HI_OP
14778 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
14779
14780 #undef TARGET_ASM_ALIGNED_SI_OP
14781 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
14782
14783 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
14784 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
14785   hook_bool_const_tree_hwi_hwi_const_tree_true
14786
14787 #undef TARGET_ASM_FILE_START
14788 #define TARGET_ASM_FILE_START aarch64_start_file
14789
14790 #undef TARGET_ASM_OUTPUT_MI_THUNK
14791 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
14792
14793 #undef TARGET_ASM_SELECT_RTX_SECTION
14794 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
14795
14796 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
14797 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
14798
14799 #undef TARGET_BUILD_BUILTIN_VA_LIST
14800 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
14801
14802 #undef TARGET_CALLEE_COPIES
14803 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
14804
14805 #undef TARGET_CAN_ELIMINATE
14806 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
14807
14808 #undef TARGET_CAN_INLINE_P
14809 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
14810
14811 #undef TARGET_CANNOT_FORCE_CONST_MEM
14812 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
14813
14814 #undef TARGET_CASE_VALUES_THRESHOLD
14815 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
14816
14817 #undef TARGET_CONDITIONAL_REGISTER_USAGE
14818 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
14819
14820 /* Only the least significant bit is used for initialization guard
14821    variables.  */
14822 #undef TARGET_CXX_GUARD_MASK_BIT
14823 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
14824
14825 #undef TARGET_C_MODE_FOR_SUFFIX
14826 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
14827
14828 #ifdef TARGET_BIG_ENDIAN_DEFAULT
14829 #undef  TARGET_DEFAULT_TARGET_FLAGS
14830 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
14831 #endif
14832
14833 #undef TARGET_CLASS_MAX_NREGS
14834 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
14835
14836 #undef TARGET_BUILTIN_DECL
14837 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
14838
14839 #undef TARGET_BUILTIN_RECIPROCAL
14840 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
14841
14842 #undef TARGET_C_EXCESS_PRECISION
14843 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
14844
14845 #undef  TARGET_EXPAND_BUILTIN
14846 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
14847
14848 #undef TARGET_EXPAND_BUILTIN_VA_START
14849 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
14850
14851 #undef TARGET_FOLD_BUILTIN
14852 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
14853
14854 #undef TARGET_FUNCTION_ARG
14855 #define TARGET_FUNCTION_ARG aarch64_function_arg
14856
14857 #undef TARGET_FUNCTION_ARG_ADVANCE
14858 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
14859
14860 #undef TARGET_FUNCTION_ARG_BOUNDARY
14861 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
14862
14863 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
14864 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
14865
14866 #undef TARGET_FUNCTION_VALUE
14867 #define TARGET_FUNCTION_VALUE aarch64_function_value
14868
14869 #undef TARGET_FUNCTION_VALUE_REGNO_P
14870 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
14871
14872 #undef TARGET_FRAME_POINTER_REQUIRED
14873 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
14874
14875 #undef TARGET_GIMPLE_FOLD_BUILTIN
14876 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
14877
14878 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
14879 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
14880
14881 #undef  TARGET_INIT_BUILTINS
14882 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
14883
14884 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
14885 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
14886   aarch64_ira_change_pseudo_allocno_class
14887
14888 #undef TARGET_LEGITIMATE_ADDRESS_P
14889 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
14890
14891 #undef TARGET_LEGITIMATE_CONSTANT_P
14892 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
14893
14894 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
14895 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
14896   aarch64_legitimize_address_displacement
14897
14898 #undef TARGET_LIBGCC_CMP_RETURN_MODE
14899 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
14900
14901 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
14902 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
14903 aarch64_libgcc_floating_mode_supported_p
14904
14905 #undef TARGET_MANGLE_TYPE
14906 #define TARGET_MANGLE_TYPE aarch64_mangle_type
14907
14908 #undef TARGET_MEMORY_MOVE_COST
14909 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
14910
14911 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
14912 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
14913
14914 #undef TARGET_MUST_PASS_IN_STACK
14915 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
14916
14917 /* This target hook should return true if accesses to volatile bitfields
14918    should use the narrowest mode possible.  It should return false if these
14919    accesses should use the bitfield container type.  */
14920 #undef TARGET_NARROW_VOLATILE_BITFIELD
14921 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
14922
14923 #undef  TARGET_OPTION_OVERRIDE
14924 #define TARGET_OPTION_OVERRIDE aarch64_override_options
14925
14926 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
14927 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
14928   aarch64_override_options_after_change
14929
14930 #undef TARGET_OPTION_SAVE
14931 #define TARGET_OPTION_SAVE aarch64_option_save
14932
14933 #undef TARGET_OPTION_RESTORE
14934 #define TARGET_OPTION_RESTORE aarch64_option_restore
14935
14936 #undef TARGET_OPTION_PRINT
14937 #define TARGET_OPTION_PRINT aarch64_option_print
14938
14939 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
14940 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
14941
14942 #undef TARGET_SET_CURRENT_FUNCTION
14943 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
14944
14945 #undef TARGET_PASS_BY_REFERENCE
14946 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
14947
14948 #undef TARGET_PREFERRED_RELOAD_CLASS
14949 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
14950
14951 #undef TARGET_SCHED_REASSOCIATION_WIDTH
14952 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
14953
14954 #undef TARGET_PROMOTED_TYPE
14955 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
14956
14957 #undef TARGET_SECONDARY_RELOAD
14958 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
14959
14960 #undef TARGET_SHIFT_TRUNCATION_MASK
14961 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
14962
14963 #undef TARGET_SETUP_INCOMING_VARARGS
14964 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
14965
14966 #undef TARGET_STRUCT_VALUE_RTX
14967 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
14968
14969 #undef TARGET_REGISTER_MOVE_COST
14970 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
14971
14972 #undef TARGET_RETURN_IN_MEMORY
14973 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
14974
14975 #undef TARGET_RETURN_IN_MSB
14976 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
14977
14978 #undef TARGET_RTX_COSTS
14979 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
14980
14981 #undef TARGET_SCALAR_MODE_SUPPORTED_P
14982 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
14983
14984 #undef TARGET_SCHED_ISSUE_RATE
14985 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
14986
14987 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
14988 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
14989   aarch64_sched_first_cycle_multipass_dfa_lookahead
14990
14991 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
14992 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
14993   aarch64_first_cycle_multipass_dfa_lookahead_guard
14994
14995 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
14996 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
14997   aarch64_get_separate_components
14998
14999 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
15000 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
15001   aarch64_components_for_bb
15002
15003 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
15004 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
15005   aarch64_disqualify_components
15006
15007 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
15008 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
15009   aarch64_emit_prologue_components
15010
15011 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
15012 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
15013   aarch64_emit_epilogue_components
15014
15015 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
15016 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
15017   aarch64_set_handled_components
15018
15019 #undef TARGET_TRAMPOLINE_INIT
15020 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
15021
15022 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
15023 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
15024
15025 #undef TARGET_VECTOR_MODE_SUPPORTED_P
15026 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
15027
15028 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
15029 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
15030   aarch64_builtin_support_vector_misalignment
15031
15032 #undef TARGET_ARRAY_MODE_SUPPORTED_P
15033 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
15034
15035 #undef TARGET_VECTORIZE_ADD_STMT_COST
15036 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
15037
15038 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
15039 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
15040   aarch64_builtin_vectorization_cost
15041
15042 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
15043 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
15044
15045 #undef TARGET_VECTORIZE_BUILTINS
15046 #define TARGET_VECTORIZE_BUILTINS
15047
15048 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
15049 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
15050   aarch64_builtin_vectorized_function
15051
15052 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
15053 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
15054   aarch64_autovectorize_vector_sizes
15055
15056 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
15057 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
15058   aarch64_atomic_assign_expand_fenv
15059
15060 /* Section anchor support.  */
15061
15062 #undef TARGET_MIN_ANCHOR_OFFSET
15063 #define TARGET_MIN_ANCHOR_OFFSET -256
15064
15065 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
15066    byte offset; we can do much more for larger data types, but have no way
15067    to determine the size of the access.  We assume accesses are aligned.  */
15068 #undef TARGET_MAX_ANCHOR_OFFSET
15069 #define TARGET_MAX_ANCHOR_OFFSET 4095
15070
15071 #undef TARGET_VECTOR_ALIGNMENT
15072 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
15073
15074 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
15075 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
15076   aarch64_simd_vector_alignment_reachable
15077
15078 /* vec_perm support.  */
15079
15080 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
15081 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
15082   aarch64_vectorize_vec_perm_const_ok
15083
15084 #undef TARGET_INIT_LIBFUNCS
15085 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
15086
15087 #undef TARGET_FIXED_CONDITION_CODE_REGS
15088 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
15089
15090 #undef TARGET_FLAGS_REGNUM
15091 #define TARGET_FLAGS_REGNUM CC_REGNUM
15092
15093 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
15094 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
15095
15096 #undef TARGET_ASAN_SHADOW_OFFSET
15097 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
15098
15099 #undef TARGET_LEGITIMIZE_ADDRESS
15100 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
15101
15102 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
15103 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
15104   aarch64_use_by_pieces_infrastructure_p
15105
15106 #undef TARGET_SCHED_CAN_SPECULATE_INSN
15107 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
15108
15109 #undef TARGET_CAN_USE_DOLOOP_P
15110 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
15111
15112 #undef TARGET_SCHED_ADJUST_PRIORITY
15113 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
15114
15115 #undef TARGET_SCHED_MACRO_FUSION_P
15116 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
15117
15118 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
15119 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
15120
15121 #undef TARGET_SCHED_FUSION_PRIORITY
15122 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
15123
15124 #undef TARGET_UNSPEC_MAY_TRAP_P
15125 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
15126
15127 #undef TARGET_USE_PSEUDO_PIC_REG
15128 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
15129
15130 #undef TARGET_PRINT_OPERAND
15131 #define TARGET_PRINT_OPERAND aarch64_print_operand
15132
15133 #undef TARGET_PRINT_OPERAND_ADDRESS
15134 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
15135
15136 #undef TARGET_OPTAB_SUPPORTED_P
15137 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
15138
15139 #undef TARGET_OMIT_STRUCT_RETURN_REG
15140 #define TARGET_OMIT_STRUCT_RETURN_REG true
15141
15142 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
15143 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
15144 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
15145
15146 #if CHECKING_P
15147 #undef TARGET_RUN_TARGET_SELFTESTS
15148 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
15149 #endif /* #if CHECKING_P */
15150
15151 struct gcc_target targetm = TARGET_INITIALIZER;
15152
15153 #include "gt-aarch64.h"