gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2017 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #define INCLUDE_STRING
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "memmodel.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "cfgloop.h"
  33 #include "df.h"
  34 #include "tm_p.h"
  35 #include "stringpool.h"
  36 #include "optabs.h"
  37 #include "regs.h"
  38 #include "emit-rtl.h"
  39 #include "recog.h"
  40 #include "diagnostic.h"
  41 #include "insn-attr.h"
  42 #include "alias.h"
  43 #include "fold-const.h"
  44 #include "stor-layout.h"
  45 #include "calls.h"
  46 #include "varasm.h"
  47 #include "output.h"
  48 #include "flags.h"
  49 #include "explow.h"
  50 #include "expr.h"
  51 #include "reload.h"
  52 #include "langhooks.h"
  53 #include "opts.h"
  54 #include "params.h"
  55 #include "gimplify.h"
  56 #include "dwarf2.h"
  57 #include "gimple-iterator.h"
  58 #include "tree-vectorizer.h"
  59 #include "aarch64-cost-tables.h"
  60 #include "dumpfile.h"
  61 #include "builtins.h"
  62 #include "rtl-iter.h"
  63 #include "tm-constrs.h"
  64 #include "sched-int.h"
  65 #include "target-globals.h"
  66 #include "common/common-target.h"
  67 #include "selftest.h"
  68 #include "selftest-rtl.h"
  69
  70 /* This file should be included last.  */
  71 #include "target-def.h"
  72
  73 /* Defined for convenience.  */
  74 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  75
  76 /* Classifies an address.
  77
  78    ADDRESS_REG_IMM
  79        A simple base register plus immediate offset.
  80
  81    ADDRESS_REG_WB
  82        A base register indexed by immediate offset with writeback.
  83
  84    ADDRESS_REG_REG
  85        A base register indexed by (optionally scaled) register.
  86
  87    ADDRESS_REG_UXTW
  88        A base register indexed by (optionally scaled) zero-extended register.
  89
  90    ADDRESS_REG_SXTW
  91        A base register indexed by (optionally scaled) sign-extended register.
  92
  93    ADDRESS_LO_SUM
  94        A LO_SUM rtx with a base register and "LO12" symbol relocation.
  95
  96    ADDRESS_SYMBOLIC:
  97        A constant symbolic address, in pc-relative literal pool.  */
  98
  99 enum aarch64_address_type {
 100   ADDRESS_REG_IMM,
 101   ADDRESS_REG_WB,
 102   ADDRESS_REG_REG,
 103   ADDRESS_REG_UXTW,
 104   ADDRESS_REG_SXTW,
 105   ADDRESS_LO_SUM,
 106   ADDRESS_SYMBOLIC
 107 };
 108
 109 struct aarch64_address_info {
 110   enum aarch64_address_type type;
 111   rtx base;
 112   rtx offset;
 113   int shift;
 114   enum aarch64_symbol_type symbol_type;
 115 };
 116
 117 struct simd_immediate_info
 118 {
 119   rtx value;
 120   int shift;
 121   int element_width;
 122   bool mvn;
 123   bool msl;
 124 };
 125
 126 /* The current code model.  */
 127 enum aarch64_code_model aarch64_cmodel;
 128
 129 #ifdef HAVE_AS_TLS
 130 #undef TARGET_HAVE_TLS
 131 #define TARGET_HAVE_TLS 1
 132 #endif
 133
 134 static bool aarch64_composite_type_p (const_tree, machine_mode);
 135 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 136                                                      const_tree,
 137                                                      machine_mode *, int *,
 138                                                      bool *);
 139 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 140 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 141 static void aarch64_override_options_after_change (void);
 142 static bool aarch64_vector_mode_supported_p (machine_mode);
 143 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
 144                                                  const unsigned char *sel);
 145 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 146 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 147                                                          const_tree type,
 148                                                          int misalignment,
 149                                                          bool is_packed);
 150
 151 /* Major revision number of the ARM Architecture implemented by the target.  */
 152 unsigned aarch64_architecture_version;
 153
 154 /* The processor for which instructions should be scheduled.  */
 155 enum aarch64_processor aarch64_tune = cortexa53;
 156
 157 /* Mask to specify which instruction scheduling options should be used.  */
 158 unsigned long aarch64_tune_flags = 0;
 159
 160 /* Global flag for PC relative loads.  */
 161 bool aarch64_pcrelative_literal_loads;
 162
 163 /* Support for command line parsing of boolean flags in the tuning
 164    structures.  */
 165 struct aarch64_flag_desc
 166 {
 167   const char* name;
 168   unsigned int flag;
 169 };
 170
 171 #define AARCH64_FUSION_PAIR(name, internal_name) \
 172   { name, AARCH64_FUSE_##internal_name },
 173 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 174 {
 175   { "none", AARCH64_FUSE_NOTHING },
 176 #include "aarch64-fusion-pairs.def"
 177   { "all", AARCH64_FUSE_ALL },
 178   { NULL, AARCH64_FUSE_NOTHING }
 179 };
 180
 181 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 182   { name, AARCH64_EXTRA_TUNE_##internal_name },
 183 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 184 {
 185   { "none", AARCH64_EXTRA_TUNE_NONE },
 186 #include "aarch64-tuning-flags.def"
 187   { "all", AARCH64_EXTRA_TUNE_ALL },
 188   { NULL, AARCH64_EXTRA_TUNE_NONE }
 189 };
 190
 191 /* Tuning parameters.  */
 192
 193 static const struct cpu_addrcost_table generic_addrcost_table =
 194 {
 195     {
 196       0, /* hi  */
 197       0, /* si  */
 198       0, /* di  */
 199       0, /* ti  */
 200     },
 201   0, /* pre_modify  */
 202   0, /* post_modify  */
 203   0, /* register_offset  */
 204   0, /* register_sextend  */
 205   0, /* register_zextend  */
 206   0 /* imm_offset  */
 207 };
 208
 209 static const struct cpu_addrcost_table cortexa57_addrcost_table =
 210 {
 211     {
 212       1, /* hi  */
 213       0, /* si  */
 214       0, /* di  */
 215       1, /* ti  */
 216     },
 217   0, /* pre_modify  */
 218   0, /* post_modify  */
 219   0, /* register_offset  */
 220   0, /* register_sextend  */
 221   0, /* register_zextend  */
 222   0, /* imm_offset  */
 223 };
 224
 225 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 226 {
 227     {
 228       0, /* hi  */
 229       0, /* si  */
 230       0, /* di  */
 231       2, /* ti  */
 232     },
 233   0, /* pre_modify  */
 234   0, /* post_modify  */
 235   1, /* register_offset  */
 236   1, /* register_sextend  */
 237   2, /* register_zextend  */
 238   0, /* imm_offset  */
 239 };
 240
 241 static const struct cpu_addrcost_table xgene1_addrcost_table =
 242 {
 243     {
 244       1, /* hi  */
 245       0, /* si  */
 246       0, /* di  */
 247       1, /* ti  */
 248     },
 249   1, /* pre_modify  */
 250   0, /* post_modify  */
 251   0, /* register_offset  */
 252   1, /* register_sextend  */
 253   1, /* register_zextend  */
 254   0, /* imm_offset  */
 255 };
 256
 257 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 258 {
 259     {
 260       1, /* hi  */
 261       0, /* si  */
 262       0, /* di  */
 263       1, /* ti  */
 264     },
 265   0, /* pre_modify  */
 266   0, /* post_modify  */
 267   0, /* register_offset  */
 268   0, /* register_sextend  */
 269   0, /* register_zextend  */
 270   0 /* imm_offset  */
 271 };
 272
 273 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 274 {
 275     {
 276       1, /* hi  */
 277       1, /* si  */
 278       1, /* di  */
 279       2, /* ti  */
 280     },
 281   0, /* pre_modify  */
 282   0, /* post_modify  */
 283   2, /* register_offset  */
 284   3, /* register_sextend  */
 285   3, /* register_zextend  */
 286   0, /* imm_offset  */
 287 };
 288
 289 static const struct cpu_regmove_cost generic_regmove_cost =
 290 {
 291   1, /* GP2GP  */
 292   /* Avoid the use of slow int<->fp moves for spilling by setting
 293      their cost higher than memmov_cost.  */
 294   5, /* GP2FP  */
 295   5, /* FP2GP  */
 296   2 /* FP2FP  */
 297 };
 298
 299 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 300 {
 301   1, /* GP2GP  */
 302   /* Avoid the use of slow int<->fp moves for spilling by setting
 303      their cost higher than memmov_cost.  */
 304   5, /* GP2FP  */
 305   5, /* FP2GP  */
 306   2 /* FP2FP  */
 307 };
 308
 309 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 310 {
 311   1, /* GP2GP  */
 312   /* Avoid the use of slow int<->fp moves for spilling by setting
 313      their cost higher than memmov_cost.  */
 314   5, /* GP2FP  */
 315   5, /* FP2GP  */
 316   2 /* FP2FP  */
 317 };
 318
 319 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 320 {
 321   1, /* GP2GP  */
 322   /* Avoid the use of slow int<->fp moves for spilling by setting
 323      their cost higher than memmov_cost (actual, 4 and 9).  */
 324   9, /* GP2FP  */
 325   9, /* FP2GP  */
 326   1 /* FP2FP  */
 327 };
 328
 329 static const struct cpu_regmove_cost thunderx_regmove_cost =
 330 {
 331   2, /* GP2GP  */
 332   2, /* GP2FP  */
 333   6, /* FP2GP  */
 334   4 /* FP2FP  */
 335 };
 336
 337 static const struct cpu_regmove_cost xgene1_regmove_cost =
 338 {
 339   1, /* GP2GP  */
 340   /* Avoid the use of slow int<->fp moves for spilling by setting
 341      their cost higher than memmov_cost.  */
 342   8, /* GP2FP  */
 343   8, /* FP2GP  */
 344   2 /* FP2FP  */
 345 };
 346
 347 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 348 {
 349   2, /* GP2GP  */
 350   /* Avoid the use of int<->fp moves for spilling.  */
 351   6, /* GP2FP  */
 352   6, /* FP2GP  */
 353   4 /* FP2FP  */
 354 };
 355
 356 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 357 {
 358   1, /* GP2GP  */
 359   /* Avoid the use of int<->fp moves for spilling.  */
 360   8, /* GP2FP  */
 361   8, /* FP2GP  */
 362   4  /* FP2FP  */
 363 };
 364
 365 /* Generic costs for vector insn classes.  */
 366 static const struct cpu_vector_cost generic_vector_cost =
 367 {
 368   1, /* scalar_int_stmt_cost  */
 369   1, /* scalar_fp_stmt_cost  */
 370   1, /* scalar_load_cost  */
 371   1, /* scalar_store_cost  */
 372   1, /* vec_int_stmt_cost  */
 373   1, /* vec_fp_stmt_cost  */
 374   2, /* vec_permute_cost  */
 375   1, /* vec_to_scalar_cost  */
 376   1, /* scalar_to_vec_cost  */
 377   1, /* vec_align_load_cost  */
 378   1, /* vec_unalign_load_cost  */
 379   1, /* vec_unalign_store_cost  */
 380   1, /* vec_store_cost  */
 381   3, /* cond_taken_branch_cost  */
 382   1 /* cond_not_taken_branch_cost  */
 383 };
 384
 385 /* ThunderX costs for vector insn classes.  */
 386 static const struct cpu_vector_cost thunderx_vector_cost =
 387 {
 388   1, /* scalar_int_stmt_cost  */
 389   1, /* scalar_fp_stmt_cost  */
 390   3, /* scalar_load_cost  */
 391   1, /* scalar_store_cost  */
 392   4, /* vec_int_stmt_cost  */
 393   4, /* vec_fp_stmt_cost  */
 394   4, /* vec_permute_cost  */
 395   2, /* vec_to_scalar_cost  */
 396   2, /* scalar_to_vec_cost  */
 397   3, /* vec_align_load_cost  */
 398   10, /* vec_unalign_load_cost  */
 399   10, /* vec_unalign_store_cost  */
 400   1, /* vec_store_cost  */
 401   3, /* cond_taken_branch_cost  */
 402   3 /* cond_not_taken_branch_cost  */
 403 };
 404
 405 /* Generic costs for vector insn classes.  */
 406 static const struct cpu_vector_cost cortexa57_vector_cost =
 407 {
 408   1, /* scalar_int_stmt_cost  */
 409   1, /* scalar_fp_stmt_cost  */
 410   4, /* scalar_load_cost  */
 411   1, /* scalar_store_cost  */
 412   2, /* vec_int_stmt_cost  */
 413   2, /* vec_fp_stmt_cost  */
 414   3, /* vec_permute_cost  */
 415   8, /* vec_to_scalar_cost  */
 416   8, /* scalar_to_vec_cost  */
 417   4, /* vec_align_load_cost  */
 418   4, /* vec_unalign_load_cost  */
 419   1, /* vec_unalign_store_cost  */
 420   1, /* vec_store_cost  */
 421   1, /* cond_taken_branch_cost  */
 422   1 /* cond_not_taken_branch_cost  */
 423 };
 424
 425 static const struct cpu_vector_cost exynosm1_vector_cost =
 426 {
 427   1, /* scalar_int_stmt_cost  */
 428   1, /* scalar_fp_stmt_cost  */
 429   5, /* scalar_load_cost  */
 430   1, /* scalar_store_cost  */
 431   3, /* vec_int_stmt_cost  */
 432   3, /* vec_fp_stmt_cost  */
 433   3, /* vec_permute_cost  */
 434   3, /* vec_to_scalar_cost  */
 435   3, /* scalar_to_vec_cost  */
 436   5, /* vec_align_load_cost  */
 437   5, /* vec_unalign_load_cost  */
 438   1, /* vec_unalign_store_cost  */
 439   1, /* vec_store_cost  */
 440   1, /* cond_taken_branch_cost  */
 441   1 /* cond_not_taken_branch_cost  */
 442 };
 443
 444 /* Generic costs for vector insn classes.  */
 445 static const struct cpu_vector_cost xgene1_vector_cost =
 446 {
 447   1, /* scalar_int_stmt_cost  */
 448   1, /* scalar_fp_stmt_cost  */
 449   5, /* scalar_load_cost  */
 450   1, /* scalar_store_cost  */
 451   2, /* vec_int_stmt_cost  */
 452   2, /* vec_fp_stmt_cost  */
 453   2, /* vec_permute_cost  */
 454   4, /* vec_to_scalar_cost  */
 455   4, /* scalar_to_vec_cost  */
 456   10, /* vec_align_load_cost  */
 457   10, /* vec_unalign_load_cost  */
 458   2, /* vec_unalign_store_cost  */
 459   2, /* vec_store_cost  */
 460   2, /* cond_taken_branch_cost  */
 461   1 /* cond_not_taken_branch_cost  */
 462 };
 463
 464 /* Costs for vector insn classes for Vulcan.  */
 465 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 466 {
 467   1, /* scalar_int_stmt_cost  */
 468   6, /* scalar_fp_stmt_cost  */
 469   4, /* scalar_load_cost  */
 470   1, /* scalar_store_cost  */
 471   5, /* vec_int_stmt_cost  */
 472   6, /* vec_fp_stmt_cost  */
 473   3, /* vec_permute_cost  */
 474   6, /* vec_to_scalar_cost  */
 475   5, /* scalar_to_vec_cost  */
 476   8, /* vec_align_load_cost  */
 477   8, /* vec_unalign_load_cost  */
 478   4, /* vec_unalign_store_cost  */
 479   4, /* vec_store_cost  */
 480   2, /* cond_taken_branch_cost  */
 481   1  /* cond_not_taken_branch_cost  */
 482 };
 483
 484 /* Generic costs for branch instructions.  */
 485 static const struct cpu_branch_cost generic_branch_cost =
 486 {
 487   1,  /* Predictable.  */
 488   3   /* Unpredictable.  */
 489 };
 490
 491 /* Branch costs for Cortex-A57.  */
 492 static const struct cpu_branch_cost cortexa57_branch_cost =
 493 {
 494   1,  /* Predictable.  */
 495   3   /* Unpredictable.  */
 496 };
 497
 498 /* Branch costs for Vulcan.  */
 499 static const struct cpu_branch_cost thunderx2t99_branch_cost =
 500 {
 501   1,  /* Predictable.  */
 502   3   /* Unpredictable.  */
 503 };
 504
 505 /* Generic approximation modes.  */
 506 static const cpu_approx_modes generic_approx_modes =
 507 {
 508   AARCH64_APPROX_NONE,  /* division  */
 509   AARCH64_APPROX_NONE,  /* sqrt  */
 510   AARCH64_APPROX_NONE   /* recip_sqrt  */
 511 };
 512
 513 /* Approximation modes for Exynos M1.  */
 514 static const cpu_approx_modes exynosm1_approx_modes =
 515 {
 516   AARCH64_APPROX_NONE,  /* division  */
 517   AARCH64_APPROX_ALL,   /* sqrt  */
 518   AARCH64_APPROX_ALL    /* recip_sqrt  */
 519 };
 520
 521 /* Approximation modes for X-Gene 1.  */
 522 static const cpu_approx_modes xgene1_approx_modes =
 523 {
 524   AARCH64_APPROX_NONE,  /* division  */
 525   AARCH64_APPROX_NONE,  /* sqrt  */
 526   AARCH64_APPROX_ALL    /* recip_sqrt  */
 527 };
 528
 529 static const struct tune_params generic_tunings =
 530 {
 531   &cortexa57_extra_costs,
 532   &generic_addrcost_table,
 533   &generic_regmove_cost,
 534   &generic_vector_cost,
 535   &generic_branch_cost,
 536   &generic_approx_modes,
 537   4, /* memmov_cost  */
 538   2, /* issue_rate  */
 539   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 540   8,    /* function_align.  */
 541   8,    /* jump_align.  */
 542   4,    /* loop_align.  */
 543   2,    /* int_reassoc_width.  */
 544   4,    /* fp_reassoc_width.  */
 545   1,    /* vec_reassoc_width.  */
 546   2,    /* min_div_recip_mul_sf.  */
 547   2,    /* min_div_recip_mul_df.  */
 548   0,    /* max_case_values.  */
 549   0,    /* cache_line_size.  */
 550   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 551   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 552 };
 553
 554 static const struct tune_params cortexa35_tunings =
 555 {
 556   &cortexa53_extra_costs,
 557   &generic_addrcost_table,
 558   &cortexa53_regmove_cost,
 559   &generic_vector_cost,
 560   &cortexa57_branch_cost,
 561   &generic_approx_modes,
 562   4, /* memmov_cost  */
 563   1, /* issue_rate  */
 564   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 565    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 566   16,   /* function_align.  */
 567   8,    /* jump_align.  */
 568   8,    /* loop_align.  */
 569   2,    /* int_reassoc_width.  */
 570   4,    /* fp_reassoc_width.  */
 571   1,    /* vec_reassoc_width.  */
 572   2,    /* min_div_recip_mul_sf.  */
 573   2,    /* min_div_recip_mul_df.  */
 574   0,    /* max_case_values.  */
 575   0,    /* cache_line_size.  */
 576   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 577   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 578 };
 579
 580 static const struct tune_params cortexa53_tunings =
 581 {
 582   &cortexa53_extra_costs,
 583   &generic_addrcost_table,
 584   &cortexa53_regmove_cost,
 585   &generic_vector_cost,
 586   &cortexa57_branch_cost,
 587   &generic_approx_modes,
 588   4, /* memmov_cost  */
 589   2, /* issue_rate  */
 590   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 591    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 592   16,   /* function_align.  */
 593   8,    /* jump_align.  */
 594   8,    /* loop_align.  */
 595   2,    /* int_reassoc_width.  */
 596   4,    /* fp_reassoc_width.  */
 597   1,    /* vec_reassoc_width.  */
 598   2,    /* min_div_recip_mul_sf.  */
 599   2,    /* min_div_recip_mul_df.  */
 600   0,    /* max_case_values.  */
 601   0,    /* cache_line_size.  */
 602   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 603   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 604 };
 605
 606 static const struct tune_params cortexa57_tunings =
 607 {
 608   &cortexa57_extra_costs,
 609   &cortexa57_addrcost_table,
 610   &cortexa57_regmove_cost,
 611   &cortexa57_vector_cost,
 612   &cortexa57_branch_cost,
 613   &generic_approx_modes,
 614   4, /* memmov_cost  */
 615   3, /* issue_rate  */
 616   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 617    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 618   16,   /* function_align.  */
 619   8,    /* jump_align.  */
 620   8,    /* loop_align.  */
 621   2,    /* int_reassoc_width.  */
 622   4,    /* fp_reassoc_width.  */
 623   1,    /* vec_reassoc_width.  */
 624   2,    /* min_div_recip_mul_sf.  */
 625   2,    /* min_div_recip_mul_df.  */
 626   0,    /* max_case_values.  */
 627   0,    /* cache_line_size.  */
 628   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 629   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS)  /* tune_flags.  */
 630 };
 631
 632 static const struct tune_params cortexa72_tunings =
 633 {
 634   &cortexa57_extra_costs,
 635   &cortexa57_addrcost_table,
 636   &cortexa57_regmove_cost,
 637   &cortexa57_vector_cost,
 638   &cortexa57_branch_cost,
 639   &generic_approx_modes,
 640   4, /* memmov_cost  */
 641   3, /* issue_rate  */
 642   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 643    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 644   16,   /* function_align.  */
 645   8,    /* jump_align.  */
 646   8,    /* loop_align.  */
 647   2,    /* int_reassoc_width.  */
 648   4,    /* fp_reassoc_width.  */
 649   1,    /* vec_reassoc_width.  */
 650   2,    /* min_div_recip_mul_sf.  */
 651   2,    /* min_div_recip_mul_df.  */
 652   0,    /* max_case_values.  */
 653   0,    /* cache_line_size.  */
 654   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 655   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 656 };
 657
 658 static const struct tune_params cortexa73_tunings =
 659 {
 660   &cortexa57_extra_costs,
 661   &cortexa57_addrcost_table,
 662   &cortexa57_regmove_cost,
 663   &cortexa57_vector_cost,
 664   &cortexa57_branch_cost,
 665   &generic_approx_modes,
 666   4, /* memmov_cost.  */
 667   2, /* issue_rate.  */
 668   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 669    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 670   16,   /* function_align.  */
 671   8,    /* jump_align.  */
 672   8,    /* loop_align.  */
 673   2,    /* int_reassoc_width.  */
 674   4,    /* fp_reassoc_width.  */
 675   1,    /* vec_reassoc_width.  */
 676   2,    /* min_div_recip_mul_sf.  */
 677   2,    /* min_div_recip_mul_df.  */
 678   0,    /* max_case_values.  */
 679   0,    /* cache_line_size.  */
 680   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 681   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 682 };
 683
 684 static const struct tune_params exynosm1_tunings =
 685 {
 686   &exynosm1_extra_costs,
 687   &exynosm1_addrcost_table,
 688   &exynosm1_regmove_cost,
 689   &exynosm1_vector_cost,
 690   &generic_branch_cost,
 691   &exynosm1_approx_modes,
 692   4,    /* memmov_cost  */
 693   3,    /* issue_rate  */
 694   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 695   4,    /* function_align.  */
 696   4,    /* jump_align.  */
 697   4,    /* loop_align.  */
 698   2,    /* int_reassoc_width.  */
 699   4,    /* fp_reassoc_width.  */
 700   1,    /* vec_reassoc_width.  */
 701   2,    /* min_div_recip_mul_sf.  */
 702   2,    /* min_div_recip_mul_df.  */
 703   48,   /* max_case_values.  */
 704   64,   /* cache_line_size.  */
 705   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 706   (AARCH64_EXTRA_TUNE_NONE) /* tune_flags.  */
 707 };
 708
 709 static const struct tune_params thunderx_tunings =
 710 {
 711   &thunderx_extra_costs,
 712   &generic_addrcost_table,
 713   &thunderx_regmove_cost,
 714   &thunderx_vector_cost,
 715   &generic_branch_cost,
 716   &generic_approx_modes,
 717   6, /* memmov_cost  */
 718   2, /* issue_rate  */
 719   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 720   8,    /* function_align.  */
 721   8,    /* jump_align.  */
 722   8,    /* loop_align.  */
 723   2,    /* int_reassoc_width.  */
 724   4,    /* fp_reassoc_width.  */
 725   1,    /* vec_reassoc_width.  */
 726   2,    /* min_div_recip_mul_sf.  */
 727   2,    /* min_div_recip_mul_df.  */
 728   0,    /* max_case_values.  */
 729   0,    /* cache_line_size.  */
 730   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 731   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)      /* tune_flags.  */
 732 };
 733
 734 static const struct tune_params xgene1_tunings =
 735 {
 736   &xgene1_extra_costs,
 737   &xgene1_addrcost_table,
 738   &xgene1_regmove_cost,
 739   &xgene1_vector_cost,
 740   &generic_branch_cost,
 741   &xgene1_approx_modes,
 742   6, /* memmov_cost  */
 743   4, /* issue_rate  */
 744   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 745   16,   /* function_align.  */
 746   8,    /* jump_align.  */
 747   16,   /* loop_align.  */
 748   2,    /* int_reassoc_width.  */
 749   4,    /* fp_reassoc_width.  */
 750   1,    /* vec_reassoc_width.  */
 751   2,    /* min_div_recip_mul_sf.  */
 752   2,    /* min_div_recip_mul_df.  */
 753   0,    /* max_case_values.  */
 754   0,    /* cache_line_size.  */
 755   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 756   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 757 };
 758
 759 static const struct tune_params qdf24xx_tunings =
 760 {
 761   &qdf24xx_extra_costs,
 762   &qdf24xx_addrcost_table,
 763   &qdf24xx_regmove_cost,
 764   &generic_vector_cost,
 765   &generic_branch_cost,
 766   &generic_approx_modes,
 767   4, /* memmov_cost  */
 768   4, /* issue_rate  */
 769   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 770    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 771   16,   /* function_align.  */
 772   8,    /* jump_align.  */
 773   16,   /* loop_align.  */
 774   2,    /* int_reassoc_width.  */
 775   4,    /* fp_reassoc_width.  */
 776   1,    /* vec_reassoc_width.  */
 777   2,    /* min_div_recip_mul_sf.  */
 778   2,    /* min_div_recip_mul_df.  */
 779   0,    /* max_case_values.  */
 780   64,   /* cache_line_size.  */
 781   tune_params::AUTOPREFETCHER_STRONG,   /* autoprefetcher_model.  */
 782   (AARCH64_EXTRA_TUNE_NONE)             /* tune_flags.  */
 783 };
 784
 785 static const struct tune_params thunderx2t99_tunings =
 786 {
 787   &thunderx2t99_extra_costs,
 788   &thunderx2t99_addrcost_table,
 789   &thunderx2t99_regmove_cost,
 790   &thunderx2t99_vector_cost,
 791   &thunderx2t99_branch_cost,
 792   &generic_approx_modes,
 793   4, /* memmov_cost.  */
 794   4, /* issue_rate.  */
 795   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 796   16,   /* function_align.  */
 797   8,    /* jump_align.  */
 798   16,   /* loop_align.  */
 799   3,    /* int_reassoc_width.  */
 800   2,    /* fp_reassoc_width.  */
 801   2,    /* vec_reassoc_width.  */
 802   2,    /* min_div_recip_mul_sf.  */
 803   2,    /* min_div_recip_mul_df.  */
 804   0,    /* max_case_values.  */
 805   64,   /* cache_line_size.  */
 806   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 807   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 808 };
 809
 810 /* Support for fine-grained override of the tuning structures.  */
 811 struct aarch64_tuning_override_function
 812 {
 813   const char* name;
 814   void (*parse_override)(const char*, struct tune_params*);
 815 };
 816
 817 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 818 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 819
 820 static const struct aarch64_tuning_override_function
 821 aarch64_tuning_override_functions[] =
 822 {
 823   { "fuse", aarch64_parse_fuse_string },
 824   { "tune", aarch64_parse_tune_string },
 825   { NULL, NULL }
 826 };
 827
 828 /* A processor implementing AArch64.  */
 829 struct processor
 830 {
 831   const char *const name;
 832   enum aarch64_processor ident;
 833   enum aarch64_processor sched_core;
 834   enum aarch64_arch arch;
 835   unsigned architecture_version;
 836   const unsigned long flags;
 837   const struct tune_params *const tune;
 838 };
 839
 840 /* Architectures implementing AArch64.  */
 841 static const struct processor all_architectures[] =
 842 {
 843 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
 844   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
 845 #include "aarch64-arches.def"
 846   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 847 };
 848
 849 /* Processor cores implementing AArch64.  */
 850 static const struct processor all_cores[] =
 851 {
 852 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
 853   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
 854   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
 855   FLAGS, &COSTS##_tunings},
 856 #include "aarch64-cores.def"
 857   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
 858     AARCH64_FL_FOR_ARCH8, &generic_tunings},
 859   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 860 };
 861
 862
 863 /* Target specification.  These are populated by the -march, -mtune, -mcpu
 864    handling code or by target attributes.  */
 865 static const struct processor *selected_arch;
 866 static const struct processor *selected_cpu;
 867 static const struct processor *selected_tune;
 868
 869 /* The current tuning set.  */
 870 struct tune_params aarch64_tune_params = generic_tunings;
 871
 872 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 873
 874 /* An ISA extension in the co-processor and main instruction set space.  */
 875 struct aarch64_option_extension
 876 {
 877   const char *const name;
 878   const unsigned long flags_on;
 879   const unsigned long flags_off;
 880 };
 881
 882 typedef enum aarch64_cond_code
 883 {
 884   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 885   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 886   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 887 }
 888 aarch64_cc;
 889
 890 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 891
 892 /* The condition codes of the processor, and the inverse function.  */
 893 static const char * const aarch64_condition_codes[] =
 894 {
 895   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 896   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 897 };
 898
 899 /* Generate code to enable conditional branches in functions over 1 MiB.  */
 900 const char *
 901 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
 902                         const char * branch_format)
 903 {
 904     rtx_code_label * tmp_label = gen_label_rtx ();
 905     char label_buf[256];
 906     char buffer[128];
 907     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
 908                                  CODE_LABEL_NUMBER (tmp_label));
 909     const char *label_ptr = targetm.strip_name_encoding (label_buf);
 910     rtx dest_label = operands[pos_label];
 911     operands[pos_label] = tmp_label;
 912
 913     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
 914     output_asm_insn (buffer, operands);
 915
 916     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
 917     operands[pos_label] = dest_label;
 918     output_asm_insn (buffer, operands);
 919     return "";
 920 }
 921
 922 void
 923 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
 924 {
 925   const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
 926   if (TARGET_GENERAL_REGS_ONLY)
 927     error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
 928   else
 929     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
 930 }
 931
 932 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
 933    The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
 934    the same cost even if ALL_REGS has a much larger cost.  ALL_REGS is also
 935    used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
 936    cost (in this case the best class is the lowest cost one).  Using ALL_REGS
 937    irrespectively of its cost results in bad allocations with many redundant
 938    int<->FP moves which are expensive on various cores.
 939    To avoid this we don't allow ALL_REGS as the allocno class, but force a
 940    decision between FP_REGS and GENERAL_REGS.  We use the allocno class if it
 941    isn't ALL_REGS.  Similarly, use the best class if it isn't ALL_REGS.
 942    Otherwise set the allocno class depending on the mode.
 943    The result of this is that it is no longer inefficient to have a higher
 944    memory move cost than the register move cost.
 945 */
 946
 947 static reg_class_t
 948 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
 949                                          reg_class_t best_class)
 950 {
 951   enum machine_mode mode;
 952
 953   if (allocno_class != ALL_REGS)
 954     return allocno_class;
 955
 956   if (best_class != ALL_REGS)
 957     return best_class;
 958
 959   mode = PSEUDO_REGNO_MODE (regno);
 960   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
 961 }
 962
 963 static unsigned int
 964 aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
 965 {
 966   if (GET_MODE_UNIT_SIZE (mode) == 4)
 967     return aarch64_tune_params.min_div_recip_mul_sf;
 968   return aarch64_tune_params.min_div_recip_mul_df;
 969 }
 970
 971 static int
 972 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
 973                              enum machine_mode mode)
 974 {
 975   if (VECTOR_MODE_P (mode))
 976     return aarch64_tune_params.vec_reassoc_width;
 977   if (INTEGRAL_MODE_P (mode))
 978     return aarch64_tune_params.int_reassoc_width;
 979   if (FLOAT_MODE_P (mode))
 980     return aarch64_tune_params.fp_reassoc_width;
 981   return 1;
 982 }
 983
 984 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
 985 unsigned
 986 aarch64_dbx_register_number (unsigned regno)
 987 {
 988    if (GP_REGNUM_P (regno))
 989      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
 990    else if (regno == SP_REGNUM)
 991      return AARCH64_DWARF_SP;
 992    else if (FP_REGNUM_P (regno))
 993      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
 994
 995    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
 996       equivalent DWARF register.  */
 997    return DWARF_FRAME_REGISTERS;
 998 }
 999
1000 /* Return TRUE if MODE is any of the large INT modes.  */
1001 static bool
1002 aarch64_vect_struct_mode_p (machine_mode mode)
1003 {
1004   return mode == OImode || mode == CImode || mode == XImode;
1005 }
1006
1007 /* Return TRUE if MODE is any of the vector modes.  */
1008 static bool
1009 aarch64_vector_mode_p (machine_mode mode)
1010 {
1011   return aarch64_vector_mode_supported_p (mode)
1012          || aarch64_vect_struct_mode_p (mode);
1013 }
1014
1015 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1016 static bool
1017 aarch64_array_mode_supported_p (machine_mode mode,
1018                                 unsigned HOST_WIDE_INT nelems)
1019 {
1020   if (TARGET_SIMD
1021       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1022           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1023       && (nelems >= 2 && nelems <= 4))
1024     return true;
1025
1026   return false;
1027 }
1028
1029 /* Implement HARD_REGNO_NREGS.  */
1030
1031 int
1032 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1033 {
1034   switch (aarch64_regno_regclass (regno))
1035     {
1036     case FP_REGS:
1037     case FP_LO_REGS:
1038       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1039     default:
1040       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1041     }
1042   gcc_unreachable ();
1043 }
1044
1045 /* Implement HARD_REGNO_MODE_OK.  */
1046
1047 int
1048 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1049 {
1050   if (GET_MODE_CLASS (mode) == MODE_CC)
1051     return regno == CC_REGNUM;
1052
1053   if (regno == SP_REGNUM)
1054     /* The purpose of comparing with ptr_mode is to support the
1055        global register variable associated with the stack pointer
1056        register via the syntax of asm ("wsp") in ILP32.  */
1057     return mode == Pmode || mode == ptr_mode;
1058
1059   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1060     return mode == Pmode;
1061
1062   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1063     return 1;
1064
1065   if (FP_REGNUM_P (regno))
1066     {
1067       if (aarch64_vect_struct_mode_p (mode))
1068         return
1069           (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
1070       else
1071         return 1;
1072     }
1073
1074   return 0;
1075 }
1076
1077 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1078 machine_mode
1079 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1080                                      machine_mode mode)
1081 {
1082   /* Handle modes that fit within single registers.  */
1083   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1084     {
1085       if (GET_MODE_SIZE (mode) >= 4)
1086         return mode;
1087       else
1088         return SImode;
1089     }
1090   /* Fall back to generic for multi-reg and very large modes.  */
1091   else
1092     return choose_hard_reg_mode (regno, nregs, false);
1093 }
1094
1095 /* Return true if calls to DECL should be treated as
1096    long-calls (ie called via a register).  */
1097 static bool
1098 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1099 {
1100   return false;
1101 }
1102
1103 /* Return true if calls to symbol-ref SYM should be treated as
1104    long-calls (ie called via a register).  */
1105 bool
1106 aarch64_is_long_call_p (rtx sym)
1107 {
1108   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1109 }
1110
1111 /* Return true if calls to symbol-ref SYM should not go through
1112    plt stubs.  */
1113
1114 bool
1115 aarch64_is_noplt_call_p (rtx sym)
1116 {
1117   const_tree decl = SYMBOL_REF_DECL (sym);
1118
1119   if (flag_pic
1120       && decl
1121       && (!flag_plt
1122           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1123       && !targetm.binds_local_p (decl))
1124     return true;
1125
1126   return false;
1127 }
1128
1129 /* Return true if the offsets to a zero/sign-extract operation
1130    represent an expression that matches an extend operation.  The
1131    operands represent the paramters from
1132
1133    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1134 bool
1135 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
1136                                 rtx extract_imm)
1137 {
1138   HOST_WIDE_INT mult_val, extract_val;
1139
1140   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1141     return false;
1142
1143   mult_val = INTVAL (mult_imm);
1144   extract_val = INTVAL (extract_imm);
1145
1146   if (extract_val > 8
1147       && extract_val < GET_MODE_BITSIZE (mode)
1148       && exact_log2 (extract_val & ~7) > 0
1149       && (extract_val & 7) <= 4
1150       && mult_val == (1 << (extract_val & 7)))
1151     return true;
1152
1153   return false;
1154 }
1155
1156 /* Emit an insn that's a simple single-set.  Both the operands must be
1157    known to be valid.  */
1158 inline static rtx_insn *
1159 emit_set_insn (rtx x, rtx y)
1160 {
1161   return emit_insn (gen_rtx_SET (x, y));
1162 }
1163
1164 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1165    return the rtx for register 0 in the proper mode.  */
1166 rtx
1167 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1168 {
1169   machine_mode mode = SELECT_CC_MODE (code, x, y);
1170   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1171
1172   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1173   return cc_reg;
1174 }
1175
1176 /* Build the SYMBOL_REF for __tls_get_addr.  */
1177
1178 static GTY(()) rtx tls_get_addr_libfunc;
1179
1180 rtx
1181 aarch64_tls_get_addr (void)
1182 {
1183   if (!tls_get_addr_libfunc)
1184     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1185   return tls_get_addr_libfunc;
1186 }
1187
1188 /* Return the TLS model to use for ADDR.  */
1189
1190 static enum tls_model
1191 tls_symbolic_operand_type (rtx addr)
1192 {
1193   enum tls_model tls_kind = TLS_MODEL_NONE;
1194   rtx sym, addend;
1195
1196   if (GET_CODE (addr) == CONST)
1197     {
1198       split_const (addr, &sym, &addend);
1199       if (GET_CODE (sym) == SYMBOL_REF)
1200         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1201     }
1202   else if (GET_CODE (addr) == SYMBOL_REF)
1203     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1204
1205   return tls_kind;
1206 }
1207
1208 /* We'll allow lo_sum's in addresses in our legitimate addresses
1209    so that combine would take care of combining addresses where
1210    necessary, but for generation purposes, we'll generate the address
1211    as :
1212    RTL                               Absolute
1213    tmp = hi (symbol_ref);            adrp  x1, foo
1214    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1215                                      nop
1216
1217    PIC                               TLS
1218    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1219    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1220                                      bl   __tls_get_addr
1221                                      nop
1222
1223    Load TLS symbol, depending on TLS mechanism and TLS access model.
1224
1225    Global Dynamic - Traditional TLS:
1226    adrp tmp, :tlsgd:imm
1227    add  dest, tmp, #:tlsgd_lo12:imm
1228    bl   __tls_get_addr
1229
1230    Global Dynamic - TLS Descriptors:
1231    adrp dest, :tlsdesc:imm
1232    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1233    add  dest, dest, #:tlsdesc_lo12:imm
1234    blr  tmp
1235    mrs  tp, tpidr_el0
1236    add  dest, dest, tp
1237
1238    Initial Exec:
1239    mrs  tp, tpidr_el0
1240    adrp tmp, :gottprel:imm
1241    ldr  dest, [tmp, #:gottprel_lo12:imm]
1242    add  dest, dest, tp
1243
1244    Local Exec:
1245    mrs  tp, tpidr_el0
1246    add  t0, tp, #:tprel_hi12:imm, lsl #12
1247    add  t0, t0, #:tprel_lo12_nc:imm
1248 */
1249
1250 static void
1251 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1252                                    enum aarch64_symbol_type type)
1253 {
1254   switch (type)
1255     {
1256     case SYMBOL_SMALL_ABSOLUTE:
1257       {
1258         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1259         rtx tmp_reg = dest;
1260         machine_mode mode = GET_MODE (dest);
1261
1262         gcc_assert (mode == Pmode || mode == ptr_mode);
1263
1264         if (can_create_pseudo_p ())
1265           tmp_reg = gen_reg_rtx (mode);
1266
1267         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1268         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1269         return;
1270       }
1271
1272     case SYMBOL_TINY_ABSOLUTE:
1273       emit_insn (gen_rtx_SET (dest, imm));
1274       return;
1275
1276     case SYMBOL_SMALL_GOT_28K:
1277       {
1278         machine_mode mode = GET_MODE (dest);
1279         rtx gp_rtx = pic_offset_table_rtx;
1280         rtx insn;
1281         rtx mem;
1282
1283         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1284            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1285            decide rtx costs, in which case pic_offset_table_rtx is not
1286            initialized.  For that case no need to generate the first adrp
1287            instruction as the final cost for global variable access is
1288            one instruction.  */
1289         if (gp_rtx != NULL)
1290           {
1291             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1292                using the page base as GOT base, the first page may be wasted,
1293                in the worst scenario, there is only 28K space for GOT).
1294
1295                The generate instruction sequence for accessing global variable
1296                is:
1297
1298                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1299
1300                Only one instruction needed. But we must initialize
1301                pic_offset_table_rtx properly.  We generate initialize insn for
1302                every global access, and allow CSE to remove all redundant.
1303
1304                The final instruction sequences will look like the following
1305                for multiply global variables access.
1306
1307                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1308
1309                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1310                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1311                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1312                  ...  */
1313
1314             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1315             crtl->uses_pic_offset_table = 1;
1316             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1317
1318             if (mode != GET_MODE (gp_rtx))
1319              gp_rtx = gen_lowpart (mode, gp_rtx);
1320
1321           }
1322
1323         if (mode == ptr_mode)
1324           {
1325             if (mode == DImode)
1326               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1327             else
1328               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1329
1330             mem = XVECEXP (SET_SRC (insn), 0, 0);
1331           }
1332         else
1333           {
1334             gcc_assert (mode == Pmode);
1335
1336             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1337             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1338           }
1339
1340         /* The operand is expected to be MEM.  Whenever the related insn
1341            pattern changed, above code which calculate mem should be
1342            updated.  */
1343         gcc_assert (GET_CODE (mem) == MEM);
1344         MEM_READONLY_P (mem) = 1;
1345         MEM_NOTRAP_P (mem) = 1;
1346         emit_insn (insn);
1347         return;
1348       }
1349
1350     case SYMBOL_SMALL_GOT_4G:
1351       {
1352         /* In ILP32, the mode of dest can be either SImode or DImode,
1353            while the got entry is always of SImode size.  The mode of
1354            dest depends on how dest is used: if dest is assigned to a
1355            pointer (e.g. in the memory), it has SImode; it may have
1356            DImode if dest is dereferenced to access the memeory.
1357            This is why we have to handle three different ldr_got_small
1358            patterns here (two patterns for ILP32).  */
1359
1360         rtx insn;
1361         rtx mem;
1362         rtx tmp_reg = dest;
1363         machine_mode mode = GET_MODE (dest);
1364
1365         if (can_create_pseudo_p ())
1366           tmp_reg = gen_reg_rtx (mode);
1367
1368         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1369         if (mode == ptr_mode)
1370           {
1371             if (mode == DImode)
1372               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1373             else
1374               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1375
1376             mem = XVECEXP (SET_SRC (insn), 0, 0);
1377           }
1378         else
1379           {
1380             gcc_assert (mode == Pmode);
1381
1382             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1383             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1384           }
1385
1386         gcc_assert (GET_CODE (mem) == MEM);
1387         MEM_READONLY_P (mem) = 1;
1388         MEM_NOTRAP_P (mem) = 1;
1389         emit_insn (insn);
1390         return;
1391       }
1392
1393     case SYMBOL_SMALL_TLSGD:
1394       {
1395         rtx_insn *insns;
1396         machine_mode mode = GET_MODE (dest);
1397         rtx result = gen_rtx_REG (mode, R0_REGNUM);
1398
1399         start_sequence ();
1400         if (TARGET_ILP32)
1401           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1402         else
1403           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1404         insns = get_insns ();
1405         end_sequence ();
1406
1407         RTL_CONST_CALL_P (insns) = 1;
1408         emit_libcall_block (insns, dest, result, imm);
1409         return;
1410       }
1411
1412     case SYMBOL_SMALL_TLSDESC:
1413       {
1414         machine_mode mode = GET_MODE (dest);
1415         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1416         rtx tp;
1417
1418         gcc_assert (mode == Pmode || mode == ptr_mode);
1419
1420         /* In ILP32, the got entry is always of SImode size.  Unlike
1421            small GOT, the dest is fixed at reg 0.  */
1422         if (TARGET_ILP32)
1423           emit_insn (gen_tlsdesc_small_si (imm));
1424         else
1425           emit_insn (gen_tlsdesc_small_di (imm));
1426         tp = aarch64_load_tp (NULL);
1427
1428         if (mode != Pmode)
1429           tp = gen_lowpart (mode, tp);
1430
1431         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1432         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1433         return;
1434       }
1435
1436     case SYMBOL_SMALL_TLSIE:
1437       {
1438         /* In ILP32, the mode of dest can be either SImode or DImode,
1439            while the got entry is always of SImode size.  The mode of
1440            dest depends on how dest is used: if dest is assigned to a
1441            pointer (e.g. in the memory), it has SImode; it may have
1442            DImode if dest is dereferenced to access the memeory.
1443            This is why we have to handle three different tlsie_small
1444            patterns here (two patterns for ILP32).  */
1445         machine_mode mode = GET_MODE (dest);
1446         rtx tmp_reg = gen_reg_rtx (mode);
1447         rtx tp = aarch64_load_tp (NULL);
1448
1449         if (mode == ptr_mode)
1450           {
1451             if (mode == DImode)
1452               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1453             else
1454               {
1455                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1456                 tp = gen_lowpart (mode, tp);
1457               }
1458           }
1459         else
1460           {
1461             gcc_assert (mode == Pmode);
1462             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1463           }
1464
1465         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1466         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1467         return;
1468       }
1469
1470     case SYMBOL_TLSLE12:
1471     case SYMBOL_TLSLE24:
1472     case SYMBOL_TLSLE32:
1473     case SYMBOL_TLSLE48:
1474       {
1475         machine_mode mode = GET_MODE (dest);
1476         rtx tp = aarch64_load_tp (NULL);
1477
1478         if (mode != Pmode)
1479           tp = gen_lowpart (mode, tp);
1480
1481         switch (type)
1482           {
1483           case SYMBOL_TLSLE12:
1484             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1485                         (dest, tp, imm));
1486             break;
1487           case SYMBOL_TLSLE24:
1488             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1489                         (dest, tp, imm));
1490           break;
1491           case SYMBOL_TLSLE32:
1492             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1493                         (dest, imm));
1494             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1495                         (dest, dest, tp));
1496           break;
1497           case SYMBOL_TLSLE48:
1498             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1499                         (dest, imm));
1500             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1501                         (dest, dest, tp));
1502             break;
1503           default:
1504             gcc_unreachable ();
1505           }
1506
1507         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1508         return;
1509       }
1510
1511     case SYMBOL_TINY_GOT:
1512       emit_insn (gen_ldr_got_tiny (dest, imm));
1513       return;
1514
1515     case SYMBOL_TINY_TLSIE:
1516       {
1517         machine_mode mode = GET_MODE (dest);
1518         rtx tp = aarch64_load_tp (NULL);
1519
1520         if (mode == ptr_mode)
1521           {
1522             if (mode == DImode)
1523               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1524             else
1525               {
1526                 tp = gen_lowpart (mode, tp);
1527                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1528               }
1529           }
1530         else
1531           {
1532             gcc_assert (mode == Pmode);
1533             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1534           }
1535
1536         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1537         return;
1538       }
1539
1540     default:
1541       gcc_unreachable ();
1542     }
1543 }
1544
1545 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1546    handle all moves if !can_create_pseudo_p ().  The distinction is
1547    important because, unlike emit_move_insn, the move expanders know
1548    how to force Pmode objects into the constant pool even when the
1549    constant pool address is not itself legitimate.  */
1550 static rtx
1551 aarch64_emit_move (rtx dest, rtx src)
1552 {
1553   return (can_create_pseudo_p ()
1554           ? emit_move_insn (dest, src)
1555           : emit_move_insn_1 (dest, src));
1556 }
1557
1558 /* Split a 128-bit move operation into two 64-bit move operations,
1559    taking care to handle partial overlap of register to register
1560    copies.  Special cases are needed when moving between GP regs and
1561    FP regs.  SRC can be a register, constant or memory; DST a register
1562    or memory.  If either operand is memory it must not have any side
1563    effects.  */
1564 void
1565 aarch64_split_128bit_move (rtx dst, rtx src)
1566 {
1567   rtx dst_lo, dst_hi;
1568   rtx src_lo, src_hi;
1569
1570   machine_mode mode = GET_MODE (dst);
1571
1572   gcc_assert (mode == TImode || mode == TFmode);
1573   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1574   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1575
1576   if (REG_P (dst) && REG_P (src))
1577     {
1578       int src_regno = REGNO (src);
1579       int dst_regno = REGNO (dst);
1580
1581       /* Handle FP <-> GP regs.  */
1582       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1583         {
1584           src_lo = gen_lowpart (word_mode, src);
1585           src_hi = gen_highpart (word_mode, src);
1586
1587           if (mode == TImode)
1588             {
1589               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1590               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1591             }
1592           else
1593             {
1594               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1595               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1596             }
1597           return;
1598         }
1599       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1600         {
1601           dst_lo = gen_lowpart (word_mode, dst);
1602           dst_hi = gen_highpart (word_mode, dst);
1603
1604           if (mode == TImode)
1605             {
1606               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1607               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1608             }
1609           else
1610             {
1611               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1612               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1613             }
1614           return;
1615         }
1616     }
1617
1618   dst_lo = gen_lowpart (word_mode, dst);
1619   dst_hi = gen_highpart (word_mode, dst);
1620   src_lo = gen_lowpart (word_mode, src);
1621   src_hi = gen_highpart_mode (word_mode, mode, src);
1622
1623   /* At most one pairing may overlap.  */
1624   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1625     {
1626       aarch64_emit_move (dst_hi, src_hi);
1627       aarch64_emit_move (dst_lo, src_lo);
1628     }
1629   else
1630     {
1631       aarch64_emit_move (dst_lo, src_lo);
1632       aarch64_emit_move (dst_hi, src_hi);
1633     }
1634 }
1635
1636 bool
1637 aarch64_split_128bit_move_p (rtx dst, rtx src)
1638 {
1639   return (! REG_P (src)
1640           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1641 }
1642
1643 /* Split a complex SIMD combine.  */
1644
1645 void
1646 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1647 {
1648   machine_mode src_mode = GET_MODE (src1);
1649   machine_mode dst_mode = GET_MODE (dst);
1650
1651   gcc_assert (VECTOR_MODE_P (dst_mode));
1652
1653   if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1654     {
1655       rtx (*gen) (rtx, rtx, rtx);
1656
1657       switch (src_mode)
1658         {
1659         case V8QImode:
1660           gen = gen_aarch64_simd_combinev8qi;
1661           break;
1662         case V4HImode:
1663           gen = gen_aarch64_simd_combinev4hi;
1664           break;
1665         case V2SImode:
1666           gen = gen_aarch64_simd_combinev2si;
1667           break;
1668         case V4HFmode:
1669           gen = gen_aarch64_simd_combinev4hf;
1670           break;
1671         case V2SFmode:
1672           gen = gen_aarch64_simd_combinev2sf;
1673           break;
1674         case DImode:
1675           gen = gen_aarch64_simd_combinedi;
1676           break;
1677         case DFmode:
1678           gen = gen_aarch64_simd_combinedf;
1679           break;
1680         default:
1681           gcc_unreachable ();
1682         }
1683
1684       emit_insn (gen (dst, src1, src2));
1685       return;
1686     }
1687 }
1688
1689 /* Split a complex SIMD move.  */
1690
1691 void
1692 aarch64_split_simd_move (rtx dst, rtx src)
1693 {
1694   machine_mode src_mode = GET_MODE (src);
1695   machine_mode dst_mode = GET_MODE (dst);
1696
1697   gcc_assert (VECTOR_MODE_P (dst_mode));
1698
1699   if (REG_P (dst) && REG_P (src))
1700     {
1701       rtx (*gen) (rtx, rtx);
1702
1703       gcc_assert (VECTOR_MODE_P (src_mode));
1704
1705       switch (src_mode)
1706         {
1707         case V16QImode:
1708           gen = gen_aarch64_split_simd_movv16qi;
1709           break;
1710         case V8HImode:
1711           gen = gen_aarch64_split_simd_movv8hi;
1712           break;
1713         case V4SImode:
1714           gen = gen_aarch64_split_simd_movv4si;
1715           break;
1716         case V2DImode:
1717           gen = gen_aarch64_split_simd_movv2di;
1718           break;
1719         case V8HFmode:
1720           gen = gen_aarch64_split_simd_movv8hf;
1721           break;
1722         case V4SFmode:
1723           gen = gen_aarch64_split_simd_movv4sf;
1724           break;
1725         case V2DFmode:
1726           gen = gen_aarch64_split_simd_movv2df;
1727           break;
1728         default:
1729           gcc_unreachable ();
1730         }
1731
1732       emit_insn (gen (dst, src));
1733       return;
1734     }
1735 }
1736
1737 bool
1738 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1739                               machine_mode ymode, rtx y)
1740 {
1741   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1742   gcc_assert (r != NULL);
1743   return rtx_equal_p (x, r);
1744 }
1745
1746
1747 static rtx
1748 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1749 {
1750   if (can_create_pseudo_p ())
1751     return force_reg (mode, value);
1752   else
1753     {
1754       x = aarch64_emit_move (x, value);
1755       return x;
1756     }
1757 }
1758
1759
1760 static rtx
1761 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1762 {
1763   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1764     {
1765       rtx high;
1766       /* Load the full offset into a register.  This
1767          might be improvable in the future.  */
1768       high = GEN_INT (offset);
1769       offset = 0;
1770       high = aarch64_force_temporary (mode, temp, high);
1771       reg = aarch64_force_temporary (mode, temp,
1772                                      gen_rtx_PLUS (mode, high, reg));
1773     }
1774   return plus_constant (mode, reg, offset);
1775 }
1776
1777 static int
1778 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1779                                 machine_mode mode)
1780 {
1781   int i;
1782   unsigned HOST_WIDE_INT val, val2, mask;
1783   int one_match, zero_match;
1784   int num_insns;
1785
1786   val = INTVAL (imm);
1787
1788   if (aarch64_move_imm (val, mode))
1789     {
1790       if (generate)
1791         emit_insn (gen_rtx_SET (dest, imm));
1792       return 1;
1793     }
1794
1795   if ((val >> 32) == 0 || mode == SImode)
1796     {
1797       if (generate)
1798         {
1799           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1800           if (mode == SImode)
1801             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1802                                        GEN_INT ((val >> 16) & 0xffff)));
1803           else
1804             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1805                                        GEN_INT ((val >> 16) & 0xffff)));
1806         }
1807       return 2;
1808     }
1809
1810   /* Remaining cases are all for DImode.  */
1811
1812   mask = 0xffff;
1813   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1814     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1815   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1816     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1817
1818   if (zero_match != 2 && one_match != 2)
1819     {
1820       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1821          For a 64-bit bitmask try whether changing 16 bits to all ones or
1822          zeroes creates a valid bitmask.  To check any repeated bitmask,
1823          try using 16 bits from the other 32-bit half of val.  */
1824
1825       for (i = 0; i < 64; i += 16, mask <<= 16)
1826         {
1827           val2 = val & ~mask;
1828           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1829             break;
1830           val2 = val | mask;
1831           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1832             break;
1833           val2 = val2 & ~mask;
1834           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1835           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1836             break;
1837         }
1838       if (i != 64)
1839         {
1840           if (generate)
1841             {
1842               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1843               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1844                                          GEN_INT ((val >> i) & 0xffff)));
1845             }
1846           return 2;
1847         }
1848     }
1849
1850   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1851      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
1852      otherwise skip zero bits.  */
1853
1854   num_insns = 1;
1855   mask = 0xffff;
1856   val2 = one_match > zero_match ? ~val : val;
1857   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1858
1859   if (generate)
1860     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1861                                            ? (val | ~(mask << i))
1862                                            : (val & (mask << i)))));
1863   for (i += 16; i < 64; i += 16)
1864     {
1865       if ((val2 & (mask << i)) == 0)
1866         continue;
1867       if (generate)
1868         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1869                                    GEN_INT ((val >> i) & 0xffff)));
1870       num_insns ++;
1871     }
1872
1873   return num_insns;
1874 }
1875
1876
1877 void
1878 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1879 {
1880   machine_mode mode = GET_MODE (dest);
1881
1882   gcc_assert (mode == SImode || mode == DImode);
1883
1884   /* Check on what type of symbol it is.  */
1885   if (GET_CODE (imm) == SYMBOL_REF
1886       || GET_CODE (imm) == LABEL_REF
1887       || GET_CODE (imm) == CONST)
1888     {
1889       rtx mem, base, offset;
1890       enum aarch64_symbol_type sty;
1891
1892       /* If we have (const (plus symbol offset)), separate out the offset
1893          before we start classifying the symbol.  */
1894       split_const (imm, &base, &offset);
1895
1896       sty = aarch64_classify_symbol (base, offset);
1897       switch (sty)
1898         {
1899         case SYMBOL_FORCE_TO_MEM:
1900           if (offset != const0_rtx
1901               && targetm.cannot_force_const_mem (mode, imm))
1902             {
1903               gcc_assert (can_create_pseudo_p ());
1904               base = aarch64_force_temporary (mode, dest, base);
1905               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1906               aarch64_emit_move (dest, base);
1907               return;
1908             }
1909
1910           mem = force_const_mem (ptr_mode, imm);
1911           gcc_assert (mem);
1912
1913           /* If we aren't generating PC relative literals, then
1914              we need to expand the literal pool access carefully.
1915              This is something that needs to be done in a number
1916              of places, so could well live as a separate function.  */
1917           if (!aarch64_pcrelative_literal_loads)
1918             {
1919               gcc_assert (can_create_pseudo_p ());
1920               base = gen_reg_rtx (ptr_mode);
1921               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
1922               mem = gen_rtx_MEM (ptr_mode, base);
1923             }
1924
1925           if (mode != ptr_mode)
1926             mem = gen_rtx_ZERO_EXTEND (mode, mem);
1927
1928           emit_insn (gen_rtx_SET (dest, mem));
1929
1930           return;
1931
1932         case SYMBOL_SMALL_TLSGD:
1933         case SYMBOL_SMALL_TLSDESC:
1934         case SYMBOL_SMALL_TLSIE:
1935         case SYMBOL_SMALL_GOT_28K:
1936         case SYMBOL_SMALL_GOT_4G:
1937         case SYMBOL_TINY_GOT:
1938         case SYMBOL_TINY_TLSIE:
1939           if (offset != const0_rtx)
1940             {
1941               gcc_assert(can_create_pseudo_p ());
1942               base = aarch64_force_temporary (mode, dest, base);
1943               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1944               aarch64_emit_move (dest, base);
1945               return;
1946             }
1947           /* FALLTHRU */
1948
1949         case SYMBOL_SMALL_ABSOLUTE:
1950         case SYMBOL_TINY_ABSOLUTE:
1951         case SYMBOL_TLSLE12:
1952         case SYMBOL_TLSLE24:
1953         case SYMBOL_TLSLE32:
1954         case SYMBOL_TLSLE48:
1955           aarch64_load_symref_appropriately (dest, imm, sty);
1956           return;
1957
1958         default:
1959           gcc_unreachable ();
1960         }
1961     }
1962
1963   if (!CONST_INT_P (imm))
1964     {
1965       if (GET_CODE (imm) == HIGH)
1966         emit_insn (gen_rtx_SET (dest, imm));
1967       else
1968         {
1969           rtx mem = force_const_mem (mode, imm);
1970           gcc_assert (mem);
1971           emit_insn (gen_rtx_SET (dest, mem));
1972         }
1973
1974       return;
1975     }
1976
1977   aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1978 }
1979
1980 /* Add DELTA to REGNUM in mode MODE.  SCRATCHREG can be used to hold a
1981    temporary value if necessary.  FRAME_RELATED_P should be true if
1982    the RTX_FRAME_RELATED flag should be set and CFA adjustments added
1983    to the generated instructions.  If SCRATCHREG is known to hold
1984    abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
1985    immediate again.
1986
1987    Since this function may be used to adjust the stack pointer, we must
1988    ensure that it cannot cause transient stack deallocation (for example
1989    by first incrementing SP and then decrementing when adjusting by a
1990    large immediate).  */
1991
1992 static void
1993 aarch64_add_constant_internal (machine_mode mode, int regnum, int scratchreg,
1994                                HOST_WIDE_INT delta, bool frame_related_p,
1995                                bool emit_move_imm)
1996 {
1997   HOST_WIDE_INT mdelta = abs_hwi (delta);
1998   rtx this_rtx = gen_rtx_REG (mode, regnum);
1999   rtx_insn *insn;
2000
2001   if (!mdelta)
2002     return;
2003
2004   /* Single instruction adjustment.  */
2005   if (aarch64_uimm12_shift (mdelta))
2006     {
2007       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
2008       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2009       return;
2010     }
2011
2012   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
2013      Only do this if mdelta is not a 16-bit move as adjusting using a move
2014      is better.  */
2015   if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode))
2016     {
2017       HOST_WIDE_INT low_off = mdelta & 0xfff;
2018
2019       low_off = delta < 0 ? -low_off : low_off;
2020       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
2021       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2022       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2023       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2024       return;
2025     }
2026
2027   /* Emit a move immediate if required and an addition/subtraction.  */
2028   rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2029   if (emit_move_imm)
2030     aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode);
2031   insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx)
2032                               : gen_add2_insn (this_rtx, scratch_rtx));
2033   if (frame_related_p)
2034     {
2035       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2036       rtx adj = plus_constant (mode, this_rtx, delta);
2037       add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2038     }
2039 }
2040
2041 static inline void
2042 aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
2043                       HOST_WIDE_INT delta)
2044 {
2045   aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true);
2046 }
2047
2048 static inline void
2049 aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm)
2050 {
2051   aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta,
2052                                  true, emit_move_imm);
2053 }
2054
2055 static inline void
2056 aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p)
2057 {
2058   aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta,
2059                                  frame_related_p, true);
2060 }
2061
2062 static bool
2063 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2064                                  tree exp ATTRIBUTE_UNUSED)
2065 {
2066   /* Currently, always true.  */
2067   return true;
2068 }
2069
2070 /* Implement TARGET_PASS_BY_REFERENCE.  */
2071
2072 static bool
2073 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2074                            machine_mode mode,
2075                            const_tree type,
2076                            bool named ATTRIBUTE_UNUSED)
2077 {
2078   HOST_WIDE_INT size;
2079   machine_mode dummymode;
2080   int nregs;
2081
2082   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
2083   size = (mode == BLKmode && type)
2084     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2085
2086   /* Aggregates are passed by reference based on their size.  */
2087   if (type && AGGREGATE_TYPE_P (type))
2088     {
2089       size = int_size_in_bytes (type);
2090     }
2091
2092   /* Variable sized arguments are always returned by reference.  */
2093   if (size < 0)
2094     return true;
2095
2096   /* Can this be a candidate to be passed in fp/simd register(s)?  */
2097   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2098                                                &dummymode, &nregs,
2099                                                NULL))
2100     return false;
2101
2102   /* Arguments which are variable sized or larger than 2 registers are
2103      passed by reference unless they are a homogenous floating point
2104      aggregate.  */
2105   return size > 2 * UNITS_PER_WORD;
2106 }
2107
2108 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
2109 static bool
2110 aarch64_return_in_msb (const_tree valtype)
2111 {
2112   machine_mode dummy_mode;
2113   int dummy_int;
2114
2115   /* Never happens in little-endian mode.  */
2116   if (!BYTES_BIG_ENDIAN)
2117     return false;
2118
2119   /* Only composite types smaller than or equal to 16 bytes can
2120      be potentially returned in registers.  */
2121   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2122       || int_size_in_bytes (valtype) <= 0
2123       || int_size_in_bytes (valtype) > 16)
2124     return false;
2125
2126   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2127      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2128      is always passed/returned in the least significant bits of fp/simd
2129      register(s).  */
2130   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2131                                                &dummy_mode, &dummy_int, NULL))
2132     return false;
2133
2134   return true;
2135 }
2136
2137 /* Implement TARGET_FUNCTION_VALUE.
2138    Define how to find the value returned by a function.  */
2139
2140 static rtx
2141 aarch64_function_value (const_tree type, const_tree func,
2142                         bool outgoing ATTRIBUTE_UNUSED)
2143 {
2144   machine_mode mode;
2145   int unsignedp;
2146   int count;
2147   machine_mode ag_mode;
2148
2149   mode = TYPE_MODE (type);
2150   if (INTEGRAL_TYPE_P (type))
2151     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2152
2153   if (aarch64_return_in_msb (type))
2154     {
2155       HOST_WIDE_INT size = int_size_in_bytes (type);
2156
2157       if (size % UNITS_PER_WORD != 0)
2158         {
2159           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2160           mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
2161         }
2162     }
2163
2164   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2165                                                &ag_mode, &count, NULL))
2166     {
2167       if (!aarch64_composite_type_p (type, mode))
2168         {
2169           gcc_assert (count == 1 && mode == ag_mode);
2170           return gen_rtx_REG (mode, V0_REGNUM);
2171         }
2172       else
2173         {
2174           int i;
2175           rtx par;
2176
2177           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2178           for (i = 0; i < count; i++)
2179             {
2180               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2181               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2182                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2183               XVECEXP (par, 0, i) = tmp;
2184             }
2185           return par;
2186         }
2187     }
2188   else
2189     return gen_rtx_REG (mode, R0_REGNUM);
2190 }
2191
2192 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2193    Return true if REGNO is the number of a hard register in which the values
2194    of called function may come back.  */
2195
2196 static bool
2197 aarch64_function_value_regno_p (const unsigned int regno)
2198 {
2199   /* Maximum of 16 bytes can be returned in the general registers.  Examples
2200      of 16-byte return values are: 128-bit integers and 16-byte small
2201      structures (excluding homogeneous floating-point aggregates).  */
2202   if (regno == R0_REGNUM || regno == R1_REGNUM)
2203     return true;
2204
2205   /* Up to four fp/simd registers can return a function value, e.g. a
2206      homogeneous floating-point aggregate having four members.  */
2207   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2208     return TARGET_FLOAT;
2209
2210   return false;
2211 }
2212
2213 /* Implement TARGET_RETURN_IN_MEMORY.
2214
2215    If the type T of the result of a function is such that
2216      void func (T arg)
2217    would require that arg be passed as a value in a register (or set of
2218    registers) according to the parameter passing rules, then the result
2219    is returned in the same registers as would be used for such an
2220    argument.  */
2221
2222 static bool
2223 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2224 {
2225   HOST_WIDE_INT size;
2226   machine_mode ag_mode;
2227   int count;
2228
2229   if (!AGGREGATE_TYPE_P (type)
2230       && TREE_CODE (type) != COMPLEX_TYPE
2231       && TREE_CODE (type) != VECTOR_TYPE)
2232     /* Simple scalar types always returned in registers.  */
2233     return false;
2234
2235   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2236                                                type,
2237                                                &ag_mode,
2238                                                &count,
2239                                                NULL))
2240     return false;
2241
2242   /* Types larger than 2 registers returned in memory.  */
2243   size = int_size_in_bytes (type);
2244   return (size < 0 || size > 2 * UNITS_PER_WORD);
2245 }
2246
2247 static bool
2248 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2249                                const_tree type, int *nregs)
2250 {
2251   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2252   return aarch64_vfp_is_call_or_return_candidate (mode,
2253                                                   type,
2254                                                   &pcum->aapcs_vfp_rmode,
2255                                                   nregs,
2256                                                   NULL);
2257 }
2258
2259 struct aarch64_fn_arg_alignment
2260 {
2261   /* Alignment for FIELD_DECLs in function arguments.  */
2262   unsigned int alignment;
2263   /* Alignment for decls other than FIELD_DECLs in function arguments.  */
2264   unsigned int warn_alignment;
2265 };
2266
2267 /* Given MODE and TYPE of a function argument, return a pair of alignments in
2268    bits.  The idea is to suppress any stronger alignment requested by
2269    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2270    This is a helper function for local use only.  */
2271
2272 static struct aarch64_fn_arg_alignment
2273 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2274 {
2275   struct aarch64_fn_arg_alignment aa;
2276   aa.alignment = 0;
2277   aa.warn_alignment = 0;
2278
2279   if (!type)
2280     {
2281       aa.alignment = GET_MODE_ALIGNMENT (mode);
2282       return aa;
2283     }
2284
2285   if (integer_zerop (TYPE_SIZE (type)))
2286     return aa;
2287
2288   gcc_assert (TYPE_MODE (type) == mode);
2289
2290   if (!AGGREGATE_TYPE_P (type))
2291     {
2292       aa.alignment = TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2293       return aa;
2294     }
2295
2296   if (TREE_CODE (type) == ARRAY_TYPE)
2297     {
2298       aa.alignment = TYPE_ALIGN (TREE_TYPE (type));
2299       return aa;
2300     }
2301
2302   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2303     {
2304       if (TREE_CODE (field) == FIELD_DECL)
2305         aa.alignment = std::max (aa.alignment, DECL_ALIGN (field));
2306       else
2307         aa.warn_alignment = std::max (aa.warn_alignment, DECL_ALIGN (field));
2308     }
2309
2310   return aa;
2311 }
2312
2313 /* Layout a function argument according to the AAPCS64 rules.  The rule
2314    numbers refer to the rule numbers in the AAPCS64.  */
2315
2316 static void
2317 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2318                     const_tree type,
2319                     bool named ATTRIBUTE_UNUSED)
2320 {
2321   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2322   int ncrn, nvrn, nregs;
2323   bool allocate_ncrn, allocate_nvrn;
2324   HOST_WIDE_INT size;
2325
2326   /* We need to do this once per argument.  */
2327   if (pcum->aapcs_arg_processed)
2328     return;
2329
2330   pcum->aapcs_arg_processed = true;
2331
2332   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
2333   size
2334     = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2335                 UNITS_PER_WORD);
2336
2337   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2338   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2339                                                  mode,
2340                                                  type,
2341                                                  &nregs);
2342
2343   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2344      The following code thus handles passing by SIMD/FP registers first.  */
2345
2346   nvrn = pcum->aapcs_nvrn;
2347
2348   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2349      and homogenous short-vector aggregates (HVA).  */
2350   if (allocate_nvrn)
2351     {
2352       if (!TARGET_FLOAT)
2353         aarch64_err_no_fpadvsimd (mode, "argument");
2354
2355       if (nvrn + nregs <= NUM_FP_ARG_REGS)
2356         {
2357           pcum->aapcs_nextnvrn = nvrn + nregs;
2358           if (!aarch64_composite_type_p (type, mode))
2359             {
2360               gcc_assert (nregs == 1);
2361               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2362             }
2363           else
2364             {
2365               rtx par;
2366               int i;
2367               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2368               for (i = 0; i < nregs; i++)
2369                 {
2370                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2371                                          V0_REGNUM + nvrn + i);
2372                   tmp = gen_rtx_EXPR_LIST
2373                     (VOIDmode, tmp,
2374                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2375                   XVECEXP (par, 0, i) = tmp;
2376                 }
2377               pcum->aapcs_reg = par;
2378             }
2379           return;
2380         }
2381       else
2382         {
2383           /* C.3 NSRN is set to 8.  */
2384           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2385           goto on_stack;
2386         }
2387     }
2388
2389   ncrn = pcum->aapcs_ncrn;
2390   nregs = size / UNITS_PER_WORD;
2391
2392   /* C6 - C9.  though the sign and zero extension semantics are
2393      handled elsewhere.  This is the case where the argument fits
2394      entirely general registers.  */
2395   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2396     {
2397
2398       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2399
2400       /* C.8 if the argument has an alignment of 16 then the NGRN is
2401          rounded up to the next even number.  */
2402       if (nregs == 2 && ncrn % 2)
2403         {
2404           struct aarch64_fn_arg_alignment aa
2405             = aarch64_function_arg_alignment (mode, type);
2406
2407           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
2408              comparisons are there because for > 16 * BITS_PER_UNIT
2409              alignment nregs should be > 2 and therefore it should be
2410              passed by reference rather than value.  */
2411           if (aa.warn_alignment == 16 * BITS_PER_UNIT
2412               && aa.alignment < aa.warn_alignment
2413               && warn_psabi
2414               && currently_expanding_gimple_stmt)
2415             inform (input_location,
2416                     "parameter passing for argument of type %qT "
2417                     "changed in GCC 7.1", type);
2418           else if (aa.alignment == 16 * BITS_PER_UNIT)
2419             {
2420               ++ncrn;
2421               gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2422             }
2423         }
2424
2425       /* NREGS can be 0 when e.g. an empty structure is to be passed.
2426          A reg is still generated for it, but the caller should be smart
2427          enough not to use it.  */
2428       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2429         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2430       else
2431         {
2432           rtx par;
2433           int i;
2434
2435           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2436           for (i = 0; i < nregs; i++)
2437             {
2438               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2439               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2440                                        GEN_INT (i * UNITS_PER_WORD));
2441               XVECEXP (par, 0, i) = tmp;
2442             }
2443           pcum->aapcs_reg = par;
2444         }
2445
2446       pcum->aapcs_nextncrn = ncrn + nregs;
2447       return;
2448     }
2449
2450   /* C.11  */
2451   pcum->aapcs_nextncrn = NUM_ARG_REGS;
2452
2453   /* The argument is passed on stack; record the needed number of words for
2454      this argument and align the total size if necessary.  */
2455 on_stack:
2456   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2457   struct aarch64_fn_arg_alignment aa
2458     = aarch64_function_arg_alignment (mode, type);
2459
2460   if (aa.alignment == 16 * BITS_PER_UNIT)
2461     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2462                                        16 / UNITS_PER_WORD);
2463   return;
2464 }
2465
2466 /* Implement TARGET_FUNCTION_ARG.  */
2467
2468 static rtx
2469 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2470                       const_tree type, bool named)
2471 {
2472   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2473   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2474
2475   if (mode == VOIDmode)
2476     return NULL_RTX;
2477
2478   aarch64_layout_arg (pcum_v, mode, type, named);
2479   return pcum->aapcs_reg;
2480 }
2481
2482 void
2483 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2484                            const_tree fntype ATTRIBUTE_UNUSED,
2485                            rtx libname ATTRIBUTE_UNUSED,
2486                            const_tree fndecl ATTRIBUTE_UNUSED,
2487                            unsigned n_named ATTRIBUTE_UNUSED)
2488 {
2489   pcum->aapcs_ncrn = 0;
2490   pcum->aapcs_nvrn = 0;
2491   pcum->aapcs_nextncrn = 0;
2492   pcum->aapcs_nextnvrn = 0;
2493   pcum->pcs_variant = ARM_PCS_AAPCS64;
2494   pcum->aapcs_reg = NULL_RTX;
2495   pcum->aapcs_arg_processed = false;
2496   pcum->aapcs_stack_words = 0;
2497   pcum->aapcs_stack_size = 0;
2498
2499   if (!TARGET_FLOAT
2500       && fndecl && TREE_PUBLIC (fndecl)
2501       && fntype && fntype != error_mark_node)
2502     {
2503       const_tree type = TREE_TYPE (fntype);
2504       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
2505       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
2506       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2507                                                    &mode, &nregs, NULL))
2508         aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2509     }
2510   return;
2511 }
2512
2513 static void
2514 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2515                               machine_mode mode,
2516                               const_tree type,
2517                               bool named)
2518 {
2519   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2520   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2521     {
2522       aarch64_layout_arg (pcum_v, mode, type, named);
2523       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2524                   != (pcum->aapcs_stack_words != 0));
2525       pcum->aapcs_arg_processed = false;
2526       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2527       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2528       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2529       pcum->aapcs_stack_words = 0;
2530       pcum->aapcs_reg = NULL_RTX;
2531     }
2532 }
2533
2534 bool
2535 aarch64_function_arg_regno_p (unsigned regno)
2536 {
2537   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2538           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2539 }
2540
2541 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
2542    PARM_BOUNDARY bits of alignment, but will be given anything up
2543    to STACK_BOUNDARY bits if the type requires it.  This makes sure
2544    that both before and after the layout of each argument, the Next
2545    Stacked Argument Address (NSAA) will have a minimum alignment of
2546    8 bytes.  */
2547
2548 static unsigned int
2549 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2550 {
2551   struct aarch64_fn_arg_alignment aa
2552     = aarch64_function_arg_alignment (mode, type);
2553   aa.alignment = MIN (MAX (aa.alignment, PARM_BOUNDARY), STACK_BOUNDARY);
2554   aa.warn_alignment
2555     = MIN (MAX (aa.warn_alignment, PARM_BOUNDARY), STACK_BOUNDARY);
2556
2557   if (warn_psabi && aa.warn_alignment > aa.alignment)
2558     inform (input_location, "parameter passing for argument of type %qT "
2559             "changed in GCC 7.1", type);
2560
2561   return aa.alignment;
2562 }
2563
2564 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2565
2566    Return true if an argument passed on the stack should be padded upwards,
2567    i.e. if the least-significant byte of the stack slot has useful data.
2568
2569    Small aggregate types are placed in the lowest memory address.
2570
2571    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
2572
2573 bool
2574 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
2575 {
2576   /* On little-endian targets, the least significant byte of every stack
2577      argument is passed at the lowest byte address of the stack slot.  */
2578   if (!BYTES_BIG_ENDIAN)
2579     return true;
2580
2581   /* Otherwise, integral, floating-point and pointer types are padded downward:
2582      the least significant byte of a stack argument is passed at the highest
2583      byte address of the stack slot.  */
2584   if (type
2585       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2586          || POINTER_TYPE_P (type))
2587       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2588     return false;
2589
2590   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
2591   return true;
2592 }
2593
2594 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2595
2596    It specifies padding for the last (may also be the only)
2597    element of a block move between registers and memory.  If
2598    assuming the block is in the memory, padding upward means that
2599    the last element is padded after its highest significant byte,
2600    while in downward padding, the last element is padded at the
2601    its least significant byte side.
2602
2603    Small aggregates and small complex types are always padded
2604    upwards.
2605
2606    We don't need to worry about homogeneous floating-point or
2607    short-vector aggregates; their move is not affected by the
2608    padding direction determined here.  Regardless of endianness,
2609    each element of such an aggregate is put in the least
2610    significant bits of a fp/simd register.
2611
2612    Return !BYTES_BIG_ENDIAN if the least significant byte of the
2613    register has useful data, and return the opposite if the most
2614    significant byte does.  */
2615
2616 bool
2617 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2618                      bool first ATTRIBUTE_UNUSED)
2619 {
2620
2621   /* Small composite types are always padded upward.  */
2622   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2623     {
2624       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2625                             : GET_MODE_SIZE (mode));
2626       if (size < 2 * UNITS_PER_WORD)
2627         return true;
2628     }
2629
2630   /* Otherwise, use the default padding.  */
2631   return !BYTES_BIG_ENDIAN;
2632 }
2633
2634 static machine_mode
2635 aarch64_libgcc_cmp_return_mode (void)
2636 {
2637   return SImode;
2638 }
2639
2640 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2641
2642 /* We use the 12-bit shifted immediate arithmetic instructions so values
2643    must be multiple of (1 << 12), i.e. 4096.  */
2644 #define ARITH_FACTOR 4096
2645
2646 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2647 #error Cannot use simple address calculation for stack probing
2648 #endif
2649
2650 /* The pair of scratch registers used for stack probing.  */
2651 #define PROBE_STACK_FIRST_REG  9
2652 #define PROBE_STACK_SECOND_REG 10
2653
2654 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2655    inclusive.  These are offsets from the current stack pointer.  */
2656
2657 static void
2658 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2659 {
2660   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
2661
2662   /* See the same assertion on PROBE_INTERVAL above.  */
2663   gcc_assert ((first % ARITH_FACTOR) == 0);
2664
2665   /* See if we have a constant small number of probes to generate.  If so,
2666      that's the easy case.  */
2667   if (size <= PROBE_INTERVAL)
2668     {
2669       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2670
2671       emit_set_insn (reg1,
2672                      plus_constant (Pmode,
2673                                     stack_pointer_rtx, -(first + base)));
2674       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
2675     }
2676
2677   /* The run-time loop is made up of 8 insns in the generic case while the
2678      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
2679   else if (size <= 4 * PROBE_INTERVAL)
2680     {
2681       HOST_WIDE_INT i, rem;
2682
2683       emit_set_insn (reg1,
2684                      plus_constant (Pmode,
2685                                     stack_pointer_rtx,
2686                                     -(first + PROBE_INTERVAL)));
2687       emit_stack_probe (reg1);
2688
2689       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2690          it exceeds SIZE.  If only two probes are needed, this will not
2691          generate any code.  Then probe at FIRST + SIZE.  */
2692       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2693         {
2694           emit_set_insn (reg1,
2695                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
2696           emit_stack_probe (reg1);
2697         }
2698
2699       rem = size - (i - PROBE_INTERVAL);
2700       if (rem > 256)
2701         {
2702           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2703
2704           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
2705           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
2706         }
2707       else
2708         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
2709     }
2710
2711   /* Otherwise, do the same as above, but in a loop.  Note that we must be
2712      extra careful with variables wrapping around because we might be at
2713      the very top (or the very bottom) of the address space and we have
2714      to be able to handle this case properly; in particular, we use an
2715      equality test for the loop condition.  */
2716   else
2717     {
2718       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
2719
2720       /* Step 1: round SIZE to the previous multiple of the interval.  */
2721
2722       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2723
2724
2725       /* Step 2: compute initial and final value of the loop counter.  */
2726
2727       /* TEST_ADDR = SP + FIRST.  */
2728       emit_set_insn (reg1,
2729                      plus_constant (Pmode, stack_pointer_rtx, -first));
2730
2731       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
2732       emit_set_insn (reg2,
2733                      plus_constant (Pmode, stack_pointer_rtx,
2734                                     -(first + rounded_size)));
2735
2736
2737       /* Step 3: the loop
2738
2739          do
2740            {
2741              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2742              probe at TEST_ADDR
2743            }
2744          while (TEST_ADDR != LAST_ADDR)
2745
2746          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2747          until it is equal to ROUNDED_SIZE.  */
2748
2749       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
2750
2751
2752       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2753          that SIZE is equal to ROUNDED_SIZE.  */
2754
2755       if (size != rounded_size)
2756         {
2757           HOST_WIDE_INT rem = size - rounded_size;
2758
2759           if (rem > 256)
2760             {
2761               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2762
2763               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
2764               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
2765             }
2766           else
2767             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
2768         }
2769     }
2770
2771   /* Make sure nothing is scheduled before we are done.  */
2772   emit_insn (gen_blockage ());
2773 }
2774
2775 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
2776    absolute addresses.  */
2777
2778 const char *
2779 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2780 {
2781   static int labelno = 0;
2782   char loop_lab[32];
2783   rtx xops[2];
2784
2785   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2786
2787   /* Loop.  */
2788   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2789
2790   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
2791   xops[0] = reg1;
2792   xops[1] = GEN_INT (PROBE_INTERVAL);
2793   output_asm_insn ("sub\t%0, %0, %1", xops);
2794
2795   /* Probe at TEST_ADDR.  */
2796   output_asm_insn ("str\txzr, [%0]", xops);
2797
2798   /* Test if TEST_ADDR == LAST_ADDR.  */
2799   xops[1] = reg2;
2800   output_asm_insn ("cmp\t%0, %1", xops);
2801
2802   /* Branch.  */
2803   fputs ("\tb.ne\t", asm_out_file);
2804   assemble_name_raw (asm_out_file, loop_lab);
2805   fputc ('\n', asm_out_file);
2806
2807   return "";
2808 }
2809
2810 static bool
2811 aarch64_frame_pointer_required (void)
2812 {
2813   /* In aarch64_override_options_after_change
2814      flag_omit_leaf_frame_pointer turns off the frame pointer by
2815      default.  Turn it back on now if we've not got a leaf
2816      function.  */
2817   if (flag_omit_leaf_frame_pointer
2818       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2819     return true;
2820
2821   /* Force a frame pointer for EH returns so the return address is at FP+8.  */
2822   if (crtl->calls_eh_return)
2823     return true;
2824
2825   return false;
2826 }
2827
2828 /* Mark the registers that need to be saved by the callee and calculate
2829    the size of the callee-saved registers area and frame record (both FP
2830    and LR may be omitted).  */
2831 static void
2832 aarch64_layout_frame (void)
2833 {
2834   HOST_WIDE_INT offset = 0;
2835   int regno, last_fp_reg = INVALID_REGNUM;
2836
2837   if (reload_completed && cfun->machine->frame.laid_out)
2838     return;
2839
2840 #define SLOT_NOT_REQUIRED (-2)
2841 #define SLOT_REQUIRED     (-1)
2842
2843   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2844   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2845
2846   /* First mark all the registers that really need to be saved...  */
2847   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2848     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2849
2850   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2851     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2852
2853   /* ... that includes the eh data registers (if needed)...  */
2854   if (crtl->calls_eh_return)
2855     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2856       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2857         = SLOT_REQUIRED;
2858
2859   /* ... and any callee saved register that dataflow says is live.  */
2860   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2861     if (df_regs_ever_live_p (regno)
2862         && (regno == R30_REGNUM
2863             || !call_used_regs[regno]))
2864       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2865
2866   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2867     if (df_regs_ever_live_p (regno)
2868         && !call_used_regs[regno])
2869       {
2870         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2871         last_fp_reg = regno;
2872       }
2873
2874   if (frame_pointer_needed)
2875     {
2876       /* FP and LR are placed in the linkage record.  */
2877       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2878       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2879       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2880       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2881       offset += 2 * UNITS_PER_WORD;
2882     }
2883
2884   /* Now assign stack slots for them.  */
2885   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2886     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2887       {
2888         cfun->machine->frame.reg_offset[regno] = offset;
2889         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2890           cfun->machine->frame.wb_candidate1 = regno;
2891         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2892           cfun->machine->frame.wb_candidate2 = regno;
2893         offset += UNITS_PER_WORD;
2894       }
2895
2896   HOST_WIDE_INT max_int_offset = offset;
2897   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2898   bool has_align_gap = offset != max_int_offset;
2899
2900   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2901     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2902       {
2903         /* If there is an alignment gap between integer and fp callee-saves,
2904            allocate the last fp register to it if possible.  */
2905         if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
2906           {
2907             cfun->machine->frame.reg_offset[regno] = max_int_offset;
2908             break;
2909           }
2910
2911         cfun->machine->frame.reg_offset[regno] = offset;
2912         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2913           cfun->machine->frame.wb_candidate1 = regno;
2914         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2915                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2916           cfun->machine->frame.wb_candidate2 = regno;
2917         offset += UNITS_PER_WORD;
2918       }
2919
2920   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2921
2922   cfun->machine->frame.saved_regs_size = offset;
2923
2924   HOST_WIDE_INT varargs_and_saved_regs_size
2925     = offset + cfun->machine->frame.saved_varargs_size;
2926
2927   cfun->machine->frame.hard_fp_offset
2928     = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
2929                 STACK_BOUNDARY / BITS_PER_UNIT);
2930
2931   cfun->machine->frame.frame_size
2932     = ROUND_UP (cfun->machine->frame.hard_fp_offset
2933                 + crtl->outgoing_args_size,
2934                 STACK_BOUNDARY / BITS_PER_UNIT);
2935
2936   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
2937
2938   cfun->machine->frame.initial_adjust = 0;
2939   cfun->machine->frame.final_adjust = 0;
2940   cfun->machine->frame.callee_adjust = 0;
2941   cfun->machine->frame.callee_offset = 0;
2942
2943   HOST_WIDE_INT max_push_offset = 0;
2944   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
2945     max_push_offset = 512;
2946   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
2947     max_push_offset = 256;
2948
2949   if (cfun->machine->frame.frame_size < max_push_offset
2950       && crtl->outgoing_args_size == 0)
2951     {
2952       /* Simple, small frame with no outgoing arguments:
2953          stp reg1, reg2, [sp, -frame_size]!
2954          stp reg3, reg4, [sp, 16]  */
2955       cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
2956     }
2957   else if ((crtl->outgoing_args_size
2958             + cfun->machine->frame.saved_regs_size < 512)
2959            && !(cfun->calls_alloca
2960                 && cfun->machine->frame.hard_fp_offset < max_push_offset))
2961     {
2962       /* Frame with small outgoing arguments:
2963          sub sp, sp, frame_size
2964          stp reg1, reg2, [sp, outgoing_args_size]
2965          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
2966       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
2967       cfun->machine->frame.callee_offset
2968         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
2969     }
2970   else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
2971     {
2972       /* Frame with large outgoing arguments but a small local area:
2973          stp reg1, reg2, [sp, -hard_fp_offset]!
2974          stp reg3, reg4, [sp, 16]
2975          sub sp, sp, outgoing_args_size  */
2976       cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
2977       cfun->machine->frame.final_adjust
2978         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
2979     }
2980   else if (!frame_pointer_needed
2981            && varargs_and_saved_regs_size < max_push_offset)
2982     {
2983       /* Frame with large local area and outgoing arguments (this pushes the
2984          callee-saves first, followed by the locals and outgoing area):
2985          stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
2986          stp reg3, reg4, [sp, 16]
2987          sub sp, sp, frame_size - varargs_and_saved_regs_size  */
2988       cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
2989       cfun->machine->frame.final_adjust
2990         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
2991       cfun->machine->frame.hard_fp_offset = cfun->machine->frame.callee_adjust;
2992       cfun->machine->frame.locals_offset = cfun->machine->frame.hard_fp_offset;
2993     }
2994   else
2995     {
2996       /* Frame with large local area and outgoing arguments using frame pointer:
2997          sub sp, sp, hard_fp_offset
2998          stp x29, x30, [sp, 0]
2999          add x29, sp, 0
3000          stp reg3, reg4, [sp, 16]
3001          sub sp, sp, outgoing_args_size  */
3002       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
3003       cfun->machine->frame.final_adjust
3004         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
3005     }
3006
3007   cfun->machine->frame.laid_out = true;
3008 }
3009
3010 /* Return true if the register REGNO is saved on entry to
3011    the current function.  */
3012
3013 static bool
3014 aarch64_register_saved_on_entry (int regno)
3015 {
3016   return cfun->machine->frame.reg_offset[regno] >= 0;
3017 }
3018
3019 /* Return the next register up from REGNO up to LIMIT for the callee
3020    to save.  */
3021
3022 static unsigned
3023 aarch64_next_callee_save (unsigned regno, unsigned limit)
3024 {
3025   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
3026     regno ++;
3027   return regno;
3028 }
3029
3030 /* Push the register number REGNO of mode MODE to the stack with write-back
3031    adjusting the stack by ADJUSTMENT.  */
3032
3033 static void
3034 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
3035                            HOST_WIDE_INT adjustment)
3036  {
3037   rtx base_rtx = stack_pointer_rtx;
3038   rtx insn, reg, mem;
3039
3040   reg = gen_rtx_REG (mode, regno);
3041   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
3042                             plus_constant (Pmode, base_rtx, -adjustment));
3043   mem = gen_rtx_MEM (mode, mem);
3044
3045   insn = emit_move_insn (mem, reg);
3046   RTX_FRAME_RELATED_P (insn) = 1;
3047 }
3048
3049 /* Generate and return an instruction to store the pair of registers
3050    REG and REG2 of mode MODE to location BASE with write-back adjusting
3051    the stack location BASE by ADJUSTMENT.  */
3052
3053 static rtx
3054 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3055                           HOST_WIDE_INT adjustment)
3056 {
3057   switch (mode)
3058     {
3059     case DImode:
3060       return gen_storewb_pairdi_di (base, base, reg, reg2,
3061                                     GEN_INT (-adjustment),
3062                                     GEN_INT (UNITS_PER_WORD - adjustment));
3063     case DFmode:
3064       return gen_storewb_pairdf_di (base, base, reg, reg2,
3065                                     GEN_INT (-adjustment),
3066                                     GEN_INT (UNITS_PER_WORD - adjustment));
3067     default:
3068       gcc_unreachable ();
3069     }
3070 }
3071
3072 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3073    stack pointer by ADJUSTMENT.  */
3074
3075 static void
3076 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
3077 {
3078   rtx_insn *insn;
3079   machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
3080
3081   if (regno2 == INVALID_REGNUM)
3082     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
3083
3084   rtx reg1 = gen_rtx_REG (mode, regno1);
3085   rtx reg2 = gen_rtx_REG (mode, regno2);
3086
3087   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
3088                                               reg2, adjustment));
3089   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
3090   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3091   RTX_FRAME_RELATED_P (insn) = 1;
3092 }
3093
3094 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3095    adjusting it by ADJUSTMENT afterwards.  */
3096
3097 static rtx
3098 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3099                          HOST_WIDE_INT adjustment)
3100 {
3101   switch (mode)
3102     {
3103     case DImode:
3104       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3105                                    GEN_INT (UNITS_PER_WORD));
3106     case DFmode:
3107       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3108                                    GEN_INT (UNITS_PER_WORD));
3109     default:
3110       gcc_unreachable ();
3111     }
3112 }
3113
3114 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3115    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3116    into CFI_OPS.  */
3117
3118 static void
3119 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
3120                   rtx *cfi_ops)
3121 {
3122   machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
3123   rtx reg1 = gen_rtx_REG (mode, regno1);
3124
3125   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
3126
3127   if (regno2 == INVALID_REGNUM)
3128     {
3129       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
3130       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3131       emit_move_insn (reg1, gen_rtx_MEM (mode, mem));
3132     }
3133   else
3134     {
3135       rtx reg2 = gen_rtx_REG (mode, regno2);
3136       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3137       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3138                                           reg2, adjustment));
3139     }
3140 }
3141
3142 /* Generate and return a store pair instruction of mode MODE to store
3143    register REG1 to MEM1 and register REG2 to MEM2.  */
3144
3145 static rtx
3146 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3147                         rtx reg2)
3148 {
3149   switch (mode)
3150     {
3151     case DImode:
3152       return gen_store_pairdi (mem1, reg1, mem2, reg2);
3153
3154     case DFmode:
3155       return gen_store_pairdf (mem1, reg1, mem2, reg2);
3156
3157     default:
3158       gcc_unreachable ();
3159     }
3160 }
3161
3162 /* Generate and regurn a load pair isntruction of mode MODE to load register
3163    REG1 from MEM1 and register REG2 from MEM2.  */
3164
3165 static rtx
3166 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3167                        rtx mem2)
3168 {
3169   switch (mode)
3170     {
3171     case DImode:
3172       return gen_load_pairdi (reg1, mem1, reg2, mem2);
3173
3174     case DFmode:
3175       return gen_load_pairdf (reg1, mem1, reg2, mem2);
3176
3177     default:
3178       gcc_unreachable ();
3179     }
3180 }
3181
3182 /* Return TRUE if return address signing should be enabled for the current
3183    function, otherwise return FALSE.  */
3184
3185 bool
3186 aarch64_return_address_signing_enabled (void)
3187 {
3188   /* This function should only be called after frame laid out.   */
3189   gcc_assert (cfun->machine->frame.laid_out);
3190
3191   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
3192      if it's LR is pushed onto stack.  */
3193   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
3194           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
3195               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
3196 }
3197
3198 /* Emit code to save the callee-saved registers from register number START
3199    to LIMIT to the stack at the location starting at offset START_OFFSET,
3200    skipping any write-back candidates if SKIP_WB is true.  */
3201
3202 static void
3203 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3204                            unsigned start, unsigned limit, bool skip_wb)
3205 {
3206   rtx_insn *insn;
3207   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3208                                                  ? gen_frame_mem : gen_rtx_MEM);
3209   unsigned regno;
3210   unsigned regno2;
3211
3212   for (regno = aarch64_next_callee_save (start, limit);
3213        regno <= limit;
3214        regno = aarch64_next_callee_save (regno + 1, limit))
3215     {
3216       rtx reg, mem;
3217       HOST_WIDE_INT offset;
3218
3219       if (skip_wb
3220           && (regno == cfun->machine->frame.wb_candidate1
3221               || regno == cfun->machine->frame.wb_candidate2))
3222         continue;
3223
3224       if (cfun->machine->reg_is_wrapped_separately[regno])
3225        continue;
3226
3227       reg = gen_rtx_REG (mode, regno);
3228       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3229       mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3230                                               offset));
3231
3232       regno2 = aarch64_next_callee_save (regno + 1, limit);
3233
3234       if (regno2 <= limit
3235           && !cfun->machine->reg_is_wrapped_separately[regno2]
3236           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3237               == cfun->machine->frame.reg_offset[regno2]))
3238
3239         {
3240           rtx reg2 = gen_rtx_REG (mode, regno2);
3241           rtx mem2;
3242
3243           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3244           mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3245                                                    offset));
3246           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3247                                                     reg2));
3248
3249           /* The first part of a frame-related parallel insn is
3250              always assumed to be relevant to the frame
3251              calculations; subsequent parts, are only
3252              frame-related if explicitly marked.  */
3253           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3254           regno = regno2;
3255         }
3256       else
3257         insn = emit_move_insn (mem, reg);
3258
3259       RTX_FRAME_RELATED_P (insn) = 1;
3260     }
3261 }
3262
3263 /* Emit code to restore the callee registers of mode MODE from register
3264    number START up to and including LIMIT.  Restore from the stack offset
3265    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3266    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
3267
3268 static void
3269 aarch64_restore_callee_saves (machine_mode mode,
3270                               HOST_WIDE_INT start_offset, unsigned start,
3271                               unsigned limit, bool skip_wb, rtx *cfi_ops)
3272 {
3273   rtx base_rtx = stack_pointer_rtx;
3274   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3275                                                  ? gen_frame_mem : gen_rtx_MEM);
3276   unsigned regno;
3277   unsigned regno2;
3278   HOST_WIDE_INT offset;
3279
3280   for (regno = aarch64_next_callee_save (start, limit);
3281        regno <= limit;
3282        regno = aarch64_next_callee_save (regno + 1, limit))
3283     {
3284       if (cfun->machine->reg_is_wrapped_separately[regno])
3285        continue;
3286
3287       rtx reg, mem;
3288
3289       if (skip_wb
3290           && (regno == cfun->machine->frame.wb_candidate1
3291               || regno == cfun->machine->frame.wb_candidate2))
3292         continue;
3293
3294       reg = gen_rtx_REG (mode, regno);
3295       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3296       mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3297
3298       regno2 = aarch64_next_callee_save (regno + 1, limit);
3299
3300       if (regno2 <= limit
3301           && !cfun->machine->reg_is_wrapped_separately[regno2]
3302           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3303               == cfun->machine->frame.reg_offset[regno2]))
3304         {
3305           rtx reg2 = gen_rtx_REG (mode, regno2);
3306           rtx mem2;
3307
3308           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3309           mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3310           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3311
3312           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3313           regno = regno2;
3314         }
3315       else
3316         emit_move_insn (reg, mem);
3317       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3318     }
3319 }
3320
3321 static inline bool
3322 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3323                                HOST_WIDE_INT offset)
3324 {
3325   return offset >= -256 && offset < 256;
3326 }
3327
3328 static inline bool
3329 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3330 {
3331   return (offset >= 0
3332           && offset < 4096 * GET_MODE_SIZE (mode)
3333           && offset % GET_MODE_SIZE (mode) == 0);
3334 }
3335
3336 bool
3337 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3338 {
3339   return (offset >= -64 * GET_MODE_SIZE (mode)
3340           && offset < 64 * GET_MODE_SIZE (mode)
3341           && offset % GET_MODE_SIZE (mode) == 0);
3342 }
3343
3344 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
3345
3346 static sbitmap
3347 aarch64_get_separate_components (void)
3348 {
3349   aarch64_layout_frame ();
3350
3351   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3352   bitmap_clear (components);
3353
3354   /* The registers we need saved to the frame.  */
3355   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3356     if (aarch64_register_saved_on_entry (regno))
3357       {
3358         HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3359         if (!frame_pointer_needed)
3360           offset += cfun->machine->frame.frame_size
3361                     - cfun->machine->frame.hard_fp_offset;
3362         /* Check that we can access the stack slot of the register with one
3363            direct load with no adjustments needed.  */
3364         if (offset_12bit_unsigned_scaled_p (DImode, offset))
3365           bitmap_set_bit (components, regno);
3366       }
3367
3368   /* Don't mess with the hard frame pointer.  */
3369   if (frame_pointer_needed)
3370     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
3371
3372   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3373   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3374   /* If aarch64_layout_frame has chosen registers to store/restore with
3375      writeback don't interfere with them to avoid having to output explicit
3376      stack adjustment instructions.  */
3377   if (reg2 != INVALID_REGNUM)
3378     bitmap_clear_bit (components, reg2);
3379   if (reg1 != INVALID_REGNUM)
3380     bitmap_clear_bit (components, reg1);
3381
3382   bitmap_clear_bit (components, LR_REGNUM);
3383   bitmap_clear_bit (components, SP_REGNUM);
3384
3385   return components;
3386 }
3387
3388 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
3389
3390 static sbitmap
3391 aarch64_components_for_bb (basic_block bb)
3392 {
3393   bitmap in = DF_LIVE_IN (bb);
3394   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
3395   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
3396
3397   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3398   bitmap_clear (components);
3399
3400   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
3401   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3402     if ((!call_used_regs[regno])
3403        && (bitmap_bit_p (in, regno)
3404            || bitmap_bit_p (gen, regno)
3405            || bitmap_bit_p (kill, regno)))
3406           bitmap_set_bit (components, regno);
3407
3408   return components;
3409 }
3410
3411 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
3412    Nothing to do for aarch64.  */
3413
3414 static void
3415 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
3416 {
3417 }
3418
3419 /* Return the next set bit in BMP from START onwards.  Return the total number
3420    of bits in BMP if no set bit is found at or after START.  */
3421
3422 static unsigned int
3423 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
3424 {
3425   unsigned int nbits = SBITMAP_SIZE (bmp);
3426   if (start == nbits)
3427     return start;
3428
3429   gcc_assert (start < nbits);
3430   for (unsigned int i = start; i < nbits; i++)
3431     if (bitmap_bit_p (bmp, i))
3432       return i;
3433
3434   return nbits;
3435 }
3436
3437 /* Do the work for aarch64_emit_prologue_components and
3438    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
3439    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
3440    for these components or the epilogue sequence.  That is, it determines
3441    whether we should emit stores or loads and what kind of CFA notes to attach
3442    to the insns.  Otherwise the logic for the two sequences is very
3443    similar.  */
3444
3445 static void
3446 aarch64_process_components (sbitmap components, bool prologue_p)
3447 {
3448   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
3449                              ? HARD_FRAME_POINTER_REGNUM
3450                              : STACK_POINTER_REGNUM);
3451
3452   unsigned last_regno = SBITMAP_SIZE (components);
3453   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
3454   rtx_insn *insn = NULL;
3455
3456   while (regno != last_regno)
3457     {
3458       /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
3459          so DFmode for the vector registers is enough.  */
3460       machine_mode mode = GP_REGNUM_P (regno) ? DImode : DFmode;
3461       rtx reg = gen_rtx_REG (mode, regno);
3462       HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3463       if (!frame_pointer_needed)
3464         offset += cfun->machine->frame.frame_size
3465                   - cfun->machine->frame.hard_fp_offset;
3466       rtx addr = plus_constant (Pmode, ptr_reg, offset);
3467       rtx mem = gen_frame_mem (mode, addr);
3468
3469       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
3470       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
3471       /* No more registers to handle after REGNO.
3472          Emit a single save/restore and exit.  */
3473       if (regno2 == last_regno)
3474         {
3475           insn = emit_insn (set);
3476           RTX_FRAME_RELATED_P (insn) = 1;
3477           if (prologue_p)
3478             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3479           else
3480             add_reg_note (insn, REG_CFA_RESTORE, reg);
3481           break;
3482         }
3483
3484       HOST_WIDE_INT offset2 = cfun->machine->frame.reg_offset[regno2];
3485       /* The next register is not of the same class or its offset is not
3486          mergeable with the current one into a pair.  */
3487       if (!satisfies_constraint_Ump (mem)
3488           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
3489           || (offset2 - cfun->machine->frame.reg_offset[regno])
3490                 != GET_MODE_SIZE (mode))
3491         {
3492           insn = emit_insn (set);
3493           RTX_FRAME_RELATED_P (insn) = 1;
3494           if (prologue_p)
3495             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3496           else
3497             add_reg_note (insn, REG_CFA_RESTORE, reg);
3498
3499           regno = regno2;
3500           continue;
3501         }
3502
3503       /* REGNO2 can be saved/restored in a pair with REGNO.  */
3504       rtx reg2 = gen_rtx_REG (mode, regno2);
3505       if (!frame_pointer_needed)
3506         offset2 += cfun->machine->frame.frame_size
3507                   - cfun->machine->frame.hard_fp_offset;
3508       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
3509       rtx mem2 = gen_frame_mem (mode, addr2);
3510       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
3511                              : gen_rtx_SET (reg2, mem2);
3512
3513       if (prologue_p)
3514         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
3515       else
3516         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3517
3518       RTX_FRAME_RELATED_P (insn) = 1;
3519       if (prologue_p)
3520         {
3521           add_reg_note (insn, REG_CFA_OFFSET, set);
3522           add_reg_note (insn, REG_CFA_OFFSET, set2);
3523         }
3524       else
3525         {
3526           add_reg_note (insn, REG_CFA_RESTORE, reg);
3527           add_reg_note (insn, REG_CFA_RESTORE, reg2);
3528         }
3529
3530       regno = aarch64_get_next_set_bit (components, regno2 + 1);
3531     }
3532 }
3533
3534 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
3535
3536 static void
3537 aarch64_emit_prologue_components (sbitmap components)
3538 {
3539   aarch64_process_components (components, true);
3540 }
3541
3542 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
3543
3544 static void
3545 aarch64_emit_epilogue_components (sbitmap components)
3546 {
3547   aarch64_process_components (components, false);
3548 }
3549
3550 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
3551
3552 static void
3553 aarch64_set_handled_components (sbitmap components)
3554 {
3555   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3556     if (bitmap_bit_p (components, regno))
3557       cfun->machine->reg_is_wrapped_separately[regno] = true;
3558 }
3559
3560 /* AArch64 stack frames generated by this compiler look like:
3561
3562         +-------------------------------+
3563         |                               |
3564         |  incoming stack arguments     |
3565         |                               |
3566         +-------------------------------+
3567         |                               | <-- incoming stack pointer (aligned)
3568         |  callee-allocated save area   |
3569         |  for register varargs         |
3570         |                               |
3571         +-------------------------------+
3572         |  local variables              | <-- frame_pointer_rtx
3573         |                               |
3574         +-------------------------------+
3575         |  padding0                     | \
3576         +-------------------------------+  |
3577         |  callee-saved registers       |  | frame.saved_regs_size
3578         +-------------------------------+  |
3579         |  LR'                          |  |
3580         +-------------------------------+  |
3581         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
3582         +-------------------------------+
3583         |  dynamic allocation           |
3584         +-------------------------------+
3585         |  padding                      |
3586         +-------------------------------+
3587         |  outgoing stack arguments     | <-- arg_pointer
3588         |                               |
3589         +-------------------------------+
3590         |                               | <-- stack_pointer_rtx (aligned)
3591
3592    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3593    but leave frame_pointer_rtx and hard_frame_pointer_rtx
3594    unchanged.  */
3595
3596 /* Generate the prologue instructions for entry into a function.
3597    Establish the stack frame by decreasing the stack pointer with a
3598    properly calculated size and, if necessary, create a frame record
3599    filled with the values of LR and previous frame pointer.  The
3600    current FP is also set up if it is in use.  */
3601
3602 void
3603 aarch64_expand_prologue (void)
3604 {
3605   aarch64_layout_frame ();
3606
3607   HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3608   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3609   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3610   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3611   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3612   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3613   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3614   rtx_insn *insn;
3615
3616   /* Sign return address for functions.  */
3617   if (aarch64_return_address_signing_enabled ())
3618     {
3619       insn = emit_insn (gen_pacisp ());
3620       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3621       RTX_FRAME_RELATED_P (insn) = 1;
3622     }
3623
3624   if (flag_stack_usage_info)
3625     current_function_static_stack_size = frame_size;
3626
3627   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3628     {
3629       if (crtl->is_leaf && !cfun->calls_alloca)
3630         {
3631           if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
3632             aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
3633                                             frame_size - STACK_CHECK_PROTECT);
3634         }
3635       else if (frame_size > 0)
3636         aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
3637     }
3638
3639   aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
3640
3641   if (callee_adjust != 0)
3642     aarch64_push_regs (reg1, reg2, callee_adjust);
3643
3644   if (frame_pointer_needed)
3645     {
3646       if (callee_adjust == 0)
3647         aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3648                                    R30_REGNUM, false);
3649       insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3650                                        stack_pointer_rtx,
3651                                        GEN_INT (callee_offset)));
3652       RTX_FRAME_RELATED_P (insn) = 1;
3653       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3654     }
3655
3656   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3657                              callee_adjust != 0 || frame_pointer_needed);
3658   aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3659                              callee_adjust != 0 || frame_pointer_needed);
3660   aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
3661 }
3662
3663 /* Return TRUE if we can use a simple_return insn.
3664
3665    This function checks whether the callee saved stack is empty, which
3666    means no restore actions are need. The pro_and_epilogue will use
3667    this to check whether shrink-wrapping opt is feasible.  */
3668
3669 bool
3670 aarch64_use_return_insn_p (void)
3671 {
3672   if (!reload_completed)
3673     return false;
3674
3675   if (crtl->profile)
3676     return false;
3677
3678   aarch64_layout_frame ();
3679
3680   return cfun->machine->frame.frame_size == 0;
3681 }
3682
3683 /* Generate the epilogue instructions for returning from a function.
3684    This is almost exactly the reverse of the prolog sequence, except
3685    that we need to insert barriers to avoid scheduling loads that read
3686    from a deallocated stack, and we optimize the unwind records by
3687    emitting them all together if possible.  */
3688 void
3689 aarch64_expand_epilogue (bool for_sibcall)
3690 {
3691   aarch64_layout_frame ();
3692
3693   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3694   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3695   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3696   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3697   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3698   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3699   rtx cfi_ops = NULL;
3700   rtx_insn *insn;
3701
3702   /* We need to add memory barrier to prevent read from deallocated stack.  */
3703   bool need_barrier_p = (get_frame_size ()
3704                          + cfun->machine->frame.saved_varargs_size) != 0;
3705
3706   /* Emit a barrier to prevent loads from a deallocated stack.  */
3707   if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca
3708       || crtl->calls_eh_return)
3709     {
3710       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3711       need_barrier_p = false;
3712     }
3713
3714   /* Restore the stack pointer from the frame pointer if it may not
3715      be the same as the stack pointer.  */
3716   if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3717     {
3718       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3719                                        hard_frame_pointer_rtx,
3720                                        GEN_INT (-callee_offset)));
3721       /* If writeback is used when restoring callee-saves, the CFA
3722          is restored on the instruction doing the writeback.  */
3723       RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3724     }
3725   else
3726     aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
3727
3728   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3729                                 callee_adjust != 0, &cfi_ops);
3730   aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3731                                 callee_adjust != 0, &cfi_ops);
3732
3733   if (need_barrier_p)
3734     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3735
3736   if (callee_adjust != 0)
3737     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3738
3739   if (callee_adjust != 0 || initial_adjust > 65536)
3740     {
3741       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
3742       insn = get_last_insn ();
3743       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3744       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3745       RTX_FRAME_RELATED_P (insn) = 1;
3746       cfi_ops = NULL;
3747     }
3748
3749   aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
3750
3751   if (cfi_ops)
3752     {
3753       /* Emit delayed restores and reset the CFA to be SP.  */
3754       insn = get_last_insn ();
3755       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3756       REG_NOTES (insn) = cfi_ops;
3757       RTX_FRAME_RELATED_P (insn) = 1;
3758     }
3759
3760   /* We prefer to emit the combined return/authenticate instruction RETAA,
3761      however there are three cases in which we must instead emit an explicit
3762      authentication instruction.
3763
3764         1) Sibcalls don't return in a normal way, so if we're about to call one
3765            we must authenticate.
3766
3767         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
3768            generating code for !TARGET_ARMV8_3 we can't use it and must
3769            explicitly authenticate.
3770
3771         3) On an eh_return path we make extra stack adjustments to update the
3772            canonical frame address to be the exception handler's CFA.  We want
3773            to authenticate using the CFA of the function which calls eh_return.
3774     */
3775   if (aarch64_return_address_signing_enabled ()
3776       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
3777     {
3778       insn = emit_insn (gen_autisp ());
3779       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3780       RTX_FRAME_RELATED_P (insn) = 1;
3781     }
3782
3783   /* Stack adjustment for exception handler.  */
3784   if (crtl->calls_eh_return)
3785     {
3786       /* We need to unwind the stack by the offset computed by
3787          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
3788          to be SP; letting the CFA move during this adjustment
3789          is just as correct as retaining the CFA from the body
3790          of the function.  Therefore, do nothing special.  */
3791       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3792     }
3793
3794   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3795   if (!for_sibcall)
3796     emit_jump_insn (ret_rtx);
3797 }
3798
3799 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
3800    normally or return to a previous frame after unwinding.
3801
3802    An EH return uses a single shared return sequence.  The epilogue is
3803    exactly like a normal epilogue except that it has an extra input
3804    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
3805    that must be applied after the frame has been destroyed.  An extra label
3806    is inserted before the epilogue which initializes this register to zero,
3807    and this is the entry point for a normal return.
3808
3809    An actual EH return updates the return address, initializes the stack
3810    adjustment and jumps directly into the epilogue (bypassing the zeroing
3811    of the adjustment).  Since the return address is typically saved on the
3812    stack when a function makes a call, the saved LR must be updated outside
3813    the epilogue.
3814
3815    This poses problems as the store is generated well before the epilogue,
3816    so the offset of LR is not known yet.  Also optimizations will remove the
3817    store as it appears dead, even after the epilogue is generated (as the
3818    base or offset for loading LR is different in many cases).
3819
3820    To avoid these problems this implementation forces the frame pointer
3821    in eh_return functions so that the location of LR is fixed and known early.
3822    It also marks the store volatile, so no optimization is permitted to
3823    remove the store.  */
3824 rtx
3825 aarch64_eh_return_handler_rtx (void)
3826 {
3827   rtx tmp = gen_frame_mem (Pmode,
3828     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3829
3830   /* Mark the store volatile, so no optimization is permitted to remove it.  */
3831   MEM_VOLATILE_P (tmp) = true;
3832   return tmp;
3833 }
3834
3835 /* Output code to add DELTA to the first argument, and then jump
3836    to FUNCTION.  Used for C++ multiple inheritance.  */
3837 static void
3838 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3839                          HOST_WIDE_INT delta,
3840                          HOST_WIDE_INT vcall_offset,
3841                          tree function)
3842 {
3843   /* The this pointer is always in x0.  Note that this differs from
3844      Arm where the this pointer maybe bumped to r1 if r0 is required
3845      to return a pointer to an aggregate.  On AArch64 a result value
3846      pointer will be in x8.  */
3847   int this_regno = R0_REGNUM;
3848   rtx this_rtx, temp0, temp1, addr, funexp;
3849   rtx_insn *insn;
3850
3851   reload_completed = 1;
3852   emit_note (NOTE_INSN_PROLOGUE_END);
3853
3854   if (vcall_offset == 0)
3855     aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3856   else
3857     {
3858       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3859
3860       this_rtx = gen_rtx_REG (Pmode, this_regno);
3861       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3862       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3863
3864       addr = this_rtx;
3865       if (delta != 0)
3866         {
3867           if (delta >= -256 && delta < 256)
3868             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3869                                        plus_constant (Pmode, this_rtx, delta));
3870           else
3871             aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3872         }
3873
3874       if (Pmode == ptr_mode)
3875         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3876       else
3877         aarch64_emit_move (temp0,
3878                            gen_rtx_ZERO_EXTEND (Pmode,
3879                                                 gen_rtx_MEM (ptr_mode, addr)));
3880
3881       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3882           addr = plus_constant (Pmode, temp0, vcall_offset);
3883       else
3884         {
3885           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3886                                           Pmode);
3887           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3888         }
3889
3890       if (Pmode == ptr_mode)
3891         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3892       else
3893         aarch64_emit_move (temp1,
3894                            gen_rtx_SIGN_EXTEND (Pmode,
3895                                                 gen_rtx_MEM (ptr_mode, addr)));
3896
3897       emit_insn (gen_add2_insn (this_rtx, temp1));
3898     }
3899
3900   /* Generate a tail call to the target function.  */
3901   if (!TREE_USED (function))
3902     {
3903       assemble_external (function);
3904       TREE_USED (function) = 1;
3905     }
3906   funexp = XEXP (DECL_RTL (function), 0);
3907   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3908   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3909   SIBLING_CALL_P (insn) = 1;
3910
3911   insn = get_insns ();
3912   shorten_branches (insn);
3913   final_start_function (insn, file, 1);
3914   final (insn, file, 1);
3915   final_end_function ();
3916
3917   /* Stop pretending to be a post-reload pass.  */
3918   reload_completed = 0;
3919 }
3920
3921 static bool
3922 aarch64_tls_referenced_p (rtx x)
3923 {
3924   if (!TARGET_HAVE_TLS)
3925     return false;
3926   subrtx_iterator::array_type array;
3927   FOR_EACH_SUBRTX (iter, array, x, ALL)
3928     {
3929       const_rtx x = *iter;
3930       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3931         return true;
3932       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3933          TLS offsets, not real symbol references.  */
3934       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3935         iter.skip_subrtxes ();
3936     }
3937   return false;
3938 }
3939
3940
3941 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3942    a left shift of 0 or 12 bits.  */
3943 bool
3944 aarch64_uimm12_shift (HOST_WIDE_INT val)
3945 {
3946   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3947           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3948           );
3949 }
3950
3951
3952 /* Return true if val is an immediate that can be loaded into a
3953    register by a MOVZ instruction.  */
3954 static bool
3955 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3956 {
3957   if (GET_MODE_SIZE (mode) > 4)
3958     {
3959       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3960           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3961         return 1;
3962     }
3963   else
3964     {
3965       /* Ignore sign extension.  */
3966       val &= (HOST_WIDE_INT) 0xffffffff;
3967     }
3968   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3969           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3970 }
3971
3972 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
3973
3974 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
3975   {
3976     0x0000000100000001ull,
3977     0x0001000100010001ull,
3978     0x0101010101010101ull,
3979     0x1111111111111111ull,
3980     0x5555555555555555ull,
3981   };
3982
3983
3984 /* Return true if val is a valid bitmask immediate.  */
3985
3986 bool
3987 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
3988 {
3989   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
3990   int bits;
3991
3992   /* Check for a single sequence of one bits and return quickly if so.
3993      The special cases of all ones and all zeroes returns false.  */
3994   val = (unsigned HOST_WIDE_INT) val_in;
3995   tmp = val + (val & -val);
3996
3997   if (tmp == (tmp & -tmp))
3998     return (val + 1) > 1;
3999
4000   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
4001   if (mode == SImode)
4002     val = (val << 32) | (val & 0xffffffff);
4003
4004   /* Invert if the immediate doesn't start with a zero bit - this means we
4005      only need to search for sequences of one bits.  */
4006   if (val & 1)
4007     val = ~val;
4008
4009   /* Find the first set bit and set tmp to val with the first sequence of one
4010      bits removed.  Return success if there is a single sequence of ones.  */
4011   first_one = val & -val;
4012   tmp = val & (val + first_one);
4013
4014   if (tmp == 0)
4015     return true;
4016
4017   /* Find the next set bit and compute the difference in bit position.  */
4018   next_one = tmp & -tmp;
4019   bits = clz_hwi (first_one) - clz_hwi (next_one);
4020   mask = val ^ tmp;
4021
4022   /* Check the bit position difference is a power of 2, and that the first
4023      sequence of one bits fits within 'bits' bits.  */
4024   if ((mask >> bits) != 0 || bits != (bits & -bits))
4025     return false;
4026
4027   /* Check the sequence of one bits is repeated 64/bits times.  */
4028   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
4029 }
4030
4031 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
4032    Assumed precondition: VAL_IN Is not zero.  */
4033
4034 unsigned HOST_WIDE_INT
4035 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
4036 {
4037   int lowest_bit_set = ctz_hwi (val_in);
4038   int highest_bit_set = floor_log2 (val_in);
4039   gcc_assert (val_in != 0);
4040
4041   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
4042           (HOST_WIDE_INT_1U << lowest_bit_set));
4043 }
4044
4045 /* Create constant where bits outside of lowest bit set to highest bit set
4046    are set to 1.  */
4047
4048 unsigned HOST_WIDE_INT
4049 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
4050 {
4051   return val_in | ~aarch64_and_split_imm1 (val_in);
4052 }
4053
4054 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
4055
4056 bool
4057 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
4058 {
4059   if (aarch64_bitmask_imm (val_in, mode))
4060     return false;
4061
4062   if (aarch64_move_imm (val_in, mode))
4063     return false;
4064
4065   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
4066
4067   return aarch64_bitmask_imm (imm2, mode);
4068 }
4069
4070 /* Return true if val is an immediate that can be loaded into a
4071    register in a single instruction.  */
4072 bool
4073 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
4074 {
4075   if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
4076     return 1;
4077   return aarch64_bitmask_imm (val, mode);
4078 }
4079
4080 static bool
4081 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
4082 {
4083   rtx base, offset;
4084
4085   if (GET_CODE (x) == HIGH)
4086     return true;
4087
4088   split_const (x, &base, &offset);
4089   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
4090     {
4091       if (aarch64_classify_symbol (base, offset)
4092           != SYMBOL_FORCE_TO_MEM)
4093         return true;
4094       else
4095         /* Avoid generating a 64-bit relocation in ILP32; leave
4096            to aarch64_expand_mov_immediate to handle it properly.  */
4097         return mode != ptr_mode;
4098     }
4099
4100   return aarch64_tls_referenced_p (x);
4101 }
4102
4103 /* Implement TARGET_CASE_VALUES_THRESHOLD.
4104    The expansion for a table switch is quite expensive due to the number
4105    of instructions, the table lookup and hard to predict indirect jump.
4106    When optimizing for speed, and -O3 enabled, use the per-core tuning if
4107    set, otherwise use tables for > 16 cases as a tradeoff between size and
4108    performance.  When optimizing for size, use the default setting.  */
4109
4110 static unsigned int
4111 aarch64_case_values_threshold (void)
4112 {
4113   /* Use the specified limit for the number of cases before using jump
4114      tables at higher optimization levels.  */
4115   if (optimize > 2
4116       && selected_cpu->tune->max_case_values != 0)
4117     return selected_cpu->tune->max_case_values;
4118   else
4119     return optimize_size ? default_case_values_threshold () : 17;
4120 }
4121
4122 /* Return true if register REGNO is a valid index register.
4123    STRICT_P is true if REG_OK_STRICT is in effect.  */
4124
4125 bool
4126 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
4127 {
4128   if (!HARD_REGISTER_NUM_P (regno))
4129     {
4130       if (!strict_p)
4131         return true;
4132
4133       if (!reg_renumber)
4134         return false;
4135
4136       regno = reg_renumber[regno];
4137     }
4138   return GP_REGNUM_P (regno);
4139 }
4140
4141 /* Return true if register REGNO is a valid base register for mode MODE.
4142    STRICT_P is true if REG_OK_STRICT is in effect.  */
4143
4144 bool
4145 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
4146 {
4147   if (!HARD_REGISTER_NUM_P (regno))
4148     {
4149       if (!strict_p)
4150         return true;
4151
4152       if (!reg_renumber)
4153         return false;
4154
4155       regno = reg_renumber[regno];
4156     }
4157
4158   /* The fake registers will be eliminated to either the stack or
4159      hard frame pointer, both of which are usually valid base registers.
4160      Reload deals with the cases where the eliminated form isn't valid.  */
4161   return (GP_REGNUM_P (regno)
4162           || regno == SP_REGNUM
4163           || regno == FRAME_POINTER_REGNUM
4164           || regno == ARG_POINTER_REGNUM);
4165 }
4166
4167 /* Return true if X is a valid base register for mode MODE.
4168    STRICT_P is true if REG_OK_STRICT is in effect.  */
4169
4170 static bool
4171 aarch64_base_register_rtx_p (rtx x, bool strict_p)
4172 {
4173   if (!strict_p && GET_CODE (x) == SUBREG)
4174     x = SUBREG_REG (x);
4175
4176   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
4177 }
4178
4179 /* Return true if address offset is a valid index.  If it is, fill in INFO
4180    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
4181
4182 static bool
4183 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
4184                         machine_mode mode, bool strict_p)
4185 {
4186   enum aarch64_address_type type;
4187   rtx index;
4188   int shift;
4189
4190   /* (reg:P) */
4191   if ((REG_P (x) || GET_CODE (x) == SUBREG)
4192       && GET_MODE (x) == Pmode)
4193     {
4194       type = ADDRESS_REG_REG;
4195       index = x;
4196       shift = 0;
4197     }
4198   /* (sign_extend:DI (reg:SI)) */
4199   else if ((GET_CODE (x) == SIGN_EXTEND
4200             || GET_CODE (x) == ZERO_EXTEND)
4201            && GET_MODE (x) == DImode
4202            && GET_MODE (XEXP (x, 0)) == SImode)
4203     {
4204       type = (GET_CODE (x) == SIGN_EXTEND)
4205         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4206       index = XEXP (x, 0);
4207       shift = 0;
4208     }
4209   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
4210   else if (GET_CODE (x) == MULT
4211            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4212                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4213            && GET_MODE (XEXP (x, 0)) == DImode
4214            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4215            && CONST_INT_P (XEXP (x, 1)))
4216     {
4217       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4218         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4219       index = XEXP (XEXP (x, 0), 0);
4220       shift = exact_log2 (INTVAL (XEXP (x, 1)));
4221     }
4222   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
4223   else if (GET_CODE (x) == ASHIFT
4224            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4225                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4226            && GET_MODE (XEXP (x, 0)) == DImode
4227            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4228            && CONST_INT_P (XEXP (x, 1)))
4229     {
4230       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4231         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4232       index = XEXP (XEXP (x, 0), 0);
4233       shift = INTVAL (XEXP (x, 1));
4234     }
4235   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
4236   else if ((GET_CODE (x) == SIGN_EXTRACT
4237             || GET_CODE (x) == ZERO_EXTRACT)
4238            && GET_MODE (x) == DImode
4239            && GET_CODE (XEXP (x, 0)) == MULT
4240            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4241            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4242     {
4243       type = (GET_CODE (x) == SIGN_EXTRACT)
4244         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4245       index = XEXP (XEXP (x, 0), 0);
4246       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4247       if (INTVAL (XEXP (x, 1)) != 32 + shift
4248           || INTVAL (XEXP (x, 2)) != 0)
4249         shift = -1;
4250     }
4251   /* (and:DI (mult:DI (reg:DI) (const_int scale))
4252      (const_int 0xffffffff<<shift)) */
4253   else if (GET_CODE (x) == AND
4254            && GET_MODE (x) == DImode
4255            && GET_CODE (XEXP (x, 0)) == MULT
4256            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4257            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4258            && CONST_INT_P (XEXP (x, 1)))
4259     {
4260       type = ADDRESS_REG_UXTW;
4261       index = XEXP (XEXP (x, 0), 0);
4262       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4263       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4264         shift = -1;
4265     }
4266   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
4267   else if ((GET_CODE (x) == SIGN_EXTRACT
4268             || GET_CODE (x) == ZERO_EXTRACT)
4269            && GET_MODE (x) == DImode
4270            && GET_CODE (XEXP (x, 0)) == ASHIFT
4271            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4272            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4273     {
4274       type = (GET_CODE (x) == SIGN_EXTRACT)
4275         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4276       index = XEXP (XEXP (x, 0), 0);
4277       shift = INTVAL (XEXP (XEXP (x, 0), 1));
4278       if (INTVAL (XEXP (x, 1)) != 32 + shift
4279           || INTVAL (XEXP (x, 2)) != 0)
4280         shift = -1;
4281     }
4282   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
4283      (const_int 0xffffffff<<shift)) */
4284   else if (GET_CODE (x) == AND
4285            && GET_MODE (x) == DImode
4286            && GET_CODE (XEXP (x, 0)) == ASHIFT
4287            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4288            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4289            && CONST_INT_P (XEXP (x, 1)))
4290     {
4291       type = ADDRESS_REG_UXTW;
4292       index = XEXP (XEXP (x, 0), 0);
4293       shift = INTVAL (XEXP (XEXP (x, 0), 1));
4294       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4295         shift = -1;
4296     }
4297   /* (mult:P (reg:P) (const_int scale)) */
4298   else if (GET_CODE (x) == MULT
4299            && GET_MODE (x) == Pmode
4300            && GET_MODE (XEXP (x, 0)) == Pmode
4301            && CONST_INT_P (XEXP (x, 1)))
4302     {
4303       type = ADDRESS_REG_REG;
4304       index = XEXP (x, 0);
4305       shift = exact_log2 (INTVAL (XEXP (x, 1)));
4306     }
4307   /* (ashift:P (reg:P) (const_int shift)) */
4308   else if (GET_CODE (x) == ASHIFT
4309            && GET_MODE (x) == Pmode
4310            && GET_MODE (XEXP (x, 0)) == Pmode
4311            && CONST_INT_P (XEXP (x, 1)))
4312     {
4313       type = ADDRESS_REG_REG;
4314       index = XEXP (x, 0);
4315       shift = INTVAL (XEXP (x, 1));
4316     }
4317   else
4318     return false;
4319
4320   if (GET_CODE (index) == SUBREG)
4321     index = SUBREG_REG (index);
4322
4323   if ((shift == 0 ||
4324        (shift > 0 && shift <= 3
4325         && (1 << shift) == GET_MODE_SIZE (mode)))
4326       && REG_P (index)
4327       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
4328     {
4329       info->type = type;
4330       info->offset = index;
4331       info->shift = shift;
4332       return true;
4333     }
4334
4335   return false;
4336 }
4337
4338 /* Return true if MODE is one of the modes for which we
4339    support LDP/STP operations.  */
4340
4341 static bool
4342 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
4343 {
4344   return mode == SImode || mode == DImode
4345          || mode == SFmode || mode == DFmode
4346          || (aarch64_vector_mode_supported_p (mode)
4347              && GET_MODE_SIZE (mode) == 8);
4348 }
4349
4350 /* Return true if REGNO is a virtual pointer register, or an eliminable
4351    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
4352    include stack_pointer or hard_frame_pointer.  */
4353 static bool
4354 virt_or_elim_regno_p (unsigned regno)
4355 {
4356   return ((regno >= FIRST_VIRTUAL_REGISTER
4357            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
4358           || regno == FRAME_POINTER_REGNUM
4359           || regno == ARG_POINTER_REGNUM);
4360 }
4361
4362 /* Return true if X is a valid address for machine mode MODE.  If it is,
4363    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
4364    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
4365
4366 static bool
4367 aarch64_classify_address (struct aarch64_address_info *info,
4368                           rtx x, machine_mode mode,
4369                           RTX_CODE outer_code, bool strict_p)
4370 {
4371   enum rtx_code code = GET_CODE (x);
4372   rtx op0, op1;
4373
4374   /* On BE, we use load/store pair for all large int mode load/stores.
4375      TI/TFmode may also use a load/store pair.  */
4376   bool load_store_pair_p = (outer_code == PARALLEL
4377                             || mode == TImode
4378                             || mode == TFmode
4379                             || (BYTES_BIG_ENDIAN
4380                                 && aarch64_vect_struct_mode_p (mode)));
4381
4382   bool allow_reg_index_p =
4383     !load_store_pair_p
4384     && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
4385     && !aarch64_vect_struct_mode_p (mode);
4386
4387   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4388      REG addressing.  */
4389   if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
4390       && (code != POST_INC && code != REG))
4391     return false;
4392
4393   switch (code)
4394     {
4395     case REG:
4396     case SUBREG:
4397       info->type = ADDRESS_REG_IMM;
4398       info->base = x;
4399       info->offset = const0_rtx;
4400       return aarch64_base_register_rtx_p (x, strict_p);
4401
4402     case PLUS:
4403       op0 = XEXP (x, 0);
4404       op1 = XEXP (x, 1);
4405
4406       if (! strict_p
4407           && REG_P (op0)
4408           && virt_or_elim_regno_p (REGNO (op0))
4409           && CONST_INT_P (op1))
4410         {
4411           info->type = ADDRESS_REG_IMM;
4412           info->base = op0;
4413           info->offset = op1;
4414
4415           return true;
4416         }
4417
4418       if (GET_MODE_SIZE (mode) != 0
4419           && CONST_INT_P (op1)
4420           && aarch64_base_register_rtx_p (op0, strict_p))
4421         {
4422           HOST_WIDE_INT offset = INTVAL (op1);
4423
4424           info->type = ADDRESS_REG_IMM;
4425           info->base = op0;
4426           info->offset = op1;
4427
4428           /* TImode and TFmode values are allowed in both pairs of X
4429              registers and individual Q registers.  The available
4430              address modes are:
4431              X,X: 7-bit signed scaled offset
4432              Q:   9-bit signed offset
4433              We conservatively require an offset representable in either mode.
4434              When performing the check for pairs of X registers i.e.  LDP/STP
4435              pass down DImode since that is the natural size of the LDP/STP
4436              instruction memory accesses.  */
4437           if (mode == TImode || mode == TFmode)
4438             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
4439                     && (offset_9bit_signed_unscaled_p (mode, offset)
4440                         || offset_12bit_unsigned_scaled_p (mode, offset)));
4441
4442           /* A 7bit offset check because OImode will emit a ldp/stp
4443              instruction (only big endian will get here).
4444              For ldp/stp instructions, the offset is scaled for the size of a
4445              single element of the pair.  */
4446           if (mode == OImode)
4447             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4448
4449           /* Three 9/12 bit offsets checks because CImode will emit three
4450              ldr/str instructions (only big endian will get here).  */
4451           if (mode == CImode)
4452             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4453                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4454                         || offset_12bit_unsigned_scaled_p (V16QImode,
4455                                                            offset + 32)));
4456
4457           /* Two 7bit offsets checks because XImode will emit two ldp/stp
4458              instructions (only big endian will get here).  */
4459           if (mode == XImode)
4460             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4461                     && aarch64_offset_7bit_signed_scaled_p (TImode,
4462                                                             offset + 32));
4463
4464           if (load_store_pair_p)
4465             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4466                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4467           else
4468             return (offset_9bit_signed_unscaled_p (mode, offset)
4469                     || offset_12bit_unsigned_scaled_p (mode, offset));
4470         }
4471
4472       if (allow_reg_index_p)
4473         {
4474           /* Look for base + (scaled/extended) index register.  */
4475           if (aarch64_base_register_rtx_p (op0, strict_p)
4476               && aarch64_classify_index (info, op1, mode, strict_p))
4477             {
4478               info->base = op0;
4479               return true;
4480             }
4481           if (aarch64_base_register_rtx_p (op1, strict_p)
4482               && aarch64_classify_index (info, op0, mode, strict_p))
4483             {
4484               info->base = op1;
4485               return true;
4486             }
4487         }
4488
4489       return false;
4490
4491     case POST_INC:
4492     case POST_DEC:
4493     case PRE_INC:
4494     case PRE_DEC:
4495       info->type = ADDRESS_REG_WB;
4496       info->base = XEXP (x, 0);
4497       info->offset = NULL_RTX;
4498       return aarch64_base_register_rtx_p (info->base, strict_p);
4499
4500     case POST_MODIFY:
4501     case PRE_MODIFY:
4502       info->type = ADDRESS_REG_WB;
4503       info->base = XEXP (x, 0);
4504       if (GET_CODE (XEXP (x, 1)) == PLUS
4505           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4506           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4507           && aarch64_base_register_rtx_p (info->base, strict_p))
4508         {
4509           HOST_WIDE_INT offset;
4510           info->offset = XEXP (XEXP (x, 1), 1);
4511           offset = INTVAL (info->offset);
4512
4513           /* TImode and TFmode values are allowed in both pairs of X
4514              registers and individual Q registers.  The available
4515              address modes are:
4516              X,X: 7-bit signed scaled offset
4517              Q:   9-bit signed offset
4518              We conservatively require an offset representable in either mode.
4519            */
4520           if (mode == TImode || mode == TFmode)
4521             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4522                     && offset_9bit_signed_unscaled_p (mode, offset));
4523
4524           if (load_store_pair_p)
4525             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4526                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4527           else
4528             return offset_9bit_signed_unscaled_p (mode, offset);
4529         }
4530       return false;
4531
4532     case CONST:
4533     case SYMBOL_REF:
4534     case LABEL_REF:
4535       /* load literal: pc-relative constant pool entry.  Only supported
4536          for SI mode or larger.  */
4537       info->type = ADDRESS_SYMBOLIC;
4538
4539       if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4540         {
4541           rtx sym, addend;
4542
4543           split_const (x, &sym, &addend);
4544           return ((GET_CODE (sym) == LABEL_REF
4545                    || (GET_CODE (sym) == SYMBOL_REF
4546                        && CONSTANT_POOL_ADDRESS_P (sym)
4547                        && aarch64_pcrelative_literal_loads)));
4548         }
4549       return false;
4550
4551     case LO_SUM:
4552       info->type = ADDRESS_LO_SUM;
4553       info->base = XEXP (x, 0);
4554       info->offset = XEXP (x, 1);
4555       if (allow_reg_index_p
4556           && aarch64_base_register_rtx_p (info->base, strict_p))
4557         {
4558           rtx sym, offs;
4559           split_const (info->offset, &sym, &offs);
4560           if (GET_CODE (sym) == SYMBOL_REF
4561               && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4562             {
4563               /* The symbol and offset must be aligned to the access size.  */
4564               unsigned int align;
4565               unsigned int ref_size;
4566
4567               if (CONSTANT_POOL_ADDRESS_P (sym))
4568                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4569               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4570                 {
4571                   tree exp = SYMBOL_REF_DECL (sym);
4572                   align = TYPE_ALIGN (TREE_TYPE (exp));
4573                   align = CONSTANT_ALIGNMENT (exp, align);
4574                 }
4575               else if (SYMBOL_REF_DECL (sym))
4576                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4577               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4578                        && SYMBOL_REF_BLOCK (sym) != NULL)
4579                 align = SYMBOL_REF_BLOCK (sym)->alignment;
4580               else
4581                 align = BITS_PER_UNIT;
4582
4583               ref_size = GET_MODE_SIZE (mode);
4584               if (ref_size == 0)
4585                 ref_size = GET_MODE_SIZE (DImode);
4586
4587               return ((INTVAL (offs) & (ref_size - 1)) == 0
4588                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4589             }
4590         }
4591       return false;
4592
4593     default:
4594       return false;
4595     }
4596 }
4597
4598 bool
4599 aarch64_symbolic_address_p (rtx x)
4600 {
4601   rtx offset;
4602
4603   split_const (x, &x, &offset);
4604   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4605 }
4606
4607 /* Classify the base of symbolic expression X.  */
4608
4609 enum aarch64_symbol_type
4610 aarch64_classify_symbolic_expression (rtx x)
4611 {
4612   rtx offset;
4613
4614   split_const (x, &x, &offset);
4615   return aarch64_classify_symbol (x, offset);
4616 }
4617
4618
4619 /* Return TRUE if X is a legitimate address for accessing memory in
4620    mode MODE.  */
4621 static bool
4622 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4623 {
4624   struct aarch64_address_info addr;
4625
4626   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4627 }
4628
4629 /* Return TRUE if X is a legitimate address for accessing memory in
4630    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
4631    pair operation.  */
4632 bool
4633 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4634                               RTX_CODE outer_code, bool strict_p)
4635 {
4636   struct aarch64_address_info addr;
4637
4638   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4639 }
4640
4641 /* Split an out-of-range address displacement into a base and offset.
4642    Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4643    to increase opportunities for sharing the base address of different sizes.
4644    For unaligned accesses and TI/TF mode use the signed 9-bit range.  */
4645 static bool
4646 aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
4647 {
4648   HOST_WIDE_INT offset = INTVAL (*disp);
4649   HOST_WIDE_INT base = offset & ~(GET_MODE_SIZE (mode) < 4 ? 0xfff : 0x3ffc);
4650
4651   if (mode == TImode || mode == TFmode
4652       || (offset & (GET_MODE_SIZE (mode) - 1)) != 0)
4653     base = (offset + 0x100) & ~0x1ff;
4654
4655   *off = GEN_INT (base);
4656   *disp = GEN_INT (offset - base);
4657   return true;
4658 }
4659
4660 /* Return TRUE if rtx X is immediate constant 0.0 */
4661 bool
4662 aarch64_float_const_zero_rtx_p (rtx x)
4663 {
4664   if (GET_MODE (x) == VOIDmode)
4665     return false;
4666
4667   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4668     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4669   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4670 }
4671
4672 /* Return the fixed registers used for condition codes.  */
4673
4674 static bool
4675 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4676 {
4677   *p1 = CC_REGNUM;
4678   *p2 = INVALID_REGNUM;
4679   return true;
4680 }
4681
4682 /* Emit call insn with PAT and do aarch64-specific handling.  */
4683
4684 void
4685 aarch64_emit_call_insn (rtx pat)
4686 {
4687   rtx insn = emit_call_insn (pat);
4688
4689   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4690   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4691   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4692 }
4693
4694 machine_mode
4695 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4696 {
4697   /* All floating point compares return CCFP if it is an equality
4698      comparison, and CCFPE otherwise.  */
4699   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4700     {
4701       switch (code)
4702         {
4703         case EQ:
4704         case NE:
4705         case UNORDERED:
4706         case ORDERED:
4707         case UNLT:
4708         case UNLE:
4709         case UNGT:
4710         case UNGE:
4711         case UNEQ:
4712         case LTGT:
4713           return CCFPmode;
4714
4715         case LT:
4716         case LE:
4717         case GT:
4718         case GE:
4719           return CCFPEmode;
4720
4721         default:
4722           gcc_unreachable ();
4723         }
4724     }
4725
4726   /* Equality comparisons of short modes against zero can be performed
4727      using the TST instruction with the appropriate bitmask.  */
4728   if (y == const0_rtx && REG_P (x)
4729       && (code == EQ || code == NE)
4730       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4731     return CC_NZmode;
4732
4733   /* Similarly, comparisons of zero_extends from shorter modes can
4734      be performed using an ANDS with an immediate mask.  */
4735   if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
4736       && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4737       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
4738       && (code == EQ || code == NE))
4739     return CC_NZmode;
4740
4741   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4742       && y == const0_rtx
4743       && (code == EQ || code == NE || code == LT || code == GE)
4744       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4745           || GET_CODE (x) == NEG
4746           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4747               && CONST_INT_P (XEXP (x, 2)))))
4748     return CC_NZmode;
4749
4750   /* A compare with a shifted operand.  Because of canonicalization,
4751      the comparison will have to be swapped when we emit the assembly
4752      code.  */
4753   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4754       && (REG_P (y) || GET_CODE (y) == SUBREG)
4755       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4756           || GET_CODE (x) == LSHIFTRT
4757           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4758     return CC_SWPmode;
4759
4760   /* Similarly for a negated operand, but we can only do this for
4761      equalities.  */
4762   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4763       && (REG_P (y) || GET_CODE (y) == SUBREG)
4764       && (code == EQ || code == NE)
4765       && GET_CODE (x) == NEG)
4766     return CC_Zmode;
4767
4768   /* A test for unsigned overflow.  */
4769   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
4770       && code == NE
4771       && GET_CODE (x) == PLUS
4772       && GET_CODE (y) == ZERO_EXTEND)
4773     return CC_Cmode;
4774
4775   /* For everything else, return CCmode.  */
4776   return CCmode;
4777 }
4778
4779 static int
4780 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
4781
4782 int
4783 aarch64_get_condition_code (rtx x)
4784 {
4785   machine_mode mode = GET_MODE (XEXP (x, 0));
4786   enum rtx_code comp_code = GET_CODE (x);
4787
4788   if (GET_MODE_CLASS (mode) != MODE_CC)
4789     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
4790   return aarch64_get_condition_code_1 (mode, comp_code);
4791 }
4792
4793 static int
4794 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
4795 {
4796   switch (mode)
4797     {
4798     case CCFPmode:
4799     case CCFPEmode:
4800       switch (comp_code)
4801         {
4802         case GE: return AARCH64_GE;
4803         case GT: return AARCH64_GT;
4804         case LE: return AARCH64_LS;
4805         case LT: return AARCH64_MI;
4806         case NE: return AARCH64_NE;
4807         case EQ: return AARCH64_EQ;
4808         case ORDERED: return AARCH64_VC;
4809         case UNORDERED: return AARCH64_VS;
4810         case UNLT: return AARCH64_LT;
4811         case UNLE: return AARCH64_LE;
4812         case UNGT: return AARCH64_HI;
4813         case UNGE: return AARCH64_PL;
4814         default: return -1;
4815         }
4816       break;
4817
4818     case CCmode:
4819       switch (comp_code)
4820         {
4821         case NE: return AARCH64_NE;
4822         case EQ: return AARCH64_EQ;
4823         case GE: return AARCH64_GE;
4824         case GT: return AARCH64_GT;
4825         case LE: return AARCH64_LE;
4826         case LT: return AARCH64_LT;
4827         case GEU: return AARCH64_CS;
4828         case GTU: return AARCH64_HI;
4829         case LEU: return AARCH64_LS;
4830         case LTU: return AARCH64_CC;
4831         default: return -1;
4832         }
4833       break;
4834
4835     case CC_SWPmode:
4836       switch (comp_code)
4837         {
4838         case NE: return AARCH64_NE;
4839         case EQ: return AARCH64_EQ;
4840         case GE: return AARCH64_LE;
4841         case GT: return AARCH64_LT;
4842         case LE: return AARCH64_GE;
4843         case LT: return AARCH64_GT;
4844         case GEU: return AARCH64_LS;
4845         case GTU: return AARCH64_CC;
4846         case LEU: return AARCH64_CS;
4847         case LTU: return AARCH64_HI;
4848         default: return -1;
4849         }
4850       break;
4851
4852     case CC_NZmode:
4853       switch (comp_code)
4854         {
4855         case NE: return AARCH64_NE;
4856         case EQ: return AARCH64_EQ;
4857         case GE: return AARCH64_PL;
4858         case LT: return AARCH64_MI;
4859         default: return -1;
4860         }
4861       break;
4862
4863     case CC_Zmode:
4864       switch (comp_code)
4865         {
4866         case NE: return AARCH64_NE;
4867         case EQ: return AARCH64_EQ;
4868         default: return -1;
4869         }
4870       break;
4871
4872     case CC_Cmode:
4873       switch (comp_code)
4874         {
4875         case NE: return AARCH64_CS;
4876         case EQ: return AARCH64_CC;
4877         default: return -1;
4878         }
4879       break;
4880
4881     default:
4882       return -1;
4883     }
4884
4885   return -1;
4886 }
4887
4888 bool
4889 aarch64_const_vec_all_same_in_range_p (rtx x,
4890                                   HOST_WIDE_INT minval,
4891                                   HOST_WIDE_INT maxval)
4892 {
4893   HOST_WIDE_INT firstval;
4894   int count, i;
4895
4896   if (GET_CODE (x) != CONST_VECTOR
4897       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
4898     return false;
4899
4900   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
4901   if (firstval < minval || firstval > maxval)
4902     return false;
4903
4904   count = CONST_VECTOR_NUNITS (x);
4905   for (i = 1; i < count; i++)
4906     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
4907       return false;
4908
4909   return true;
4910 }
4911
4912 bool
4913 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
4914 {
4915   return aarch64_const_vec_all_same_in_range_p (x, val, val);
4916 }
4917
4918
4919 /* N Z C V.  */
4920 #define AARCH64_CC_V 1
4921 #define AARCH64_CC_C (1 << 1)
4922 #define AARCH64_CC_Z (1 << 2)
4923 #define AARCH64_CC_N (1 << 3)
4924
4925 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
4926 static const int aarch64_nzcv_codes[] =
4927 {
4928   0,            /* EQ, Z == 1.  */
4929   AARCH64_CC_Z, /* NE, Z == 0.  */
4930   0,            /* CS, C == 1.  */
4931   AARCH64_CC_C, /* CC, C == 0.  */
4932   0,            /* MI, N == 1.  */
4933   AARCH64_CC_N, /* PL, N == 0.  */
4934   0,            /* VS, V == 1.  */
4935   AARCH64_CC_V, /* VC, V == 0.  */
4936   0,            /* HI, C ==1 && Z == 0.  */
4937   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
4938   AARCH64_CC_V, /* GE, N == V.  */
4939   0,            /* LT, N != V.  */
4940   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
4941   0,            /* LE, !(Z == 0 && N == V).  */
4942   0,            /* AL, Any.  */
4943   0             /* NV, Any.  */
4944 };
4945
4946 static void
4947 aarch64_print_operand (FILE *f, rtx x, int code)
4948 {
4949   switch (code)
4950     {
4951     /* An integer or symbol address without a preceding # sign.  */
4952     case 'c':
4953       switch (GET_CODE (x))
4954         {
4955         case CONST_INT:
4956           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4957           break;
4958
4959         case SYMBOL_REF:
4960           output_addr_const (f, x);
4961           break;
4962
4963         case CONST:
4964           if (GET_CODE (XEXP (x, 0)) == PLUS
4965               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4966             {
4967               output_addr_const (f, x);
4968               break;
4969             }
4970           /* Fall through.  */
4971
4972         default:
4973           output_operand_lossage ("Unsupported operand for code '%c'", code);
4974         }
4975       break;
4976
4977     case 'e':
4978       /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w.  */
4979       {
4980         int n;
4981
4982         if (!CONST_INT_P (x)
4983             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4984           {
4985             output_operand_lossage ("invalid operand for '%%%c'", code);
4986             return;
4987           }
4988
4989         switch (n)
4990           {
4991           case 3:
4992             fputc ('b', f);
4993             break;
4994           case 4:
4995             fputc ('h', f);
4996             break;
4997           case 5:
4998             fputc ('w', f);
4999             break;
5000           default:
5001             output_operand_lossage ("invalid operand for '%%%c'", code);
5002             return;
5003           }
5004       }
5005       break;
5006
5007     case 'p':
5008       {
5009         int n;
5010
5011         /* Print N such that 2^N == X.  */
5012         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
5013           {
5014             output_operand_lossage ("invalid operand for '%%%c'", code);
5015             return;
5016           }
5017
5018         asm_fprintf (f, "%d", n);
5019       }
5020       break;
5021
5022     case 'P':
5023       /* Print the number of non-zero bits in X (a const_int).  */
5024       if (!CONST_INT_P (x))
5025         {
5026           output_operand_lossage ("invalid operand for '%%%c'", code);
5027           return;
5028         }
5029
5030       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
5031       break;
5032
5033     case 'H':
5034       /* Print the higher numbered register of a pair (TImode) of regs.  */
5035       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
5036         {
5037           output_operand_lossage ("invalid operand for '%%%c'", code);
5038           return;
5039         }
5040
5041       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
5042       break;
5043
5044     case 'M':
5045     case 'm':
5046       {
5047         int cond_code;
5048         /* Print a condition (eq, ne, etc) or its inverse.  */
5049
5050         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
5051         if (x == const_true_rtx)
5052           {
5053             if (code == 'M')
5054               fputs ("nv", f);
5055             return;
5056           }
5057
5058         if (!COMPARISON_P (x))
5059           {
5060             output_operand_lossage ("invalid operand for '%%%c'", code);
5061             return;
5062           }
5063
5064         cond_code = aarch64_get_condition_code (x);
5065         gcc_assert (cond_code >= 0);
5066         if (code == 'M')
5067           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
5068         fputs (aarch64_condition_codes[cond_code], f);
5069       }
5070       break;
5071
5072     case 'b':
5073     case 'h':
5074     case 's':
5075     case 'd':
5076     case 'q':
5077       /* Print a scalar FP/SIMD register name.  */
5078       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5079         {
5080           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5081           return;
5082         }
5083       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
5084       break;
5085
5086     case 'S':
5087     case 'T':
5088     case 'U':
5089     case 'V':
5090       /* Print the first FP/SIMD register name in a list.  */
5091       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5092         {
5093           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5094           return;
5095         }
5096       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
5097       break;
5098
5099     case 'R':
5100       /* Print a scalar FP/SIMD register name + 1.  */
5101       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5102         {
5103           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5104           return;
5105         }
5106       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
5107       break;
5108
5109     case 'X':
5110       /* Print bottom 16 bits of integer constant in hex.  */
5111       if (!CONST_INT_P (x))
5112         {
5113           output_operand_lossage ("invalid operand for '%%%c'", code);
5114           return;
5115         }
5116       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
5117       break;
5118
5119     case 'w':
5120     case 'x':
5121       /* Print a general register name or the zero register (32-bit or
5122          64-bit).  */
5123       if (x == const0_rtx
5124           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
5125         {
5126           asm_fprintf (f, "%czr", code);
5127           break;
5128         }
5129
5130       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
5131         {
5132           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
5133           break;
5134         }
5135
5136       if (REG_P (x) && REGNO (x) == SP_REGNUM)
5137         {
5138           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
5139           break;
5140         }
5141
5142       /* Fall through */
5143
5144     case 0:
5145       /* Print a normal operand, if it's a general register, then we
5146          assume DImode.  */
5147       if (x == NULL)
5148         {
5149           output_operand_lossage ("missing operand");
5150           return;
5151         }
5152
5153       switch (GET_CODE (x))
5154         {
5155         case REG:
5156           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
5157           break;
5158
5159         case MEM:
5160           output_address (GET_MODE (x), XEXP (x, 0));
5161           break;
5162
5163         case CONST:
5164         case LABEL_REF:
5165         case SYMBOL_REF:
5166           output_addr_const (asm_out_file, x);
5167           break;
5168
5169         case CONST_INT:
5170           asm_fprintf (f, "%wd", INTVAL (x));
5171           break;
5172
5173         case CONST_VECTOR:
5174           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
5175             {
5176               gcc_assert (
5177                   aarch64_const_vec_all_same_in_range_p (x,
5178                                                          HOST_WIDE_INT_MIN,
5179                                                          HOST_WIDE_INT_MAX));
5180               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
5181             }
5182           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
5183             {
5184               fputc ('0', f);
5185             }
5186           else
5187             gcc_unreachable ();
5188           break;
5189
5190         case CONST_DOUBLE:
5191           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
5192              be getting CONST_DOUBLEs holding integers.  */
5193           gcc_assert (GET_MODE (x) != VOIDmode);
5194           if (aarch64_float_const_zero_rtx_p (x))
5195             {
5196               fputc ('0', f);
5197               break;
5198             }
5199           else if (aarch64_float_const_representable_p (x))
5200             {
5201 #define buf_size 20
5202               char float_buf[buf_size] = {'\0'};
5203               real_to_decimal_for_mode (float_buf,
5204                                         CONST_DOUBLE_REAL_VALUE (x),
5205                                         buf_size, buf_size,
5206                                         1, GET_MODE (x));
5207               asm_fprintf (asm_out_file, "%s", float_buf);
5208               break;
5209 #undef buf_size
5210             }
5211           output_operand_lossage ("invalid constant");
5212           return;
5213         default:
5214           output_operand_lossage ("invalid operand");
5215           return;
5216         }
5217       break;
5218
5219     case 'A':
5220       if (GET_CODE (x) == HIGH)
5221         x = XEXP (x, 0);
5222
5223       switch (aarch64_classify_symbolic_expression (x))
5224         {
5225         case SYMBOL_SMALL_GOT_4G:
5226           asm_fprintf (asm_out_file, ":got:");
5227           break;
5228
5229         case SYMBOL_SMALL_TLSGD:
5230           asm_fprintf (asm_out_file, ":tlsgd:");
5231           break;
5232
5233         case SYMBOL_SMALL_TLSDESC:
5234           asm_fprintf (asm_out_file, ":tlsdesc:");
5235           break;
5236
5237         case SYMBOL_SMALL_TLSIE:
5238           asm_fprintf (asm_out_file, ":gottprel:");
5239           break;
5240
5241         case SYMBOL_TLSLE24:
5242           asm_fprintf (asm_out_file, ":tprel:");
5243           break;
5244
5245         case SYMBOL_TINY_GOT:
5246           gcc_unreachable ();
5247           break;
5248
5249         default:
5250           break;
5251         }
5252       output_addr_const (asm_out_file, x);
5253       break;
5254
5255     case 'L':
5256       switch (aarch64_classify_symbolic_expression (x))
5257         {
5258         case SYMBOL_SMALL_GOT_4G:
5259           asm_fprintf (asm_out_file, ":lo12:");
5260           break;
5261
5262         case SYMBOL_SMALL_TLSGD:
5263           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
5264           break;
5265
5266         case SYMBOL_SMALL_TLSDESC:
5267           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
5268           break;
5269
5270         case SYMBOL_SMALL_TLSIE:
5271           asm_fprintf (asm_out_file, ":gottprel_lo12:");
5272           break;
5273
5274         case SYMBOL_TLSLE12:
5275           asm_fprintf (asm_out_file, ":tprel_lo12:");
5276           break;
5277
5278         case SYMBOL_TLSLE24:
5279           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
5280           break;
5281
5282         case SYMBOL_TINY_GOT:
5283           asm_fprintf (asm_out_file, ":got:");
5284           break;
5285
5286         case SYMBOL_TINY_TLSIE:
5287           asm_fprintf (asm_out_file, ":gottprel:");
5288           break;
5289
5290         default:
5291           break;
5292         }
5293       output_addr_const (asm_out_file, x);
5294       break;
5295
5296     case 'G':
5297
5298       switch (aarch64_classify_symbolic_expression (x))
5299         {
5300         case SYMBOL_TLSLE24:
5301           asm_fprintf (asm_out_file, ":tprel_hi12:");
5302           break;
5303         default:
5304           break;
5305         }
5306       output_addr_const (asm_out_file, x);
5307       break;
5308
5309     case 'k':
5310       {
5311         HOST_WIDE_INT cond_code;
5312         /* Print nzcv.  */
5313
5314         if (!CONST_INT_P (x))
5315           {
5316             output_operand_lossage ("invalid operand for '%%%c'", code);
5317             return;
5318           }
5319
5320         cond_code = INTVAL (x);
5321         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
5322         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
5323       }
5324       break;
5325
5326     default:
5327       output_operand_lossage ("invalid operand prefix '%%%c'", code);
5328       return;
5329     }
5330 }
5331
5332 static void
5333 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
5334 {
5335   struct aarch64_address_info addr;
5336
5337   if (aarch64_classify_address (&addr, x, mode, MEM, true))
5338     switch (addr.type)
5339       {
5340       case ADDRESS_REG_IMM:
5341         if (addr.offset == const0_rtx)
5342           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
5343         else
5344           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
5345                        INTVAL (addr.offset));
5346         return;
5347
5348       case ADDRESS_REG_REG:
5349         if (addr.shift == 0)
5350           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
5351                        reg_names [REGNO (addr.offset)]);
5352         else
5353           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
5354                        reg_names [REGNO (addr.offset)], addr.shift);
5355         return;
5356
5357       case ADDRESS_REG_UXTW:
5358         if (addr.shift == 0)
5359           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
5360                        REGNO (addr.offset) - R0_REGNUM);
5361         else
5362           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
5363                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
5364         return;
5365
5366       case ADDRESS_REG_SXTW:
5367         if (addr.shift == 0)
5368           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
5369                        REGNO (addr.offset) - R0_REGNUM);
5370         else
5371           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
5372                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
5373         return;
5374
5375       case ADDRESS_REG_WB:
5376         switch (GET_CODE (x))
5377           {
5378           case PRE_INC:
5379             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
5380                          GET_MODE_SIZE (mode));
5381             return;
5382           case POST_INC:
5383             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
5384                          GET_MODE_SIZE (mode));
5385             return;
5386           case PRE_DEC:
5387             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
5388                          GET_MODE_SIZE (mode));
5389             return;
5390           case POST_DEC:
5391             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
5392                          GET_MODE_SIZE (mode));
5393             return;
5394           case PRE_MODIFY:
5395             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
5396                          INTVAL (addr.offset));
5397             return;
5398           case POST_MODIFY:
5399             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
5400                          INTVAL (addr.offset));
5401             return;
5402           default:
5403             break;
5404           }
5405         break;
5406
5407       case ADDRESS_LO_SUM:
5408         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
5409         output_addr_const (f, addr.offset);
5410         asm_fprintf (f, "]");
5411         return;
5412
5413       case ADDRESS_SYMBOLIC:
5414         break;
5415       }
5416
5417   output_addr_const (f, x);
5418 }
5419
5420 bool
5421 aarch64_label_mentioned_p (rtx x)
5422 {
5423   const char *fmt;
5424   int i;
5425
5426   if (GET_CODE (x) == LABEL_REF)
5427     return true;
5428
5429   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5430      referencing instruction, but they are constant offsets, not
5431      symbols.  */
5432   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5433     return false;
5434
5435   fmt = GET_RTX_FORMAT (GET_CODE (x));
5436   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
5437     {
5438       if (fmt[i] == 'E')
5439         {
5440           int j;
5441
5442           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
5443             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
5444               return 1;
5445         }
5446       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
5447         return 1;
5448     }
5449
5450   return 0;
5451 }
5452
5453 /* Implement REGNO_REG_CLASS.  */
5454
5455 enum reg_class
5456 aarch64_regno_regclass (unsigned regno)
5457 {
5458   if (GP_REGNUM_P (regno))
5459     return GENERAL_REGS;
5460
5461   if (regno == SP_REGNUM)
5462     return STACK_REG;
5463
5464   if (regno == FRAME_POINTER_REGNUM
5465       || regno == ARG_POINTER_REGNUM)
5466     return POINTER_REGS;
5467
5468   if (FP_REGNUM_P (regno))
5469     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
5470
5471   return NO_REGS;
5472 }
5473
5474 static rtx
5475 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
5476 {
5477   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5478      where mask is selected by alignment and size of the offset.
5479      We try to pick as large a range for the offset as possible to
5480      maximize the chance of a CSE.  However, for aligned addresses
5481      we limit the range to 4k so that structures with different sized
5482      elements are likely to use the same base.  We need to be careful
5483      not to split a CONST for some forms of address expression, otherwise
5484      it will generate sub-optimal code.  */
5485
5486   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5487     {
5488       rtx base = XEXP (x, 0);
5489       rtx offset_rtx = XEXP (x, 1);
5490       HOST_WIDE_INT offset = INTVAL (offset_rtx);
5491
5492       if (GET_CODE (base) == PLUS)
5493         {
5494           rtx op0 = XEXP (base, 0);
5495           rtx op1 = XEXP (base, 1);
5496
5497           /* Force any scaling into a temp for CSE.  */
5498           op0 = force_reg (Pmode, op0);
5499           op1 = force_reg (Pmode, op1);
5500
5501           /* Let the pointer register be in op0.  */
5502           if (REG_POINTER (op1))
5503             std::swap (op0, op1);
5504
5505           /* If the pointer is virtual or frame related, then we know that
5506              virtual register instantiation or register elimination is going
5507              to apply a second constant.  We want the two constants folded
5508              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
5509           if (virt_or_elim_regno_p (REGNO (op0)))
5510             {
5511               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5512                                    NULL_RTX, true, OPTAB_DIRECT);
5513               return gen_rtx_PLUS (Pmode, base, op1);
5514             }
5515
5516           /* Otherwise, in order to encourage CSE (and thence loop strength
5517              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
5518           base = expand_binop (Pmode, add_optab, op0, op1,
5519                                NULL_RTX, true, OPTAB_DIRECT);
5520           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5521         }
5522
5523       /* Does it look like we'll need a 16-byte load/store-pair operation?  */
5524       HOST_WIDE_INT base_offset;
5525       if (GET_MODE_SIZE (mode) > 16)
5526         base_offset = (offset + 0x400) & ~0x7f0;
5527       /* For offsets aren't a multiple of the access size, the limit is
5528          -256...255.  */
5529       else if (offset & (GET_MODE_SIZE (mode) - 1))
5530         {
5531           base_offset = (offset + 0x100) & ~0x1ff;
5532
5533           /* BLKmode typically uses LDP of X-registers.  */
5534           if (mode == BLKmode)
5535             base_offset = (offset + 512) & ~0x3ff;
5536         }
5537       /* Small negative offsets are supported.  */
5538       else if (IN_RANGE (offset, -256, 0))
5539         base_offset = 0;
5540       else if (mode == TImode || mode == TFmode)
5541         base_offset = (offset + 0x100) & ~0x1ff;
5542       /* Use 12-bit offset by access size.  */
5543       else
5544         base_offset = offset & (~0xfff * GET_MODE_SIZE (mode));
5545
5546       if (base_offset != 0)
5547         {
5548           base = plus_constant (Pmode, base, base_offset);
5549           base = force_operand (base, NULL_RTX);
5550           return plus_constant (Pmode, base, offset - base_offset);
5551         }
5552     }
5553
5554   return x;
5555 }
5556
5557 /* Return the reload icode required for a constant pool in mode.  */
5558 static enum insn_code
5559 aarch64_constant_pool_reload_icode (machine_mode mode)
5560 {
5561   switch (mode)
5562     {
5563     case SFmode:
5564       return CODE_FOR_aarch64_reload_movcpsfdi;
5565
5566     case DFmode:
5567       return CODE_FOR_aarch64_reload_movcpdfdi;
5568
5569     case TFmode:
5570       return CODE_FOR_aarch64_reload_movcptfdi;
5571
5572     case V8QImode:
5573       return CODE_FOR_aarch64_reload_movcpv8qidi;
5574
5575     case V16QImode:
5576       return CODE_FOR_aarch64_reload_movcpv16qidi;
5577
5578     case V4HImode:
5579       return CODE_FOR_aarch64_reload_movcpv4hidi;
5580
5581     case V8HImode:
5582       return CODE_FOR_aarch64_reload_movcpv8hidi;
5583
5584     case V2SImode:
5585       return CODE_FOR_aarch64_reload_movcpv2sidi;
5586
5587     case V4SImode:
5588       return CODE_FOR_aarch64_reload_movcpv4sidi;
5589
5590     case V2DImode:
5591       return CODE_FOR_aarch64_reload_movcpv2didi;
5592
5593     case V2DFmode:
5594       return CODE_FOR_aarch64_reload_movcpv2dfdi;
5595
5596     default:
5597       gcc_unreachable ();
5598     }
5599
5600   gcc_unreachable ();
5601 }
5602 static reg_class_t
5603 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5604                           reg_class_t rclass,
5605                           machine_mode mode,
5606                           secondary_reload_info *sri)
5607 {
5608
5609   /* If we have to disable direct literal pool loads and stores because the
5610      function is too big, then we need a scratch register.  */
5611   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5612       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5613           || targetm.vector_mode_supported_p (GET_MODE (x)))
5614       && !aarch64_pcrelative_literal_loads)
5615     {
5616       sri->icode = aarch64_constant_pool_reload_icode (mode);
5617       return NO_REGS;
5618     }
5619
5620   /* Without the TARGET_SIMD instructions we cannot move a Q register
5621      to a Q register directly.  We need a scratch.  */
5622   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5623       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5624       && reg_class_subset_p (rclass, FP_REGS))
5625     {
5626       if (mode == TFmode)
5627         sri->icode = CODE_FOR_aarch64_reload_movtf;
5628       else if (mode == TImode)
5629         sri->icode = CODE_FOR_aarch64_reload_movti;
5630       return NO_REGS;
5631     }
5632
5633   /* A TFmode or TImode memory access should be handled via an FP_REGS
5634      because AArch64 has richer addressing modes for LDR/STR instructions
5635      than LDP/STP instructions.  */
5636   if (TARGET_FLOAT && rclass == GENERAL_REGS
5637       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5638     return FP_REGS;
5639
5640   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5641       return GENERAL_REGS;
5642
5643   return NO_REGS;
5644 }
5645
5646 static bool
5647 aarch64_can_eliminate (const int from, const int to)
5648 {
5649   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5650      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
5651
5652   if (frame_pointer_needed)
5653     {
5654       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5655         return true;
5656       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5657         return false;
5658       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5659           && !cfun->calls_alloca)
5660         return true;
5661       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5662         return true;
5663
5664       return false;
5665     }
5666   else
5667     {
5668       /* If we decided that we didn't need a leaf frame pointer but then used
5669          LR in the function, then we'll want a frame pointer after all, so
5670          prevent this elimination to ensure a frame pointer is used.  */
5671       if (to == STACK_POINTER_REGNUM
5672           && flag_omit_leaf_frame_pointer
5673           && df_regs_ever_live_p (LR_REGNUM))
5674         return false;
5675     }
5676
5677   return true;
5678 }
5679
5680 HOST_WIDE_INT
5681 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5682 {
5683   aarch64_layout_frame ();
5684
5685   if (to == HARD_FRAME_POINTER_REGNUM)
5686     {
5687       if (from == ARG_POINTER_REGNUM)
5688         return cfun->machine->frame.hard_fp_offset;
5689
5690       if (from == FRAME_POINTER_REGNUM)
5691         return cfun->machine->frame.hard_fp_offset
5692                - cfun->machine->frame.locals_offset;
5693     }
5694
5695   if (to == STACK_POINTER_REGNUM)
5696     {
5697       if (from == FRAME_POINTER_REGNUM)
5698           return cfun->machine->frame.frame_size
5699                  - cfun->machine->frame.locals_offset;
5700     }
5701
5702   return cfun->machine->frame.frame_size;
5703 }
5704
5705 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
5706    previous frame.  */
5707
5708 rtx
5709 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5710 {
5711   if (count != 0)
5712     return const0_rtx;
5713   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5714 }
5715
5716
5717 static void
5718 aarch64_asm_trampoline_template (FILE *f)
5719 {
5720   if (TARGET_ILP32)
5721     {
5722       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5723       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5724     }
5725   else
5726     {
5727       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5728       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5729     }
5730   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5731   assemble_aligned_integer (4, const0_rtx);
5732   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5733   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5734 }
5735
5736 static void
5737 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5738 {
5739   rtx fnaddr, mem, a_tramp;
5740   const int tramp_code_sz = 16;
5741
5742   /* Don't need to copy the trailing D-words, we fill those in below.  */
5743   emit_block_move (m_tramp, assemble_trampoline_template (),
5744                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5745   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5746   fnaddr = XEXP (DECL_RTL (fndecl), 0);
5747   if (GET_MODE (fnaddr) != ptr_mode)
5748     fnaddr = convert_memory_address (ptr_mode, fnaddr);
5749   emit_move_insn (mem, fnaddr);
5750
5751   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5752   emit_move_insn (mem, chain_value);
5753
5754   /* XXX We should really define a "clear_cache" pattern and use
5755      gen_clear_cache().  */
5756   a_tramp = XEXP (m_tramp, 0);
5757   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5758                      LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
5759                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5760                      ptr_mode);
5761 }
5762
5763 static unsigned char
5764 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5765 {
5766   switch (regclass)
5767     {
5768     case CALLER_SAVE_REGS:
5769     case POINTER_REGS:
5770     case GENERAL_REGS:
5771     case ALL_REGS:
5772     case FP_REGS:
5773     case FP_LO_REGS:
5774       return
5775         aarch64_vector_mode_p (mode)
5776           ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
5777           : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5778     case STACK_REG:
5779       return 1;
5780
5781     case NO_REGS:
5782       return 0;
5783
5784     default:
5785       break;
5786     }
5787   gcc_unreachable ();
5788 }
5789
5790 static reg_class_t
5791 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
5792 {
5793   if (regclass == POINTER_REGS)
5794     return GENERAL_REGS;
5795
5796   if (regclass == STACK_REG)
5797     {
5798       if (REG_P(x)
5799           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
5800           return regclass;
5801
5802       return NO_REGS;
5803     }
5804
5805   /* If it's an integer immediate that MOVI can't handle, then
5806      FP_REGS is not an option, so we return NO_REGS instead.  */
5807   if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
5808       && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
5809     return NO_REGS;
5810
5811   /* Register eliminiation can result in a request for
5812      SP+constant->FP_REGS.  We cannot support such operations which
5813      use SP as source and an FP_REG as destination, so reject out
5814      right now.  */
5815   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
5816     {
5817       rtx lhs = XEXP (x, 0);
5818
5819       /* Look through a possible SUBREG introduced by ILP32.  */
5820       if (GET_CODE (lhs) == SUBREG)
5821         lhs = SUBREG_REG (lhs);
5822
5823       gcc_assert (REG_P (lhs));
5824       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
5825                                       POINTER_REGS));
5826       return NO_REGS;
5827     }
5828
5829   return regclass;
5830 }
5831
5832 void
5833 aarch64_asm_output_labelref (FILE* f, const char *name)
5834 {
5835   asm_fprintf (f, "%U%s", name);
5836 }
5837
5838 static void
5839 aarch64_elf_asm_constructor (rtx symbol, int priority)
5840 {
5841   if (priority == DEFAULT_INIT_PRIORITY)
5842     default_ctor_section_asm_out_constructor (symbol, priority);
5843   else
5844     {
5845       section *s;
5846       /* While priority is known to be in range [0, 65535], so 18 bytes
5847          would be enough, the compiler might not know that.  To avoid
5848          -Wformat-truncation false positive, use a larger size.  */
5849       char buf[23];
5850       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
5851       s = get_section (buf, SECTION_WRITE, NULL);
5852       switch_to_section (s);
5853       assemble_align (POINTER_SIZE);
5854       assemble_aligned_integer (POINTER_BYTES, symbol);
5855     }
5856 }
5857
5858 static void
5859 aarch64_elf_asm_destructor (rtx symbol, int priority)
5860 {
5861   if (priority == DEFAULT_INIT_PRIORITY)
5862     default_dtor_section_asm_out_destructor (symbol, priority);
5863   else
5864     {
5865       section *s;
5866       /* While priority is known to be in range [0, 65535], so 18 bytes
5867          would be enough, the compiler might not know that.  To avoid
5868          -Wformat-truncation false positive, use a larger size.  */
5869       char buf[23];
5870       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5871       s = get_section (buf, SECTION_WRITE, NULL);
5872       switch_to_section (s);
5873       assemble_align (POINTER_SIZE);
5874       assemble_aligned_integer (POINTER_BYTES, symbol);
5875     }
5876 }
5877
5878 const char*
5879 aarch64_output_casesi (rtx *operands)
5880 {
5881   char buf[100];
5882   char label[100];
5883   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5884   int index;
5885   static const char *const patterns[4][2] =
5886   {
5887     {
5888       "ldrb\t%w3, [%0,%w1,uxtw]",
5889       "add\t%3, %4, %w3, sxtb #2"
5890     },
5891     {
5892       "ldrh\t%w3, [%0,%w1,uxtw #1]",
5893       "add\t%3, %4, %w3, sxth #2"
5894     },
5895     {
5896       "ldr\t%w3, [%0,%w1,uxtw #2]",
5897       "add\t%3, %4, %w3, sxtw #2"
5898     },
5899     /* We assume that DImode is only generated when not optimizing and
5900        that we don't really need 64-bit address offsets.  That would
5901        imply an object file with 8GB of code in a single function!  */
5902     {
5903       "ldr\t%w3, [%0,%w1,uxtw #2]",
5904       "add\t%3, %4, %w3, sxtw #2"
5905     }
5906   };
5907
5908   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5909
5910   index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5911
5912   gcc_assert (index >= 0 && index <= 3);
5913
5914   /* Need to implement table size reduction, by chaning the code below.  */
5915   output_asm_insn (patterns[index][0], operands);
5916   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5917   snprintf (buf, sizeof (buf),
5918             "adr\t%%4, %s", targetm.strip_name_encoding (label));
5919   output_asm_insn (buf, operands);
5920   output_asm_insn (patterns[index][1], operands);
5921   output_asm_insn ("br\t%3", operands);
5922   assemble_label (asm_out_file, label);
5923   return "";
5924 }
5925
5926
5927 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5928    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5929    operator.  */
5930
5931 int
5932 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5933 {
5934   if (shift >= 0 && shift <= 3)
5935     {
5936       int size;
5937       for (size = 8; size <= 32; size *= 2)
5938         {
5939           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5940           if (mask == bits << shift)
5941             return size;
5942         }
5943     }
5944   return 0;
5945 }
5946
5947 /* Constant pools are per function only when PC relative
5948    literal loads are true or we are in the large memory
5949    model.  */
5950
5951 static inline bool
5952 aarch64_can_use_per_function_literal_pools_p (void)
5953 {
5954   return (aarch64_pcrelative_literal_loads
5955           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
5956 }
5957
5958 static bool
5959 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
5960 {
5961   /* Fixme:: In an ideal world this would work similar
5962      to the logic in aarch64_select_rtx_section but this
5963      breaks bootstrap in gcc go.  For now we workaround
5964      this by returning false here.  */
5965   return false;
5966 }
5967
5968 /* Select appropriate section for constants depending
5969    on where we place literal pools.  */
5970
5971 static section *
5972 aarch64_select_rtx_section (machine_mode mode,
5973                             rtx x,
5974                             unsigned HOST_WIDE_INT align)
5975 {
5976   if (aarch64_can_use_per_function_literal_pools_p ())
5977     return function_section (current_function_decl);
5978
5979   return default_elf_select_rtx_section (mode, x, align);
5980 }
5981
5982 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
5983 void
5984 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
5985                                   HOST_WIDE_INT offset)
5986 {
5987   /* When using per-function literal pools, we must ensure that any code
5988      section is aligned to the minimal instruction length, lest we get
5989      errors from the assembler re "unaligned instructions".  */
5990   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
5991     ASM_OUTPUT_ALIGN (f, 2);
5992 }
5993
5994 /* Costs.  */
5995
5996 /* Helper function for rtx cost calculation.  Strip a shift expression
5997    from X.  Returns the inner operand if successful, or the original
5998    expression on failure.  */
5999 static rtx
6000 aarch64_strip_shift (rtx x)
6001 {
6002   rtx op = x;
6003
6004   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
6005      we can convert both to ROR during final output.  */
6006   if ((GET_CODE (op) == ASHIFT
6007        || GET_CODE (op) == ASHIFTRT
6008        || GET_CODE (op) == LSHIFTRT
6009        || GET_CODE (op) == ROTATERT
6010        || GET_CODE (op) == ROTATE)
6011       && CONST_INT_P (XEXP (op, 1)))
6012     return XEXP (op, 0);
6013
6014   if (GET_CODE (op) == MULT
6015       && CONST_INT_P (XEXP (op, 1))
6016       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
6017     return XEXP (op, 0);
6018
6019   return x;
6020 }
6021
6022 /* Helper function for rtx cost calculation.  Strip an extend
6023    expression from X.  Returns the inner operand if successful, or the
6024    original expression on failure.  We deal with a number of possible
6025    canonicalization variations here.  */
6026 static rtx
6027 aarch64_strip_extend (rtx x)
6028 {
6029   rtx op = x;
6030
6031   /* Zero and sign extraction of a widened value.  */
6032   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
6033       && XEXP (op, 2) == const0_rtx
6034       && GET_CODE (XEXP (op, 0)) == MULT
6035       && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
6036                                          XEXP (op, 1)))
6037     return XEXP (XEXP (op, 0), 0);
6038
6039   /* It can also be represented (for zero-extend) as an AND with an
6040      immediate.  */
6041   if (GET_CODE (op) == AND
6042       && GET_CODE (XEXP (op, 0)) == MULT
6043       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
6044       && CONST_INT_P (XEXP (op, 1))
6045       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
6046                            INTVAL (XEXP (op, 1))) != 0)
6047     return XEXP (XEXP (op, 0), 0);
6048
6049   /* Now handle extended register, as this may also have an optional
6050      left shift by 1..4.  */
6051   if (GET_CODE (op) == ASHIFT
6052       && CONST_INT_P (XEXP (op, 1))
6053       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
6054     op = XEXP (op, 0);
6055
6056   if (GET_CODE (op) == ZERO_EXTEND
6057       || GET_CODE (op) == SIGN_EXTEND)
6058     op = XEXP (op, 0);
6059
6060   if (op != x)
6061     return op;
6062
6063   return x;
6064 }
6065
6066 /* Return true iff CODE is a shift supported in combination
6067    with arithmetic instructions.  */
6068
6069 static bool
6070 aarch64_shift_p (enum rtx_code code)
6071 {
6072   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
6073 }
6074
6075 /* Helper function for rtx cost calculation.  Calculate the cost of
6076    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
6077    Return the calculated cost of the expression, recursing manually in to
6078    operands where needed.  */
6079
6080 static int
6081 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
6082 {
6083   rtx op0, op1;
6084   const struct cpu_cost_table *extra_cost
6085     = aarch64_tune_params.insn_extra_cost;
6086   int cost = 0;
6087   bool compound_p = (outer == PLUS || outer == MINUS);
6088   machine_mode mode = GET_MODE (x);
6089
6090   gcc_checking_assert (code == MULT);
6091
6092   op0 = XEXP (x, 0);
6093   op1 = XEXP (x, 1);
6094
6095   if (VECTOR_MODE_P (mode))
6096     mode = GET_MODE_INNER (mode);
6097
6098   /* Integer multiply/fma.  */
6099   if (GET_MODE_CLASS (mode) == MODE_INT)
6100     {
6101       /* The multiply will be canonicalized as a shift, cost it as such.  */
6102       if (aarch64_shift_p (GET_CODE (x))
6103           || (CONST_INT_P (op1)
6104               && exact_log2 (INTVAL (op1)) > 0))
6105         {
6106           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
6107                            || GET_CODE (op0) == SIGN_EXTEND;
6108           if (speed)
6109             {
6110               if (compound_p)
6111                 {
6112                   if (REG_P (op1))
6113                     /* ARITH + shift-by-register.  */
6114                     cost += extra_cost->alu.arith_shift_reg;
6115                   else if (is_extend)
6116                     /* ARITH + extended register.  We don't have a cost field
6117                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
6118                     cost += extra_cost->alu.extend_arith;
6119                   else
6120                     /* ARITH + shift-by-immediate.  */
6121                     cost += extra_cost->alu.arith_shift;
6122                 }
6123               else
6124                 /* LSL (immediate).  */
6125                 cost += extra_cost->alu.shift;
6126
6127             }
6128           /* Strip extends as we will have costed them in the case above.  */
6129           if (is_extend)
6130             op0 = aarch64_strip_extend (op0);
6131
6132           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
6133
6134           return cost;
6135         }
6136
6137       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
6138          compound and let the below cases handle it.  After all, MNEG is a
6139          special-case alias of MSUB.  */
6140       if (GET_CODE (op0) == NEG)
6141         {
6142           op0 = XEXP (op0, 0);
6143           compound_p = true;
6144         }
6145
6146       /* Integer multiplies or FMAs have zero/sign extending variants.  */
6147       if ((GET_CODE (op0) == ZERO_EXTEND
6148            && GET_CODE (op1) == ZERO_EXTEND)
6149           || (GET_CODE (op0) == SIGN_EXTEND
6150               && GET_CODE (op1) == SIGN_EXTEND))
6151         {
6152           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
6153           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
6154
6155           if (speed)
6156             {
6157               if (compound_p)
6158                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
6159                 cost += extra_cost->mult[0].extend_add;
6160               else
6161                 /* MUL/SMULL/UMULL.  */
6162                 cost += extra_cost->mult[0].extend;
6163             }
6164
6165           return cost;
6166         }
6167
6168       /* This is either an integer multiply or a MADD.  In both cases
6169          we want to recurse and cost the operands.  */
6170       cost += rtx_cost (op0, mode, MULT, 0, speed);
6171       cost += rtx_cost (op1, mode, MULT, 1, speed);
6172
6173       if (speed)
6174         {
6175           if (compound_p)
6176             /* MADD/MSUB.  */
6177             cost += extra_cost->mult[mode == DImode].add;
6178           else
6179             /* MUL.  */
6180             cost += extra_cost->mult[mode == DImode].simple;
6181         }
6182
6183       return cost;
6184     }
6185   else
6186     {
6187       if (speed)
6188         {
6189           /* Floating-point FMA/FMUL can also support negations of the
6190              operands, unless the rounding mode is upward or downward in
6191              which case FNMUL is different than FMUL with operand negation.  */
6192           bool neg0 = GET_CODE (op0) == NEG;
6193           bool neg1 = GET_CODE (op1) == NEG;
6194           if (compound_p || !flag_rounding_math || (neg0 && neg1))
6195             {
6196               if (neg0)
6197                 op0 = XEXP (op0, 0);
6198               if (neg1)
6199                 op1 = XEXP (op1, 0);
6200             }
6201
6202           if (compound_p)
6203             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
6204             cost += extra_cost->fp[mode == DFmode].fma;
6205           else
6206             /* FMUL/FNMUL.  */
6207             cost += extra_cost->fp[mode == DFmode].mult;
6208         }
6209
6210       cost += rtx_cost (op0, mode, MULT, 0, speed);
6211       cost += rtx_cost (op1, mode, MULT, 1, speed);
6212       return cost;
6213     }
6214 }
6215
6216 static int
6217 aarch64_address_cost (rtx x,
6218                       machine_mode mode,
6219                       addr_space_t as ATTRIBUTE_UNUSED,
6220                       bool speed)
6221 {
6222   enum rtx_code c = GET_CODE (x);
6223   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
6224   struct aarch64_address_info info;
6225   int cost = 0;
6226   info.shift = 0;
6227
6228   if (!aarch64_classify_address (&info, x, mode, c, false))
6229     {
6230       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
6231         {
6232           /* This is a CONST or SYMBOL ref which will be split
6233              in a different way depending on the code model in use.
6234              Cost it through the generic infrastructure.  */
6235           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
6236           /* Divide through by the cost of one instruction to
6237              bring it to the same units as the address costs.  */
6238           cost_symbol_ref /= COSTS_N_INSNS (1);
6239           /* The cost is then the cost of preparing the address,
6240              followed by an immediate (possibly 0) offset.  */
6241           return cost_symbol_ref + addr_cost->imm_offset;
6242         }
6243       else
6244         {
6245           /* This is most likely a jump table from a case
6246              statement.  */
6247           return addr_cost->register_offset;
6248         }
6249     }
6250
6251   switch (info.type)
6252     {
6253       case ADDRESS_LO_SUM:
6254       case ADDRESS_SYMBOLIC:
6255       case ADDRESS_REG_IMM:
6256         cost += addr_cost->imm_offset;
6257         break;
6258
6259       case ADDRESS_REG_WB:
6260         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
6261           cost += addr_cost->pre_modify;
6262         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
6263           cost += addr_cost->post_modify;
6264         else
6265           gcc_unreachable ();
6266
6267         break;
6268
6269       case ADDRESS_REG_REG:
6270         cost += addr_cost->register_offset;
6271         break;
6272
6273       case ADDRESS_REG_SXTW:
6274         cost += addr_cost->register_sextend;
6275         break;
6276
6277       case ADDRESS_REG_UXTW:
6278         cost += addr_cost->register_zextend;
6279         break;
6280
6281       default:
6282         gcc_unreachable ();
6283     }
6284
6285
6286   if (info.shift > 0)
6287     {
6288       /* For the sake of calculating the cost of the shifted register
6289          component, we can treat same sized modes in the same way.  */
6290       switch (GET_MODE_BITSIZE (mode))
6291         {
6292           case 16:
6293             cost += addr_cost->addr_scale_costs.hi;
6294             break;
6295
6296           case 32:
6297             cost += addr_cost->addr_scale_costs.si;
6298             break;
6299
6300           case 64:
6301             cost += addr_cost->addr_scale_costs.di;
6302             break;
6303
6304           /* We can't tell, or this is a 128-bit vector.  */
6305           default:
6306             cost += addr_cost->addr_scale_costs.ti;
6307             break;
6308         }
6309     }
6310
6311   return cost;
6312 }
6313
6314 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
6315    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
6316    to be taken.  */
6317
6318 int
6319 aarch64_branch_cost (bool speed_p, bool predictable_p)
6320 {
6321   /* When optimizing for speed, use the cost of unpredictable branches.  */
6322   const struct cpu_branch_cost *branch_costs =
6323     aarch64_tune_params.branch_costs;
6324
6325   if (!speed_p || predictable_p)
6326     return branch_costs->predictable;
6327   else
6328     return branch_costs->unpredictable;
6329 }
6330
6331 /* Return true if the RTX X in mode MODE is a zero or sign extract
6332    usable in an ADD or SUB (extended register) instruction.  */
6333 static bool
6334 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
6335 {
6336   /* Catch add with a sign extract.
6337      This is add_<optab><mode>_multp2.  */
6338   if (GET_CODE (x) == SIGN_EXTRACT
6339       || GET_CODE (x) == ZERO_EXTRACT)
6340     {
6341       rtx op0 = XEXP (x, 0);
6342       rtx op1 = XEXP (x, 1);
6343       rtx op2 = XEXP (x, 2);
6344
6345       if (GET_CODE (op0) == MULT
6346           && CONST_INT_P (op1)
6347           && op2 == const0_rtx
6348           && CONST_INT_P (XEXP (op0, 1))
6349           && aarch64_is_extend_from_extract (mode,
6350                                              XEXP (op0, 1),
6351                                              op1))
6352         {
6353           return true;
6354         }
6355     }
6356   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
6357      No shift.  */
6358   else if (GET_CODE (x) == SIGN_EXTEND
6359            || GET_CODE (x) == ZERO_EXTEND)
6360     return REG_P (XEXP (x, 0));
6361
6362   return false;
6363 }
6364
6365 static bool
6366 aarch64_frint_unspec_p (unsigned int u)
6367 {
6368   switch (u)
6369     {
6370       case UNSPEC_FRINTZ:
6371       case UNSPEC_FRINTP:
6372       case UNSPEC_FRINTM:
6373       case UNSPEC_FRINTA:
6374       case UNSPEC_FRINTN:
6375       case UNSPEC_FRINTX:
6376       case UNSPEC_FRINTI:
6377         return true;
6378
6379       default:
6380         return false;
6381     }
6382 }
6383
6384 /* Return true iff X is an rtx that will match an extr instruction
6385    i.e. as described in the *extr<mode>5_insn family of patterns.
6386    OP0 and OP1 will be set to the operands of the shifts involved
6387    on success and will be NULL_RTX otherwise.  */
6388
6389 static bool
6390 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
6391 {
6392   rtx op0, op1;
6393   machine_mode mode = GET_MODE (x);
6394
6395   *res_op0 = NULL_RTX;
6396   *res_op1 = NULL_RTX;
6397
6398   if (GET_CODE (x) != IOR)
6399     return false;
6400
6401   op0 = XEXP (x, 0);
6402   op1 = XEXP (x, 1);
6403
6404   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
6405       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
6406     {
6407      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
6408       if (GET_CODE (op1) == ASHIFT)
6409         std::swap (op0, op1);
6410
6411       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6412         return false;
6413
6414       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6415       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6416
6417       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6418           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6419         {
6420           *res_op0 = XEXP (op0, 0);
6421           *res_op1 = XEXP (op1, 0);
6422           return true;
6423         }
6424     }
6425
6426   return false;
6427 }
6428
6429 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6430    storing it in *COST.  Result is true if the total cost of the operation
6431    has now been calculated.  */
6432 static bool
6433 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6434 {
6435   rtx inner;
6436   rtx comparator;
6437   enum rtx_code cmpcode;
6438
6439   if (COMPARISON_P (op0))
6440     {
6441       inner = XEXP (op0, 0);
6442       comparator = XEXP (op0, 1);
6443       cmpcode = GET_CODE (op0);
6444     }
6445   else
6446     {
6447       inner = op0;
6448       comparator = const0_rtx;
6449       cmpcode = NE;
6450     }
6451
6452   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6453     {
6454       /* Conditional branch.  */
6455       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6456         return true;
6457       else
6458         {
6459           if (cmpcode == NE || cmpcode == EQ)
6460             {
6461               if (comparator == const0_rtx)
6462                 {
6463                   /* TBZ/TBNZ/CBZ/CBNZ.  */
6464                   if (GET_CODE (inner) == ZERO_EXTRACT)
6465                     /* TBZ/TBNZ.  */
6466                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6467                                        ZERO_EXTRACT, 0, speed);
6468                   else
6469                     /* CBZ/CBNZ.  */
6470                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6471
6472                 return true;
6473               }
6474             }
6475           else if (cmpcode == LT || cmpcode == GE)
6476             {
6477               /* TBZ/TBNZ.  */
6478               if (comparator == const0_rtx)
6479                 return true;
6480             }
6481         }
6482     }
6483   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6484     {
6485       /* CCMP.  */
6486       if (GET_CODE (op1) == COMPARE)
6487         {
6488           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
6489           if (XEXP (op1, 1) == const0_rtx)
6490             *cost += 1;
6491           if (speed)
6492             {
6493               machine_mode mode = GET_MODE (XEXP (op1, 0));
6494               const struct cpu_cost_table *extra_cost
6495                 = aarch64_tune_params.insn_extra_cost;
6496
6497               if (GET_MODE_CLASS (mode) == MODE_INT)
6498                 *cost += extra_cost->alu.arith;
6499               else
6500                 *cost += extra_cost->fp[mode == DFmode].compare;
6501             }
6502           return true;
6503         }
6504
6505       /* It's a conditional operation based on the status flags,
6506          so it must be some flavor of CSEL.  */
6507
6508       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
6509       if (GET_CODE (op1) == NEG
6510           || GET_CODE (op1) == NOT
6511           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6512         op1 = XEXP (op1, 0);
6513       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6514         {
6515           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
6516           op1 = XEXP (op1, 0);
6517           op2 = XEXP (op2, 0);
6518         }
6519
6520       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6521       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6522       return true;
6523     }
6524
6525   /* We don't know what this is, cost all operands.  */
6526   return false;
6527 }
6528
6529 /* Check whether X is a bitfield operation of the form shift + extend that
6530    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
6531    operand to which the bitfield operation is applied.  Otherwise return
6532    NULL_RTX.  */
6533
6534 static rtx
6535 aarch64_extend_bitfield_pattern_p (rtx x)
6536 {
6537   rtx_code outer_code = GET_CODE (x);
6538   machine_mode outer_mode = GET_MODE (x);
6539
6540   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6541       && outer_mode != SImode && outer_mode != DImode)
6542     return NULL_RTX;
6543
6544   rtx inner = XEXP (x, 0);
6545   rtx_code inner_code = GET_CODE (inner);
6546   machine_mode inner_mode = GET_MODE (inner);
6547   rtx op = NULL_RTX;
6548
6549   switch (inner_code)
6550     {
6551       case ASHIFT:
6552         if (CONST_INT_P (XEXP (inner, 1))
6553             && (inner_mode == QImode || inner_mode == HImode))
6554           op = XEXP (inner, 0);
6555         break;
6556       case LSHIFTRT:
6557         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6558             && (inner_mode == QImode || inner_mode == HImode))
6559           op = XEXP (inner, 0);
6560         break;
6561       case ASHIFTRT:
6562         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6563             && (inner_mode == QImode || inner_mode == HImode))
6564           op = XEXP (inner, 0);
6565         break;
6566       default:
6567         break;
6568     }
6569
6570   return op;
6571 }
6572
6573 /* Return true if the mask and a shift amount from an RTX of the form
6574    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6575    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
6576
6577 bool
6578 aarch64_mask_and_shift_for_ubfiz_p (machine_mode mode, rtx mask, rtx shft_amnt)
6579 {
6580   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6581          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6582          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6583          && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6584 }
6585
6586 /* Calculate the cost of calculating X, storing it in *COST.  Result
6587    is true if the total cost of the operation has now been calculated.  */
6588 static bool
6589 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6590                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6591 {
6592   rtx op0, op1, op2;
6593   const struct cpu_cost_table *extra_cost
6594     = aarch64_tune_params.insn_extra_cost;
6595   int code = GET_CODE (x);
6596
6597   /* By default, assume that everything has equivalent cost to the
6598      cheapest instruction.  Any additional costs are applied as a delta
6599      above this default.  */
6600   *cost = COSTS_N_INSNS (1);
6601
6602   switch (code)
6603     {
6604     case SET:
6605       /* The cost depends entirely on the operands to SET.  */
6606       *cost = 0;
6607       op0 = SET_DEST (x);
6608       op1 = SET_SRC (x);
6609
6610       switch (GET_CODE (op0))
6611         {
6612         case MEM:
6613           if (speed)
6614             {
6615               rtx address = XEXP (op0, 0);
6616               if (VECTOR_MODE_P (mode))
6617                 *cost += extra_cost->ldst.storev;
6618               else if (GET_MODE_CLASS (mode) == MODE_INT)
6619                 *cost += extra_cost->ldst.store;
6620               else if (mode == SFmode)
6621                 *cost += extra_cost->ldst.storef;
6622               else if (mode == DFmode)
6623                 *cost += extra_cost->ldst.stored;
6624
6625               *cost +=
6626                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6627                                                      0, speed));
6628             }
6629
6630           *cost += rtx_cost (op1, mode, SET, 1, speed);
6631           return true;
6632
6633         case SUBREG:
6634           if (! REG_P (SUBREG_REG (op0)))
6635             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6636
6637           /* Fall through.  */
6638         case REG:
6639           /* The cost is one per vector-register copied.  */
6640           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6641             {
6642               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6643                               / GET_MODE_SIZE (V4SImode);
6644               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6645             }
6646           /* const0_rtx is in general free, but we will use an
6647              instruction to set a register to 0.  */
6648           else if (REG_P (op1) || op1 == const0_rtx)
6649             {
6650               /* The cost is 1 per register copied.  */
6651               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6652                               / UNITS_PER_WORD;
6653               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6654             }
6655           else
6656             /* Cost is just the cost of the RHS of the set.  */
6657             *cost += rtx_cost (op1, mode, SET, 1, speed);
6658           return true;
6659
6660         case ZERO_EXTRACT:
6661         case SIGN_EXTRACT:
6662           /* Bit-field insertion.  Strip any redundant widening of
6663              the RHS to meet the width of the target.  */
6664           if (GET_CODE (op1) == SUBREG)
6665             op1 = SUBREG_REG (op1);
6666           if ((GET_CODE (op1) == ZERO_EXTEND
6667                || GET_CODE (op1) == SIGN_EXTEND)
6668               && CONST_INT_P (XEXP (op0, 1))
6669               && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
6670                   >= INTVAL (XEXP (op0, 1))))
6671             op1 = XEXP (op1, 0);
6672
6673           if (CONST_INT_P (op1))
6674             {
6675               /* MOV immediate is assumed to always be cheap.  */
6676               *cost = COSTS_N_INSNS (1);
6677             }
6678           else
6679             {
6680               /* BFM.  */
6681               if (speed)
6682                 *cost += extra_cost->alu.bfi;
6683               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6684             }
6685
6686           return true;
6687
6688         default:
6689           /* We can't make sense of this, assume default cost.  */
6690           *cost = COSTS_N_INSNS (1);
6691           return false;
6692         }
6693       return false;
6694
6695     case CONST_INT:
6696       /* If an instruction can incorporate a constant within the
6697          instruction, the instruction's expression avoids calling
6698          rtx_cost() on the constant.  If rtx_cost() is called on a
6699          constant, then it is usually because the constant must be
6700          moved into a register by one or more instructions.
6701
6702          The exception is constant 0, which can be expressed
6703          as XZR/WZR and is therefore free.  The exception to this is
6704          if we have (set (reg) (const0_rtx)) in which case we must cost
6705          the move.  However, we can catch that when we cost the SET, so
6706          we don't need to consider that here.  */
6707       if (x == const0_rtx)
6708         *cost = 0;
6709       else
6710         {
6711           /* To an approximation, building any other constant is
6712              proportionally expensive to the number of instructions
6713              required to build that constant.  This is true whether we
6714              are compiling for SPEED or otherwise.  */
6715           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
6716                                  (NULL_RTX, x, false, mode));
6717         }
6718       return true;
6719
6720     case CONST_DOUBLE:
6721       if (speed)
6722         {
6723           /* mov[df,sf]_aarch64.  */
6724           if (aarch64_float_const_representable_p (x))
6725             /* FMOV (scalar immediate).  */
6726             *cost += extra_cost->fp[mode == DFmode].fpconst;
6727           else if (!aarch64_float_const_zero_rtx_p (x))
6728             {
6729               /* This will be a load from memory.  */
6730               if (mode == DFmode)
6731                 *cost += extra_cost->ldst.loadd;
6732               else
6733                 *cost += extra_cost->ldst.loadf;
6734             }
6735           else
6736             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
6737                or MOV v0.s[0], wzr - neither of which are modeled by the
6738                cost tables.  Just use the default cost.  */
6739             {
6740             }
6741         }
6742
6743       return true;
6744
6745     case MEM:
6746       if (speed)
6747         {
6748           /* For loads we want the base cost of a load, plus an
6749              approximation for the additional cost of the addressing
6750              mode.  */
6751           rtx address = XEXP (x, 0);
6752           if (VECTOR_MODE_P (mode))
6753             *cost += extra_cost->ldst.loadv;
6754           else if (GET_MODE_CLASS (mode) == MODE_INT)
6755             *cost += extra_cost->ldst.load;
6756           else if (mode == SFmode)
6757             *cost += extra_cost->ldst.loadf;
6758           else if (mode == DFmode)
6759             *cost += extra_cost->ldst.loadd;
6760
6761           *cost +=
6762                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6763                                                      0, speed));
6764         }
6765
6766       return true;
6767
6768     case NEG:
6769       op0 = XEXP (x, 0);
6770
6771       if (VECTOR_MODE_P (mode))
6772         {
6773           if (speed)
6774             {
6775               /* FNEG.  */
6776               *cost += extra_cost->vect.alu;
6777             }
6778           return false;
6779         }
6780
6781       if (GET_MODE_CLASS (mode) == MODE_INT)
6782         {
6783           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6784               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6785             {
6786               /* CSETM.  */
6787               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
6788               return true;
6789             }
6790
6791           /* Cost this as SUB wzr, X.  */
6792           op0 = CONST0_RTX (mode);
6793           op1 = XEXP (x, 0);
6794           goto cost_minus;
6795         }
6796
6797       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6798         {
6799           /* Support (neg(fma...)) as a single instruction only if
6800              sign of zeros is unimportant.  This matches the decision
6801              making in aarch64.md.  */
6802           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
6803             {
6804               /* FNMADD.  */
6805               *cost = rtx_cost (op0, mode, NEG, 0, speed);
6806               return true;
6807             }
6808           if (GET_CODE (op0) == MULT)
6809             {
6810               /* FNMUL.  */
6811               *cost = rtx_cost (op0, mode, NEG, 0, speed);
6812               return true;
6813             }
6814           if (speed)
6815             /* FNEG.  */
6816             *cost += extra_cost->fp[mode == DFmode].neg;
6817           return false;
6818         }
6819
6820       return false;
6821
6822     case CLRSB:
6823     case CLZ:
6824       if (speed)
6825         {
6826           if (VECTOR_MODE_P (mode))
6827             *cost += extra_cost->vect.alu;
6828           else
6829             *cost += extra_cost->alu.clz;
6830         }
6831
6832       return false;
6833
6834     case COMPARE:
6835       op0 = XEXP (x, 0);
6836       op1 = XEXP (x, 1);
6837
6838       if (op1 == const0_rtx
6839           && GET_CODE (op0) == AND)
6840         {
6841           x = op0;
6842           mode = GET_MODE (op0);
6843           goto cost_logic;
6844         }
6845
6846       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
6847         {
6848           /* TODO: A write to the CC flags possibly costs extra, this
6849              needs encoding in the cost tables.  */
6850
6851           mode = GET_MODE (op0);
6852           /* ANDS.  */
6853           if (GET_CODE (op0) == AND)
6854             {
6855               x = op0;
6856               goto cost_logic;
6857             }
6858
6859           if (GET_CODE (op0) == PLUS)
6860             {
6861               /* ADDS (and CMN alias).  */
6862               x = op0;
6863               goto cost_plus;
6864             }
6865
6866           if (GET_CODE (op0) == MINUS)
6867             {
6868               /* SUBS.  */
6869               x = op0;
6870               goto cost_minus;
6871             }
6872
6873           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
6874               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
6875               && CONST_INT_P (XEXP (op0, 2)))
6876             {
6877               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
6878                  Handle it here directly rather than going to cost_logic
6879                  since we know the immediate generated for the TST is valid
6880                  so we can avoid creating an intermediate rtx for it only
6881                  for costing purposes.  */
6882               if (speed)
6883                 *cost += extra_cost->alu.logical;
6884
6885               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
6886                                  ZERO_EXTRACT, 0, speed);
6887               return true;
6888             }
6889
6890           if (GET_CODE (op1) == NEG)
6891             {
6892               /* CMN.  */
6893               if (speed)
6894                 *cost += extra_cost->alu.arith;
6895
6896               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
6897               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
6898               return true;
6899             }
6900
6901           /* CMP.
6902
6903              Compare can freely swap the order of operands, and
6904              canonicalization puts the more complex operation first.
6905              But the integer MINUS logic expects the shift/extend
6906              operation in op1.  */
6907           if (! (REG_P (op0)
6908                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
6909           {
6910             op0 = XEXP (x, 1);
6911             op1 = XEXP (x, 0);
6912           }
6913           goto cost_minus;
6914         }
6915
6916       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
6917         {
6918           /* FCMP.  */
6919           if (speed)
6920             *cost += extra_cost->fp[mode == DFmode].compare;
6921
6922           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
6923             {
6924               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
6925               /* FCMP supports constant 0.0 for no extra cost. */
6926               return true;
6927             }
6928           return false;
6929         }
6930
6931       if (VECTOR_MODE_P (mode))
6932         {
6933           /* Vector compare.  */
6934           if (speed)
6935             *cost += extra_cost->vect.alu;
6936
6937           if (aarch64_float_const_zero_rtx_p (op1))
6938             {
6939               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
6940                  cost.  */
6941               return true;
6942             }
6943           return false;
6944         }
6945       return false;
6946
6947     case MINUS:
6948       {
6949         op0 = XEXP (x, 0);
6950         op1 = XEXP (x, 1);
6951
6952 cost_minus:
6953         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
6954
6955         /* Detect valid immediates.  */
6956         if ((GET_MODE_CLASS (mode) == MODE_INT
6957              || (GET_MODE_CLASS (mode) == MODE_CC
6958                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
6959             && CONST_INT_P (op1)
6960             && aarch64_uimm12_shift (INTVAL (op1)))
6961           {
6962             if (speed)
6963               /* SUB(S) (immediate).  */
6964               *cost += extra_cost->alu.arith;
6965             return true;
6966           }
6967
6968         /* Look for SUB (extended register).  */
6969         if (aarch64_rtx_arith_op_extract_p (op1, mode))
6970           {
6971             if (speed)
6972               *cost += extra_cost->alu.extend_arith;
6973
6974             op1 = aarch64_strip_extend (op1);
6975             *cost += rtx_cost (op1, VOIDmode,
6976                                (enum rtx_code) GET_CODE (op1), 0, speed);
6977             return true;
6978           }
6979
6980         rtx new_op1 = aarch64_strip_extend (op1);
6981
6982         /* Cost this as an FMA-alike operation.  */
6983         if ((GET_CODE (new_op1) == MULT
6984              || aarch64_shift_p (GET_CODE (new_op1)))
6985             && code != COMPARE)
6986           {
6987             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
6988                                             (enum rtx_code) code,
6989                                             speed);
6990             return true;
6991           }
6992
6993         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
6994
6995         if (speed)
6996           {
6997             if (VECTOR_MODE_P (mode))
6998               {
6999                 /* Vector SUB.  */
7000                 *cost += extra_cost->vect.alu;
7001               }
7002             else if (GET_MODE_CLASS (mode) == MODE_INT)
7003               {
7004                 /* SUB(S).  */
7005                 *cost += extra_cost->alu.arith;
7006               }
7007             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7008               {
7009                 /* FSUB.  */
7010                 *cost += extra_cost->fp[mode == DFmode].addsub;
7011               }
7012           }
7013         return true;
7014       }
7015
7016     case PLUS:
7017       {
7018         rtx new_op0;
7019
7020         op0 = XEXP (x, 0);
7021         op1 = XEXP (x, 1);
7022
7023 cost_plus:
7024         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7025             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7026           {
7027             /* CSINC.  */
7028             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
7029             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7030             return true;
7031           }
7032
7033         if (GET_MODE_CLASS (mode) == MODE_INT
7034             && CONST_INT_P (op1)
7035             && aarch64_uimm12_shift (INTVAL (op1)))
7036           {
7037             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
7038
7039             if (speed)
7040               /* ADD (immediate).  */
7041               *cost += extra_cost->alu.arith;
7042             return true;
7043           }
7044
7045         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7046
7047         /* Look for ADD (extended register).  */
7048         if (aarch64_rtx_arith_op_extract_p (op0, mode))
7049           {
7050             if (speed)
7051               *cost += extra_cost->alu.extend_arith;
7052
7053             op0 = aarch64_strip_extend (op0);
7054             *cost += rtx_cost (op0, VOIDmode,
7055                                (enum rtx_code) GET_CODE (op0), 0, speed);
7056             return true;
7057           }
7058
7059         /* Strip any extend, leave shifts behind as we will
7060            cost them through mult_cost.  */
7061         new_op0 = aarch64_strip_extend (op0);
7062
7063         if (GET_CODE (new_op0) == MULT
7064             || aarch64_shift_p (GET_CODE (new_op0)))
7065           {
7066             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
7067                                             speed);
7068             return true;
7069           }
7070
7071         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
7072
7073         if (speed)
7074           {
7075             if (VECTOR_MODE_P (mode))
7076               {
7077                 /* Vector ADD.  */
7078                 *cost += extra_cost->vect.alu;
7079               }
7080             else if (GET_MODE_CLASS (mode) == MODE_INT)
7081               {
7082                 /* ADD.  */
7083                 *cost += extra_cost->alu.arith;
7084               }
7085             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7086               {
7087                 /* FADD.  */
7088                 *cost += extra_cost->fp[mode == DFmode].addsub;
7089               }
7090           }
7091         return true;
7092       }
7093
7094     case BSWAP:
7095       *cost = COSTS_N_INSNS (1);
7096
7097       if (speed)
7098         {
7099           if (VECTOR_MODE_P (mode))
7100             *cost += extra_cost->vect.alu;
7101           else
7102             *cost += extra_cost->alu.rev;
7103         }
7104       return false;
7105
7106     case IOR:
7107       if (aarch_rev16_p (x))
7108         {
7109           *cost = COSTS_N_INSNS (1);
7110
7111           if (speed)
7112             {
7113               if (VECTOR_MODE_P (mode))
7114                 *cost += extra_cost->vect.alu;
7115               else
7116                 *cost += extra_cost->alu.rev;
7117             }
7118           return true;
7119         }
7120
7121       if (aarch64_extr_rtx_p (x, &op0, &op1))
7122         {
7123           *cost += rtx_cost (op0, mode, IOR, 0, speed);
7124           *cost += rtx_cost (op1, mode, IOR, 1, speed);
7125           if (speed)
7126             *cost += extra_cost->alu.shift;
7127
7128           return true;
7129         }
7130     /* Fall through.  */
7131     case XOR:
7132     case AND:
7133     cost_logic:
7134       op0 = XEXP (x, 0);
7135       op1 = XEXP (x, 1);
7136
7137       if (VECTOR_MODE_P (mode))
7138         {
7139           if (speed)
7140             *cost += extra_cost->vect.alu;
7141           return true;
7142         }
7143
7144       if (code == AND
7145           && GET_CODE (op0) == MULT
7146           && CONST_INT_P (XEXP (op0, 1))
7147           && CONST_INT_P (op1)
7148           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
7149                                INTVAL (op1)) != 0)
7150         {
7151           /* This is a UBFM/SBFM.  */
7152           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
7153           if (speed)
7154             *cost += extra_cost->alu.bfx;
7155           return true;
7156         }
7157
7158       if (GET_MODE_CLASS (mode) == MODE_INT)
7159         {
7160           if (CONST_INT_P (op1))
7161             {
7162               /* We have a mask + shift version of a UBFIZ
7163                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
7164               if (GET_CODE (op0) == ASHIFT
7165                   && aarch64_mask_and_shift_for_ubfiz_p (mode, op1,
7166                                                           XEXP (op0, 1)))
7167                 {
7168                   *cost += rtx_cost (XEXP (op0, 0), mode,
7169                                      (enum rtx_code) code, 0, speed);
7170                   if (speed)
7171                     *cost += extra_cost->alu.bfx;
7172
7173                   return true;
7174                 }
7175               else if (aarch64_bitmask_imm (INTVAL (op1), mode))
7176                 {
7177                 /* We possibly get the immediate for free, this is not
7178                    modelled.  */
7179                   *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7180                   if (speed)
7181                     *cost += extra_cost->alu.logical;
7182
7183                   return true;
7184                 }
7185             }
7186           else
7187             {
7188               rtx new_op0 = op0;
7189
7190               /* Handle ORN, EON, or BIC.  */
7191               if (GET_CODE (op0) == NOT)
7192                 op0 = XEXP (op0, 0);
7193
7194               new_op0 = aarch64_strip_shift (op0);
7195
7196               /* If we had a shift on op0 then this is a logical-shift-
7197                  by-register/immediate operation.  Otherwise, this is just
7198                  a logical operation.  */
7199               if (speed)
7200                 {
7201                   if (new_op0 != op0)
7202                     {
7203                       /* Shift by immediate.  */
7204                       if (CONST_INT_P (XEXP (op0, 1)))
7205                         *cost += extra_cost->alu.log_shift;
7206                       else
7207                         *cost += extra_cost->alu.log_shift_reg;
7208                     }
7209                   else
7210                     *cost += extra_cost->alu.logical;
7211                 }
7212
7213               /* In both cases we want to cost both operands.  */
7214               *cost += rtx_cost (new_op0, mode, (enum rtx_code) code, 0, speed);
7215               *cost += rtx_cost (op1, mode, (enum rtx_code) code, 1, speed);
7216
7217               return true;
7218             }
7219         }
7220       return false;
7221
7222     case NOT:
7223       x = XEXP (x, 0);
7224       op0 = aarch64_strip_shift (x);
7225
7226       if (VECTOR_MODE_P (mode))
7227         {
7228           /* Vector NOT.  */
7229           *cost += extra_cost->vect.alu;
7230           return false;
7231         }
7232
7233       /* MVN-shifted-reg.  */
7234       if (op0 != x)
7235         {
7236           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7237
7238           if (speed)
7239             *cost += extra_cost->alu.log_shift;
7240
7241           return true;
7242         }
7243       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
7244          Handle the second form here taking care that 'a' in the above can
7245          be a shift.  */
7246       else if (GET_CODE (op0) == XOR)
7247         {
7248           rtx newop0 = XEXP (op0, 0);
7249           rtx newop1 = XEXP (op0, 1);
7250           rtx op0_stripped = aarch64_strip_shift (newop0);
7251
7252           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
7253           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
7254
7255           if (speed)
7256             {
7257               if (op0_stripped != newop0)
7258                 *cost += extra_cost->alu.log_shift;
7259               else
7260                 *cost += extra_cost->alu.logical;
7261             }
7262
7263           return true;
7264         }
7265       /* MVN.  */
7266       if (speed)
7267         *cost += extra_cost->alu.logical;
7268
7269       return false;
7270
7271     case ZERO_EXTEND:
7272
7273       op0 = XEXP (x, 0);
7274       /* If a value is written in SI mode, then zero extended to DI
7275          mode, the operation will in general be free as a write to
7276          a 'w' register implicitly zeroes the upper bits of an 'x'
7277          register.  However, if this is
7278
7279            (set (reg) (zero_extend (reg)))
7280
7281          we must cost the explicit register move.  */
7282       if (mode == DImode
7283           && GET_MODE (op0) == SImode
7284           && outer == SET)
7285         {
7286           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
7287
7288         /* If OP_COST is non-zero, then the cost of the zero extend
7289            is effectively the cost of the inner operation.  Otherwise
7290            we have a MOV instruction and we take the cost from the MOV
7291            itself.  This is true independently of whether we are
7292            optimizing for space or time.  */
7293           if (op_cost)
7294             *cost = op_cost;
7295
7296           return true;
7297         }
7298       else if (MEM_P (op0))
7299         {
7300           /* All loads can zero extend to any size for free.  */
7301           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
7302           return true;
7303         }
7304
7305       op0 = aarch64_extend_bitfield_pattern_p (x);
7306       if (op0)
7307         {
7308           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
7309           if (speed)
7310             *cost += extra_cost->alu.bfx;
7311           return true;
7312         }
7313
7314       if (speed)
7315         {
7316           if (VECTOR_MODE_P (mode))
7317             {
7318               /* UMOV.  */
7319               *cost += extra_cost->vect.alu;
7320             }
7321           else
7322             {
7323               /* We generate an AND instead of UXTB/UXTH.  */
7324               *cost += extra_cost->alu.logical;
7325             }
7326         }
7327       return false;
7328
7329     case SIGN_EXTEND:
7330       if (MEM_P (XEXP (x, 0)))
7331         {
7332           /* LDRSH.  */
7333           if (speed)
7334             {
7335               rtx address = XEXP (XEXP (x, 0), 0);
7336               *cost += extra_cost->ldst.load_sign_extend;
7337
7338               *cost +=
7339                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7340                                                      0, speed));
7341             }
7342           return true;
7343         }
7344
7345       op0 = aarch64_extend_bitfield_pattern_p (x);
7346       if (op0)
7347         {
7348           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
7349           if (speed)
7350             *cost += extra_cost->alu.bfx;
7351           return true;
7352         }
7353
7354       if (speed)
7355         {
7356           if (VECTOR_MODE_P (mode))
7357             *cost += extra_cost->vect.alu;
7358           else
7359             *cost += extra_cost->alu.extend;
7360         }
7361       return false;
7362
7363     case ASHIFT:
7364       op0 = XEXP (x, 0);
7365       op1 = XEXP (x, 1);
7366
7367       if (CONST_INT_P (op1))
7368         {
7369           if (speed)
7370             {
7371               if (VECTOR_MODE_P (mode))
7372                 {
7373                   /* Vector shift (immediate).  */
7374                   *cost += extra_cost->vect.alu;
7375                 }
7376               else
7377                 {
7378                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
7379                      aliases.  */
7380                   *cost += extra_cost->alu.shift;
7381                 }
7382             }
7383
7384           /* We can incorporate zero/sign extend for free.  */
7385           if (GET_CODE (op0) == ZERO_EXTEND
7386               || GET_CODE (op0) == SIGN_EXTEND)
7387             op0 = XEXP (op0, 0);
7388
7389           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
7390           return true;
7391         }
7392       else
7393         {
7394           if (speed)
7395             {
7396               if (VECTOR_MODE_P (mode))
7397                 {
7398                   /* Vector shift (register).  */
7399                   *cost += extra_cost->vect.alu;
7400                 }
7401               else
7402                 {
7403                   /* LSLV.  */
7404                   *cost += extra_cost->alu.shift_reg;
7405                 }
7406             }
7407           return false;  /* All arguments need to be in registers.  */
7408         }
7409
7410     case ROTATE:
7411     case ROTATERT:
7412     case LSHIFTRT:
7413     case ASHIFTRT:
7414       op0 = XEXP (x, 0);
7415       op1 = XEXP (x, 1);
7416
7417       if (CONST_INT_P (op1))
7418         {
7419           /* ASR (immediate) and friends.  */
7420           if (speed)
7421             {
7422               if (VECTOR_MODE_P (mode))
7423                 *cost += extra_cost->vect.alu;
7424               else
7425                 *cost += extra_cost->alu.shift;
7426             }
7427
7428           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7429           return true;
7430         }
7431       else
7432         {
7433
7434           /* ASR (register) and friends.  */
7435           if (speed)
7436             {
7437               if (VECTOR_MODE_P (mode))
7438                 *cost += extra_cost->vect.alu;
7439               else
7440                 *cost += extra_cost->alu.shift_reg;
7441             }
7442           return false;  /* All arguments need to be in registers.  */
7443         }
7444
7445     case SYMBOL_REF:
7446
7447       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7448           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7449         {
7450           /* LDR.  */
7451           if (speed)
7452             *cost += extra_cost->ldst.load;
7453         }
7454       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7455                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7456         {
7457           /* ADRP, followed by ADD.  */
7458           *cost += COSTS_N_INSNS (1);
7459           if (speed)
7460             *cost += 2 * extra_cost->alu.arith;
7461         }
7462       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7463                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7464         {
7465           /* ADR.  */
7466           if (speed)
7467             *cost += extra_cost->alu.arith;
7468         }
7469
7470       if (flag_pic)
7471         {
7472           /* One extra load instruction, after accessing the GOT.  */
7473           *cost += COSTS_N_INSNS (1);
7474           if (speed)
7475             *cost += extra_cost->ldst.load;
7476         }
7477       return true;
7478
7479     case HIGH:
7480     case LO_SUM:
7481       /* ADRP/ADD (immediate).  */
7482       if (speed)
7483         *cost += extra_cost->alu.arith;
7484       return true;
7485
7486     case ZERO_EXTRACT:
7487     case SIGN_EXTRACT:
7488       /* UBFX/SBFX.  */
7489       if (speed)
7490         {
7491           if (VECTOR_MODE_P (mode))
7492             *cost += extra_cost->vect.alu;
7493           else
7494             *cost += extra_cost->alu.bfx;
7495         }
7496
7497       /* We can trust that the immediates used will be correct (there
7498          are no by-register forms), so we need only cost op0.  */
7499       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7500       return true;
7501
7502     case MULT:
7503       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7504       /* aarch64_rtx_mult_cost always handles recursion to its
7505          operands.  */
7506       return true;
7507
7508     case MOD:
7509     /* We can expand signed mod by power of 2 using a NEGS, two parallel
7510        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
7511        an unconditional negate.  This case should only ever be reached through
7512        the set_smod_pow2_cheap check in expmed.c.  */
7513       if (CONST_INT_P (XEXP (x, 1))
7514           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7515           && (mode == SImode || mode == DImode))
7516         {
7517           /* We expand to 4 instructions.  Reset the baseline.  */
7518           *cost = COSTS_N_INSNS (4);
7519
7520           if (speed)
7521             *cost += 2 * extra_cost->alu.logical
7522                      + 2 * extra_cost->alu.arith;
7523
7524           return true;
7525         }
7526
7527     /* Fall-through.  */
7528     case UMOD:
7529       if (speed)
7530         {
7531           if (VECTOR_MODE_P (mode))
7532             *cost += extra_cost->vect.alu;
7533           else if (GET_MODE_CLASS (mode) == MODE_INT)
7534             *cost += (extra_cost->mult[mode == DImode].add
7535                       + extra_cost->mult[mode == DImode].idiv);
7536           else if (mode == DFmode)
7537             *cost += (extra_cost->fp[1].mult
7538                       + extra_cost->fp[1].div);
7539           else if (mode == SFmode)
7540             *cost += (extra_cost->fp[0].mult
7541                       + extra_cost->fp[0].div);
7542         }
7543       return false;  /* All arguments need to be in registers.  */
7544
7545     case DIV:
7546     case UDIV:
7547     case SQRT:
7548       if (speed)
7549         {
7550           if (VECTOR_MODE_P (mode))
7551             *cost += extra_cost->vect.alu;
7552           else if (GET_MODE_CLASS (mode) == MODE_INT)
7553             /* There is no integer SQRT, so only DIV and UDIV can get
7554                here.  */
7555             *cost += extra_cost->mult[mode == DImode].idiv;
7556           else
7557             *cost += extra_cost->fp[mode == DFmode].div;
7558         }
7559       return false;  /* All arguments need to be in registers.  */
7560
7561     case IF_THEN_ELSE:
7562       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7563                                          XEXP (x, 2), cost, speed);
7564
7565     case EQ:
7566     case NE:
7567     case GT:
7568     case GTU:
7569     case LT:
7570     case LTU:
7571     case GE:
7572     case GEU:
7573     case LE:
7574     case LEU:
7575
7576       return false; /* All arguments must be in registers.  */
7577
7578     case FMA:
7579       op0 = XEXP (x, 0);
7580       op1 = XEXP (x, 1);
7581       op2 = XEXP (x, 2);
7582
7583       if (speed)
7584         {
7585           if (VECTOR_MODE_P (mode))
7586             *cost += extra_cost->vect.alu;
7587           else
7588             *cost += extra_cost->fp[mode == DFmode].fma;
7589         }
7590
7591       /* FMSUB, FNMADD, and FNMSUB are free.  */
7592       if (GET_CODE (op0) == NEG)
7593         op0 = XEXP (op0, 0);
7594
7595       if (GET_CODE (op2) == NEG)
7596         op2 = XEXP (op2, 0);
7597
7598       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7599          and the by-element operand as operand 0.  */
7600       if (GET_CODE (op1) == NEG)
7601         op1 = XEXP (op1, 0);
7602
7603       /* Catch vector-by-element operations.  The by-element operand can
7604          either be (vec_duplicate (vec_select (x))) or just
7605          (vec_select (x)), depending on whether we are multiplying by
7606          a vector or a scalar.
7607
7608          Canonicalization is not very good in these cases, FMA4 will put the
7609          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
7610       if (GET_CODE (op0) == VEC_DUPLICATE)
7611         op0 = XEXP (op0, 0);
7612       else if (GET_CODE (op1) == VEC_DUPLICATE)
7613         op1 = XEXP (op1, 0);
7614
7615       if (GET_CODE (op0) == VEC_SELECT)
7616         op0 = XEXP (op0, 0);
7617       else if (GET_CODE (op1) == VEC_SELECT)
7618         op1 = XEXP (op1, 0);
7619
7620       /* If the remaining parameters are not registers,
7621          get the cost to put them into registers.  */
7622       *cost += rtx_cost (op0, mode, FMA, 0, speed);
7623       *cost += rtx_cost (op1, mode, FMA, 1, speed);
7624       *cost += rtx_cost (op2, mode, FMA, 2, speed);
7625       return true;
7626
7627     case FLOAT:
7628     case UNSIGNED_FLOAT:
7629       if (speed)
7630         *cost += extra_cost->fp[mode == DFmode].fromint;
7631       return false;
7632
7633     case FLOAT_EXTEND:
7634       if (speed)
7635         {
7636           if (VECTOR_MODE_P (mode))
7637             {
7638               /*Vector truncate.  */
7639               *cost += extra_cost->vect.alu;
7640             }
7641           else
7642             *cost += extra_cost->fp[mode == DFmode].widen;
7643         }
7644       return false;
7645
7646     case FLOAT_TRUNCATE:
7647       if (speed)
7648         {
7649           if (VECTOR_MODE_P (mode))
7650             {
7651               /*Vector conversion.  */
7652               *cost += extra_cost->vect.alu;
7653             }
7654           else
7655             *cost += extra_cost->fp[mode == DFmode].narrow;
7656         }
7657       return false;
7658
7659     case FIX:
7660     case UNSIGNED_FIX:
7661       x = XEXP (x, 0);
7662       /* Strip the rounding part.  They will all be implemented
7663          by the fcvt* family of instructions anyway.  */
7664       if (GET_CODE (x) == UNSPEC)
7665         {
7666           unsigned int uns_code = XINT (x, 1);
7667
7668           if (uns_code == UNSPEC_FRINTA
7669               || uns_code == UNSPEC_FRINTM
7670               || uns_code == UNSPEC_FRINTN
7671               || uns_code == UNSPEC_FRINTP
7672               || uns_code == UNSPEC_FRINTZ)
7673             x = XVECEXP (x, 0, 0);
7674         }
7675
7676       if (speed)
7677         {
7678           if (VECTOR_MODE_P (mode))
7679             *cost += extra_cost->vect.alu;
7680           else
7681             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
7682         }
7683
7684       /* We can combine fmul by a power of 2 followed by a fcvt into a single
7685          fixed-point fcvt.  */
7686       if (GET_CODE (x) == MULT
7687           && ((VECTOR_MODE_P (mode)
7688                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
7689               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
7690         {
7691           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
7692                              0, speed);
7693           return true;
7694         }
7695
7696       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
7697       return true;
7698
7699     case ABS:
7700       if (VECTOR_MODE_P (mode))
7701         {
7702           /* ABS (vector).  */
7703           if (speed)
7704             *cost += extra_cost->vect.alu;
7705         }
7706       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7707         {
7708           op0 = XEXP (x, 0);
7709
7710           /* FABD, which is analogous to FADD.  */
7711           if (GET_CODE (op0) == MINUS)
7712             {
7713               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
7714               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
7715               if (speed)
7716                 *cost += extra_cost->fp[mode == DFmode].addsub;
7717
7718               return true;
7719             }
7720           /* Simple FABS is analogous to FNEG.  */
7721           if (speed)
7722             *cost += extra_cost->fp[mode == DFmode].neg;
7723         }
7724       else
7725         {
7726           /* Integer ABS will either be split to
7727              two arithmetic instructions, or will be an ABS
7728              (scalar), which we don't model.  */
7729           *cost = COSTS_N_INSNS (2);
7730           if (speed)
7731             *cost += 2 * extra_cost->alu.arith;
7732         }
7733       return false;
7734
7735     case SMAX:
7736     case SMIN:
7737       if (speed)
7738         {
7739           if (VECTOR_MODE_P (mode))
7740             *cost += extra_cost->vect.alu;
7741           else
7742             {
7743               /* FMAXNM/FMINNM/FMAX/FMIN.
7744                  TODO: This may not be accurate for all implementations, but
7745                  we do not model this in the cost tables.  */
7746               *cost += extra_cost->fp[mode == DFmode].addsub;
7747             }
7748         }
7749       return false;
7750
7751     case UNSPEC:
7752       /* The floating point round to integer frint* instructions.  */
7753       if (aarch64_frint_unspec_p (XINT (x, 1)))
7754         {
7755           if (speed)
7756             *cost += extra_cost->fp[mode == DFmode].roundint;
7757
7758           return false;
7759         }
7760
7761       if (XINT (x, 1) == UNSPEC_RBIT)
7762         {
7763           if (speed)
7764             *cost += extra_cost->alu.rev;
7765
7766           return false;
7767         }
7768       break;
7769
7770     case TRUNCATE:
7771
7772       /* Decompose <su>muldi3_highpart.  */
7773       if (/* (truncate:DI  */
7774           mode == DImode
7775           /*   (lshiftrt:TI  */
7776           && GET_MODE (XEXP (x, 0)) == TImode
7777           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
7778           /*      (mult:TI  */
7779           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
7780           /*        (ANY_EXTEND:TI (reg:DI))
7781                     (ANY_EXTEND:TI (reg:DI)))  */
7782           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
7783                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
7784               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
7785                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
7786           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
7787           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
7788           /*     (const_int 64)  */
7789           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7790           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
7791         {
7792           /* UMULH/SMULH.  */
7793           if (speed)
7794             *cost += extra_cost->mult[mode == DImode].extend;
7795           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
7796                              mode, MULT, 0, speed);
7797           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
7798                              mode, MULT, 1, speed);
7799           return true;
7800         }
7801
7802       /* Fall through.  */
7803     default:
7804       break;
7805     }
7806
7807   if (dump_file
7808       && flag_aarch64_verbose_cost)
7809     fprintf (dump_file,
7810       "\nFailed to cost RTX.  Assuming default cost.\n");
7811
7812   return true;
7813 }
7814
7815 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
7816    calculated for X.  This cost is stored in *COST.  Returns true
7817    if the total cost of X was calculated.  */
7818 static bool
7819 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
7820                    int param, int *cost, bool speed)
7821 {
7822   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
7823
7824   if (dump_file
7825       && flag_aarch64_verbose_cost)
7826     {
7827       print_rtl_single (dump_file, x);
7828       fprintf (dump_file, "\n%s cost: %d (%s)\n",
7829                speed ? "Hot" : "Cold",
7830                *cost, result ? "final" : "partial");
7831     }
7832
7833   return result;
7834 }
7835
7836 static int
7837 aarch64_register_move_cost (machine_mode mode,
7838                             reg_class_t from_i, reg_class_t to_i)
7839 {
7840   enum reg_class from = (enum reg_class) from_i;
7841   enum reg_class to = (enum reg_class) to_i;
7842   const struct cpu_regmove_cost *regmove_cost
7843     = aarch64_tune_params.regmove_cost;
7844
7845   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
7846   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
7847     to = GENERAL_REGS;
7848
7849   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
7850     from = GENERAL_REGS;
7851
7852   /* Moving between GPR and stack cost is the same as GP2GP.  */
7853   if ((from == GENERAL_REGS && to == STACK_REG)
7854       || (to == GENERAL_REGS && from == STACK_REG))
7855     return regmove_cost->GP2GP;
7856
7857   /* To/From the stack register, we move via the gprs.  */
7858   if (to == STACK_REG || from == STACK_REG)
7859     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
7860             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
7861
7862   if (GET_MODE_SIZE (mode) == 16)
7863     {
7864       /* 128-bit operations on general registers require 2 instructions.  */
7865       if (from == GENERAL_REGS && to == GENERAL_REGS)
7866         return regmove_cost->GP2GP * 2;
7867       else if (from == GENERAL_REGS)
7868         return regmove_cost->GP2FP * 2;
7869       else if (to == GENERAL_REGS)
7870         return regmove_cost->FP2GP * 2;
7871
7872       /* When AdvSIMD instructions are disabled it is not possible to move
7873          a 128-bit value directly between Q registers.  This is handled in
7874          secondary reload.  A general register is used as a scratch to move
7875          the upper DI value and the lower DI value is moved directly,
7876          hence the cost is the sum of three moves. */
7877       if (! TARGET_SIMD)
7878         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
7879
7880       return regmove_cost->FP2FP;
7881     }
7882
7883   if (from == GENERAL_REGS && to == GENERAL_REGS)
7884     return regmove_cost->GP2GP;
7885   else if (from == GENERAL_REGS)
7886     return regmove_cost->GP2FP;
7887   else if (to == GENERAL_REGS)
7888     return regmove_cost->FP2GP;
7889
7890   return regmove_cost->FP2FP;
7891 }
7892
7893 static int
7894 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
7895                           reg_class_t rclass ATTRIBUTE_UNUSED,
7896                           bool in ATTRIBUTE_UNUSED)
7897 {
7898   return aarch64_tune_params.memmov_cost;
7899 }
7900
7901 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
7902    to optimize 1.0/sqrt.  */
7903
7904 static bool
7905 use_rsqrt_p (machine_mode mode)
7906 {
7907   return (!flag_trapping_math
7908           && flag_unsafe_math_optimizations
7909           && ((aarch64_tune_params.approx_modes->recip_sqrt
7910                & AARCH64_APPROX_MODE (mode))
7911               || flag_mrecip_low_precision_sqrt));
7912 }
7913
7914 /* Function to decide when to use the approximate reciprocal square root
7915    builtin.  */
7916
7917 static tree
7918 aarch64_builtin_reciprocal (tree fndecl)
7919 {
7920   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
7921
7922   if (!use_rsqrt_p (mode))
7923     return NULL_TREE;
7924   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
7925 }
7926
7927 typedef rtx (*rsqrte_type) (rtx, rtx);
7928
7929 /* Select reciprocal square root initial estimate insn depending on machine
7930    mode.  */
7931
7932 static rsqrte_type
7933 get_rsqrte_type (machine_mode mode)
7934 {
7935   switch (mode)
7936   {
7937     case DFmode:   return gen_aarch64_rsqrtedf;
7938     case SFmode:   return gen_aarch64_rsqrtesf;
7939     case V2DFmode: return gen_aarch64_rsqrtev2df;
7940     case V2SFmode: return gen_aarch64_rsqrtev2sf;
7941     case V4SFmode: return gen_aarch64_rsqrtev4sf;
7942     default: gcc_unreachable ();
7943   }
7944 }
7945
7946 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
7947
7948 /* Select reciprocal square root series step insn depending on machine mode.  */
7949
7950 static rsqrts_type
7951 get_rsqrts_type (machine_mode mode)
7952 {
7953   switch (mode)
7954   {
7955     case DFmode:   return gen_aarch64_rsqrtsdf;
7956     case SFmode:   return gen_aarch64_rsqrtssf;
7957     case V2DFmode: return gen_aarch64_rsqrtsv2df;
7958     case V2SFmode: return gen_aarch64_rsqrtsv2sf;
7959     case V4SFmode: return gen_aarch64_rsqrtsv4sf;
7960     default: gcc_unreachable ();
7961   }
7962 }
7963
7964 /* Emit instruction sequence to compute either the approximate square root
7965    or its approximate reciprocal, depending on the flag RECP, and return
7966    whether the sequence was emitted or not.  */
7967
7968 bool
7969 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
7970 {
7971   machine_mode mode = GET_MODE (dst);
7972
7973   if (GET_MODE_INNER (mode) == HFmode)
7974     return false;
7975
7976   machine_mode mmsk = mode_for_vector
7977                         (int_mode_for_mode (GET_MODE_INNER (mode)),
7978                          GET_MODE_NUNITS (mode));
7979   bool use_approx_sqrt_p = (!recp
7980                             && (flag_mlow_precision_sqrt
7981                                 || (aarch64_tune_params.approx_modes->sqrt
7982                                     & AARCH64_APPROX_MODE (mode))));
7983   bool use_approx_rsqrt_p = (recp
7984                              && (flag_mrecip_low_precision_sqrt
7985                                  || (aarch64_tune_params.approx_modes->recip_sqrt
7986                                      & AARCH64_APPROX_MODE (mode))));
7987
7988   if (!flag_finite_math_only
7989       || flag_trapping_math
7990       || !flag_unsafe_math_optimizations
7991       || !(use_approx_sqrt_p || use_approx_rsqrt_p)
7992       || optimize_function_for_size_p (cfun))
7993     return false;
7994
7995   rtx xmsk = gen_reg_rtx (mmsk);
7996   if (!recp)
7997     /* When calculating the approximate square root, compare the argument with
7998        0.0 and create a mask.  */
7999     emit_insn (gen_rtx_SET (xmsk, gen_rtx_NEG (mmsk, gen_rtx_EQ (mmsk, src,
8000                                                           CONST0_RTX (mode)))));
8001
8002   /* Estimate the approximate reciprocal square root.  */
8003   rtx xdst = gen_reg_rtx (mode);
8004   emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
8005
8006   /* Iterate over the series twice for SF and thrice for DF.  */
8007   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8008
8009   /* Optionally iterate over the series once less for faster performance
8010      while sacrificing the accuracy.  */
8011   if ((recp && flag_mrecip_low_precision_sqrt)
8012       || (!recp && flag_mlow_precision_sqrt))
8013     iterations--;
8014
8015   /* Iterate over the series to calculate the approximate reciprocal square
8016      root.  */
8017   rtx x1 = gen_reg_rtx (mode);
8018   while (iterations--)
8019     {
8020       rtx x2 = gen_reg_rtx (mode);
8021       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
8022
8023       emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
8024
8025       if (iterations > 0)
8026         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
8027     }
8028
8029   if (!recp)
8030     {
8031       /* Qualify the approximate reciprocal square root when the argument is
8032          0.0 by squashing the intermediary result to 0.0.  */
8033       rtx xtmp = gen_reg_rtx (mmsk);
8034       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
8035                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
8036       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
8037
8038       /* Calculate the approximate square root.  */
8039       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
8040     }
8041
8042   /* Finalize the approximation.  */
8043   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
8044
8045   return true;
8046 }
8047
8048 typedef rtx (*recpe_type) (rtx, rtx);
8049
8050 /* Select reciprocal initial estimate insn depending on machine mode.  */
8051
8052 static recpe_type
8053 get_recpe_type (machine_mode mode)
8054 {
8055   switch (mode)
8056   {
8057     case SFmode:   return (gen_aarch64_frecpesf);
8058     case V2SFmode: return (gen_aarch64_frecpev2sf);
8059     case V4SFmode: return (gen_aarch64_frecpev4sf);
8060     case DFmode:   return (gen_aarch64_frecpedf);
8061     case V2DFmode: return (gen_aarch64_frecpev2df);
8062     default:       gcc_unreachable ();
8063   }
8064 }
8065
8066 typedef rtx (*recps_type) (rtx, rtx, rtx);
8067
8068 /* Select reciprocal series step insn depending on machine mode.  */
8069
8070 static recps_type
8071 get_recps_type (machine_mode mode)
8072 {
8073   switch (mode)
8074   {
8075     case SFmode:   return (gen_aarch64_frecpssf);
8076     case V2SFmode: return (gen_aarch64_frecpsv2sf);
8077     case V4SFmode: return (gen_aarch64_frecpsv4sf);
8078     case DFmode:   return (gen_aarch64_frecpsdf);
8079     case V2DFmode: return (gen_aarch64_frecpsv2df);
8080     default:       gcc_unreachable ();
8081   }
8082 }
8083
8084 /* Emit the instruction sequence to compute the approximation for the division
8085    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
8086
8087 bool
8088 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
8089 {
8090   machine_mode mode = GET_MODE (quo);
8091
8092   if (GET_MODE_INNER (mode) == HFmode)
8093     return false;
8094
8095   bool use_approx_division_p = (flag_mlow_precision_div
8096                                 || (aarch64_tune_params.approx_modes->division
8097                                     & AARCH64_APPROX_MODE (mode)));
8098
8099   if (!flag_finite_math_only
8100       || flag_trapping_math
8101       || !flag_unsafe_math_optimizations
8102       || optimize_function_for_size_p (cfun)
8103       || !use_approx_division_p)
8104     return false;
8105
8106   /* Estimate the approximate reciprocal.  */
8107   rtx xrcp = gen_reg_rtx (mode);
8108   emit_insn ((*get_recpe_type (mode)) (xrcp, den));
8109
8110   /* Iterate over the series twice for SF and thrice for DF.  */
8111   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8112
8113   /* Optionally iterate over the series once less for faster performance,
8114      while sacrificing the accuracy.  */
8115   if (flag_mlow_precision_div)
8116     iterations--;
8117
8118   /* Iterate over the series to calculate the approximate reciprocal.  */
8119   rtx xtmp = gen_reg_rtx (mode);
8120   while (iterations--)
8121     {
8122       emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
8123
8124       if (iterations > 0)
8125         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
8126     }
8127
8128   if (num != CONST1_RTX (mode))
8129     {
8130       /* As the approximate reciprocal of DEN is already calculated, only
8131          calculate the approximate division when NUM is not 1.0.  */
8132       rtx xnum = force_reg (mode, num);
8133       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
8134     }
8135
8136   /* Finalize the approximation.  */
8137   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
8138   return true;
8139 }
8140
8141 /* Return the number of instructions that can be issued per cycle.  */
8142 static int
8143 aarch64_sched_issue_rate (void)
8144 {
8145   return aarch64_tune_params.issue_rate;
8146 }
8147
8148 static int
8149 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
8150 {
8151   int issue_rate = aarch64_sched_issue_rate ();
8152
8153   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
8154 }
8155
8156
8157 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
8158    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
8159    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
8160
8161 static int
8162 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
8163                                                     int ready_index)
8164 {
8165   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
8166 }
8167
8168
8169 /* Vectorizer cost model target hooks.  */
8170
8171 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
8172 static int
8173 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
8174                                     tree vectype,
8175                                     int misalign ATTRIBUTE_UNUSED)
8176 {
8177   unsigned elements;
8178   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
8179   bool fp = false;
8180
8181   if (vectype != NULL)
8182     fp = FLOAT_TYPE_P (vectype);
8183
8184   switch (type_of_cost)
8185     {
8186       case scalar_stmt:
8187         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
8188
8189       case scalar_load:
8190         return costs->scalar_load_cost;
8191
8192       case scalar_store:
8193         return costs->scalar_store_cost;
8194
8195       case vector_stmt:
8196         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8197
8198       case vector_load:
8199         return costs->vec_align_load_cost;
8200
8201       case vector_store:
8202         return costs->vec_store_cost;
8203
8204       case vec_to_scalar:
8205         return costs->vec_to_scalar_cost;
8206
8207       case scalar_to_vec:
8208         return costs->scalar_to_vec_cost;
8209
8210       case unaligned_load:
8211         return costs->vec_unalign_load_cost;
8212
8213       case unaligned_store:
8214         return costs->vec_unalign_store_cost;
8215
8216       case cond_branch_taken:
8217         return costs->cond_taken_branch_cost;
8218
8219       case cond_branch_not_taken:
8220         return costs->cond_not_taken_branch_cost;
8221
8222       case vec_perm:
8223         return costs->vec_permute_cost;
8224
8225       case vec_promote_demote:
8226         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8227
8228       case vec_construct:
8229         elements = TYPE_VECTOR_SUBPARTS (vectype);
8230         return elements / 2 + 1;
8231
8232       default:
8233         gcc_unreachable ();
8234     }
8235 }
8236
8237 /* Implement targetm.vectorize.add_stmt_cost.  */
8238 static unsigned
8239 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
8240                        struct _stmt_vec_info *stmt_info, int misalign,
8241                        enum vect_cost_model_location where)
8242 {
8243   unsigned *cost = (unsigned *) data;
8244   unsigned retval = 0;
8245
8246   if (flag_vect_cost_model)
8247     {
8248       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
8249       int stmt_cost =
8250             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
8251
8252       /* Statements in an inner loop relative to the loop being
8253          vectorized are weighted more heavily.  The value here is
8254          arbitrary and could potentially be improved with analysis.  */
8255       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
8256         count *= 50; /*  FIXME  */
8257
8258       retval = (unsigned) (count * stmt_cost);
8259       cost[where] += retval;
8260     }
8261
8262   return retval;
8263 }
8264
8265 static void initialize_aarch64_code_model (struct gcc_options *);
8266
8267 /* Parse the TO_PARSE string and put the architecture struct that it
8268    selects into RES and the architectural features into ISA_FLAGS.
8269    Return an aarch64_parse_opt_result describing the parse result.
8270    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
8271
8272 static enum aarch64_parse_opt_result
8273 aarch64_parse_arch (const char *to_parse, const struct processor **res,
8274                     unsigned long *isa_flags)
8275 {
8276   char *ext;
8277   const struct processor *arch;
8278   char *str = (char *) alloca (strlen (to_parse) + 1);
8279   size_t len;
8280
8281   strcpy (str, to_parse);
8282
8283   ext = strchr (str, '+');
8284
8285   if (ext != NULL)
8286     len = ext - str;
8287   else
8288     len = strlen (str);
8289
8290   if (len == 0)
8291     return AARCH64_PARSE_MISSING_ARG;
8292
8293
8294   /* Loop through the list of supported ARCHes to find a match.  */
8295   for (arch = all_architectures; arch->name != NULL; arch++)
8296     {
8297       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
8298         {
8299           unsigned long isa_temp = arch->flags;
8300
8301           if (ext != NULL)
8302             {
8303               /* TO_PARSE string contains at least one extension.  */
8304               enum aarch64_parse_opt_result ext_res
8305                 = aarch64_parse_extension (ext, &isa_temp);
8306
8307               if (ext_res != AARCH64_PARSE_OK)
8308                 return ext_res;
8309             }
8310           /* Extension parsing was successful.  Confirm the result
8311              arch and ISA flags.  */
8312           *res = arch;
8313           *isa_flags = isa_temp;
8314           return AARCH64_PARSE_OK;
8315         }
8316     }
8317
8318   /* ARCH name not found in list.  */
8319   return AARCH64_PARSE_INVALID_ARG;
8320 }
8321
8322 /* Parse the TO_PARSE string and put the result tuning in RES and the
8323    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
8324    describing the parse result.  If there is an error parsing, RES and
8325    ISA_FLAGS are left unchanged.  */
8326
8327 static enum aarch64_parse_opt_result
8328 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
8329                    unsigned long *isa_flags)
8330 {
8331   char *ext;
8332   const struct processor *cpu;
8333   char *str = (char *) alloca (strlen (to_parse) + 1);
8334   size_t len;
8335
8336   strcpy (str, to_parse);
8337
8338   ext = strchr (str, '+');
8339
8340   if (ext != NULL)
8341     len = ext - str;
8342   else
8343     len = strlen (str);
8344
8345   if (len == 0)
8346     return AARCH64_PARSE_MISSING_ARG;
8347
8348
8349   /* Loop through the list of supported CPUs to find a match.  */
8350   for (cpu = all_cores; cpu->name != NULL; cpu++)
8351     {
8352       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
8353         {
8354           unsigned long isa_temp = cpu->flags;
8355
8356
8357           if (ext != NULL)
8358             {
8359               /* TO_PARSE string contains at least one extension.  */
8360               enum aarch64_parse_opt_result ext_res
8361                 = aarch64_parse_extension (ext, &isa_temp);
8362
8363               if (ext_res != AARCH64_PARSE_OK)
8364                 return ext_res;
8365             }
8366           /* Extension parsing was successfull.  Confirm the result
8367              cpu and ISA flags.  */
8368           *res = cpu;
8369           *isa_flags = isa_temp;
8370           return AARCH64_PARSE_OK;
8371         }
8372     }
8373
8374   /* CPU name not found in list.  */
8375   return AARCH64_PARSE_INVALID_ARG;
8376 }
8377
8378 /* Parse the TO_PARSE string and put the cpu it selects into RES.
8379    Return an aarch64_parse_opt_result describing the parse result.
8380    If the parsing fails the RES does not change.  */
8381
8382 static enum aarch64_parse_opt_result
8383 aarch64_parse_tune (const char *to_parse, const struct processor **res)
8384 {
8385   const struct processor *cpu;
8386   char *str = (char *) alloca (strlen (to_parse) + 1);
8387
8388   strcpy (str, to_parse);
8389
8390   /* Loop through the list of supported CPUs to find a match.  */
8391   for (cpu = all_cores; cpu->name != NULL; cpu++)
8392     {
8393       if (strcmp (cpu->name, str) == 0)
8394         {
8395           *res = cpu;
8396           return AARCH64_PARSE_OK;
8397         }
8398     }
8399
8400   /* CPU name not found in list.  */
8401   return AARCH64_PARSE_INVALID_ARG;
8402 }
8403
8404 /* Parse TOKEN, which has length LENGTH to see if it is an option
8405    described in FLAG.  If it is, return the index bit for that fusion type.
8406    If not, error (printing OPTION_NAME) and return zero.  */
8407
8408 static unsigned int
8409 aarch64_parse_one_option_token (const char *token,
8410                                 size_t length,
8411                                 const struct aarch64_flag_desc *flag,
8412                                 const char *option_name)
8413 {
8414   for (; flag->name != NULL; flag++)
8415     {
8416       if (length == strlen (flag->name)
8417           && !strncmp (flag->name, token, length))
8418         return flag->flag;
8419     }
8420
8421   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
8422   return 0;
8423 }
8424
8425 /* Parse OPTION which is a comma-separated list of flags to enable.
8426    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8427    default state we inherit from the CPU tuning structures.  OPTION_NAME
8428    gives the top-level option we are parsing in the -moverride string,
8429    for use in error messages.  */
8430
8431 static unsigned int
8432 aarch64_parse_boolean_options (const char *option,
8433                                const struct aarch64_flag_desc *flags,
8434                                unsigned int initial_state,
8435                                const char *option_name)
8436 {
8437   const char separator = '.';
8438   const char* specs = option;
8439   const char* ntoken = option;
8440   unsigned int found_flags = initial_state;
8441
8442   while ((ntoken = strchr (specs, separator)))
8443     {
8444       size_t token_length = ntoken - specs;
8445       unsigned token_ops = aarch64_parse_one_option_token (specs,
8446                                                            token_length,
8447                                                            flags,
8448                                                            option_name);
8449       /* If we find "none" (or, for simplicity's sake, an error) anywhere
8450          in the token stream, reset the supported operations.  So:
8451
8452            adrp+add.cmp+branch.none.adrp+add
8453
8454            would have the result of turning on only adrp+add fusion.  */
8455       if (!token_ops)
8456         found_flags = 0;
8457
8458       found_flags |= token_ops;
8459       specs = ++ntoken;
8460     }
8461
8462   /* We ended with a comma, print something.  */
8463   if (!(*specs))
8464     {
8465       error ("%s string ill-formed\n", option_name);
8466       return 0;
8467     }
8468
8469   /* We still have one more token to parse.  */
8470   size_t token_length = strlen (specs);
8471   unsigned token_ops = aarch64_parse_one_option_token (specs,
8472                                                        token_length,
8473                                                        flags,
8474                                                        option_name);
8475    if (!token_ops)
8476      found_flags = 0;
8477
8478   found_flags |= token_ops;
8479   return found_flags;
8480 }
8481
8482 /* Support for overriding instruction fusion.  */
8483
8484 static void
8485 aarch64_parse_fuse_string (const char *fuse_string,
8486                             struct tune_params *tune)
8487 {
8488   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8489                                                      aarch64_fusible_pairs,
8490                                                      tune->fusible_ops,
8491                                                      "fuse=");
8492 }
8493
8494 /* Support for overriding other tuning flags.  */
8495
8496 static void
8497 aarch64_parse_tune_string (const char *tune_string,
8498                             struct tune_params *tune)
8499 {
8500   tune->extra_tuning_flags
8501     = aarch64_parse_boolean_options (tune_string,
8502                                      aarch64_tuning_flags,
8503                                      tune->extra_tuning_flags,
8504                                      "tune=");
8505 }
8506
8507 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8508    we understand.  If it is, extract the option string and handoff to
8509    the appropriate function.  */
8510
8511 void
8512 aarch64_parse_one_override_token (const char* token,
8513                                   size_t length,
8514                                   struct tune_params *tune)
8515 {
8516   const struct aarch64_tuning_override_function *fn
8517     = aarch64_tuning_override_functions;
8518
8519   const char *option_part = strchr (token, '=');
8520   if (!option_part)
8521     {
8522       error ("tuning string missing in option (%s)", token);
8523       return;
8524     }
8525
8526   /* Get the length of the option name.  */
8527   length = option_part - token;
8528   /* Skip the '=' to get to the option string.  */
8529   option_part++;
8530
8531   for (; fn->name != NULL; fn++)
8532     {
8533       if (!strncmp (fn->name, token, length))
8534         {
8535           fn->parse_override (option_part, tune);
8536           return;
8537         }
8538     }
8539
8540   error ("unknown tuning option (%s)",token);
8541   return;
8542 }
8543
8544 /* A checking mechanism for the implementation of the tls size.  */
8545
8546 static void
8547 initialize_aarch64_tls_size (struct gcc_options *opts)
8548 {
8549   if (aarch64_tls_size == 0)
8550     aarch64_tls_size = 24;
8551
8552   switch (opts->x_aarch64_cmodel_var)
8553     {
8554     case AARCH64_CMODEL_TINY:
8555       /* Both the default and maximum TLS size allowed under tiny is 1M which
8556          needs two instructions to address, so we clamp the size to 24.  */
8557       if (aarch64_tls_size > 24)
8558         aarch64_tls_size = 24;
8559       break;
8560     case AARCH64_CMODEL_SMALL:
8561       /* The maximum TLS size allowed under small is 4G.  */
8562       if (aarch64_tls_size > 32)
8563         aarch64_tls_size = 32;
8564       break;
8565     case AARCH64_CMODEL_LARGE:
8566       /* The maximum TLS size allowed under large is 16E.
8567          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
8568       if (aarch64_tls_size > 48)
8569         aarch64_tls_size = 48;
8570       break;
8571     default:
8572       gcc_unreachable ();
8573     }
8574
8575   return;
8576 }
8577
8578 /* Parse STRING looking for options in the format:
8579      string     :: option:string
8580      option     :: name=substring
8581      name       :: {a-z}
8582      substring  :: defined by option.  */
8583
8584 static void
8585 aarch64_parse_override_string (const char* input_string,
8586                                struct tune_params* tune)
8587 {
8588   const char separator = ':';
8589   size_t string_length = strlen (input_string) + 1;
8590   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8591   char *string = string_root;
8592   strncpy (string, input_string, string_length);
8593   string[string_length - 1] = '\0';
8594
8595   char* ntoken = string;
8596
8597   while ((ntoken = strchr (string, separator)))
8598     {
8599       size_t token_length = ntoken - string;
8600       /* Make this substring look like a string.  */
8601       *ntoken = '\0';
8602       aarch64_parse_one_override_token (string, token_length, tune);
8603       string = ++ntoken;
8604     }
8605
8606   /* One last option to parse.  */
8607   aarch64_parse_one_override_token (string, strlen (string), tune);
8608   free (string_root);
8609 }
8610
8611
8612 static void
8613 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8614 {
8615   /* The logic here is that if we are disabling all frame pointer generation
8616      then we do not need to disable leaf frame pointer generation as a
8617      separate operation.  But if we are *only* disabling leaf frame pointer
8618      generation then we set flag_omit_frame_pointer to true, but in
8619      aarch64_frame_pointer_required we return false only for leaf functions.
8620
8621      PR 70044: We have to be careful about being called multiple times for the
8622      same function.  Once we have decided to set flag_omit_frame_pointer just
8623      so that we can omit leaf frame pointers, we must then not interpret a
8624      second call as meaning that all frame pointer generation should be
8625      omitted.  We do this by setting flag_omit_frame_pointer to a special,
8626      non-zero value.  */
8627   if (opts->x_flag_omit_frame_pointer == 2)
8628     opts->x_flag_omit_frame_pointer = 0;
8629
8630   if (opts->x_flag_omit_frame_pointer)
8631     opts->x_flag_omit_leaf_frame_pointer = false;
8632   else if (opts->x_flag_omit_leaf_frame_pointer)
8633     opts->x_flag_omit_frame_pointer = 2;
8634
8635   /* If not optimizing for size, set the default
8636      alignment to what the target wants.  */
8637   if (!opts->x_optimize_size)
8638     {
8639       if (opts->x_align_loops <= 0)
8640         opts->x_align_loops = aarch64_tune_params.loop_align;
8641       if (opts->x_align_jumps <= 0)
8642         opts->x_align_jumps = aarch64_tune_params.jump_align;
8643       if (opts->x_align_functions <= 0)
8644         opts->x_align_functions = aarch64_tune_params.function_align;
8645     }
8646
8647   /* We default to no pc-relative literal loads.  */
8648
8649   aarch64_pcrelative_literal_loads = false;
8650
8651   /* If -mpc-relative-literal-loads is set on the command line, this
8652      implies that the user asked for PC relative literal loads.  */
8653   if (opts->x_pcrelative_literal_loads == 1)
8654     aarch64_pcrelative_literal_loads = true;
8655
8656   /* This is PR70113. When building the Linux kernel with
8657      CONFIG_ARM64_ERRATUM_843419, support for relocations
8658      R_AARCH64_ADR_PREL_PG_HI21 and R_AARCH64_ADR_PREL_PG_HI21_NC is
8659      removed from the kernel to avoid loading objects with possibly
8660      offending sequences.  Without -mpc-relative-literal-loads we would
8661      generate such relocations, preventing the kernel build from
8662      succeeding.  */
8663   if (opts->x_pcrelative_literal_loads == 2
8664       && TARGET_FIX_ERR_A53_843419)
8665     aarch64_pcrelative_literal_loads = true;
8666
8667   /* In the tiny memory model it makes no sense to disallow PC relative
8668      literal pool loads.  */
8669   if (aarch64_cmodel == AARCH64_CMODEL_TINY
8670       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
8671     aarch64_pcrelative_literal_loads = true;
8672
8673   /* When enabling the lower precision Newton series for the square root, also
8674      enable it for the reciprocal square root, since the latter is an
8675      intermediary step for the former.  */
8676   if (flag_mlow_precision_sqrt)
8677     flag_mrecip_low_precision_sqrt = true;
8678 }
8679
8680 /* 'Unpack' up the internal tuning structs and update the options
8681     in OPTS.  The caller must have set up selected_tune and selected_arch
8682     as all the other target-specific codegen decisions are
8683     derived from them.  */
8684
8685 void
8686 aarch64_override_options_internal (struct gcc_options *opts)
8687 {
8688   aarch64_tune_flags = selected_tune->flags;
8689   aarch64_tune = selected_tune->sched_core;
8690   /* Make a copy of the tuning parameters attached to the core, which
8691      we may later overwrite.  */
8692   aarch64_tune_params = *(selected_tune->tune);
8693   aarch64_architecture_version = selected_arch->architecture_version;
8694
8695   if (opts->x_aarch64_override_tune_string)
8696     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
8697                                   &aarch64_tune_params);
8698
8699   /* This target defaults to strict volatile bitfields.  */
8700   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
8701     opts->x_flag_strict_volatile_bitfields = 1;
8702
8703   initialize_aarch64_code_model (opts);
8704   initialize_aarch64_tls_size (opts);
8705
8706   int queue_depth = 0;
8707   switch (aarch64_tune_params.autoprefetcher_model)
8708     {
8709       case tune_params::AUTOPREFETCHER_OFF:
8710         queue_depth = -1;
8711         break;
8712       case tune_params::AUTOPREFETCHER_WEAK:
8713         queue_depth = 0;
8714         break;
8715       case tune_params::AUTOPREFETCHER_STRONG:
8716         queue_depth = max_insn_queue_index + 1;
8717         break;
8718       default:
8719         gcc_unreachable ();
8720     }
8721
8722   /* We don't mind passing in global_options_set here as we don't use
8723      the *options_set structs anyway.  */
8724   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
8725                          queue_depth,
8726                          opts->x_param_values,
8727                          global_options_set.x_param_values);
8728
8729   /* Set the L1 cache line size.  */
8730   if (selected_cpu->tune->cache_line_size != 0)
8731     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
8732                            selected_cpu->tune->cache_line_size,
8733                            opts->x_param_values,
8734                            global_options_set.x_param_values);
8735
8736   aarch64_override_options_after_change_1 (opts);
8737 }
8738
8739 /* Print a hint with a suggestion for a core or architecture name that
8740    most closely resembles what the user passed in STR.  ARCH is true if
8741    the user is asking for an architecture name.  ARCH is false if the user
8742    is asking for a core name.  */
8743
8744 static void
8745 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
8746 {
8747   auto_vec<const char *> candidates;
8748   const struct processor *entry = arch ? all_architectures : all_cores;
8749   for (; entry->name != NULL; entry++)
8750     candidates.safe_push (entry->name);
8751   char *s;
8752   const char *hint = candidates_list_and_hint (str, s, candidates);
8753   if (hint)
8754     inform (input_location, "valid arguments are: %s;"
8755                              " did you mean %qs?", s, hint);
8756   XDELETEVEC (s);
8757 }
8758
8759 /* Print a hint with a suggestion for a core name that most closely resembles
8760    what the user passed in STR.  */
8761
8762 inline static void
8763 aarch64_print_hint_for_core (const char *str)
8764 {
8765   aarch64_print_hint_for_core_or_arch (str, false);
8766 }
8767
8768 /* Print a hint with a suggestion for an architecture name that most closely
8769    resembles what the user passed in STR.  */
8770
8771 inline static void
8772 aarch64_print_hint_for_arch (const char *str)
8773 {
8774   aarch64_print_hint_for_core_or_arch (str, true);
8775 }
8776
8777 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
8778    specified in STR and throw errors if appropriate.  Put the results if
8779    they are valid in RES and ISA_FLAGS.  Return whether the option is
8780    valid.  */
8781
8782 static bool
8783 aarch64_validate_mcpu (const char *str, const struct processor **res,
8784                        unsigned long *isa_flags)
8785 {
8786   enum aarch64_parse_opt_result parse_res
8787     = aarch64_parse_cpu (str, res, isa_flags);
8788
8789   if (parse_res == AARCH64_PARSE_OK)
8790     return true;
8791
8792   switch (parse_res)
8793     {
8794       case AARCH64_PARSE_MISSING_ARG:
8795         error ("missing cpu name in %<-mcpu=%s%>", str);
8796         break;
8797       case AARCH64_PARSE_INVALID_ARG:
8798         error ("unknown value %qs for -mcpu", str);
8799         aarch64_print_hint_for_core (str);
8800         break;
8801       case AARCH64_PARSE_INVALID_FEATURE:
8802         error ("invalid feature modifier in %<-mcpu=%s%>", str);
8803         break;
8804       default:
8805         gcc_unreachable ();
8806     }
8807
8808   return false;
8809 }
8810
8811 /* Validate a command-line -march option.  Parse the arch and extensions
8812    (if any) specified in STR and throw errors if appropriate.  Put the
8813    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
8814    option is valid.  */
8815
8816 static bool
8817 aarch64_validate_march (const char *str, const struct processor **res,
8818                          unsigned long *isa_flags)
8819 {
8820   enum aarch64_parse_opt_result parse_res
8821     = aarch64_parse_arch (str, res, isa_flags);
8822
8823   if (parse_res == AARCH64_PARSE_OK)
8824     return true;
8825
8826   switch (parse_res)
8827     {
8828       case AARCH64_PARSE_MISSING_ARG:
8829         error ("missing arch name in %<-march=%s%>", str);
8830         break;
8831       case AARCH64_PARSE_INVALID_ARG:
8832         error ("unknown value %qs for -march", str);
8833         aarch64_print_hint_for_arch (str);
8834         break;
8835       case AARCH64_PARSE_INVALID_FEATURE:
8836         error ("invalid feature modifier in %<-march=%s%>", str);
8837         break;
8838       default:
8839         gcc_unreachable ();
8840     }
8841
8842   return false;
8843 }
8844
8845 /* Validate a command-line -mtune option.  Parse the cpu
8846    specified in STR and throw errors if appropriate.  Put the
8847    result, if it is valid, in RES.  Return whether the option is
8848    valid.  */
8849
8850 static bool
8851 aarch64_validate_mtune (const char *str, const struct processor **res)
8852 {
8853   enum aarch64_parse_opt_result parse_res
8854     = aarch64_parse_tune (str, res);
8855
8856   if (parse_res == AARCH64_PARSE_OK)
8857     return true;
8858
8859   switch (parse_res)
8860     {
8861       case AARCH64_PARSE_MISSING_ARG:
8862         error ("missing cpu name in %<-mtune=%s%>", str);
8863         break;
8864       case AARCH64_PARSE_INVALID_ARG:
8865         error ("unknown value %qs for -mtune", str);
8866         aarch64_print_hint_for_core (str);
8867         break;
8868       default:
8869         gcc_unreachable ();
8870     }
8871   return false;
8872 }
8873
8874 /* Return the CPU corresponding to the enum CPU.
8875    If it doesn't specify a cpu, return the default.  */
8876
8877 static const struct processor *
8878 aarch64_get_tune_cpu (enum aarch64_processor cpu)
8879 {
8880   if (cpu != aarch64_none)
8881     return &all_cores[cpu];
8882
8883   /* The & 0x3f is to extract the bottom 6 bits that encode the
8884      default cpu as selected by the --with-cpu GCC configure option
8885      in config.gcc.
8886      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
8887      flags mechanism should be reworked to make it more sane.  */
8888   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8889 }
8890
8891 /* Return the architecture corresponding to the enum ARCH.
8892    If it doesn't specify a valid architecture, return the default.  */
8893
8894 static const struct processor *
8895 aarch64_get_arch (enum aarch64_arch arch)
8896 {
8897   if (arch != aarch64_no_arch)
8898     return &all_architectures[arch];
8899
8900   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8901
8902   return &all_architectures[cpu->arch];
8903 }
8904
8905 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
8906    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
8907    tuning structs.  In particular it must set selected_tune and
8908    aarch64_isa_flags that define the available ISA features and tuning
8909    decisions.  It must also set selected_arch as this will be used to
8910    output the .arch asm tags for each function.  */
8911
8912 static void
8913 aarch64_override_options (void)
8914 {
8915   unsigned long cpu_isa = 0;
8916   unsigned long arch_isa = 0;
8917   aarch64_isa_flags = 0;
8918
8919   bool valid_cpu = true;
8920   bool valid_tune = true;
8921   bool valid_arch = true;
8922
8923   selected_cpu = NULL;
8924   selected_arch = NULL;
8925   selected_tune = NULL;
8926
8927   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
8928      If either of -march or -mtune is given, they override their
8929      respective component of -mcpu.  */
8930   if (aarch64_cpu_string)
8931     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
8932                                         &cpu_isa);
8933
8934   if (aarch64_arch_string)
8935     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
8936                                           &arch_isa);
8937
8938   if (aarch64_tune_string)
8939     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
8940
8941   /* If the user did not specify a processor, choose the default
8942      one for them.  This will be the CPU set during configuration using
8943      --with-cpu, otherwise it is "generic".  */
8944   if (!selected_cpu)
8945     {
8946       if (selected_arch)
8947         {
8948           selected_cpu = &all_cores[selected_arch->ident];
8949           aarch64_isa_flags = arch_isa;
8950           explicit_arch = selected_arch->arch;
8951         }
8952       else
8953         {
8954           /* Get default configure-time CPU.  */
8955           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
8956           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
8957         }
8958
8959       if (selected_tune)
8960         explicit_tune_core = selected_tune->ident;
8961     }
8962   /* If both -mcpu and -march are specified check that they are architecturally
8963      compatible, warn if they're not and prefer the -march ISA flags.  */
8964   else if (selected_arch)
8965     {
8966       if (selected_arch->arch != selected_cpu->arch)
8967         {
8968           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
8969                        all_architectures[selected_cpu->arch].name,
8970                        selected_arch->name);
8971         }
8972       aarch64_isa_flags = arch_isa;
8973       explicit_arch = selected_arch->arch;
8974       explicit_tune_core = selected_tune ? selected_tune->ident
8975                                           : selected_cpu->ident;
8976     }
8977   else
8978     {
8979       /* -mcpu but no -march.  */
8980       aarch64_isa_flags = cpu_isa;
8981       explicit_tune_core = selected_tune ? selected_tune->ident
8982                                           : selected_cpu->ident;
8983       gcc_assert (selected_cpu);
8984       selected_arch = &all_architectures[selected_cpu->arch];
8985       explicit_arch = selected_arch->arch;
8986     }
8987
8988   /* Set the arch as well as we will need it when outputing
8989      the .arch directive in assembly.  */
8990   if (!selected_arch)
8991     {
8992       gcc_assert (selected_cpu);
8993       selected_arch = &all_architectures[selected_cpu->arch];
8994     }
8995
8996   if (!selected_tune)
8997     selected_tune = selected_cpu;
8998
8999 #ifndef HAVE_AS_MABI_OPTION
9000   /* The compiler may have been configured with 2.23.* binutils, which does
9001      not have support for ILP32.  */
9002   if (TARGET_ILP32)
9003     error ("Assembler does not support -mabi=ilp32");
9004 #endif
9005
9006   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
9007     sorry ("Return address signing is only supported for -mabi=lp64");
9008
9009   /* Make sure we properly set up the explicit options.  */
9010   if ((aarch64_cpu_string && valid_cpu)
9011        || (aarch64_tune_string && valid_tune))
9012     gcc_assert (explicit_tune_core != aarch64_none);
9013
9014   if ((aarch64_cpu_string && valid_cpu)
9015        || (aarch64_arch_string && valid_arch))
9016     gcc_assert (explicit_arch != aarch64_no_arch);
9017
9018   aarch64_override_options_internal (&global_options);
9019
9020   /* Save these options as the default ones in case we push and pop them later
9021      while processing functions with potential target attributes.  */
9022   target_option_default_node = target_option_current_node
9023       = build_target_option_node (&global_options);
9024 }
9025
9026 /* Implement targetm.override_options_after_change.  */
9027
9028 static void
9029 aarch64_override_options_after_change (void)
9030 {
9031   aarch64_override_options_after_change_1 (&global_options);
9032 }
9033
9034 static struct machine_function *
9035 aarch64_init_machine_status (void)
9036 {
9037   struct machine_function *machine;
9038   machine = ggc_cleared_alloc<machine_function> ();
9039   return machine;
9040 }
9041
9042 void
9043 aarch64_init_expanders (void)
9044 {
9045   init_machine_status = aarch64_init_machine_status;
9046 }
9047
9048 /* A checking mechanism for the implementation of the various code models.  */
9049 static void
9050 initialize_aarch64_code_model (struct gcc_options *opts)
9051 {
9052    if (opts->x_flag_pic)
9053      {
9054        switch (opts->x_aarch64_cmodel_var)
9055          {
9056          case AARCH64_CMODEL_TINY:
9057            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
9058            break;
9059          case AARCH64_CMODEL_SMALL:
9060 #ifdef HAVE_AS_SMALL_PIC_RELOCS
9061            aarch64_cmodel = (flag_pic == 2
9062                              ? AARCH64_CMODEL_SMALL_PIC
9063                              : AARCH64_CMODEL_SMALL_SPIC);
9064 #else
9065            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
9066 #endif
9067            break;
9068          case AARCH64_CMODEL_LARGE:
9069            sorry ("code model %qs with -f%s", "large",
9070                   opts->x_flag_pic > 1 ? "PIC" : "pic");
9071            break;
9072          default:
9073            gcc_unreachable ();
9074          }
9075      }
9076    else
9077      aarch64_cmodel = opts->x_aarch64_cmodel_var;
9078 }
9079
9080 /* Implement TARGET_OPTION_SAVE.  */
9081
9082 static void
9083 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
9084 {
9085   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
9086 }
9087
9088 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
9089    using the information saved in PTR.  */
9090
9091 static void
9092 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
9093 {
9094   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
9095   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9096   opts->x_explicit_arch = ptr->x_explicit_arch;
9097   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
9098   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
9099
9100   aarch64_override_options_internal (opts);
9101 }
9102
9103 /* Implement TARGET_OPTION_PRINT.  */
9104
9105 static void
9106 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
9107 {
9108   const struct processor *cpu
9109     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9110   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
9111   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
9112   std::string extension
9113     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
9114
9115   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
9116   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
9117            arch->name, extension.c_str ());
9118 }
9119
9120 static GTY(()) tree aarch64_previous_fndecl;
9121
9122 void
9123 aarch64_reset_previous_fndecl (void)
9124 {
9125   aarch64_previous_fndecl = NULL;
9126 }
9127
9128 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
9129    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
9130    make sure optab availability predicates are recomputed when necessary.  */
9131
9132 void
9133 aarch64_save_restore_target_globals (tree new_tree)
9134 {
9135   if (TREE_TARGET_GLOBALS (new_tree))
9136     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
9137   else if (new_tree == target_option_default_node)
9138     restore_target_globals (&default_target_globals);
9139   else
9140     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
9141 }
9142
9143 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
9144    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
9145    of the function, if such exists.  This function may be called multiple
9146    times on a single function so use aarch64_previous_fndecl to avoid
9147    setting up identical state.  */
9148
9149 static void
9150 aarch64_set_current_function (tree fndecl)
9151 {
9152   if (!fndecl || fndecl == aarch64_previous_fndecl)
9153     return;
9154
9155   tree old_tree = (aarch64_previous_fndecl
9156                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
9157                    : NULL_TREE);
9158
9159   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9160
9161   /* If current function has no attributes but the previous one did,
9162      use the default node.  */
9163   if (!new_tree && old_tree)
9164     new_tree = target_option_default_node;
9165
9166   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
9167      the default have been handled by aarch64_save_restore_target_globals from
9168      aarch64_pragma_target_parse.  */
9169   if (old_tree == new_tree)
9170     return;
9171
9172   aarch64_previous_fndecl = fndecl;
9173
9174   /* First set the target options.  */
9175   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
9176
9177   aarch64_save_restore_target_globals (new_tree);
9178 }
9179
9180 /* Enum describing the various ways we can handle attributes.
9181    In many cases we can reuse the generic option handling machinery.  */
9182
9183 enum aarch64_attr_opt_type
9184 {
9185   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
9186   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
9187   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
9188   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
9189 };
9190
9191 /* All the information needed to handle a target attribute.
9192    NAME is the name of the attribute.
9193    ATTR_TYPE specifies the type of behavior of the attribute as described
9194    in the definition of enum aarch64_attr_opt_type.
9195    ALLOW_NEG is true if the attribute supports a "no-" form.
9196    HANDLER is the function that takes the attribute string and whether
9197    it is a pragma or attribute and handles the option.  It is needed only
9198    when the ATTR_TYPE is aarch64_attr_custom.
9199    OPT_NUM is the enum specifying the option that the attribute modifies.
9200    This is needed for attributes that mirror the behavior of a command-line
9201    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
9202    aarch64_attr_enum.  */
9203
9204 struct aarch64_attribute_info
9205 {
9206   const char *name;
9207   enum aarch64_attr_opt_type attr_type;
9208   bool allow_neg;
9209   bool (*handler) (const char *, const char *);
9210   enum opt_code opt_num;
9211 };
9212
9213 /* Handle the ARCH_STR argument to the arch= target attribute.
9214    PRAGMA_OR_ATTR is used in potential error messages.  */
9215
9216 static bool
9217 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
9218 {
9219   const struct processor *tmp_arch = NULL;
9220   enum aarch64_parse_opt_result parse_res
9221     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
9222
9223   if (parse_res == AARCH64_PARSE_OK)
9224     {
9225       gcc_assert (tmp_arch);
9226       selected_arch = tmp_arch;
9227       explicit_arch = selected_arch->arch;
9228       return true;
9229     }
9230
9231   switch (parse_res)
9232     {
9233       case AARCH64_PARSE_MISSING_ARG:
9234         error ("missing architecture name in 'arch' target %s", pragma_or_attr);
9235         break;
9236       case AARCH64_PARSE_INVALID_ARG:
9237         error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
9238         aarch64_print_hint_for_arch (str);
9239         break;
9240       case AARCH64_PARSE_INVALID_FEATURE:
9241         error ("invalid feature modifier %qs for 'arch' target %s",
9242                str, pragma_or_attr);
9243         break;
9244       default:
9245         gcc_unreachable ();
9246     }
9247
9248   return false;
9249 }
9250
9251 /* Handle the argument CPU_STR to the cpu= target attribute.
9252    PRAGMA_OR_ATTR is used in potential error messages.  */
9253
9254 static bool
9255 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
9256 {
9257   const struct processor *tmp_cpu = NULL;
9258   enum aarch64_parse_opt_result parse_res
9259     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
9260
9261   if (parse_res == AARCH64_PARSE_OK)
9262     {
9263       gcc_assert (tmp_cpu);
9264       selected_tune = tmp_cpu;
9265       explicit_tune_core = selected_tune->ident;
9266
9267       selected_arch = &all_architectures[tmp_cpu->arch];
9268       explicit_arch = selected_arch->arch;
9269       return true;
9270     }
9271
9272   switch (parse_res)
9273     {
9274       case AARCH64_PARSE_MISSING_ARG:
9275         error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
9276         break;
9277       case AARCH64_PARSE_INVALID_ARG:
9278         error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
9279         aarch64_print_hint_for_core (str);
9280         break;
9281       case AARCH64_PARSE_INVALID_FEATURE:
9282         error ("invalid feature modifier %qs for 'cpu' target %s",
9283                str, pragma_or_attr);
9284         break;
9285       default:
9286         gcc_unreachable ();
9287     }
9288
9289   return false;
9290 }
9291
9292 /* Handle the argument STR to the tune= target attribute.
9293    PRAGMA_OR_ATTR is used in potential error messages.  */
9294
9295 static bool
9296 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
9297 {
9298   const struct processor *tmp_tune = NULL;
9299   enum aarch64_parse_opt_result parse_res
9300     = aarch64_parse_tune (str, &tmp_tune);
9301
9302   if (parse_res == AARCH64_PARSE_OK)
9303     {
9304       gcc_assert (tmp_tune);
9305       selected_tune = tmp_tune;
9306       explicit_tune_core = selected_tune->ident;
9307       return true;
9308     }
9309
9310   switch (parse_res)
9311     {
9312       case AARCH64_PARSE_INVALID_ARG:
9313         error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
9314         aarch64_print_hint_for_core (str);
9315         break;
9316       default:
9317         gcc_unreachable ();
9318     }
9319
9320   return false;
9321 }
9322
9323 /* Parse an architecture extensions target attribute string specified in STR.
9324    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
9325    if successful.  Update aarch64_isa_flags to reflect the ISA features
9326    modified.
9327    PRAGMA_OR_ATTR is used in potential error messages.  */
9328
9329 static bool
9330 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
9331 {
9332   enum aarch64_parse_opt_result parse_res;
9333   unsigned long isa_flags = aarch64_isa_flags;
9334
9335   /* We allow "+nothing" in the beginning to clear out all architectural
9336      features if the user wants to handpick specific features.  */
9337   if (strncmp ("+nothing", str, 8) == 0)
9338     {
9339       isa_flags = 0;
9340       str += 8;
9341     }
9342
9343   parse_res = aarch64_parse_extension (str, &isa_flags);
9344
9345   if (parse_res == AARCH64_PARSE_OK)
9346     {
9347       aarch64_isa_flags = isa_flags;
9348       return true;
9349     }
9350
9351   switch (parse_res)
9352     {
9353       case AARCH64_PARSE_MISSING_ARG:
9354         error ("missing feature modifier in target %s %qs",
9355                pragma_or_attr, str);
9356         break;
9357
9358       case AARCH64_PARSE_INVALID_FEATURE:
9359         error ("invalid feature modifier in target %s %qs",
9360                pragma_or_attr, str);
9361         break;
9362
9363       default:
9364         gcc_unreachable ();
9365     }
9366
9367  return false;
9368 }
9369
9370 /* The target attributes that we support.  On top of these we also support just
9371    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
9372    handled explicitly in aarch64_process_one_target_attr.  */
9373
9374 static const struct aarch64_attribute_info aarch64_attributes[] =
9375 {
9376   { "general-regs-only", aarch64_attr_mask, false, NULL,
9377      OPT_mgeneral_regs_only },
9378   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
9379      OPT_mfix_cortex_a53_835769 },
9380   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
9381      OPT_mfix_cortex_a53_843419 },
9382   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
9383   { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
9384   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
9385      OPT_momit_leaf_frame_pointer },
9386   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
9387   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
9388      OPT_march_ },
9389   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
9390   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
9391      OPT_mtune_ },
9392   { "sign-return-address", aarch64_attr_enum, false, NULL,
9393      OPT_msign_return_address_ },
9394   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
9395 };
9396
9397 /* Parse ARG_STR which contains the definition of one target attribute.
9398    Show appropriate errors if any or return true if the attribute is valid.
9399    PRAGMA_OR_ATTR holds the string to use in error messages about whether
9400    we're processing a target attribute or pragma.  */
9401
9402 static bool
9403 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
9404 {
9405   bool invert = false;
9406
9407   size_t len = strlen (arg_str);
9408
9409   if (len == 0)
9410     {
9411       error ("malformed target %s", pragma_or_attr);
9412       return false;
9413     }
9414
9415   char *str_to_check = (char *) alloca (len + 1);
9416   strcpy (str_to_check, arg_str);
9417
9418   /* Skip leading whitespace.  */
9419   while (*str_to_check == ' ' || *str_to_check == '\t')
9420     str_to_check++;
9421
9422   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
9423      It is easier to detect and handle it explicitly here rather than going
9424      through the machinery for the rest of the target attributes in this
9425      function.  */
9426   if (*str_to_check == '+')
9427     return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
9428
9429   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
9430     {
9431       invert = true;
9432       str_to_check += 3;
9433     }
9434   char *arg = strchr (str_to_check, '=');
9435
9436   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9437      and point ARG to "foo".  */
9438   if (arg)
9439     {
9440       *arg = '\0';
9441       arg++;
9442     }
9443   const struct aarch64_attribute_info *p_attr;
9444   bool found = false;
9445   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
9446     {
9447       /* If the names don't match up, or the user has given an argument
9448          to an attribute that doesn't accept one, or didn't give an argument
9449          to an attribute that expects one, fail to match.  */
9450       if (strcmp (str_to_check, p_attr->name) != 0)
9451         continue;
9452
9453       found = true;
9454       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
9455                               || p_attr->attr_type == aarch64_attr_enum;
9456
9457       if (attr_need_arg_p ^ (arg != NULL))
9458         {
9459           error ("target %s %qs does not accept an argument",
9460                   pragma_or_attr, str_to_check);
9461           return false;
9462         }
9463
9464       /* If the name matches but the attribute does not allow "no-" versions
9465          then we can't match.  */
9466       if (invert && !p_attr->allow_neg)
9467         {
9468           error ("target %s %qs does not allow a negated form",
9469                   pragma_or_attr, str_to_check);
9470           return false;
9471         }
9472
9473       switch (p_attr->attr_type)
9474         {
9475         /* Has a custom handler registered.
9476            For example, cpu=, arch=, tune=.  */
9477           case aarch64_attr_custom:
9478             gcc_assert (p_attr->handler);
9479             if (!p_attr->handler (arg, pragma_or_attr))
9480               return false;
9481             break;
9482
9483           /* Either set or unset a boolean option.  */
9484           case aarch64_attr_bool:
9485             {
9486               struct cl_decoded_option decoded;
9487
9488               generate_option (p_attr->opt_num, NULL, !invert,
9489                                CL_TARGET, &decoded);
9490               aarch64_handle_option (&global_options, &global_options_set,
9491                                       &decoded, input_location);
9492               break;
9493             }
9494           /* Set or unset a bit in the target_flags.  aarch64_handle_option
9495              should know what mask to apply given the option number.  */
9496           case aarch64_attr_mask:
9497             {
9498               struct cl_decoded_option decoded;
9499               /* We only need to specify the option number.
9500                  aarch64_handle_option will know which mask to apply.  */
9501               decoded.opt_index = p_attr->opt_num;
9502               decoded.value = !invert;
9503               aarch64_handle_option (&global_options, &global_options_set,
9504                                       &decoded, input_location);
9505               break;
9506             }
9507           /* Use the option setting machinery to set an option to an enum.  */
9508           case aarch64_attr_enum:
9509             {
9510               gcc_assert (arg);
9511               bool valid;
9512               int value;
9513               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
9514                                               &value, CL_TARGET);
9515               if (valid)
9516                 {
9517                   set_option (&global_options, NULL, p_attr->opt_num, value,
9518                               NULL, DK_UNSPECIFIED, input_location,
9519                               global_dc);
9520                 }
9521               else
9522                 {
9523                   error ("target %s %s=%s is not valid",
9524                          pragma_or_attr, str_to_check, arg);
9525                 }
9526               break;
9527             }
9528           default:
9529             gcc_unreachable ();
9530         }
9531     }
9532
9533   /* If we reached here we either have found an attribute and validated
9534      it or didn't match any.  If we matched an attribute but its arguments
9535      were malformed we will have returned false already.  */
9536   return found;
9537 }
9538
9539 /* Count how many times the character C appears in
9540    NULL-terminated string STR.  */
9541
9542 static unsigned int
9543 num_occurences_in_str (char c, char *str)
9544 {
9545   unsigned int res = 0;
9546   while (*str != '\0')
9547     {
9548       if (*str == c)
9549         res++;
9550
9551       str++;
9552     }
9553
9554   return res;
9555 }
9556
9557 /* Parse the tree in ARGS that contains the target attribute information
9558    and update the global target options space.  PRAGMA_OR_ATTR is a string
9559    to be used in error messages, specifying whether this is processing
9560    a target attribute or a target pragma.  */
9561
9562 bool
9563 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9564 {
9565   if (TREE_CODE (args) == TREE_LIST)
9566     {
9567       do
9568         {
9569           tree head = TREE_VALUE (args);
9570           if (head)
9571             {
9572               if (!aarch64_process_target_attr (head, pragma_or_attr))
9573                 return false;
9574             }
9575           args = TREE_CHAIN (args);
9576         } while (args);
9577
9578       return true;
9579     }
9580
9581   if (TREE_CODE (args) != STRING_CST)
9582     {
9583       error ("attribute %<target%> argument not a string");
9584       return false;
9585     }
9586
9587   size_t len = strlen (TREE_STRING_POINTER (args));
9588   char *str_to_check = (char *) alloca (len + 1);
9589   strcpy (str_to_check, TREE_STRING_POINTER (args));
9590
9591   if (len == 0)
9592     {
9593       error ("malformed target %s value", pragma_or_attr);
9594       return false;
9595     }
9596
9597   /* Used to catch empty spaces between commas i.e.
9598      attribute ((target ("attr1,,attr2"))).  */
9599   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9600
9601   /* Handle multiple target attributes separated by ','.  */
9602   char *token = strtok (str_to_check, ",");
9603
9604   unsigned int num_attrs = 0;
9605   while (token)
9606     {
9607       num_attrs++;
9608       if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9609         {
9610           error ("target %s %qs is invalid", pragma_or_attr, token);
9611           return false;
9612         }
9613
9614       token = strtok (NULL, ",");
9615     }
9616
9617   if (num_attrs != num_commas + 1)
9618     {
9619       error ("malformed target %s list %qs",
9620               pragma_or_attr, TREE_STRING_POINTER (args));
9621       return false;
9622     }
9623
9624   return true;
9625 }
9626
9627 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
9628    process attribute ((target ("..."))).  */
9629
9630 static bool
9631 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9632 {
9633   struct cl_target_option cur_target;
9634   bool ret;
9635   tree old_optimize;
9636   tree new_target, new_optimize;
9637   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9638
9639   /* If what we're processing is the current pragma string then the
9640      target option node is already stored in target_option_current_node
9641      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
9642      having to re-parse the string.  This is especially useful to keep
9643      arm_neon.h compile times down since that header contains a lot
9644      of intrinsics enclosed in pragmas.  */
9645   if (!existing_target && args == current_target_pragma)
9646     {
9647       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
9648       return true;
9649     }
9650   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9651
9652   old_optimize = build_optimization_node (&global_options);
9653   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9654
9655   /* If the function changed the optimization levels as well as setting
9656      target options, start with the optimizations specified.  */
9657   if (func_optimize && func_optimize != old_optimize)
9658     cl_optimization_restore (&global_options,
9659                              TREE_OPTIMIZATION (func_optimize));
9660
9661   /* Save the current target options to restore at the end.  */
9662   cl_target_option_save (&cur_target, &global_options);
9663
9664   /* If fndecl already has some target attributes applied to it, unpack
9665      them so that we add this attribute on top of them, rather than
9666      overwriting them.  */
9667   if (existing_target)
9668     {
9669       struct cl_target_option *existing_options
9670         = TREE_TARGET_OPTION (existing_target);
9671
9672       if (existing_options)
9673         cl_target_option_restore (&global_options, existing_options);
9674     }
9675   else
9676     cl_target_option_restore (&global_options,
9677                         TREE_TARGET_OPTION (target_option_current_node));
9678
9679
9680   ret = aarch64_process_target_attr (args, "attribute");
9681
9682   /* Set up any additional state.  */
9683   if (ret)
9684     {
9685       aarch64_override_options_internal (&global_options);
9686       /* Initialize SIMD builtins if we haven't already.
9687          Set current_target_pragma to NULL for the duration so that
9688          the builtin initialization code doesn't try to tag the functions
9689          being built with the attributes specified by any current pragma, thus
9690          going into an infinite recursion.  */
9691       if (TARGET_SIMD)
9692         {
9693           tree saved_current_target_pragma = current_target_pragma;
9694           current_target_pragma = NULL;
9695           aarch64_init_simd_builtins ();
9696           current_target_pragma = saved_current_target_pragma;
9697         }
9698       new_target = build_target_option_node (&global_options);
9699     }
9700   else
9701     new_target = NULL;
9702
9703   new_optimize = build_optimization_node (&global_options);
9704
9705   if (fndecl && ret)
9706     {
9707       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
9708
9709       if (old_optimize != new_optimize)
9710         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
9711     }
9712
9713   cl_target_option_restore (&global_options, &cur_target);
9714
9715   if (old_optimize != new_optimize)
9716     cl_optimization_restore (&global_options,
9717                              TREE_OPTIMIZATION (old_optimize));
9718   return ret;
9719 }
9720
9721 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
9722    tri-bool options (yes, no, don't care) and the default value is
9723    DEF, determine whether to reject inlining.  */
9724
9725 static bool
9726 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
9727                                      int dont_care, int def)
9728 {
9729   /* If the callee doesn't care, always allow inlining.  */
9730   if (callee == dont_care)
9731     return true;
9732
9733   /* If the caller doesn't care, always allow inlining.  */
9734   if (caller == dont_care)
9735     return true;
9736
9737   /* Otherwise, allow inlining if either the callee and caller values
9738      agree, or if the callee is using the default value.  */
9739   return (callee == caller || callee == def);
9740 }
9741
9742 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
9743    to inline CALLEE into CALLER based on target-specific info.
9744    Make sure that the caller and callee have compatible architectural
9745    features.  Then go through the other possible target attributes
9746    and see if they can block inlining.  Try not to reject always_inline
9747    callees unless they are incompatible architecturally.  */
9748
9749 static bool
9750 aarch64_can_inline_p (tree caller, tree callee)
9751 {
9752   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
9753   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
9754
9755   /* If callee has no option attributes, then it is ok to inline.  */
9756   if (!callee_tree)
9757     return true;
9758
9759   struct cl_target_option *caller_opts
9760         = TREE_TARGET_OPTION (caller_tree ? caller_tree
9761                                            : target_option_default_node);
9762
9763   struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
9764
9765
9766   /* Callee's ISA flags should be a subset of the caller's.  */
9767   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
9768        != callee_opts->x_aarch64_isa_flags)
9769     return false;
9770
9771   /* Allow non-strict aligned functions inlining into strict
9772      aligned ones.  */
9773   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
9774        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
9775       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
9776            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
9777     return false;
9778
9779   bool always_inline = lookup_attribute ("always_inline",
9780                                           DECL_ATTRIBUTES (callee));
9781
9782   /* If the architectural features match up and the callee is always_inline
9783      then the other attributes don't matter.  */
9784   if (always_inline)
9785     return true;
9786
9787   if (caller_opts->x_aarch64_cmodel_var
9788       != callee_opts->x_aarch64_cmodel_var)
9789     return false;
9790
9791   if (caller_opts->x_aarch64_tls_dialect
9792       != callee_opts->x_aarch64_tls_dialect)
9793     return false;
9794
9795   /* Honour explicit requests to workaround errata.  */
9796   if (!aarch64_tribools_ok_for_inlining_p (
9797           caller_opts->x_aarch64_fix_a53_err835769,
9798           callee_opts->x_aarch64_fix_a53_err835769,
9799           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
9800     return false;
9801
9802   if (!aarch64_tribools_ok_for_inlining_p (
9803           caller_opts->x_aarch64_fix_a53_err843419,
9804           callee_opts->x_aarch64_fix_a53_err843419,
9805           2, TARGET_FIX_ERR_A53_843419))
9806     return false;
9807
9808   /* If the user explicitly specified -momit-leaf-frame-pointer for the
9809      caller and calle and they don't match up, reject inlining.  */
9810   if (!aarch64_tribools_ok_for_inlining_p (
9811           caller_opts->x_flag_omit_leaf_frame_pointer,
9812           callee_opts->x_flag_omit_leaf_frame_pointer,
9813           2, 1))
9814     return false;
9815
9816   /* If the callee has specific tuning overrides, respect them.  */
9817   if (callee_opts->x_aarch64_override_tune_string != NULL
9818       && caller_opts->x_aarch64_override_tune_string == NULL)
9819     return false;
9820
9821   /* If the user specified tuning override strings for the
9822      caller and callee and they don't match up, reject inlining.
9823      We just do a string compare here, we don't analyze the meaning
9824      of the string, as it would be too costly for little gain.  */
9825   if (callee_opts->x_aarch64_override_tune_string
9826       && caller_opts->x_aarch64_override_tune_string
9827       && (strcmp (callee_opts->x_aarch64_override_tune_string,
9828                   caller_opts->x_aarch64_override_tune_string) != 0))
9829     return false;
9830
9831   return true;
9832 }
9833
9834 /* Return true if SYMBOL_REF X binds locally.  */
9835
9836 static bool
9837 aarch64_symbol_binds_local_p (const_rtx x)
9838 {
9839   return (SYMBOL_REF_DECL (x)
9840           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
9841           : SYMBOL_REF_LOCAL_P (x));
9842 }
9843
9844 /* Return true if SYMBOL_REF X is thread local */
9845 static bool
9846 aarch64_tls_symbol_p (rtx x)
9847 {
9848   if (! TARGET_HAVE_TLS)
9849     return false;
9850
9851   if (GET_CODE (x) != SYMBOL_REF)
9852     return false;
9853
9854   return SYMBOL_REF_TLS_MODEL (x) != 0;
9855 }
9856
9857 /* Classify a TLS symbol into one of the TLS kinds.  */
9858 enum aarch64_symbol_type
9859 aarch64_classify_tls_symbol (rtx x)
9860 {
9861   enum tls_model tls_kind = tls_symbolic_operand_type (x);
9862
9863   switch (tls_kind)
9864     {
9865     case TLS_MODEL_GLOBAL_DYNAMIC:
9866     case TLS_MODEL_LOCAL_DYNAMIC:
9867       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
9868
9869     case TLS_MODEL_INITIAL_EXEC:
9870       switch (aarch64_cmodel)
9871         {
9872         case AARCH64_CMODEL_TINY:
9873         case AARCH64_CMODEL_TINY_PIC:
9874           return SYMBOL_TINY_TLSIE;
9875         default:
9876           return SYMBOL_SMALL_TLSIE;
9877         }
9878
9879     case TLS_MODEL_LOCAL_EXEC:
9880       if (aarch64_tls_size == 12)
9881         return SYMBOL_TLSLE12;
9882       else if (aarch64_tls_size == 24)
9883         return SYMBOL_TLSLE24;
9884       else if (aarch64_tls_size == 32)
9885         return SYMBOL_TLSLE32;
9886       else if (aarch64_tls_size == 48)
9887         return SYMBOL_TLSLE48;
9888       else
9889         gcc_unreachable ();
9890
9891     case TLS_MODEL_EMULATED:
9892     case TLS_MODEL_NONE:
9893       return SYMBOL_FORCE_TO_MEM;
9894
9895     default:
9896       gcc_unreachable ();
9897     }
9898 }
9899
9900 /* Return the method that should be used to access SYMBOL_REF or
9901    LABEL_REF X.  */
9902
9903 enum aarch64_symbol_type
9904 aarch64_classify_symbol (rtx x, rtx offset)
9905 {
9906   if (GET_CODE (x) == LABEL_REF)
9907     {
9908       switch (aarch64_cmodel)
9909         {
9910         case AARCH64_CMODEL_LARGE:
9911           return SYMBOL_FORCE_TO_MEM;
9912
9913         case AARCH64_CMODEL_TINY_PIC:
9914         case AARCH64_CMODEL_TINY:
9915           return SYMBOL_TINY_ABSOLUTE;
9916
9917         case AARCH64_CMODEL_SMALL_SPIC:
9918         case AARCH64_CMODEL_SMALL_PIC:
9919         case AARCH64_CMODEL_SMALL:
9920           return SYMBOL_SMALL_ABSOLUTE;
9921
9922         default:
9923           gcc_unreachable ();
9924         }
9925     }
9926
9927   if (GET_CODE (x) == SYMBOL_REF)
9928     {
9929       if (aarch64_tls_symbol_p (x))
9930         return aarch64_classify_tls_symbol (x);
9931
9932       switch (aarch64_cmodel)
9933         {
9934         case AARCH64_CMODEL_TINY:
9935           /* When we retrieve symbol + offset address, we have to make sure
9936              the offset does not cause overflow of the final address.  But
9937              we have no way of knowing the address of symbol at compile time
9938              so we can't accurately say if the distance between the PC and
9939              symbol + offset is outside the addressible range of +/-1M in the
9940              TINY code model.  So we rely on images not being greater than
9941              1M and cap the offset at 1M and anything beyond 1M will have to
9942              be loaded using an alternative mechanism.  Furthermore if the
9943              symbol is a weak reference to something that isn't known to
9944              resolve to a symbol in this module, then force to memory.  */
9945           if ((SYMBOL_REF_WEAK (x)
9946                && !aarch64_symbol_binds_local_p (x))
9947               || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
9948             return SYMBOL_FORCE_TO_MEM;
9949           return SYMBOL_TINY_ABSOLUTE;
9950
9951         case AARCH64_CMODEL_SMALL:
9952           /* Same reasoning as the tiny code model, but the offset cap here is
9953              4G.  */
9954           if ((SYMBOL_REF_WEAK (x)
9955                && !aarch64_symbol_binds_local_p (x))
9956               || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
9957                             HOST_WIDE_INT_C (4294967264)))
9958             return SYMBOL_FORCE_TO_MEM;
9959           return SYMBOL_SMALL_ABSOLUTE;
9960
9961         case AARCH64_CMODEL_TINY_PIC:
9962           if (!aarch64_symbol_binds_local_p (x))
9963             return SYMBOL_TINY_GOT;
9964           return SYMBOL_TINY_ABSOLUTE;
9965
9966         case AARCH64_CMODEL_SMALL_SPIC:
9967         case AARCH64_CMODEL_SMALL_PIC:
9968           if (!aarch64_symbol_binds_local_p (x))
9969             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
9970                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
9971           return SYMBOL_SMALL_ABSOLUTE;
9972
9973         case AARCH64_CMODEL_LARGE:
9974           /* This is alright even in PIC code as the constant
9975              pool reference is always PC relative and within
9976              the same translation unit.  */
9977           if (CONSTANT_POOL_ADDRESS_P (x))
9978             return SYMBOL_SMALL_ABSOLUTE;
9979           else
9980             return SYMBOL_FORCE_TO_MEM;
9981
9982         default:
9983           gcc_unreachable ();
9984         }
9985     }
9986
9987   /* By default push everything into the constant pool.  */
9988   return SYMBOL_FORCE_TO_MEM;
9989 }
9990
9991 bool
9992 aarch64_constant_address_p (rtx x)
9993 {
9994   return (CONSTANT_P (x) && memory_address_p (DImode, x));
9995 }
9996
9997 bool
9998 aarch64_legitimate_pic_operand_p (rtx x)
9999 {
10000   if (GET_CODE (x) == SYMBOL_REF
10001       || (GET_CODE (x) == CONST
10002           && GET_CODE (XEXP (x, 0)) == PLUS
10003           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
10004      return false;
10005
10006   return true;
10007 }
10008
10009 /* Return true if X holds either a quarter-precision or
10010      floating-point +0.0 constant.  */
10011 static bool
10012 aarch64_valid_floating_const (machine_mode mode, rtx x)
10013 {
10014   if (!CONST_DOUBLE_P (x))
10015     return false;
10016
10017   if (aarch64_float_const_zero_rtx_p (x))
10018     return true;
10019
10020   /* We only handle moving 0.0 to a TFmode register.  */
10021   if (!(mode == SFmode || mode == DFmode))
10022     return false;
10023
10024   return aarch64_float_const_representable_p (x);
10025 }
10026
10027 static bool
10028 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
10029 {
10030   /* Do not allow vector struct mode constants.  We could support
10031      0 and -1 easily, but they need support in aarch64-simd.md.  */
10032   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
10033     return false;
10034
10035   /* This could probably go away because
10036      we now decompose CONST_INTs according to expand_mov_immediate.  */
10037   if ((GET_CODE (x) == CONST_VECTOR
10038        && aarch64_simd_valid_immediate (x, mode, false, NULL))
10039       || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
10040         return !targetm.cannot_force_const_mem (mode, x);
10041
10042   if (GET_CODE (x) == HIGH
10043       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
10044     return true;
10045
10046   return aarch64_constant_address_p (x);
10047 }
10048
10049 rtx
10050 aarch64_load_tp (rtx target)
10051 {
10052   if (!target
10053       || GET_MODE (target) != Pmode
10054       || !register_operand (target, Pmode))
10055     target = gen_reg_rtx (Pmode);
10056
10057   /* Can return in any reg.  */
10058   emit_insn (gen_aarch64_load_tp_hard (target));
10059   return target;
10060 }
10061
10062 /* On AAPCS systems, this is the "struct __va_list".  */
10063 static GTY(()) tree va_list_type;
10064
10065 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
10066    Return the type to use as __builtin_va_list.
10067
10068    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
10069
10070    struct __va_list
10071    {
10072      void *__stack;
10073      void *__gr_top;
10074      void *__vr_top;
10075      int   __gr_offs;
10076      int   __vr_offs;
10077    };  */
10078
10079 static tree
10080 aarch64_build_builtin_va_list (void)
10081 {
10082   tree va_list_name;
10083   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10084
10085   /* Create the type.  */
10086   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
10087   /* Give it the required name.  */
10088   va_list_name = build_decl (BUILTINS_LOCATION,
10089                              TYPE_DECL,
10090                              get_identifier ("__va_list"),
10091                              va_list_type);
10092   DECL_ARTIFICIAL (va_list_name) = 1;
10093   TYPE_NAME (va_list_type) = va_list_name;
10094   TYPE_STUB_DECL (va_list_type) = va_list_name;
10095
10096   /* Create the fields.  */
10097   f_stack = build_decl (BUILTINS_LOCATION,
10098                         FIELD_DECL, get_identifier ("__stack"),
10099                         ptr_type_node);
10100   f_grtop = build_decl (BUILTINS_LOCATION,
10101                         FIELD_DECL, get_identifier ("__gr_top"),
10102                         ptr_type_node);
10103   f_vrtop = build_decl (BUILTINS_LOCATION,
10104                         FIELD_DECL, get_identifier ("__vr_top"),
10105                         ptr_type_node);
10106   f_groff = build_decl (BUILTINS_LOCATION,
10107                         FIELD_DECL, get_identifier ("__gr_offs"),
10108                         integer_type_node);
10109   f_vroff = build_decl (BUILTINS_LOCATION,
10110                         FIELD_DECL, get_identifier ("__vr_offs"),
10111                         integer_type_node);
10112
10113   /* Tell tree-stdarg pass about our internal offset fields.
10114      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
10115      purpose to identify whether the code is updating va_list internal
10116      offset fields through irregular way.  */
10117   va_list_gpr_counter_field = f_groff;
10118   va_list_fpr_counter_field = f_vroff;
10119
10120   DECL_ARTIFICIAL (f_stack) = 1;
10121   DECL_ARTIFICIAL (f_grtop) = 1;
10122   DECL_ARTIFICIAL (f_vrtop) = 1;
10123   DECL_ARTIFICIAL (f_groff) = 1;
10124   DECL_ARTIFICIAL (f_vroff) = 1;
10125
10126   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
10127   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
10128   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
10129   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
10130   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
10131
10132   TYPE_FIELDS (va_list_type) = f_stack;
10133   DECL_CHAIN (f_stack) = f_grtop;
10134   DECL_CHAIN (f_grtop) = f_vrtop;
10135   DECL_CHAIN (f_vrtop) = f_groff;
10136   DECL_CHAIN (f_groff) = f_vroff;
10137
10138   /* Compute its layout.  */
10139   layout_type (va_list_type);
10140
10141   return va_list_type;
10142 }
10143
10144 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
10145 static void
10146 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
10147 {
10148   const CUMULATIVE_ARGS *cum;
10149   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10150   tree stack, grtop, vrtop, groff, vroff;
10151   tree t;
10152   int gr_save_area_size = cfun->va_list_gpr_size;
10153   int vr_save_area_size = cfun->va_list_fpr_size;
10154   int vr_offset;
10155
10156   cum = &crtl->args.info;
10157   if (cfun->va_list_gpr_size)
10158     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
10159                              cfun->va_list_gpr_size);
10160   if (cfun->va_list_fpr_size)
10161     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
10162                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
10163
10164   if (!TARGET_FLOAT)
10165     {
10166       gcc_assert (cum->aapcs_nvrn == 0);
10167       vr_save_area_size = 0;
10168     }
10169
10170   f_stack = TYPE_FIELDS (va_list_type_node);
10171   f_grtop = DECL_CHAIN (f_stack);
10172   f_vrtop = DECL_CHAIN (f_grtop);
10173   f_groff = DECL_CHAIN (f_vrtop);
10174   f_vroff = DECL_CHAIN (f_groff);
10175
10176   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
10177                   NULL_TREE);
10178   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
10179                   NULL_TREE);
10180   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
10181                   NULL_TREE);
10182   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
10183                   NULL_TREE);
10184   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
10185                   NULL_TREE);
10186
10187   /* Emit code to initialize STACK, which points to the next varargs stack
10188      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
10189      by named arguments.  STACK is 8-byte aligned.  */
10190   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
10191   if (cum->aapcs_stack_size > 0)
10192     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
10193   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
10194   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10195
10196   /* Emit code to initialize GRTOP, the top of the GR save area.
10197      virtual_incoming_args_rtx should have been 16 byte aligned.  */
10198   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
10199   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
10200   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10201
10202   /* Emit code to initialize VRTOP, the top of the VR save area.
10203      This address is gr_save_area_bytes below GRTOP, rounded
10204      down to the next 16-byte boundary.  */
10205   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
10206   vr_offset = ROUND_UP (gr_save_area_size,
10207                         STACK_BOUNDARY / BITS_PER_UNIT);
10208
10209   if (vr_offset)
10210     t = fold_build_pointer_plus_hwi (t, -vr_offset);
10211   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
10212   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10213
10214   /* Emit code to initialize GROFF, the offset from GRTOP of the
10215      next GPR argument.  */
10216   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
10217               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
10218   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10219
10220   /* Likewise emit code to initialize VROFF, the offset from FTOP
10221      of the next VR argument.  */
10222   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
10223               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
10224   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10225 }
10226
10227 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
10228
10229 static tree
10230 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
10231                               gimple_seq *post_p ATTRIBUTE_UNUSED)
10232 {
10233   tree addr;
10234   bool indirect_p;
10235   bool is_ha;           /* is HFA or HVA.  */
10236   bool dw_align;        /* double-word align.  */
10237   machine_mode ag_mode = VOIDmode;
10238   int nregs;
10239   machine_mode mode;
10240
10241   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10242   tree stack, f_top, f_off, off, arg, roundup, on_stack;
10243   HOST_WIDE_INT size, rsize, adjust, align;
10244   tree t, u, cond1, cond2;
10245
10246   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
10247   if (indirect_p)
10248     type = build_pointer_type (type);
10249
10250   mode = TYPE_MODE (type);
10251
10252   f_stack = TYPE_FIELDS (va_list_type_node);
10253   f_grtop = DECL_CHAIN (f_stack);
10254   f_vrtop = DECL_CHAIN (f_grtop);
10255   f_groff = DECL_CHAIN (f_vrtop);
10256   f_vroff = DECL_CHAIN (f_groff);
10257
10258   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
10259                   f_stack, NULL_TREE);
10260   size = int_size_in_bytes (type);
10261   struct aarch64_fn_arg_alignment aa
10262     = aarch64_function_arg_alignment (mode, type);
10263   align = aa.alignment / BITS_PER_UNIT;
10264
10265   dw_align = false;
10266   adjust = 0;
10267   if (aarch64_vfp_is_call_or_return_candidate (mode,
10268                                                type,
10269                                                &ag_mode,
10270                                                &nregs,
10271                                                &is_ha))
10272     {
10273       /* TYPE passed in fp/simd registers.  */
10274       if (!TARGET_FLOAT)
10275         aarch64_err_no_fpadvsimd (mode, "varargs");
10276
10277       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
10278                       unshare_expr (valist), f_vrtop, NULL_TREE);
10279       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
10280                       unshare_expr (valist), f_vroff, NULL_TREE);
10281
10282       rsize = nregs * UNITS_PER_VREG;
10283
10284       if (is_ha)
10285         {
10286           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
10287             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
10288         }
10289       else if (BLOCK_REG_PADDING (mode, type, 1) == downward
10290                && size < UNITS_PER_VREG)
10291         {
10292           adjust = UNITS_PER_VREG - size;
10293         }
10294     }
10295   else
10296     {
10297       /* TYPE passed in general registers.  */
10298       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
10299                       unshare_expr (valist), f_grtop, NULL_TREE);
10300       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
10301                       unshare_expr (valist), f_groff, NULL_TREE);
10302       rsize = ROUND_UP (size, UNITS_PER_WORD);
10303       nregs = rsize / UNITS_PER_WORD;
10304
10305       if (align > 8)
10306         dw_align = true;
10307
10308       if (BLOCK_REG_PADDING (mode, type, 1) == downward
10309           && size < UNITS_PER_WORD)
10310         {
10311           adjust = UNITS_PER_WORD  - size;
10312         }
10313     }
10314
10315   /* Get a local temporary for the field value.  */
10316   off = get_initialized_tmp_var (f_off, pre_p, NULL);
10317
10318   /* Emit code to branch if off >= 0.  */
10319   t = build2 (GE_EXPR, boolean_type_node, off,
10320               build_int_cst (TREE_TYPE (off), 0));
10321   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
10322
10323   if (dw_align)
10324     {
10325       /* Emit: offs = (offs + 15) & -16.  */
10326       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10327                   build_int_cst (TREE_TYPE (off), 15));
10328       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
10329                   build_int_cst (TREE_TYPE (off), -16));
10330       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
10331     }
10332   else
10333     roundup = NULL;
10334
10335   /* Update ap.__[g|v]r_offs  */
10336   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10337               build_int_cst (TREE_TYPE (off), rsize));
10338   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
10339
10340   /* String up.  */
10341   if (roundup)
10342     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10343
10344   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
10345   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
10346               build_int_cst (TREE_TYPE (f_off), 0));
10347   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
10348
10349   /* String up: make sure the assignment happens before the use.  */
10350   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
10351   COND_EXPR_ELSE (cond1) = t;
10352
10353   /* Prepare the trees handling the argument that is passed on the stack;
10354      the top level node will store in ON_STACK.  */
10355   arg = get_initialized_tmp_var (stack, pre_p, NULL);
10356   if (align > 8)
10357     {
10358       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
10359       t = fold_convert (intDI_type_node, arg);
10360       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10361                   build_int_cst (TREE_TYPE (t), 15));
10362       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10363                   build_int_cst (TREE_TYPE (t), -16));
10364       t = fold_convert (TREE_TYPE (arg), t);
10365       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
10366     }
10367   else
10368     roundup = NULL;
10369   /* Advance ap.__stack  */
10370   t = fold_convert (intDI_type_node, arg);
10371   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10372               build_int_cst (TREE_TYPE (t), size + 7));
10373   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10374               build_int_cst (TREE_TYPE (t), -8));
10375   t = fold_convert (TREE_TYPE (arg), t);
10376   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
10377   /* String up roundup and advance.  */
10378   if (roundup)
10379     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10380   /* String up with arg */
10381   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
10382   /* Big-endianness related address adjustment.  */
10383   if (BLOCK_REG_PADDING (mode, type, 1) == downward
10384       && size < UNITS_PER_WORD)
10385   {
10386     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
10387                 size_int (UNITS_PER_WORD - size));
10388     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
10389   }
10390
10391   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
10392   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
10393
10394   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
10395   t = off;
10396   if (adjust)
10397     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
10398                 build_int_cst (TREE_TYPE (off), adjust));
10399
10400   t = fold_convert (sizetype, t);
10401   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
10402
10403   if (is_ha)
10404     {
10405       /* type ha; // treat as "struct {ftype field[n];}"
10406          ... [computing offs]
10407          for (i = 0; i <nregs; ++i, offs += 16)
10408            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
10409          return ha;  */
10410       int i;
10411       tree tmp_ha, field_t, field_ptr_t;
10412
10413       /* Declare a local variable.  */
10414       tmp_ha = create_tmp_var_raw (type, "ha");
10415       gimple_add_tmp_var (tmp_ha);
10416
10417       /* Establish the base type.  */
10418       switch (ag_mode)
10419         {
10420         case SFmode:
10421           field_t = float_type_node;
10422           field_ptr_t = float_ptr_type_node;
10423           break;
10424         case DFmode:
10425           field_t = double_type_node;
10426           field_ptr_t = double_ptr_type_node;
10427           break;
10428         case TFmode:
10429           field_t = long_double_type_node;
10430           field_ptr_t = long_double_ptr_type_node;
10431           break;
10432         case HFmode:
10433           field_t = aarch64_fp16_type_node;
10434           field_ptr_t = aarch64_fp16_ptr_type_node;
10435           break;
10436         case V2SImode:
10437         case V4SImode:
10438             {
10439               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
10440               field_t = build_vector_type_for_mode (innertype, ag_mode);
10441               field_ptr_t = build_pointer_type (field_t);
10442             }
10443           break;
10444         default:
10445           gcc_assert (0);
10446         }
10447
10448       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
10449       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
10450       addr = t;
10451       t = fold_convert (field_ptr_t, addr);
10452       t = build2 (MODIFY_EXPR, field_t,
10453                   build1 (INDIRECT_REF, field_t, tmp_ha),
10454                   build1 (INDIRECT_REF, field_t, t));
10455
10456       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
10457       for (i = 1; i < nregs; ++i)
10458         {
10459           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
10460           u = fold_convert (field_ptr_t, addr);
10461           u = build2 (MODIFY_EXPR, field_t,
10462                       build2 (MEM_REF, field_t, tmp_ha,
10463                               build_int_cst (field_ptr_t,
10464                                              (i *
10465                                               int_size_in_bytes (field_t)))),
10466                       build1 (INDIRECT_REF, field_t, u));
10467           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
10468         }
10469
10470       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
10471       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
10472     }
10473
10474   COND_EXPR_ELSE (cond2) = t;
10475   addr = fold_convert (build_pointer_type (type), cond1);
10476   addr = build_va_arg_indirect_ref (addr);
10477
10478   if (indirect_p)
10479     addr = build_va_arg_indirect_ref (addr);
10480
10481   return addr;
10482 }
10483
10484 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
10485
10486 static void
10487 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10488                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
10489                                 int no_rtl)
10490 {
10491   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10492   CUMULATIVE_ARGS local_cum;
10493   int gr_saved = cfun->va_list_gpr_size;
10494   int vr_saved = cfun->va_list_fpr_size;
10495
10496   /* The caller has advanced CUM up to, but not beyond, the last named
10497      argument.  Advance a local copy of CUM past the last "real" named
10498      argument, to find out how many registers are left over.  */
10499   local_cum = *cum;
10500   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
10501
10502   /* Found out how many registers we need to save.
10503      Honor tree-stdvar analysis results.  */
10504   if (cfun->va_list_gpr_size)
10505     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
10506                     cfun->va_list_gpr_size / UNITS_PER_WORD);
10507   if (cfun->va_list_fpr_size)
10508     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
10509                     cfun->va_list_fpr_size / UNITS_PER_VREG);
10510
10511   if (!TARGET_FLOAT)
10512     {
10513       gcc_assert (local_cum.aapcs_nvrn == 0);
10514       vr_saved = 0;
10515     }
10516
10517   if (!no_rtl)
10518     {
10519       if (gr_saved > 0)
10520         {
10521           rtx ptr, mem;
10522
10523           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
10524           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
10525                                - gr_saved * UNITS_PER_WORD);
10526           mem = gen_frame_mem (BLKmode, ptr);
10527           set_mem_alias_set (mem, get_varargs_alias_set ());
10528
10529           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
10530                                mem, gr_saved);
10531         }
10532       if (vr_saved > 0)
10533         {
10534           /* We can't use move_block_from_reg, because it will use
10535              the wrong mode, storing D regs only.  */
10536           machine_mode mode = TImode;
10537           int off, i, vr_start;
10538
10539           /* Set OFF to the offset from virtual_incoming_args_rtx of
10540              the first vector register.  The VR save area lies below
10541              the GR one, and is aligned to 16 bytes.  */
10542           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10543                            STACK_BOUNDARY / BITS_PER_UNIT);
10544           off -= vr_saved * UNITS_PER_VREG;
10545
10546           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10547           for (i = 0; i < vr_saved; ++i)
10548             {
10549               rtx ptr, mem;
10550
10551               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10552               mem = gen_frame_mem (mode, ptr);
10553               set_mem_alias_set (mem, get_varargs_alias_set ());
10554               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10555               off += UNITS_PER_VREG;
10556             }
10557         }
10558     }
10559
10560   /* We don't save the size into *PRETEND_SIZE because we want to avoid
10561      any complication of having crtl->args.pretend_args_size changed.  */
10562   cfun->machine->frame.saved_varargs_size
10563     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10564                  STACK_BOUNDARY / BITS_PER_UNIT)
10565        + vr_saved * UNITS_PER_VREG);
10566 }
10567
10568 static void
10569 aarch64_conditional_register_usage (void)
10570 {
10571   int i;
10572   if (!TARGET_FLOAT)
10573     {
10574       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10575         {
10576           fixed_regs[i] = 1;
10577           call_used_regs[i] = 1;
10578         }
10579     }
10580 }
10581
10582 /* Walk down the type tree of TYPE counting consecutive base elements.
10583    If *MODEP is VOIDmode, then set it to the first valid floating point
10584    type.  If a non-floating point type is found, or if a floating point
10585    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10586    otherwise return the count in the sub-tree.  */
10587 static int
10588 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10589 {
10590   machine_mode mode;
10591   HOST_WIDE_INT size;
10592
10593   switch (TREE_CODE (type))
10594     {
10595     case REAL_TYPE:
10596       mode = TYPE_MODE (type);
10597       if (mode != DFmode && mode != SFmode
10598           && mode != TFmode && mode != HFmode)
10599         return -1;
10600
10601       if (*modep == VOIDmode)
10602         *modep = mode;
10603
10604       if (*modep == mode)
10605         return 1;
10606
10607       break;
10608
10609     case COMPLEX_TYPE:
10610       mode = TYPE_MODE (TREE_TYPE (type));
10611       if (mode != DFmode && mode != SFmode
10612           && mode != TFmode && mode != HFmode)
10613         return -1;
10614
10615       if (*modep == VOIDmode)
10616         *modep = mode;
10617
10618       if (*modep == mode)
10619         return 2;
10620
10621       break;
10622
10623     case VECTOR_TYPE:
10624       /* Use V2SImode and V4SImode as representatives of all 64-bit
10625          and 128-bit vector types.  */
10626       size = int_size_in_bytes (type);
10627       switch (size)
10628         {
10629         case 8:
10630           mode = V2SImode;
10631           break;
10632         case 16:
10633           mode = V4SImode;
10634           break;
10635         default:
10636           return -1;
10637         }
10638
10639       if (*modep == VOIDmode)
10640         *modep = mode;
10641
10642       /* Vector modes are considered to be opaque: two vectors are
10643          equivalent for the purposes of being homogeneous aggregates
10644          if they are the same size.  */
10645       if (*modep == mode)
10646         return 1;
10647
10648       break;
10649
10650     case ARRAY_TYPE:
10651       {
10652         int count;
10653         tree index = TYPE_DOMAIN (type);
10654
10655         /* Can't handle incomplete types nor sizes that are not
10656            fixed.  */
10657         if (!COMPLETE_TYPE_P (type)
10658             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10659           return -1;
10660
10661         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
10662         if (count == -1
10663             || !index
10664             || !TYPE_MAX_VALUE (index)
10665             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
10666             || !TYPE_MIN_VALUE (index)
10667             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
10668             || count < 0)
10669           return -1;
10670
10671         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
10672                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
10673
10674         /* There must be no padding.  */
10675         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10676           return -1;
10677
10678         return count;
10679       }
10680
10681     case RECORD_TYPE:
10682       {
10683         int count = 0;
10684         int sub_count;
10685         tree field;
10686
10687         /* Can't handle incomplete types nor sizes that are not
10688            fixed.  */
10689         if (!COMPLETE_TYPE_P (type)
10690             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10691           return -1;
10692
10693         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10694           {
10695             if (TREE_CODE (field) != FIELD_DECL)
10696               continue;
10697
10698             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10699             if (sub_count < 0)
10700               return -1;
10701             count += sub_count;
10702           }
10703
10704         /* There must be no padding.  */
10705         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10706           return -1;
10707
10708         return count;
10709       }
10710
10711     case UNION_TYPE:
10712     case QUAL_UNION_TYPE:
10713       {
10714         /* These aren't very interesting except in a degenerate case.  */
10715         int count = 0;
10716         int sub_count;
10717         tree field;
10718
10719         /* Can't handle incomplete types nor sizes that are not
10720            fixed.  */
10721         if (!COMPLETE_TYPE_P (type)
10722             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10723           return -1;
10724
10725         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10726           {
10727             if (TREE_CODE (field) != FIELD_DECL)
10728               continue;
10729
10730             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10731             if (sub_count < 0)
10732               return -1;
10733             count = count > sub_count ? count : sub_count;
10734           }
10735
10736         /* There must be no padding.  */
10737         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10738           return -1;
10739
10740         return count;
10741       }
10742
10743     default:
10744       break;
10745     }
10746
10747   return -1;
10748 }
10749
10750 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
10751    type as described in AAPCS64 \S 4.1.2.
10752
10753    See the comment above aarch64_composite_type_p for the notes on MODE.  */
10754
10755 static bool
10756 aarch64_short_vector_p (const_tree type,
10757                         machine_mode mode)
10758 {
10759   HOST_WIDE_INT size = -1;
10760
10761   if (type && TREE_CODE (type) == VECTOR_TYPE)
10762     size = int_size_in_bytes (type);
10763   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
10764             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
10765     size = GET_MODE_SIZE (mode);
10766
10767   return (size == 8 || size == 16);
10768 }
10769
10770 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
10771    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
10772    array types.  The C99 floating-point complex types are also considered
10773    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
10774    types, which are GCC extensions and out of the scope of AAPCS64, are
10775    treated as composite types here as well.
10776
10777    Note that MODE itself is not sufficient in determining whether a type
10778    is such a composite type or not.  This is because
10779    stor-layout.c:compute_record_mode may have already changed the MODE
10780    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
10781    structure with only one field may have its MODE set to the mode of the
10782    field.  Also an integer mode whose size matches the size of the
10783    RECORD_TYPE type may be used to substitute the original mode
10784    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
10785    solely relied on.  */
10786
10787 static bool
10788 aarch64_composite_type_p (const_tree type,
10789                           machine_mode mode)
10790 {
10791   if (aarch64_short_vector_p (type, mode))
10792     return false;
10793
10794   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
10795     return true;
10796
10797   if (mode == BLKmode
10798       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
10799       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
10800     return true;
10801
10802   return false;
10803 }
10804
10805 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
10806    shall be passed or returned in simd/fp register(s) (providing these
10807    parameter passing registers are available).
10808
10809    Upon successful return, *COUNT returns the number of needed registers,
10810    *BASE_MODE returns the mode of the individual register and when IS_HAF
10811    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
10812    floating-point aggregate or a homogeneous short-vector aggregate.  */
10813
10814 static bool
10815 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
10816                                          const_tree type,
10817                                          machine_mode *base_mode,
10818                                          int *count,
10819                                          bool *is_ha)
10820 {
10821   machine_mode new_mode = VOIDmode;
10822   bool composite_p = aarch64_composite_type_p (type, mode);
10823
10824   if (is_ha != NULL) *is_ha = false;
10825
10826   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
10827       || aarch64_short_vector_p (type, mode))
10828     {
10829       *count = 1;
10830       new_mode = mode;
10831     }
10832   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
10833     {
10834       if (is_ha != NULL) *is_ha = true;
10835       *count = 2;
10836       new_mode = GET_MODE_INNER (mode);
10837     }
10838   else if (type && composite_p)
10839     {
10840       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
10841
10842       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
10843         {
10844           if (is_ha != NULL) *is_ha = true;
10845           *count = ag_count;
10846         }
10847       else
10848         return false;
10849     }
10850   else
10851     return false;
10852
10853   *base_mode = new_mode;
10854   return true;
10855 }
10856
10857 /* Implement TARGET_STRUCT_VALUE_RTX.  */
10858
10859 static rtx
10860 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
10861                           int incoming ATTRIBUTE_UNUSED)
10862 {
10863   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
10864 }
10865
10866 /* Implements target hook vector_mode_supported_p.  */
10867 static bool
10868 aarch64_vector_mode_supported_p (machine_mode mode)
10869 {
10870   if (TARGET_SIMD
10871       && (mode == V4SImode  || mode == V8HImode
10872           || mode == V16QImode || mode == V2DImode
10873           || mode == V2SImode  || mode == V4HImode
10874           || mode == V8QImode || mode == V2SFmode
10875           || mode == V4SFmode || mode == V2DFmode
10876           || mode == V4HFmode || mode == V8HFmode
10877           || mode == V1DFmode))
10878     return true;
10879
10880   return false;
10881 }
10882
10883 /* Return appropriate SIMD container
10884    for MODE within a vector of WIDTH bits.  */
10885 static machine_mode
10886 aarch64_simd_container_mode (machine_mode mode, unsigned width)
10887 {
10888   gcc_assert (width == 64 || width == 128);
10889   if (TARGET_SIMD)
10890     {
10891       if (width == 128)
10892         switch (mode)
10893           {
10894           case DFmode:
10895             return V2DFmode;
10896           case SFmode:
10897             return V4SFmode;
10898           case HFmode:
10899             return V8HFmode;
10900           case SImode:
10901             return V4SImode;
10902           case HImode:
10903             return V8HImode;
10904           case QImode:
10905             return V16QImode;
10906           case DImode:
10907             return V2DImode;
10908           default:
10909             break;
10910           }
10911       else
10912         switch (mode)
10913           {
10914           case SFmode:
10915             return V2SFmode;
10916           case HFmode:
10917             return V4HFmode;
10918           case SImode:
10919             return V2SImode;
10920           case HImode:
10921             return V4HImode;
10922           case QImode:
10923             return V8QImode;
10924           default:
10925             break;
10926           }
10927     }
10928   return word_mode;
10929 }
10930
10931 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
10932 static machine_mode
10933 aarch64_preferred_simd_mode (machine_mode mode)
10934 {
10935   return aarch64_simd_container_mode (mode, 128);
10936 }
10937
10938 /* Return the bitmask of possible vector sizes for the vectorizer
10939    to iterate over.  */
10940 static unsigned int
10941 aarch64_autovectorize_vector_sizes (void)
10942 {
10943   return (16 | 8);
10944 }
10945
10946 /* Implement TARGET_MANGLE_TYPE.  */
10947
10948 static const char *
10949 aarch64_mangle_type (const_tree type)
10950 {
10951   /* The AArch64 ABI documents say that "__va_list" has to be
10952      managled as if it is in the "std" namespace.  */
10953   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
10954     return "St9__va_list";
10955
10956   /* Half-precision float.  */
10957   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
10958     return "Dh";
10959
10960   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
10961      builtin types.  */
10962   if (TYPE_NAME (type) != NULL)
10963     return aarch64_mangle_builtin_type (type);
10964
10965   /* Use the default mangling.  */
10966   return NULL;
10967 }
10968
10969 /* Find the first rtx_insn before insn that will generate an assembly
10970    instruction.  */
10971
10972 static rtx_insn *
10973 aarch64_prev_real_insn (rtx_insn *insn)
10974 {
10975   if (!insn)
10976     return NULL;
10977
10978   do
10979     {
10980       insn = prev_real_insn (insn);
10981     }
10982   while (insn && recog_memoized (insn) < 0);
10983
10984   return insn;
10985 }
10986
10987 static bool
10988 is_madd_op (enum attr_type t1)
10989 {
10990   unsigned int i;
10991   /* A number of these may be AArch32 only.  */
10992   enum attr_type mlatypes[] = {
10993     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
10994     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
10995     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
10996   };
10997
10998   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
10999     {
11000       if (t1 == mlatypes[i])
11001         return true;
11002     }
11003
11004   return false;
11005 }
11006
11007 /* Check if there is a register dependency between a load and the insn
11008    for which we hold recog_data.  */
11009
11010 static bool
11011 dep_between_memop_and_curr (rtx memop)
11012 {
11013   rtx load_reg;
11014   int opno;
11015
11016   gcc_assert (GET_CODE (memop) == SET);
11017
11018   if (!REG_P (SET_DEST (memop)))
11019     return false;
11020
11021   load_reg = SET_DEST (memop);
11022   for (opno = 1; opno < recog_data.n_operands; opno++)
11023     {
11024       rtx operand = recog_data.operand[opno];
11025       if (REG_P (operand)
11026           && reg_overlap_mentioned_p (load_reg, operand))
11027         return true;
11028
11029     }
11030   return false;
11031 }
11032
11033
11034 /* When working around the Cortex-A53 erratum 835769,
11035    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
11036    instruction and has a preceding memory instruction such that a NOP
11037    should be inserted between them.  */
11038
11039 bool
11040 aarch64_madd_needs_nop (rtx_insn* insn)
11041 {
11042   enum attr_type attr_type;
11043   rtx_insn *prev;
11044   rtx body;
11045
11046   if (!TARGET_FIX_ERR_A53_835769)
11047     return false;
11048
11049   if (!INSN_P (insn) || recog_memoized (insn) < 0)
11050     return false;
11051
11052   attr_type = get_attr_type (insn);
11053   if (!is_madd_op (attr_type))
11054     return false;
11055
11056   prev = aarch64_prev_real_insn (insn);
11057   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
11058      Restore recog state to INSN to avoid state corruption.  */
11059   extract_constrain_insn_cached (insn);
11060
11061   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
11062     return false;
11063
11064   body = single_set (prev);
11065
11066   /* If the previous insn is a memory op and there is no dependency between
11067      it and the DImode madd, emit a NOP between them.  If body is NULL then we
11068      have a complex memory operation, probably a load/store pair.
11069      Be conservative for now and emit a NOP.  */
11070   if (GET_MODE (recog_data.operand[0]) == DImode
11071       && (!body || !dep_between_memop_and_curr (body)))
11072     return true;
11073
11074   return false;
11075
11076 }
11077
11078
11079 /* Implement FINAL_PRESCAN_INSN.  */
11080
11081 void
11082 aarch64_final_prescan_insn (rtx_insn *insn)
11083 {
11084   if (aarch64_madd_needs_nop (insn))
11085     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
11086 }
11087
11088
11089 /* Return the equivalent letter for size.  */
11090 static char
11091 sizetochar (int size)
11092 {
11093   switch (size)
11094     {
11095     case 64: return 'd';
11096     case 32: return 's';
11097     case 16: return 'h';
11098     case 8 : return 'b';
11099     default: gcc_unreachable ();
11100     }
11101 }
11102
11103 /* Return true iff x is a uniform vector of floating-point
11104    constants, and the constant can be represented in
11105    quarter-precision form.  Note, as aarch64_float_const_representable
11106    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
11107 static bool
11108 aarch64_vect_float_const_representable_p (rtx x)
11109 {
11110   rtx elt;
11111   return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
11112           && const_vec_duplicate_p (x, &elt)
11113           && aarch64_float_const_representable_p (elt));
11114 }
11115
11116 /* Return true for valid and false for invalid.  */
11117 bool
11118 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
11119                               struct simd_immediate_info *info)
11120 {
11121 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
11122   matches = 1;                                          \
11123   for (i = 0; i < idx; i += (STRIDE))                   \
11124     if (!(TEST))                                        \
11125       matches = 0;                                      \
11126   if (matches)                                          \
11127     {                                                   \
11128       immtype = (CLASS);                                \
11129       elsize = (ELSIZE);                                \
11130       eshift = (SHIFT);                                 \
11131       emvn = (NEG);                                     \
11132       break;                                            \
11133     }
11134
11135   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
11136   unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
11137   unsigned char bytes[16];
11138   int immtype = -1, matches;
11139   unsigned int invmask = inverse ? 0xff : 0;
11140   int eshift, emvn;
11141
11142   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11143     {
11144       if (! (aarch64_simd_imm_zero_p (op, mode)
11145              || aarch64_vect_float_const_representable_p (op)))
11146         return false;
11147
11148       if (info)
11149         {
11150           info->value = CONST_VECTOR_ELT (op, 0);
11151           info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
11152           info->mvn = false;
11153           info->shift = 0;
11154         }
11155
11156       return true;
11157     }
11158
11159   /* Splat vector constant out into a byte vector.  */
11160   for (i = 0; i < n_elts; i++)
11161     {
11162       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
11163          it must be laid out in the vector register in reverse order.  */
11164       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
11165       unsigned HOST_WIDE_INT elpart;
11166
11167       gcc_assert (CONST_INT_P (el));
11168       elpart = INTVAL (el);
11169
11170       for (unsigned int byte = 0; byte < innersize; byte++)
11171         {
11172           bytes[idx++] = (elpart & 0xff) ^ invmask;
11173           elpart >>= BITS_PER_UNIT;
11174         }
11175
11176     }
11177
11178   /* Sanity check.  */
11179   gcc_assert (idx == GET_MODE_SIZE (mode));
11180
11181   do
11182     {
11183       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
11184              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
11185
11186       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11187              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11188
11189       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
11190              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11191
11192       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
11193              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
11194
11195       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
11196
11197       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
11198
11199       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
11200              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
11201
11202       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11203              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11204
11205       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
11206              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11207
11208       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
11209              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
11210
11211       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
11212
11213       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
11214
11215       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11216              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11217
11218       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11219              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11220
11221       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
11222              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11223
11224       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
11225              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11226
11227       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
11228
11229       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
11230              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
11231     }
11232   while (0);
11233
11234   if (immtype == -1)
11235     return false;
11236
11237   if (info)
11238     {
11239       info->element_width = elsize;
11240       info->mvn = emvn != 0;
11241       info->shift = eshift;
11242
11243       unsigned HOST_WIDE_INT imm = 0;
11244
11245       if (immtype >= 12 && immtype <= 15)
11246         info->msl = true;
11247
11248       /* Un-invert bytes of recognized vector, if necessary.  */
11249       if (invmask != 0)
11250         for (i = 0; i < idx; i++)
11251           bytes[i] ^= invmask;
11252
11253       if (immtype == 17)
11254         {
11255           /* FIXME: Broken on 32-bit H_W_I hosts.  */
11256           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
11257
11258           for (i = 0; i < 8; i++)
11259             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
11260               << (i * BITS_PER_UNIT);
11261
11262
11263           info->value = GEN_INT (imm);
11264         }
11265       else
11266         {
11267           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
11268             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
11269
11270           /* Construct 'abcdefgh' because the assembler cannot handle
11271              generic constants.  */
11272           if (info->mvn)
11273             imm = ~imm;
11274           imm = (imm >> info->shift) & 0xff;
11275           info->value = GEN_INT (imm);
11276         }
11277     }
11278
11279   return true;
11280 #undef CHECK
11281 }
11282
11283 /* Check of immediate shift constants are within range.  */
11284 bool
11285 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
11286 {
11287   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
11288   if (left)
11289     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
11290   else
11291     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
11292 }
11293
11294 /* Return true if X is a uniform vector where all elements
11295    are either the floating-point constant 0.0 or the
11296    integer constant 0.  */
11297 bool
11298 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
11299 {
11300   return x == CONST0_RTX (mode);
11301 }
11302
11303
11304 /* Return the bitmask CONST_INT to select the bits required by a zero extract
11305    operation of width WIDTH at bit position POS.  */
11306
11307 rtx
11308 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
11309 {
11310   gcc_assert (CONST_INT_P (width));
11311   gcc_assert (CONST_INT_P (pos));
11312
11313   unsigned HOST_WIDE_INT mask
11314     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
11315   return GEN_INT (mask << UINTVAL (pos));
11316 }
11317
11318 bool
11319 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
11320 {
11321   HOST_WIDE_INT imm = INTVAL (x);
11322   int i;
11323
11324   for (i = 0; i < 8; i++)
11325     {
11326       unsigned int byte = imm & 0xff;
11327       if (byte != 0xff && byte != 0)
11328        return false;
11329       imm >>= 8;
11330     }
11331
11332   return true;
11333 }
11334
11335 bool
11336 aarch64_mov_operand_p (rtx x, machine_mode mode)
11337 {
11338   if (GET_CODE (x) == HIGH
11339       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
11340     return true;
11341
11342   if (CONST_INT_P (x))
11343     return true;
11344
11345   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
11346     return true;
11347
11348   return aarch64_classify_symbolic_expression (x)
11349     == SYMBOL_TINY_ABSOLUTE;
11350 }
11351
11352 /* Return a const_int vector of VAL.  */
11353 rtx
11354 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
11355 {
11356   int nunits = GET_MODE_NUNITS (mode);
11357   rtvec v = rtvec_alloc (nunits);
11358   int i;
11359
11360   rtx cache = GEN_INT (val);
11361
11362   for (i=0; i < nunits; i++)
11363     RTVEC_ELT (v, i) = cache;
11364
11365   return gen_rtx_CONST_VECTOR (mode, v);
11366 }
11367
11368 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
11369
11370 bool
11371 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
11372 {
11373   machine_mode vmode;
11374
11375   gcc_assert (!VECTOR_MODE_P (mode));
11376   vmode = aarch64_preferred_simd_mode (mode);
11377   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
11378   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
11379 }
11380
11381 /* Construct and return a PARALLEL RTX vector with elements numbering the
11382    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
11383    the vector - from the perspective of the architecture.  This does not
11384    line up with GCC's perspective on lane numbers, so we end up with
11385    different masks depending on our target endian-ness.  The diagram
11386    below may help.  We must draw the distinction when building masks
11387    which select one half of the vector.  An instruction selecting
11388    architectural low-lanes for a big-endian target, must be described using
11389    a mask selecting GCC high-lanes.
11390
11391                  Big-Endian             Little-Endian
11392
11393 GCC             0   1   2   3           3   2   1   0
11394               | x | x | x | x |       | x | x | x | x |
11395 Architecture    3   2   1   0           3   2   1   0
11396
11397 Low Mask:         { 2, 3 }                { 0, 1 }
11398 High Mask:        { 0, 1 }                { 2, 3 }
11399 */
11400
11401 rtx
11402 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
11403 {
11404   int nunits = GET_MODE_NUNITS (mode);
11405   rtvec v = rtvec_alloc (nunits / 2);
11406   int high_base = nunits / 2;
11407   int low_base = 0;
11408   int base;
11409   rtx t1;
11410   int i;
11411
11412   if (BYTES_BIG_ENDIAN)
11413     base = high ? low_base : high_base;
11414   else
11415     base = high ? high_base : low_base;
11416
11417   for (i = 0; i < nunits / 2; i++)
11418     RTVEC_ELT (v, i) = GEN_INT (base + i);
11419
11420   t1 = gen_rtx_PARALLEL (mode, v);
11421   return t1;
11422 }
11423
11424 /* Check OP for validity as a PARALLEL RTX vector with elements
11425    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
11426    from the perspective of the architecture.  See the diagram above
11427    aarch64_simd_vect_par_cnst_half for more details.  */
11428
11429 bool
11430 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
11431                                        bool high)
11432 {
11433   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
11434   HOST_WIDE_INT count_op = XVECLEN (op, 0);
11435   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
11436   int i = 0;
11437
11438   if (!VECTOR_MODE_P (mode))
11439     return false;
11440
11441   if (count_op != count_ideal)
11442     return false;
11443
11444   for (i = 0; i < count_ideal; i++)
11445     {
11446       rtx elt_op = XVECEXP (op, 0, i);
11447       rtx elt_ideal = XVECEXP (ideal, 0, i);
11448
11449       if (!CONST_INT_P (elt_op)
11450           || INTVAL (elt_ideal) != INTVAL (elt_op))
11451         return false;
11452     }
11453   return true;
11454 }
11455
11456 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
11457    HIGH (exclusive).  */
11458 void
11459 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
11460                           const_tree exp)
11461 {
11462   HOST_WIDE_INT lane;
11463   gcc_assert (CONST_INT_P (operand));
11464   lane = INTVAL (operand);
11465
11466   if (lane < low || lane >= high)
11467   {
11468     if (exp)
11469       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
11470     else
11471       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
11472   }
11473 }
11474
11475 /* Return TRUE if OP is a valid vector addressing mode.  */
11476 bool
11477 aarch64_simd_mem_operand_p (rtx op)
11478 {
11479   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
11480                         || REG_P (XEXP (op, 0)));
11481 }
11482
11483 /* Emit a register copy from operand to operand, taking care not to
11484    early-clobber source registers in the process.
11485
11486    COUNT is the number of components into which the copy needs to be
11487    decomposed.  */
11488 void
11489 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
11490                                 unsigned int count)
11491 {
11492   unsigned int i;
11493   int rdest = REGNO (operands[0]);
11494   int rsrc = REGNO (operands[1]);
11495
11496   if (!reg_overlap_mentioned_p (operands[0], operands[1])
11497       || rdest < rsrc)
11498     for (i = 0; i < count; i++)
11499       emit_move_insn (gen_rtx_REG (mode, rdest + i),
11500                       gen_rtx_REG (mode, rsrc + i));
11501   else
11502     for (i = 0; i < count; i++)
11503       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
11504                       gen_rtx_REG (mode, rsrc + count - i - 1));
11505 }
11506
11507 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11508    one of VSTRUCT modes: OI, CI, or XI.  */
11509 int
11510 aarch64_simd_attr_length_rglist (enum machine_mode mode)
11511 {
11512   return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
11513 }
11514
11515 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
11516    alignment of a vector to 128 bits.  */
11517 static HOST_WIDE_INT
11518 aarch64_simd_vector_alignment (const_tree type)
11519 {
11520   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
11521   return MIN (align, 128);
11522 }
11523
11524 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
11525 static bool
11526 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
11527 {
11528   if (is_packed)
11529     return false;
11530
11531   /* We guarantee alignment for vectors up to 128-bits.  */
11532   if (tree_int_cst_compare (TYPE_SIZE (type),
11533                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11534     return false;
11535
11536   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
11537   return true;
11538 }
11539
11540 /* Return true if the vector misalignment factor is supported by the
11541    target.  */
11542 static bool
11543 aarch64_builtin_support_vector_misalignment (machine_mode mode,
11544                                              const_tree type, int misalignment,
11545                                              bool is_packed)
11546 {
11547   if (TARGET_SIMD && STRICT_ALIGNMENT)
11548     {
11549       /* Return if movmisalign pattern is not supported for this mode.  */
11550       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
11551         return false;
11552
11553       if (misalignment == -1)
11554         {
11555           /* Misalignment factor is unknown at compile time but we know
11556              it's word aligned.  */
11557           if (aarch64_simd_vector_alignment_reachable (type, is_packed))
11558             {
11559               int element_size = TREE_INT_CST_LOW (TYPE_SIZE (type));
11560
11561               if (element_size != 64)
11562                 return true;
11563             }
11564           return false;
11565         }
11566     }
11567   return default_builtin_support_vector_misalignment (mode, type, misalignment,
11568                                                       is_packed);
11569 }
11570
11571 /* If VALS is a vector constant that can be loaded into a register
11572    using DUP, generate instructions to do so and return an RTX to
11573    assign to the register.  Otherwise return NULL_RTX.  */
11574 static rtx
11575 aarch64_simd_dup_constant (rtx vals)
11576 {
11577   machine_mode mode = GET_MODE (vals);
11578   machine_mode inner_mode = GET_MODE_INNER (mode);
11579   rtx x;
11580
11581   if (!const_vec_duplicate_p (vals, &x))
11582     return NULL_RTX;
11583
11584   /* We can load this constant by using DUP and a constant in a
11585      single ARM register.  This will be cheaper than a vector
11586      load.  */
11587   x = copy_to_mode_reg (inner_mode, x);
11588   return gen_rtx_VEC_DUPLICATE (mode, x);
11589 }
11590
11591
11592 /* Generate code to load VALS, which is a PARALLEL containing only
11593    constants (for vec_init) or CONST_VECTOR, efficiently into a
11594    register.  Returns an RTX to copy into the register, or NULL_RTX
11595    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
11596 static rtx
11597 aarch64_simd_make_constant (rtx vals)
11598 {
11599   machine_mode mode = GET_MODE (vals);
11600   rtx const_dup;
11601   rtx const_vec = NULL_RTX;
11602   int n_elts = GET_MODE_NUNITS (mode);
11603   int n_const = 0;
11604   int i;
11605
11606   if (GET_CODE (vals) == CONST_VECTOR)
11607     const_vec = vals;
11608   else if (GET_CODE (vals) == PARALLEL)
11609     {
11610       /* A CONST_VECTOR must contain only CONST_INTs and
11611          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11612          Only store valid constants in a CONST_VECTOR.  */
11613       for (i = 0; i < n_elts; ++i)
11614         {
11615           rtx x = XVECEXP (vals, 0, i);
11616           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11617             n_const++;
11618         }
11619       if (n_const == n_elts)
11620         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11621     }
11622   else
11623     gcc_unreachable ();
11624
11625   if (const_vec != NULL_RTX
11626       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11627     /* Load using MOVI/MVNI.  */
11628     return const_vec;
11629   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11630     /* Loaded using DUP.  */
11631     return const_dup;
11632   else if (const_vec != NULL_RTX)
11633     /* Load from constant pool. We can not take advantage of single-cycle
11634        LD1 because we need a PC-relative addressing mode.  */
11635     return const_vec;
11636   else
11637     /* A PARALLEL containing something not valid inside CONST_VECTOR.
11638        We can not construct an initializer.  */
11639     return NULL_RTX;
11640 }
11641
11642 /* Expand a vector initialisation sequence, such that TARGET is
11643    initialised to contain VALS.  */
11644
11645 void
11646 aarch64_expand_vector_init (rtx target, rtx vals)
11647 {
11648   machine_mode mode = GET_MODE (target);
11649   machine_mode inner_mode = GET_MODE_INNER (mode);
11650   /* The number of vector elements.  */
11651   int n_elts = GET_MODE_NUNITS (mode);
11652   /* The number of vector elements which are not constant.  */
11653   int n_var = 0;
11654   rtx any_const = NULL_RTX;
11655   /* The first element of vals.  */
11656   rtx v0 = XVECEXP (vals, 0, 0);
11657   bool all_same = true;
11658
11659   /* Count the number of variable elements to initialise.  */
11660   for (int i = 0; i < n_elts; ++i)
11661     {
11662       rtx x = XVECEXP (vals, 0, i);
11663       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
11664         ++n_var;
11665       else
11666         any_const = x;
11667
11668       all_same &= rtx_equal_p (x, v0);
11669     }
11670
11671   /* No variable elements, hand off to aarch64_simd_make_constant which knows
11672      how best to handle this.  */
11673   if (n_var == 0)
11674     {
11675       rtx constant = aarch64_simd_make_constant (vals);
11676       if (constant != NULL_RTX)
11677         {
11678           emit_move_insn (target, constant);
11679           return;
11680         }
11681     }
11682
11683   /* Splat a single non-constant element if we can.  */
11684   if (all_same)
11685     {
11686       rtx x = copy_to_mode_reg (inner_mode, v0);
11687       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
11688       return;
11689     }
11690
11691   /* Initialise a vector which is part-variable.  We want to first try
11692      to build those lanes which are constant in the most efficient way we
11693      can.  */
11694   if (n_var != n_elts)
11695     {
11696       rtx copy = copy_rtx (vals);
11697
11698       /* Load constant part of vector.  We really don't care what goes into the
11699          parts we will overwrite, but we're more likely to be able to load the
11700          constant efficiently if it has fewer, larger, repeating parts
11701          (see aarch64_simd_valid_immediate).  */
11702       for (int i = 0; i < n_elts; i++)
11703         {
11704           rtx x = XVECEXP (vals, 0, i);
11705           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11706             continue;
11707           rtx subst = any_const;
11708           for (int bit = n_elts / 2; bit > 0; bit /= 2)
11709             {
11710               /* Look in the copied vector, as more elements are const.  */
11711               rtx test = XVECEXP (copy, 0, i ^ bit);
11712               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
11713                 {
11714                   subst = test;
11715                   break;
11716                 }
11717             }
11718           XVECEXP (copy, 0, i) = subst;
11719         }
11720       aarch64_expand_vector_init (target, copy);
11721     }
11722
11723   /* Insert the variable lanes directly.  */
11724
11725   enum insn_code icode = optab_handler (vec_set_optab, mode);
11726   gcc_assert (icode != CODE_FOR_nothing);
11727
11728   for (int i = 0; i < n_elts; i++)
11729     {
11730       rtx x = XVECEXP (vals, 0, i);
11731       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11732         continue;
11733       x = copy_to_mode_reg (inner_mode, x);
11734       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
11735     }
11736 }
11737
11738 static unsigned HOST_WIDE_INT
11739 aarch64_shift_truncation_mask (machine_mode mode)
11740 {
11741   return
11742     (!SHIFT_COUNT_TRUNCATED
11743      || aarch64_vector_mode_supported_p (mode)
11744      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
11745 }
11746
11747 /* Select a format to encode pointers in exception handling data.  */
11748 int
11749 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
11750 {
11751    int type;
11752    switch (aarch64_cmodel)
11753      {
11754      case AARCH64_CMODEL_TINY:
11755      case AARCH64_CMODEL_TINY_PIC:
11756      case AARCH64_CMODEL_SMALL:
11757      case AARCH64_CMODEL_SMALL_PIC:
11758      case AARCH64_CMODEL_SMALL_SPIC:
11759        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
11760           for everything.  */
11761        type = DW_EH_PE_sdata4;
11762        break;
11763      default:
11764        /* No assumptions here.  8-byte relocs required.  */
11765        type = DW_EH_PE_sdata8;
11766        break;
11767      }
11768    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
11769 }
11770
11771 /* The last .arch and .tune assembly strings that we printed.  */
11772 static std::string aarch64_last_printed_arch_string;
11773 static std::string aarch64_last_printed_tune_string;
11774
11775 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
11776    by the function fndecl.  */
11777
11778 void
11779 aarch64_declare_function_name (FILE *stream, const char* name,
11780                                 tree fndecl)
11781 {
11782   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11783
11784   struct cl_target_option *targ_options;
11785   if (target_parts)
11786     targ_options = TREE_TARGET_OPTION (target_parts);
11787   else
11788     targ_options = TREE_TARGET_OPTION (target_option_current_node);
11789   gcc_assert (targ_options);
11790
11791   const struct processor *this_arch
11792     = aarch64_get_arch (targ_options->x_explicit_arch);
11793
11794   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
11795   std::string extension
11796     = aarch64_get_extension_string_for_isa_flags (isa_flags,
11797                                                   this_arch->flags);
11798   /* Only update the assembler .arch string if it is distinct from the last
11799      such string we printed.  */
11800   std::string to_print = this_arch->name + extension;
11801   if (to_print != aarch64_last_printed_arch_string)
11802     {
11803       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
11804       aarch64_last_printed_arch_string = to_print;
11805     }
11806
11807   /* Print the cpu name we're tuning for in the comments, might be
11808      useful to readers of the generated asm.  Do it only when it changes
11809      from function to function and verbose assembly is requested.  */
11810   const struct processor *this_tune
11811     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
11812
11813   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
11814     {
11815       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
11816                    this_tune->name);
11817       aarch64_last_printed_tune_string = this_tune->name;
11818     }
11819
11820   /* Don't forget the type directive for ELF.  */
11821   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
11822   ASM_OUTPUT_LABEL (stream, name);
11823 }
11824
11825 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
11826
11827 static void
11828 aarch64_start_file (void)
11829 {
11830   struct cl_target_option *default_options
11831     = TREE_TARGET_OPTION (target_option_default_node);
11832
11833   const struct processor *default_arch
11834     = aarch64_get_arch (default_options->x_explicit_arch);
11835   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
11836   std::string extension
11837     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
11838                                                   default_arch->flags);
11839
11840    aarch64_last_printed_arch_string = default_arch->name + extension;
11841    aarch64_last_printed_tune_string = "";
11842    asm_fprintf (asm_out_file, "\t.arch %s\n",
11843                 aarch64_last_printed_arch_string.c_str ());
11844
11845    default_file_start ();
11846 }
11847
11848 /* Emit load exclusive.  */
11849
11850 static void
11851 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
11852                              rtx mem, rtx model_rtx)
11853 {
11854   rtx (*gen) (rtx, rtx, rtx);
11855
11856   switch (mode)
11857     {
11858     case QImode: gen = gen_aarch64_load_exclusiveqi; break;
11859     case HImode: gen = gen_aarch64_load_exclusivehi; break;
11860     case SImode: gen = gen_aarch64_load_exclusivesi; break;
11861     case DImode: gen = gen_aarch64_load_exclusivedi; break;
11862     default:
11863       gcc_unreachable ();
11864     }
11865
11866   emit_insn (gen (rval, mem, model_rtx));
11867 }
11868
11869 /* Emit store exclusive.  */
11870
11871 static void
11872 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
11873                               rtx rval, rtx mem, rtx model_rtx)
11874 {
11875   rtx (*gen) (rtx, rtx, rtx, rtx);
11876
11877   switch (mode)
11878     {
11879     case QImode: gen = gen_aarch64_store_exclusiveqi; break;
11880     case HImode: gen = gen_aarch64_store_exclusivehi; break;
11881     case SImode: gen = gen_aarch64_store_exclusivesi; break;
11882     case DImode: gen = gen_aarch64_store_exclusivedi; break;
11883     default:
11884       gcc_unreachable ();
11885     }
11886
11887   emit_insn (gen (bval, rval, mem, model_rtx));
11888 }
11889
11890 /* Mark the previous jump instruction as unlikely.  */
11891
11892 static void
11893 aarch64_emit_unlikely_jump (rtx insn)
11894 {
11895   int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
11896
11897   rtx_insn *jump = emit_jump_insn (insn);
11898   add_int_reg_note (jump, REG_BR_PROB, very_unlikely);
11899 }
11900
11901 /* Expand a compare and swap pattern.  */
11902
11903 void
11904 aarch64_expand_compare_and_swap (rtx operands[])
11905 {
11906   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
11907   machine_mode mode, cmp_mode;
11908   typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
11909   int idx;
11910   gen_cas_fn gen;
11911   const gen_cas_fn split_cas[] =
11912   {
11913     gen_aarch64_compare_and_swapqi,
11914     gen_aarch64_compare_and_swaphi,
11915     gen_aarch64_compare_and_swapsi,
11916     gen_aarch64_compare_and_swapdi
11917   };
11918   const gen_cas_fn atomic_cas[] =
11919   {
11920     gen_aarch64_compare_and_swapqi_lse,
11921     gen_aarch64_compare_and_swaphi_lse,
11922     gen_aarch64_compare_and_swapsi_lse,
11923     gen_aarch64_compare_and_swapdi_lse
11924   };
11925
11926   bval = operands[0];
11927   rval = operands[1];
11928   mem = operands[2];
11929   oldval = operands[3];
11930   newval = operands[4];
11931   is_weak = operands[5];
11932   mod_s = operands[6];
11933   mod_f = operands[7];
11934   mode = GET_MODE (mem);
11935   cmp_mode = mode;
11936
11937   /* Normally the succ memory model must be stronger than fail, but in the
11938      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
11939      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
11940
11941   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
11942       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
11943     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
11944
11945   switch (mode)
11946     {
11947     case QImode:
11948     case HImode:
11949       /* For short modes, we're going to perform the comparison in SImode,
11950          so do the zero-extension now.  */
11951       cmp_mode = SImode;
11952       rval = gen_reg_rtx (SImode);
11953       oldval = convert_modes (SImode, mode, oldval, true);
11954       /* Fall through.  */
11955
11956     case SImode:
11957     case DImode:
11958       /* Force the value into a register if needed.  */
11959       if (!aarch64_plus_operand (oldval, mode))
11960         oldval = force_reg (cmp_mode, oldval);
11961       break;
11962
11963     default:
11964       gcc_unreachable ();
11965     }
11966
11967   switch (mode)
11968     {
11969     case QImode: idx = 0; break;
11970     case HImode: idx = 1; break;
11971     case SImode: idx = 2; break;
11972     case DImode: idx = 3; break;
11973     default:
11974       gcc_unreachable ();
11975     }
11976   if (TARGET_LSE)
11977     gen = atomic_cas[idx];
11978   else
11979     gen = split_cas[idx];
11980
11981   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
11982
11983   if (mode == QImode || mode == HImode)
11984     emit_move_insn (operands[1], gen_lowpart (mode, rval));
11985
11986   x = gen_rtx_REG (CCmode, CC_REGNUM);
11987   x = gen_rtx_EQ (SImode, x, const0_rtx);
11988   emit_insn (gen_rtx_SET (bval, x));
11989 }
11990
11991 /* Test whether the target supports using a atomic load-operate instruction.
11992    CODE is the operation and AFTER is TRUE if the data in memory after the
11993    operation should be returned and FALSE if the data before the operation
11994    should be returned.  Returns FALSE if the operation isn't supported by the
11995    architecture.  */
11996
11997 bool
11998 aarch64_atomic_ldop_supported_p (enum rtx_code code)
11999 {
12000   if (!TARGET_LSE)
12001     return false;
12002
12003   switch (code)
12004     {
12005     case SET:
12006     case AND:
12007     case IOR:
12008     case XOR:
12009     case MINUS:
12010     case PLUS:
12011       return true;
12012     default:
12013       return false;
12014     }
12015 }
12016
12017 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
12018    sequence implementing an atomic operation.  */
12019
12020 static void
12021 aarch64_emit_post_barrier (enum memmodel model)
12022 {
12023   const enum memmodel base_model = memmodel_base (model);
12024
12025   if (is_mm_sync (model)
12026       && (base_model == MEMMODEL_ACQUIRE
12027           || base_model == MEMMODEL_ACQ_REL
12028           || base_model == MEMMODEL_SEQ_CST))
12029     {
12030       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
12031     }
12032 }
12033
12034 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
12035    for the data in memory.  EXPECTED is the value expected to be in memory.
12036    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
12037    is the memory ordering to use.  */
12038
12039 void
12040 aarch64_gen_atomic_cas (rtx rval, rtx mem,
12041                         rtx expected, rtx desired,
12042                         rtx model)
12043 {
12044   rtx (*gen) (rtx, rtx, rtx, rtx);
12045   machine_mode mode;
12046
12047   mode = GET_MODE (mem);
12048
12049   switch (mode)
12050     {
12051     case QImode: gen = gen_aarch64_atomic_casqi; break;
12052     case HImode: gen = gen_aarch64_atomic_cashi; break;
12053     case SImode: gen = gen_aarch64_atomic_cassi; break;
12054     case DImode: gen = gen_aarch64_atomic_casdi; break;
12055     default:
12056       gcc_unreachable ();
12057     }
12058
12059   /* Move the expected value into the CAS destination register.  */
12060   emit_insn (gen_rtx_SET (rval, expected));
12061
12062   /* Emit the CAS.  */
12063   emit_insn (gen (rval, mem, desired, model));
12064
12065   /* Compare the expected value with the value loaded by the CAS, to establish
12066      whether the swap was made.  */
12067   aarch64_gen_compare_reg (EQ, rval, expected);
12068 }
12069
12070 /* Split a compare and swap pattern.  */
12071
12072 void
12073 aarch64_split_compare_and_swap (rtx operands[])
12074 {
12075   rtx rval, mem, oldval, newval, scratch;
12076   machine_mode mode;
12077   bool is_weak;
12078   rtx_code_label *label1, *label2;
12079   rtx x, cond;
12080   enum memmodel model;
12081   rtx model_rtx;
12082
12083   rval = operands[0];
12084   mem = operands[1];
12085   oldval = operands[2];
12086   newval = operands[3];
12087   is_weak = (operands[4] != const0_rtx);
12088   model_rtx = operands[5];
12089   scratch = operands[7];
12090   mode = GET_MODE (mem);
12091   model = memmodel_from_int (INTVAL (model_rtx));
12092
12093   label1 = NULL;
12094   if (!is_weak)
12095     {
12096       label1 = gen_label_rtx ();
12097       emit_label (label1);
12098     }
12099   label2 = gen_label_rtx ();
12100
12101   /* The initial load can be relaxed for a __sync operation since a final
12102      barrier will be emitted to stop code hoisting.  */
12103   if (is_mm_sync (model))
12104     aarch64_emit_load_exclusive (mode, rval, mem,
12105                                  GEN_INT (MEMMODEL_RELAXED));
12106   else
12107     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
12108
12109   cond = aarch64_gen_compare_reg (NE, rval, oldval);
12110   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12111   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12112                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12113   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12114
12115   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
12116
12117   if (!is_weak)
12118     {
12119       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
12120       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12121                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
12122       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12123     }
12124   else
12125     {
12126       cond = gen_rtx_REG (CCmode, CC_REGNUM);
12127       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
12128       emit_insn (gen_rtx_SET (cond, x));
12129     }
12130
12131   emit_label (label2);
12132
12133   /* Emit any final barrier needed for a __sync operation.  */
12134   if (is_mm_sync (model))
12135     aarch64_emit_post_barrier (model);
12136 }
12137
12138 /* Emit a BIC instruction.  */
12139
12140 static void
12141 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
12142 {
12143   rtx shift_rtx = GEN_INT (shift);
12144   rtx (*gen) (rtx, rtx, rtx, rtx);
12145
12146   switch (mode)
12147     {
12148     case SImode: gen = gen_and_one_cmpl_lshrsi3; break;
12149     case DImode: gen = gen_and_one_cmpl_lshrdi3; break;
12150     default:
12151       gcc_unreachable ();
12152     }
12153
12154   emit_insn (gen (dst, s2, shift_rtx, s1));
12155 }
12156
12157 /* Emit an atomic swap.  */
12158
12159 static void
12160 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
12161                           rtx mem, rtx model)
12162 {
12163   rtx (*gen) (rtx, rtx, rtx, rtx);
12164
12165   switch (mode)
12166     {
12167     case QImode: gen = gen_aarch64_atomic_swpqi; break;
12168     case HImode: gen = gen_aarch64_atomic_swphi; break;
12169     case SImode: gen = gen_aarch64_atomic_swpsi; break;
12170     case DImode: gen = gen_aarch64_atomic_swpdi; break;
12171     default:
12172       gcc_unreachable ();
12173     }
12174
12175   emit_insn (gen (dst, mem, value, model));
12176 }
12177
12178 /* Operations supported by aarch64_emit_atomic_load_op.  */
12179
12180 enum aarch64_atomic_load_op_code
12181 {
12182   AARCH64_LDOP_PLUS,    /* A + B  */
12183   AARCH64_LDOP_XOR,     /* A ^ B  */
12184   AARCH64_LDOP_OR,      /* A | B  */
12185   AARCH64_LDOP_BIC      /* A & ~B  */
12186 };
12187
12188 /* Emit an atomic load-operate.  */
12189
12190 static void
12191 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
12192                              machine_mode mode, rtx dst, rtx src,
12193                              rtx mem, rtx model)
12194 {
12195   typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
12196   const aarch64_atomic_load_op_fn plus[] =
12197   {
12198     gen_aarch64_atomic_loadaddqi,
12199     gen_aarch64_atomic_loadaddhi,
12200     gen_aarch64_atomic_loadaddsi,
12201     gen_aarch64_atomic_loadadddi
12202   };
12203   const aarch64_atomic_load_op_fn eor[] =
12204   {
12205     gen_aarch64_atomic_loadeorqi,
12206     gen_aarch64_atomic_loadeorhi,
12207     gen_aarch64_atomic_loadeorsi,
12208     gen_aarch64_atomic_loadeordi
12209   };
12210   const aarch64_atomic_load_op_fn ior[] =
12211   {
12212     gen_aarch64_atomic_loadsetqi,
12213     gen_aarch64_atomic_loadsethi,
12214     gen_aarch64_atomic_loadsetsi,
12215     gen_aarch64_atomic_loadsetdi
12216   };
12217   const aarch64_atomic_load_op_fn bic[] =
12218   {
12219     gen_aarch64_atomic_loadclrqi,
12220     gen_aarch64_atomic_loadclrhi,
12221     gen_aarch64_atomic_loadclrsi,
12222     gen_aarch64_atomic_loadclrdi
12223   };
12224   aarch64_atomic_load_op_fn gen;
12225   int idx = 0;
12226
12227   switch (mode)
12228     {
12229     case QImode: idx = 0; break;
12230     case HImode: idx = 1; break;
12231     case SImode: idx = 2; break;
12232     case DImode: idx = 3; break;
12233     default:
12234       gcc_unreachable ();
12235     }
12236
12237   switch (code)
12238     {
12239     case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
12240     case AARCH64_LDOP_XOR: gen = eor[idx]; break;
12241     case AARCH64_LDOP_OR: gen = ior[idx]; break;
12242     case AARCH64_LDOP_BIC: gen = bic[idx]; break;
12243     default:
12244       gcc_unreachable ();
12245     }
12246
12247   emit_insn (gen (dst, mem, src, model));
12248 }
12249
12250 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
12251    location to store the data read from memory.  OUT_RESULT is the location to
12252    store the result of the operation.  MEM is the memory location to read and
12253    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
12254    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
12255    be NULL.  */
12256
12257 void
12258 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
12259                          rtx mem, rtx value, rtx model_rtx)
12260 {
12261   machine_mode mode = GET_MODE (mem);
12262   machine_mode wmode = (mode == DImode ? DImode : SImode);
12263   const bool short_mode = (mode < SImode);
12264   aarch64_atomic_load_op_code ldop_code;
12265   rtx src;
12266   rtx x;
12267
12268   if (out_data)
12269     out_data = gen_lowpart (mode, out_data);
12270
12271   if (out_result)
12272     out_result = gen_lowpart (mode, out_result);
12273
12274   /* Make sure the value is in a register, putting it into a destination
12275      register if it needs to be manipulated.  */
12276   if (!register_operand (value, mode)
12277       || code == AND || code == MINUS)
12278     {
12279       src = out_result ? out_result : out_data;
12280       emit_move_insn (src, gen_lowpart (mode, value));
12281     }
12282   else
12283     src = value;
12284   gcc_assert (register_operand (src, mode));
12285
12286   /* Preprocess the data for the operation as necessary.  If the operation is
12287      a SET then emit a swap instruction and finish.  */
12288   switch (code)
12289     {
12290     case SET:
12291       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
12292       return;
12293
12294     case MINUS:
12295       /* Negate the value and treat it as a PLUS.  */
12296       {
12297         rtx neg_src;
12298
12299         /* Resize the value if necessary.  */
12300         if (short_mode)
12301           src = gen_lowpart (wmode, src);
12302
12303         neg_src = gen_rtx_NEG (wmode, src);
12304         emit_insn (gen_rtx_SET (src, neg_src));
12305
12306         if (short_mode)
12307           src = gen_lowpart (mode, src);
12308       }
12309       /* Fall-through.  */
12310     case PLUS:
12311       ldop_code = AARCH64_LDOP_PLUS;
12312       break;
12313
12314     case IOR:
12315       ldop_code = AARCH64_LDOP_OR;
12316       break;
12317
12318     case XOR:
12319       ldop_code = AARCH64_LDOP_XOR;
12320       break;
12321
12322     case AND:
12323       {
12324         rtx not_src;
12325
12326         /* Resize the value if necessary.  */
12327         if (short_mode)
12328           src = gen_lowpart (wmode, src);
12329
12330         not_src = gen_rtx_NOT (wmode, src);
12331         emit_insn (gen_rtx_SET (src, not_src));
12332
12333         if (short_mode)
12334           src = gen_lowpart (mode, src);
12335       }
12336       ldop_code = AARCH64_LDOP_BIC;
12337       break;
12338
12339     default:
12340       /* The operation can't be done with atomic instructions.  */
12341       gcc_unreachable ();
12342     }
12343
12344   aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
12345
12346   /* If necessary, calculate the data in memory after the update by redoing the
12347      operation from values in registers.  */
12348   if (!out_result)
12349     return;
12350
12351   if (short_mode)
12352     {
12353       src = gen_lowpart (wmode, src);
12354       out_data = gen_lowpart (wmode, out_data);
12355       out_result = gen_lowpart (wmode, out_result);
12356     }
12357
12358   x = NULL_RTX;
12359
12360   switch (code)
12361     {
12362     case MINUS:
12363     case PLUS:
12364       x = gen_rtx_PLUS (wmode, out_data, src);
12365       break;
12366     case IOR:
12367       x = gen_rtx_IOR (wmode, out_data, src);
12368       break;
12369     case XOR:
12370       x = gen_rtx_XOR (wmode, out_data, src);
12371       break;
12372     case AND:
12373       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
12374       return;
12375     default:
12376       gcc_unreachable ();
12377     }
12378
12379   emit_set_insn (out_result, x);
12380
12381   return;
12382 }
12383
12384 /* Split an atomic operation.  */
12385
12386 void
12387 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
12388                          rtx value, rtx model_rtx, rtx cond)
12389 {
12390   machine_mode mode = GET_MODE (mem);
12391   machine_mode wmode = (mode == DImode ? DImode : SImode);
12392   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
12393   const bool is_sync = is_mm_sync (model);
12394   rtx_code_label *label;
12395   rtx x;
12396
12397   /* Split the atomic operation into a sequence.  */
12398   label = gen_label_rtx ();
12399   emit_label (label);
12400
12401   if (new_out)
12402     new_out = gen_lowpart (wmode, new_out);
12403   if (old_out)
12404     old_out = gen_lowpart (wmode, old_out);
12405   else
12406     old_out = new_out;
12407   value = simplify_gen_subreg (wmode, value, mode, 0);
12408
12409   /* The initial load can be relaxed for a __sync operation since a final
12410      barrier will be emitted to stop code hoisting.  */
12411  if (is_sync)
12412     aarch64_emit_load_exclusive (mode, old_out, mem,
12413                                  GEN_INT (MEMMODEL_RELAXED));
12414   else
12415     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
12416
12417   switch (code)
12418     {
12419     case SET:
12420       new_out = value;
12421       break;
12422
12423     case NOT:
12424       x = gen_rtx_AND (wmode, old_out, value);
12425       emit_insn (gen_rtx_SET (new_out, x));
12426       x = gen_rtx_NOT (wmode, new_out);
12427       emit_insn (gen_rtx_SET (new_out, x));
12428       break;
12429
12430     case MINUS:
12431       if (CONST_INT_P (value))
12432         {
12433           value = GEN_INT (-INTVAL (value));
12434           code = PLUS;
12435         }
12436       /* Fall through.  */
12437
12438     default:
12439       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
12440       emit_insn (gen_rtx_SET (new_out, x));
12441       break;
12442     }
12443
12444   aarch64_emit_store_exclusive (mode, cond, mem,
12445                                 gen_lowpart (mode, new_out), model_rtx);
12446
12447   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12448   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12449                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
12450   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12451
12452   /* Emit any final barrier needed for a __sync operation.  */
12453   if (is_sync)
12454     aarch64_emit_post_barrier (model);
12455 }
12456
12457 static void
12458 aarch64_init_libfuncs (void)
12459 {
12460    /* Half-precision float operations.  The compiler handles all operations
12461      with NULL libfuncs by converting to SFmode.  */
12462
12463   /* Conversions.  */
12464   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
12465   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
12466
12467   /* Arithmetic.  */
12468   set_optab_libfunc (add_optab, HFmode, NULL);
12469   set_optab_libfunc (sdiv_optab, HFmode, NULL);
12470   set_optab_libfunc (smul_optab, HFmode, NULL);
12471   set_optab_libfunc (neg_optab, HFmode, NULL);
12472   set_optab_libfunc (sub_optab, HFmode, NULL);
12473
12474   /* Comparisons.  */
12475   set_optab_libfunc (eq_optab, HFmode, NULL);
12476   set_optab_libfunc (ne_optab, HFmode, NULL);
12477   set_optab_libfunc (lt_optab, HFmode, NULL);
12478   set_optab_libfunc (le_optab, HFmode, NULL);
12479   set_optab_libfunc (ge_optab, HFmode, NULL);
12480   set_optab_libfunc (gt_optab, HFmode, NULL);
12481   set_optab_libfunc (unord_optab, HFmode, NULL);
12482 }
12483
12484 /* Target hook for c_mode_for_suffix.  */
12485 static machine_mode
12486 aarch64_c_mode_for_suffix (char suffix)
12487 {
12488   if (suffix == 'q')
12489     return TFmode;
12490
12491   return VOIDmode;
12492 }
12493
12494 /* We can only represent floating point constants which will fit in
12495    "quarter-precision" values.  These values are characterised by
12496    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
12497    by:
12498
12499    (-1)^s * (n/16) * 2^r
12500
12501    Where:
12502      's' is the sign bit.
12503      'n' is an integer in the range 16 <= n <= 31.
12504      'r' is an integer in the range -3 <= r <= 4.  */
12505
12506 /* Return true iff X can be represented by a quarter-precision
12507    floating point immediate operand X.  Note, we cannot represent 0.0.  */
12508 bool
12509 aarch64_float_const_representable_p (rtx x)
12510 {
12511   /* This represents our current view of how many bits
12512      make up the mantissa.  */
12513   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
12514   int exponent;
12515   unsigned HOST_WIDE_INT mantissa, mask;
12516   REAL_VALUE_TYPE r, m;
12517   bool fail;
12518
12519   if (!CONST_DOUBLE_P (x))
12520     return false;
12521
12522   /* We don't support HFmode constants yet.  */
12523   if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
12524     return false;
12525
12526   r = *CONST_DOUBLE_REAL_VALUE (x);
12527
12528   /* We cannot represent infinities, NaNs or +/-zero.  We won't
12529      know if we have +zero until we analyse the mantissa, but we
12530      can reject the other invalid values.  */
12531   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
12532       || REAL_VALUE_MINUS_ZERO (r))
12533     return false;
12534
12535   /* Extract exponent.  */
12536   r = real_value_abs (&r);
12537   exponent = REAL_EXP (&r);
12538
12539   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12540      highest (sign) bit, with a fixed binary point at bit point_pos.
12541      m1 holds the low part of the mantissa, m2 the high part.
12542      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12543      bits for the mantissa, this can fail (low bits will be lost).  */
12544   real_ldexp (&m, &r, point_pos - exponent);
12545   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
12546
12547   /* If the low part of the mantissa has bits set we cannot represent
12548      the value.  */
12549   if (w.ulow () != 0)
12550     return false;
12551   /* We have rejected the lower HOST_WIDE_INT, so update our
12552      understanding of how many bits lie in the mantissa and
12553      look only at the high HOST_WIDE_INT.  */
12554   mantissa = w.elt (1);
12555   point_pos -= HOST_BITS_PER_WIDE_INT;
12556
12557   /* We can only represent values with a mantissa of the form 1.xxxx.  */
12558   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
12559   if ((mantissa & mask) != 0)
12560     return false;
12561
12562   /* Having filtered unrepresentable values, we may now remove all
12563      but the highest 5 bits.  */
12564   mantissa >>= point_pos - 5;
12565
12566   /* We cannot represent the value 0.0, so reject it.  This is handled
12567      elsewhere.  */
12568   if (mantissa == 0)
12569     return false;
12570
12571   /* Then, as bit 4 is always set, we can mask it off, leaving
12572      the mantissa in the range [0, 15].  */
12573   mantissa &= ~(1 << 4);
12574   gcc_assert (mantissa <= 15);
12575
12576   /* GCC internally does not use IEEE754-like encoding (where normalized
12577      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
12578      Our mantissa values are shifted 4 places to the left relative to
12579      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
12580      by 5 places to correct for GCC's representation.  */
12581   exponent = 5 - exponent;
12582
12583   return (exponent >= 0 && exponent <= 7);
12584 }
12585
12586 char*
12587 aarch64_output_simd_mov_immediate (rtx const_vector,
12588                                    machine_mode mode,
12589                                    unsigned width)
12590 {
12591   bool is_valid;
12592   static char templ[40];
12593   const char *mnemonic;
12594   const char *shift_op;
12595   unsigned int lane_count = 0;
12596   char element_char;
12597
12598   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
12599
12600   /* This will return true to show const_vector is legal for use as either
12601      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
12602      also update INFO to show how the immediate should be generated.  */
12603   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
12604   gcc_assert (is_valid);
12605
12606   element_char = sizetochar (info.element_width);
12607   lane_count = width / info.element_width;
12608
12609   mode = GET_MODE_INNER (mode);
12610   if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12611     {
12612       gcc_assert (info.shift == 0 && ! info.mvn);
12613       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
12614          move immediate path.  */
12615       if (aarch64_float_const_zero_rtx_p (info.value))
12616         info.value = GEN_INT (0);
12617       else
12618         {
12619           const unsigned int buf_size = 20;
12620           char float_buf[buf_size] = {'\0'};
12621           real_to_decimal_for_mode (float_buf,
12622                                     CONST_DOUBLE_REAL_VALUE (info.value),
12623                                     buf_size, buf_size, 1, mode);
12624
12625           if (lane_count == 1)
12626             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
12627           else
12628             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
12629                       lane_count, element_char, float_buf);
12630           return templ;
12631         }
12632     }
12633
12634   mnemonic = info.mvn ? "mvni" : "movi";
12635   shift_op = info.msl ? "msl" : "lsl";
12636
12637   gcc_assert (CONST_INT_P (info.value));
12638   if (lane_count == 1)
12639     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
12640               mnemonic, UINTVAL (info.value));
12641   else if (info.shift)
12642     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
12643               ", %s %d", mnemonic, lane_count, element_char,
12644               UINTVAL (info.value), shift_op, info.shift);
12645   else
12646     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
12647               mnemonic, lane_count, element_char, UINTVAL (info.value));
12648   return templ;
12649 }
12650
12651 char*
12652 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
12653                                           machine_mode mode)
12654 {
12655   machine_mode vmode;
12656
12657   gcc_assert (!VECTOR_MODE_P (mode));
12658   vmode = aarch64_simd_container_mode (mode, 64);
12659   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
12660   return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
12661 }
12662
12663 /* Split operands into moves from op[1] + op[2] into op[0].  */
12664
12665 void
12666 aarch64_split_combinev16qi (rtx operands[3])
12667 {
12668   unsigned int dest = REGNO (operands[0]);
12669   unsigned int src1 = REGNO (operands[1]);
12670   unsigned int src2 = REGNO (operands[2]);
12671   machine_mode halfmode = GET_MODE (operands[1]);
12672   unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
12673   rtx destlo, desthi;
12674
12675   gcc_assert (halfmode == V16QImode);
12676
12677   if (src1 == dest && src2 == dest + halfregs)
12678     {
12679       /* No-op move.  Can't split to nothing; emit something.  */
12680       emit_note (NOTE_INSN_DELETED);
12681       return;
12682     }
12683
12684   /* Preserve register attributes for variable tracking.  */
12685   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
12686   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
12687                                GET_MODE_SIZE (halfmode));
12688
12689   /* Special case of reversed high/low parts.  */
12690   if (reg_overlap_mentioned_p (operands[2], destlo)
12691       && reg_overlap_mentioned_p (operands[1], desthi))
12692     {
12693       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12694       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
12695       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12696     }
12697   else if (!reg_overlap_mentioned_p (operands[2], destlo))
12698     {
12699       /* Try to avoid unnecessary moves if part of the result
12700          is in the right place already.  */
12701       if (src1 != dest)
12702         emit_move_insn (destlo, operands[1]);
12703       if (src2 != dest + halfregs)
12704         emit_move_insn (desthi, operands[2]);
12705     }
12706   else
12707     {
12708       if (src2 != dest + halfregs)
12709         emit_move_insn (desthi, operands[2]);
12710       if (src1 != dest)
12711         emit_move_insn (destlo, operands[1]);
12712     }
12713 }
12714
12715 /* vec_perm support.  */
12716
12717 #define MAX_VECT_LEN 16
12718
12719 struct expand_vec_perm_d
12720 {
12721   rtx target, op0, op1;
12722   unsigned char perm[MAX_VECT_LEN];
12723   machine_mode vmode;
12724   unsigned char nelt;
12725   bool one_vector_p;
12726   bool testing_p;
12727 };
12728
12729 /* Generate a variable permutation.  */
12730
12731 static void
12732 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
12733 {
12734   machine_mode vmode = GET_MODE (target);
12735   bool one_vector_p = rtx_equal_p (op0, op1);
12736
12737   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
12738   gcc_checking_assert (GET_MODE (op0) == vmode);
12739   gcc_checking_assert (GET_MODE (op1) == vmode);
12740   gcc_checking_assert (GET_MODE (sel) == vmode);
12741   gcc_checking_assert (TARGET_SIMD);
12742
12743   if (one_vector_p)
12744     {
12745       if (vmode == V8QImode)
12746         {
12747           /* Expand the argument to a V16QI mode by duplicating it.  */
12748           rtx pair = gen_reg_rtx (V16QImode);
12749           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
12750           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12751         }
12752       else
12753         {
12754           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
12755         }
12756     }
12757   else
12758     {
12759       rtx pair;
12760
12761       if (vmode == V8QImode)
12762         {
12763           pair = gen_reg_rtx (V16QImode);
12764           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
12765           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12766         }
12767       else
12768         {
12769           pair = gen_reg_rtx (OImode);
12770           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
12771           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
12772         }
12773     }
12774 }
12775
12776 void
12777 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
12778 {
12779   machine_mode vmode = GET_MODE (target);
12780   unsigned int nelt = GET_MODE_NUNITS (vmode);
12781   bool one_vector_p = rtx_equal_p (op0, op1);
12782   rtx mask;
12783
12784   /* The TBL instruction does not use a modulo index, so we must take care
12785      of that ourselves.  */
12786   mask = aarch64_simd_gen_const_vector_dup (vmode,
12787       one_vector_p ? nelt - 1 : 2 * nelt - 1);
12788   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
12789
12790   /* For big-endian, we also need to reverse the index within the vector
12791      (but not which vector).  */
12792   if (BYTES_BIG_ENDIAN)
12793     {
12794       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
12795       if (!one_vector_p)
12796         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
12797       sel = expand_simple_binop (vmode, XOR, sel, mask,
12798                                  NULL, 0, OPTAB_LIB_WIDEN);
12799     }
12800   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
12801 }
12802
12803 /* Recognize patterns suitable for the TRN instructions.  */
12804 static bool
12805 aarch64_evpc_trn (struct expand_vec_perm_d *d)
12806 {
12807   unsigned int i, odd, mask, nelt = d->nelt;
12808   rtx out, in0, in1, x;
12809   rtx (*gen) (rtx, rtx, rtx);
12810   machine_mode vmode = d->vmode;
12811
12812   if (GET_MODE_UNIT_SIZE (vmode) > 8)
12813     return false;
12814
12815   /* Note that these are little-endian tests.
12816      We correct for big-endian later.  */
12817   if (d->perm[0] == 0)
12818     odd = 0;
12819   else if (d->perm[0] == 1)
12820     odd = 1;
12821   else
12822     return false;
12823   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12824
12825   for (i = 0; i < nelt; i += 2)
12826     {
12827       if (d->perm[i] != i + odd)
12828         return false;
12829       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
12830         return false;
12831     }
12832
12833   /* Success!  */
12834   if (d->testing_p)
12835     return true;
12836
12837   in0 = d->op0;
12838   in1 = d->op1;
12839   if (BYTES_BIG_ENDIAN)
12840     {
12841       x = in0, in0 = in1, in1 = x;
12842       odd = !odd;
12843     }
12844   out = d->target;
12845
12846   if (odd)
12847     {
12848       switch (vmode)
12849         {
12850         case V16QImode: gen = gen_aarch64_trn2v16qi; break;
12851         case V8QImode: gen = gen_aarch64_trn2v8qi; break;
12852         case V8HImode: gen = gen_aarch64_trn2v8hi; break;
12853         case V4HImode: gen = gen_aarch64_trn2v4hi; break;
12854         case V4SImode: gen = gen_aarch64_trn2v4si; break;
12855         case V2SImode: gen = gen_aarch64_trn2v2si; break;
12856         case V2DImode: gen = gen_aarch64_trn2v2di; break;
12857         case V4HFmode: gen = gen_aarch64_trn2v4hf; break;
12858         case V8HFmode: gen = gen_aarch64_trn2v8hf; break;
12859         case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
12860         case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
12861         case V2DFmode: gen = gen_aarch64_trn2v2df; break;
12862         default:
12863           return false;
12864         }
12865     }
12866   else
12867     {
12868       switch (vmode)
12869         {
12870         case V16QImode: gen = gen_aarch64_trn1v16qi; break;
12871         case V8QImode: gen = gen_aarch64_trn1v8qi; break;
12872         case V8HImode: gen = gen_aarch64_trn1v8hi; break;
12873         case V4HImode: gen = gen_aarch64_trn1v4hi; break;
12874         case V4SImode: gen = gen_aarch64_trn1v4si; break;
12875         case V2SImode: gen = gen_aarch64_trn1v2si; break;
12876         case V2DImode: gen = gen_aarch64_trn1v2di; break;
12877         case V4HFmode: gen = gen_aarch64_trn1v4hf; break;
12878         case V8HFmode: gen = gen_aarch64_trn1v8hf; break;
12879         case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
12880         case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
12881         case V2DFmode: gen = gen_aarch64_trn1v2df; break;
12882         default:
12883           return false;
12884         }
12885     }
12886
12887   emit_insn (gen (out, in0, in1));
12888   return true;
12889 }
12890
12891 /* Recognize patterns suitable for the UZP instructions.  */
12892 static bool
12893 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
12894 {
12895   unsigned int i, odd, mask, nelt = d->nelt;
12896   rtx out, in0, in1, x;
12897   rtx (*gen) (rtx, rtx, rtx);
12898   machine_mode vmode = d->vmode;
12899
12900   if (GET_MODE_UNIT_SIZE (vmode) > 8)
12901     return false;
12902
12903   /* Note that these are little-endian tests.
12904      We correct for big-endian later.  */
12905   if (d->perm[0] == 0)
12906     odd = 0;
12907   else if (d->perm[0] == 1)
12908     odd = 1;
12909   else
12910     return false;
12911   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12912
12913   for (i = 0; i < nelt; i++)
12914     {
12915       unsigned elt = (i * 2 + odd) & mask;
12916       if (d->perm[i] != elt)
12917         return false;
12918     }
12919
12920   /* Success!  */
12921   if (d->testing_p)
12922     return true;
12923
12924   in0 = d->op0;
12925   in1 = d->op1;
12926   if (BYTES_BIG_ENDIAN)
12927     {
12928       x = in0, in0 = in1, in1 = x;
12929       odd = !odd;
12930     }
12931   out = d->target;
12932
12933   if (odd)
12934     {
12935       switch (vmode)
12936         {
12937         case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
12938         case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
12939         case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
12940         case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
12941         case V4SImode: gen = gen_aarch64_uzp2v4si; break;
12942         case V2SImode: gen = gen_aarch64_uzp2v2si; break;
12943         case V2DImode: gen = gen_aarch64_uzp2v2di; break;
12944         case V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
12945         case V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
12946         case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
12947         case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
12948         case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
12949         default:
12950           return false;
12951         }
12952     }
12953   else
12954     {
12955       switch (vmode)
12956         {
12957         case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
12958         case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
12959         case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
12960         case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
12961         case V4SImode: gen = gen_aarch64_uzp1v4si; break;
12962         case V2SImode: gen = gen_aarch64_uzp1v2si; break;
12963         case V2DImode: gen = gen_aarch64_uzp1v2di; break;
12964         case V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
12965         case V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
12966         case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
12967         case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
12968         case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
12969         default:
12970           return false;
12971         }
12972     }
12973
12974   emit_insn (gen (out, in0, in1));
12975   return true;
12976 }
12977
12978 /* Recognize patterns suitable for the ZIP instructions.  */
12979 static bool
12980 aarch64_evpc_zip (struct expand_vec_perm_d *d)
12981 {
12982   unsigned int i, high, mask, nelt = d->nelt;
12983   rtx out, in0, in1, x;
12984   rtx (*gen) (rtx, rtx, rtx);
12985   machine_mode vmode = d->vmode;
12986
12987   if (GET_MODE_UNIT_SIZE (vmode) > 8)
12988     return false;
12989
12990   /* Note that these are little-endian tests.
12991      We correct for big-endian later.  */
12992   high = nelt / 2;
12993   if (d->perm[0] == high)
12994     /* Do Nothing.  */
12995     ;
12996   else if (d->perm[0] == 0)
12997     high = 0;
12998   else
12999     return false;
13000   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13001
13002   for (i = 0; i < nelt / 2; i++)
13003     {
13004       unsigned elt = (i + high) & mask;
13005       if (d->perm[i * 2] != elt)
13006         return false;
13007       elt = (elt + nelt) & mask;
13008       if (d->perm[i * 2 + 1] != elt)
13009         return false;
13010     }
13011
13012   /* Success!  */
13013   if (d->testing_p)
13014     return true;
13015
13016   in0 = d->op0;
13017   in1 = d->op1;
13018   if (BYTES_BIG_ENDIAN)
13019     {
13020       x = in0, in0 = in1, in1 = x;
13021       high = !high;
13022     }
13023   out = d->target;
13024
13025   if (high)
13026     {
13027       switch (vmode)
13028         {
13029         case V16QImode: gen = gen_aarch64_zip2v16qi; break;
13030         case V8QImode: gen = gen_aarch64_zip2v8qi; break;
13031         case V8HImode: gen = gen_aarch64_zip2v8hi; break;
13032         case V4HImode: gen = gen_aarch64_zip2v4hi; break;
13033         case V4SImode: gen = gen_aarch64_zip2v4si; break;
13034         case V2SImode: gen = gen_aarch64_zip2v2si; break;
13035         case V2DImode: gen = gen_aarch64_zip2v2di; break;
13036         case V4HFmode: gen = gen_aarch64_zip2v4hf; break;
13037         case V8HFmode: gen = gen_aarch64_zip2v8hf; break;
13038         case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
13039         case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
13040         case V2DFmode: gen = gen_aarch64_zip2v2df; break;
13041         default:
13042           return false;
13043         }
13044     }
13045   else
13046     {
13047       switch (vmode)
13048         {
13049         case V16QImode: gen = gen_aarch64_zip1v16qi; break;
13050         case V8QImode: gen = gen_aarch64_zip1v8qi; break;
13051         case V8HImode: gen = gen_aarch64_zip1v8hi; break;
13052         case V4HImode: gen = gen_aarch64_zip1v4hi; break;
13053         case V4SImode: gen = gen_aarch64_zip1v4si; break;
13054         case V2SImode: gen = gen_aarch64_zip1v2si; break;
13055         case V2DImode: gen = gen_aarch64_zip1v2di; break;
13056         case V4HFmode: gen = gen_aarch64_zip1v4hf; break;
13057         case V8HFmode: gen = gen_aarch64_zip1v8hf; break;
13058         case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
13059         case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
13060         case V2DFmode: gen = gen_aarch64_zip1v2df; break;
13061         default:
13062           return false;
13063         }
13064     }
13065
13066   emit_insn (gen (out, in0, in1));
13067   return true;
13068 }
13069
13070 /* Recognize patterns for the EXT insn.  */
13071
13072 static bool
13073 aarch64_evpc_ext (struct expand_vec_perm_d *d)
13074 {
13075   unsigned int i, nelt = d->nelt;
13076   rtx (*gen) (rtx, rtx, rtx, rtx);
13077   rtx offset;
13078
13079   unsigned int location = d->perm[0]; /* Always < nelt.  */
13080
13081   /* Check if the extracted indices are increasing by one.  */
13082   for (i = 1; i < nelt; i++)
13083     {
13084       unsigned int required = location + i;
13085       if (d->one_vector_p)
13086         {
13087           /* We'll pass the same vector in twice, so allow indices to wrap.  */
13088           required &= (nelt - 1);
13089         }
13090       if (d->perm[i] != required)
13091         return false;
13092     }
13093
13094   switch (d->vmode)
13095     {
13096     case V16QImode: gen = gen_aarch64_extv16qi; break;
13097     case V8QImode: gen = gen_aarch64_extv8qi; break;
13098     case V4HImode: gen = gen_aarch64_extv4hi; break;
13099     case V8HImode: gen = gen_aarch64_extv8hi; break;
13100     case V2SImode: gen = gen_aarch64_extv2si; break;
13101     case V4SImode: gen = gen_aarch64_extv4si; break;
13102     case V4HFmode: gen = gen_aarch64_extv4hf; break;
13103     case V8HFmode: gen = gen_aarch64_extv8hf; break;
13104     case V2SFmode: gen = gen_aarch64_extv2sf; break;
13105     case V4SFmode: gen = gen_aarch64_extv4sf; break;
13106     case V2DImode: gen = gen_aarch64_extv2di; break;
13107     case V2DFmode: gen = gen_aarch64_extv2df; break;
13108     default:
13109       return false;
13110     }
13111
13112   /* Success! */
13113   if (d->testing_p)
13114     return true;
13115
13116   /* The case where (location == 0) is a no-op for both big- and little-endian,
13117      and is removed by the mid-end at optimization levels -O1 and higher.  */
13118
13119   if (BYTES_BIG_ENDIAN && (location != 0))
13120     {
13121       /* After setup, we want the high elements of the first vector (stored
13122          at the LSB end of the register), and the low elements of the second
13123          vector (stored at the MSB end of the register). So swap.  */
13124       std::swap (d->op0, d->op1);
13125       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
13126       location = nelt - location;
13127     }
13128
13129   offset = GEN_INT (location);
13130   emit_insn (gen (d->target, d->op0, d->op1, offset));
13131   return true;
13132 }
13133
13134 /* Recognize patterns for the REV insns.  */
13135
13136 static bool
13137 aarch64_evpc_rev (struct expand_vec_perm_d *d)
13138 {
13139   unsigned int i, j, diff, nelt = d->nelt;
13140   rtx (*gen) (rtx, rtx);
13141
13142   if (!d->one_vector_p)
13143     return false;
13144
13145   diff = d->perm[0];
13146   switch (diff)
13147     {
13148     case 7:
13149       switch (d->vmode)
13150         {
13151         case V16QImode: gen = gen_aarch64_rev64v16qi; break;
13152         case V8QImode: gen = gen_aarch64_rev64v8qi;  break;
13153         default:
13154           return false;
13155         }
13156       break;
13157     case 3:
13158       switch (d->vmode)
13159         {
13160         case V16QImode: gen = gen_aarch64_rev32v16qi; break;
13161         case V8QImode: gen = gen_aarch64_rev32v8qi;  break;
13162         case V8HImode: gen = gen_aarch64_rev64v8hi;  break;
13163         case V4HImode: gen = gen_aarch64_rev64v4hi;  break;
13164         default:
13165           return false;
13166         }
13167       break;
13168     case 1:
13169       switch (d->vmode)
13170         {
13171         case V16QImode: gen = gen_aarch64_rev16v16qi; break;
13172         case V8QImode: gen = gen_aarch64_rev16v8qi;  break;
13173         case V8HImode: gen = gen_aarch64_rev32v8hi;  break;
13174         case V4HImode: gen = gen_aarch64_rev32v4hi;  break;
13175         case V4SImode: gen = gen_aarch64_rev64v4si;  break;
13176         case V2SImode: gen = gen_aarch64_rev64v2si;  break;
13177         case V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
13178         case V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
13179         case V8HFmode: gen = gen_aarch64_rev64v8hf;  break;
13180         case V4HFmode: gen = gen_aarch64_rev64v4hf;  break;
13181         default:
13182           return false;
13183         }
13184       break;
13185     default:
13186       return false;
13187     }
13188
13189   for (i = 0; i < nelt ; i += diff + 1)
13190     for (j = 0; j <= diff; j += 1)
13191       {
13192         /* This is guaranteed to be true as the value of diff
13193            is 7, 3, 1 and we should have enough elements in the
13194            queue to generate this.  Getting a vector mask with a
13195            value of diff other than these values implies that
13196            something is wrong by the time we get here.  */
13197         gcc_assert (i + j < nelt);
13198         if (d->perm[i + j] != i + diff - j)
13199           return false;
13200       }
13201
13202   /* Success! */
13203   if (d->testing_p)
13204     return true;
13205
13206   emit_insn (gen (d->target, d->op0));
13207   return true;
13208 }
13209
13210 static bool
13211 aarch64_evpc_dup (struct expand_vec_perm_d *d)
13212 {
13213   rtx (*gen) (rtx, rtx, rtx);
13214   rtx out = d->target;
13215   rtx in0;
13216   machine_mode vmode = d->vmode;
13217   unsigned int i, elt, nelt = d->nelt;
13218   rtx lane;
13219
13220   elt = d->perm[0];
13221   for (i = 1; i < nelt; i++)
13222     {
13223       if (elt != d->perm[i])
13224         return false;
13225     }
13226
13227   /* The generic preparation in aarch64_expand_vec_perm_const_1
13228      swaps the operand order and the permute indices if it finds
13229      d->perm[0] to be in the second operand.  Thus, we can always
13230      use d->op0 and need not do any extra arithmetic to get the
13231      correct lane number.  */
13232   in0 = d->op0;
13233   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
13234
13235   switch (vmode)
13236     {
13237     case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
13238     case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
13239     case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
13240     case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
13241     case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
13242     case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
13243     case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
13244     case V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
13245     case V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
13246     case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
13247     case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
13248     case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
13249     default:
13250       return false;
13251     }
13252
13253   emit_insn (gen (out, in0, lane));
13254   return true;
13255 }
13256
13257 static bool
13258 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
13259 {
13260   rtx rperm[MAX_VECT_LEN], sel;
13261   machine_mode vmode = d->vmode;
13262   unsigned int i, nelt = d->nelt;
13263
13264   if (d->testing_p)
13265     return true;
13266
13267   /* Generic code will try constant permutation twice.  Once with the
13268      original mode and again with the elements lowered to QImode.
13269      So wait and don't do the selector expansion ourselves.  */
13270   if (vmode != V8QImode && vmode != V16QImode)
13271     return false;
13272
13273   for (i = 0; i < nelt; ++i)
13274     {
13275       int nunits = GET_MODE_NUNITS (vmode);
13276
13277       /* If big-endian and two vectors we end up with a weird mixed-endian
13278          mode on NEON.  Reverse the index within each word but not the word
13279          itself.  */
13280       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
13281                                            : d->perm[i]);
13282     }
13283   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
13284   sel = force_reg (vmode, sel);
13285
13286   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
13287   return true;
13288 }
13289
13290 static bool
13291 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
13292 {
13293   /* The pattern matching functions above are written to look for a small
13294      number to begin the sequence (0, 1, N/2).  If we begin with an index
13295      from the second operand, we can swap the operands.  */
13296   if (d->perm[0] >= d->nelt)
13297     {
13298       unsigned i, nelt = d->nelt;
13299
13300       gcc_assert (nelt == (nelt & -nelt));
13301       for (i = 0; i < nelt; ++i)
13302         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
13303
13304       std::swap (d->op0, d->op1);
13305     }
13306
13307   if (TARGET_SIMD)
13308     {
13309       if (aarch64_evpc_rev (d))
13310         return true;
13311       else if (aarch64_evpc_ext (d))
13312         return true;
13313       else if (aarch64_evpc_dup (d))
13314         return true;
13315       else if (aarch64_evpc_zip (d))
13316         return true;
13317       else if (aarch64_evpc_uzp (d))
13318         return true;
13319       else if (aarch64_evpc_trn (d))
13320         return true;
13321       return aarch64_evpc_tbl (d);
13322     }
13323   return false;
13324 }
13325
13326 /* Expand a vec_perm_const pattern.  */
13327
13328 bool
13329 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
13330 {
13331   struct expand_vec_perm_d d;
13332   int i, nelt, which;
13333
13334   d.target = target;
13335   d.op0 = op0;
13336   d.op1 = op1;
13337
13338   d.vmode = GET_MODE (target);
13339   gcc_assert (VECTOR_MODE_P (d.vmode));
13340   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13341   d.testing_p = false;
13342
13343   for (i = which = 0; i < nelt; ++i)
13344     {
13345       rtx e = XVECEXP (sel, 0, i);
13346       int ei = INTVAL (e) & (2 * nelt - 1);
13347       which |= (ei < nelt ? 1 : 2);
13348       d.perm[i] = ei;
13349     }
13350
13351   switch (which)
13352     {
13353     default:
13354       gcc_unreachable ();
13355
13356     case 3:
13357       d.one_vector_p = false;
13358       if (!rtx_equal_p (op0, op1))
13359         break;
13360
13361       /* The elements of PERM do not suggest that only the first operand
13362          is used, but both operands are identical.  Allow easier matching
13363          of the permutation by folding the permutation into the single
13364          input vector.  */
13365       /* Fall Through.  */
13366     case 2:
13367       for (i = 0; i < nelt; ++i)
13368         d.perm[i] &= nelt - 1;
13369       d.op0 = op1;
13370       d.one_vector_p = true;
13371       break;
13372
13373     case 1:
13374       d.op1 = op0;
13375       d.one_vector_p = true;
13376       break;
13377     }
13378
13379   return aarch64_expand_vec_perm_const_1 (&d);
13380 }
13381
13382 static bool
13383 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
13384                                      const unsigned char *sel)
13385 {
13386   struct expand_vec_perm_d d;
13387   unsigned int i, nelt, which;
13388   bool ret;
13389
13390   d.vmode = vmode;
13391   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13392   d.testing_p = true;
13393   memcpy (d.perm, sel, nelt);
13394
13395   /* Calculate whether all elements are in one vector.  */
13396   for (i = which = 0; i < nelt; ++i)
13397     {
13398       unsigned char e = d.perm[i];
13399       gcc_assert (e < 2 * nelt);
13400       which |= (e < nelt ? 1 : 2);
13401     }
13402
13403   /* If all elements are from the second vector, reindex as if from the
13404      first vector.  */
13405   if (which == 2)
13406     for (i = 0; i < nelt; ++i)
13407       d.perm[i] -= nelt;
13408
13409   /* Check whether the mask can be applied to a single vector.  */
13410   d.one_vector_p = (which != 3);
13411
13412   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
13413   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
13414   if (!d.one_vector_p)
13415     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
13416
13417   start_sequence ();
13418   ret = aarch64_expand_vec_perm_const_1 (&d);
13419   end_sequence ();
13420
13421   return ret;
13422 }
13423
13424 rtx
13425 aarch64_reverse_mask (enum machine_mode mode)
13426 {
13427   /* We have to reverse each vector because we dont have
13428      a permuted load that can reverse-load according to ABI rules.  */
13429   rtx mask;
13430   rtvec v = rtvec_alloc (16);
13431   int i, j;
13432   int nunits = GET_MODE_NUNITS (mode);
13433   int usize = GET_MODE_UNIT_SIZE (mode);
13434
13435   gcc_assert (BYTES_BIG_ENDIAN);
13436   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
13437
13438   for (i = 0; i < nunits; i++)
13439     for (j = 0; j < usize; j++)
13440       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
13441   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
13442   return force_reg (V16QImode, mask);
13443 }
13444
13445 /* Implement MODES_TIEABLE_P.  In principle we should always return true.
13446    However due to issues with register allocation it is preferable to avoid
13447    tieing integer scalar and FP scalar modes.  Executing integer operations
13448    in general registers is better than treating them as scalar vector
13449    operations.  This reduces latency and avoids redundant int<->FP moves.
13450    So tie modes if they are either the same class, or vector modes with
13451    other vector modes, vector structs or any scalar mode.
13452 */
13453
13454 bool
13455 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
13456 {
13457   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
13458     return true;
13459
13460   /* We specifically want to allow elements of "structure" modes to
13461      be tieable to the structure.  This more general condition allows
13462      other rarer situations too.  */
13463   if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
13464     return true;
13465
13466   /* Also allow any scalar modes with vectors.  */
13467   if (aarch64_vector_mode_supported_p (mode1)
13468       || aarch64_vector_mode_supported_p (mode2))
13469     return true;
13470
13471   return false;
13472 }
13473
13474 /* Return a new RTX holding the result of moving POINTER forward by
13475    AMOUNT bytes.  */
13476
13477 static rtx
13478 aarch64_move_pointer (rtx pointer, int amount)
13479 {
13480   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
13481
13482   return adjust_automodify_address (pointer, GET_MODE (pointer),
13483                                     next, amount);
13484 }
13485
13486 /* Return a new RTX holding the result of moving POINTER forward by the
13487    size of the mode it points to.  */
13488
13489 static rtx
13490 aarch64_progress_pointer (rtx pointer)
13491 {
13492   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
13493
13494   return aarch64_move_pointer (pointer, amount);
13495 }
13496
13497 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13498    MODE bytes.  */
13499
13500 static void
13501 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
13502                                               machine_mode mode)
13503 {
13504   rtx reg = gen_reg_rtx (mode);
13505
13506   /* "Cast" the pointers to the correct mode.  */
13507   *src = adjust_address (*src, mode, 0);
13508   *dst = adjust_address (*dst, mode, 0);
13509   /* Emit the memcpy.  */
13510   emit_move_insn (reg, *src);
13511   emit_move_insn (*dst, reg);
13512   /* Move the pointers forward.  */
13513   *src = aarch64_progress_pointer (*src);
13514   *dst = aarch64_progress_pointer (*dst);
13515 }
13516
13517 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
13518    we succeed, otherwise return false.  */
13519
13520 bool
13521 aarch64_expand_movmem (rtx *operands)
13522 {
13523   unsigned int n;
13524   rtx dst = operands[0];
13525   rtx src = operands[1];
13526   rtx base;
13527   bool speed_p = !optimize_function_for_size_p (cfun);
13528
13529   /* When optimizing for size, give a better estimate of the length of a
13530      memcpy call, but use the default otherwise.  */
13531   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
13532
13533   /* We can't do anything smart if the amount to copy is not constant.  */
13534   if (!CONST_INT_P (operands[2]))
13535     return false;
13536
13537   n = UINTVAL (operands[2]);
13538
13539   /* Try to keep the number of instructions low.  For cases below 16 bytes we
13540      need to make at most two moves.  For cases above 16 bytes it will be one
13541      move for each 16 byte chunk, then at most two additional moves.  */
13542   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
13543     return false;
13544
13545   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
13546   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
13547
13548   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
13549   src = adjust_automodify_address (src, VOIDmode, base, 0);
13550
13551   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
13552      1-byte chunk.  */
13553   if (n < 4)
13554     {
13555       if (n >= 2)
13556         {
13557           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13558           n -= 2;
13559         }
13560
13561       if (n == 1)
13562         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13563
13564       return true;
13565     }
13566
13567   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
13568      4-byte chunk, partially overlapping with the previously copied chunk.  */
13569   if (n < 8)
13570     {
13571       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13572       n -= 4;
13573       if (n > 0)
13574         {
13575           int move = n - 4;
13576
13577           src = aarch64_move_pointer (src, move);
13578           dst = aarch64_move_pointer (dst, move);
13579           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13580         }
13581       return true;
13582     }
13583
13584   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
13585      them, then (if applicable) an 8-byte chunk.  */
13586   while (n >= 8)
13587     {
13588       if (n / 16)
13589         {
13590           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
13591           n -= 16;
13592         }
13593       else
13594         {
13595           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13596           n -= 8;
13597         }
13598     }
13599
13600   /* Finish the final bytes of the copy.  We can always do this in one
13601      instruction.  We either copy the exact amount we need, or partially
13602      overlap with the previous chunk we copied and copy 8-bytes.  */
13603   if (n == 0)
13604     return true;
13605   else if (n == 1)
13606     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13607   else if (n == 2)
13608     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13609   else if (n == 4)
13610     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13611   else
13612     {
13613       if (n == 3)
13614         {
13615           src = aarch64_move_pointer (src, -1);
13616           dst = aarch64_move_pointer (dst, -1);
13617           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13618         }
13619       else
13620         {
13621           int move = n - 8;
13622
13623           src = aarch64_move_pointer (src, move);
13624           dst = aarch64_move_pointer (dst, move);
13625           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13626         }
13627     }
13628
13629   return true;
13630 }
13631
13632 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
13633    SImode stores.  Handle the case when the constant has identical
13634    bottom and top halves.  This is beneficial when the two stores can be
13635    merged into an STP and we avoid synthesising potentially expensive
13636    immediates twice.  Return true if such a split is possible.  */
13637
13638 bool
13639 aarch64_split_dimode_const_store (rtx dst, rtx src)
13640 {
13641   rtx lo = gen_lowpart (SImode, src);
13642   rtx hi = gen_highpart_mode (SImode, DImode, src);
13643
13644   bool size_p = optimize_function_for_size_p (cfun);
13645
13646   if (!rtx_equal_p (lo, hi))
13647     return false;
13648
13649   unsigned int orig_cost
13650     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
13651   unsigned int lo_cost
13652     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
13653
13654   /* We want to transform:
13655      MOV        x1, 49370
13656      MOVK       x1, 0x140, lsl 16
13657      MOVK       x1, 0xc0da, lsl 32
13658      MOVK       x1, 0x140, lsl 48
13659      STR        x1, [x0]
13660    into:
13661      MOV        w1, 49370
13662      MOVK       w1, 0x140, lsl 16
13663      STP        w1, w1, [x0]
13664    So we want to perform this only when we save two instructions
13665    or more.  When optimizing for size, however, accept any code size
13666    savings we can.  */
13667   if (size_p && orig_cost <= lo_cost)
13668     return false;
13669
13670   if (!size_p
13671       && (orig_cost <= lo_cost + 1))
13672     return false;
13673
13674   rtx mem_lo = adjust_address (dst, SImode, 0);
13675   if (!aarch64_mem_pair_operand (mem_lo, SImode))
13676     return false;
13677
13678   rtx tmp_reg = gen_reg_rtx (SImode);
13679   aarch64_expand_mov_immediate (tmp_reg, lo);
13680   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
13681   /* Don't emit an explicit store pair as this may not be always profitable.
13682      Let the sched-fusion logic decide whether to merge them.  */
13683   emit_move_insn (mem_lo, tmp_reg);
13684   emit_move_insn (mem_hi, tmp_reg);
13685
13686   return true;
13687 }
13688
13689 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
13690
13691 static unsigned HOST_WIDE_INT
13692 aarch64_asan_shadow_offset (void)
13693 {
13694   return (HOST_WIDE_INT_1 << 36);
13695 }
13696
13697 static bool
13698 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
13699                                         unsigned int align,
13700                                         enum by_pieces_operation op,
13701                                         bool speed_p)
13702 {
13703   /* STORE_BY_PIECES can be used when copying a constant string, but
13704      in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
13705      For now we always fail this and let the move_by_pieces code copy
13706      the string from read-only memory.  */
13707   if (op == STORE_BY_PIECES)
13708     return false;
13709
13710   return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
13711 }
13712
13713 static rtx
13714 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
13715                         int code, tree treeop0, tree treeop1)
13716 {
13717   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13718   rtx op0, op1;
13719   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13720   insn_code icode;
13721   struct expand_operand ops[4];
13722
13723   start_sequence ();
13724   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13725
13726   op_mode = GET_MODE (op0);
13727   if (op_mode == VOIDmode)
13728     op_mode = GET_MODE (op1);
13729
13730   switch (op_mode)
13731     {
13732     case QImode:
13733     case HImode:
13734     case SImode:
13735       cmp_mode = SImode;
13736       icode = CODE_FOR_cmpsi;
13737       break;
13738
13739     case DImode:
13740       cmp_mode = DImode;
13741       icode = CODE_FOR_cmpdi;
13742       break;
13743
13744     case SFmode:
13745       cmp_mode = SFmode;
13746       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13747       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
13748       break;
13749
13750     case DFmode:
13751       cmp_mode = DFmode;
13752       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13753       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
13754       break;
13755
13756     default:
13757       end_sequence ();
13758       return NULL_RTX;
13759     }
13760
13761   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
13762   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
13763   if (!op0 || !op1)
13764     {
13765       end_sequence ();
13766       return NULL_RTX;
13767     }
13768   *prep_seq = get_insns ();
13769   end_sequence ();
13770
13771   create_fixed_operand (&ops[0], op0);
13772   create_fixed_operand (&ops[1], op1);
13773
13774   start_sequence ();
13775   if (!maybe_expand_insn (icode, 2, ops))
13776     {
13777       end_sequence ();
13778       return NULL_RTX;
13779     }
13780   *gen_seq = get_insns ();
13781   end_sequence ();
13782
13783   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
13784                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
13785 }
13786
13787 static rtx
13788 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
13789                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
13790 {
13791   rtx op0, op1, target;
13792   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13793   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13794   insn_code icode;
13795   struct expand_operand ops[6];
13796   int aarch64_cond;
13797
13798   push_to_sequence (*prep_seq);
13799   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13800
13801   op_mode = GET_MODE (op0);
13802   if (op_mode == VOIDmode)
13803     op_mode = GET_MODE (op1);
13804
13805   switch (op_mode)
13806     {
13807     case QImode:
13808     case HImode:
13809     case SImode:
13810       cmp_mode = SImode;
13811       icode = CODE_FOR_ccmpsi;
13812       break;
13813
13814     case DImode:
13815       cmp_mode = DImode;
13816       icode = CODE_FOR_ccmpdi;
13817       break;
13818
13819     case SFmode:
13820       cmp_mode = SFmode;
13821       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13822       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
13823       break;
13824
13825     case DFmode:
13826       cmp_mode = DFmode;
13827       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13828       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
13829       break;
13830
13831     default:
13832       end_sequence ();
13833       return NULL_RTX;
13834     }
13835
13836   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
13837   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
13838   if (!op0 || !op1)
13839     {
13840       end_sequence ();
13841       return NULL_RTX;
13842     }
13843   *prep_seq = get_insns ();
13844   end_sequence ();
13845
13846   target = gen_rtx_REG (cc_mode, CC_REGNUM);
13847   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
13848
13849   if (bit_code != AND)
13850     {
13851       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
13852                                                 GET_MODE (XEXP (prev, 0))),
13853                              VOIDmode, XEXP (prev, 0), const0_rtx);
13854       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
13855     }
13856
13857   create_fixed_operand (&ops[0], XEXP (prev, 0));
13858   create_fixed_operand (&ops[1], target);
13859   create_fixed_operand (&ops[2], op0);
13860   create_fixed_operand (&ops[3], op1);
13861   create_fixed_operand (&ops[4], prev);
13862   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
13863
13864   push_to_sequence (*gen_seq);
13865   if (!maybe_expand_insn (icode, 6, ops))
13866     {
13867       end_sequence ();
13868       return NULL_RTX;
13869     }
13870
13871   *gen_seq = get_insns ();
13872   end_sequence ();
13873
13874   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
13875 }
13876
13877 #undef TARGET_GEN_CCMP_FIRST
13878 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
13879
13880 #undef TARGET_GEN_CCMP_NEXT
13881 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
13882
13883 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
13884    instruction fusion of some sort.  */
13885
13886 static bool
13887 aarch64_macro_fusion_p (void)
13888 {
13889   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
13890 }
13891
13892
13893 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
13894    should be kept together during scheduling.  */
13895
13896 static bool
13897 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
13898 {
13899   rtx set_dest;
13900   rtx prev_set = single_set (prev);
13901   rtx curr_set = single_set (curr);
13902   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
13903   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
13904
13905   if (!aarch64_macro_fusion_p ())
13906     return false;
13907
13908   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
13909     {
13910       /* We are trying to match:
13911          prev (mov)  == (set (reg r0) (const_int imm16))
13912          curr (movk) == (set (zero_extract (reg r0)
13913                                            (const_int 16)
13914                                            (const_int 16))
13915                              (const_int imm16_1))  */
13916
13917       set_dest = SET_DEST (curr_set);
13918
13919       if (GET_CODE (set_dest) == ZERO_EXTRACT
13920           && CONST_INT_P (SET_SRC (curr_set))
13921           && CONST_INT_P (SET_SRC (prev_set))
13922           && CONST_INT_P (XEXP (set_dest, 2))
13923           && INTVAL (XEXP (set_dest, 2)) == 16
13924           && REG_P (XEXP (set_dest, 0))
13925           && REG_P (SET_DEST (prev_set))
13926           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
13927         {
13928           return true;
13929         }
13930     }
13931
13932   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
13933     {
13934
13935       /*  We're trying to match:
13936           prev (adrp) == (set (reg r1)
13937                               (high (symbol_ref ("SYM"))))
13938           curr (add) == (set (reg r0)
13939                              (lo_sum (reg r1)
13940                                      (symbol_ref ("SYM"))))
13941           Note that r0 need not necessarily be the same as r1, especially
13942           during pre-regalloc scheduling.  */
13943
13944       if (satisfies_constraint_Ush (SET_SRC (prev_set))
13945           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13946         {
13947           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
13948               && REG_P (XEXP (SET_SRC (curr_set), 0))
13949               && REGNO (XEXP (SET_SRC (curr_set), 0))
13950                  == REGNO (SET_DEST (prev_set))
13951               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
13952                               XEXP (SET_SRC (curr_set), 1)))
13953             return true;
13954         }
13955     }
13956
13957   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
13958     {
13959
13960       /* We're trying to match:
13961          prev (movk) == (set (zero_extract (reg r0)
13962                                            (const_int 16)
13963                                            (const_int 32))
13964                              (const_int imm16_1))
13965          curr (movk) == (set (zero_extract (reg r0)
13966                                            (const_int 16)
13967                                            (const_int 48))
13968                              (const_int imm16_2))  */
13969
13970       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
13971           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
13972           && REG_P (XEXP (SET_DEST (prev_set), 0))
13973           && REG_P (XEXP (SET_DEST (curr_set), 0))
13974           && REGNO (XEXP (SET_DEST (prev_set), 0))
13975              == REGNO (XEXP (SET_DEST (curr_set), 0))
13976           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
13977           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
13978           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
13979           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
13980           && CONST_INT_P (SET_SRC (prev_set))
13981           && CONST_INT_P (SET_SRC (curr_set)))
13982         return true;
13983
13984     }
13985   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
13986     {
13987       /* We're trying to match:
13988           prev (adrp) == (set (reg r0)
13989                               (high (symbol_ref ("SYM"))))
13990           curr (ldr) == (set (reg r1)
13991                              (mem (lo_sum (reg r0)
13992                                              (symbol_ref ("SYM")))))
13993                  or
13994           curr (ldr) == (set (reg r1)
13995                              (zero_extend (mem
13996                                            (lo_sum (reg r0)
13997                                                    (symbol_ref ("SYM"))))))  */
13998       if (satisfies_constraint_Ush (SET_SRC (prev_set))
13999           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14000         {
14001           rtx curr_src = SET_SRC (curr_set);
14002
14003           if (GET_CODE (curr_src) == ZERO_EXTEND)
14004             curr_src = XEXP (curr_src, 0);
14005
14006           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
14007               && REG_P (XEXP (XEXP (curr_src, 0), 0))
14008               && REGNO (XEXP (XEXP (curr_src, 0), 0))
14009                  == REGNO (SET_DEST (prev_set))
14010               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
14011                               XEXP (SET_SRC (prev_set), 0)))
14012               return true;
14013         }
14014     }
14015
14016   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
14017        && aarch_crypto_can_dual_issue (prev, curr))
14018     return true;
14019
14020   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
14021       && any_condjump_p (curr))
14022     {
14023       enum attr_type prev_type = get_attr_type (prev);
14024
14025       /* FIXME: this misses some which is considered simple arthematic
14026          instructions for ThunderX.  Simple shifts are missed here.  */
14027       if (prev_type == TYPE_ALUS_SREG
14028           || prev_type == TYPE_ALUS_IMM
14029           || prev_type == TYPE_LOGICS_REG
14030           || prev_type == TYPE_LOGICS_IMM)
14031         return true;
14032     }
14033
14034   return false;
14035 }
14036
14037 /* Return true iff the instruction fusion described by OP is enabled.  */
14038
14039 bool
14040 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
14041 {
14042   return (aarch64_tune_params.fusible_ops & op) != 0;
14043 }
14044
14045 /* If MEM is in the form of [base+offset], extract the two parts
14046    of address and set to BASE and OFFSET, otherwise return false
14047    after clearing BASE and OFFSET.  */
14048
14049 bool
14050 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
14051 {
14052   rtx addr;
14053
14054   gcc_assert (MEM_P (mem));
14055
14056   addr = XEXP (mem, 0);
14057
14058   if (REG_P (addr))
14059     {
14060       *base = addr;
14061       *offset = const0_rtx;
14062       return true;
14063     }
14064
14065   if (GET_CODE (addr) == PLUS
14066       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
14067     {
14068       *base = XEXP (addr, 0);
14069       *offset = XEXP (addr, 1);
14070       return true;
14071     }
14072
14073   *base = NULL_RTX;
14074   *offset = NULL_RTX;
14075
14076   return false;
14077 }
14078
14079 /* Types for scheduling fusion.  */
14080 enum sched_fusion_type
14081 {
14082   SCHED_FUSION_NONE = 0,
14083   SCHED_FUSION_LD_SIGN_EXTEND,
14084   SCHED_FUSION_LD_ZERO_EXTEND,
14085   SCHED_FUSION_LD,
14086   SCHED_FUSION_ST,
14087   SCHED_FUSION_NUM
14088 };
14089
14090 /* If INSN is a load or store of address in the form of [base+offset],
14091    extract the two parts and set to BASE and OFFSET.  Return scheduling
14092    fusion type this INSN is.  */
14093
14094 static enum sched_fusion_type
14095 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
14096 {
14097   rtx x, dest, src;
14098   enum sched_fusion_type fusion = SCHED_FUSION_LD;
14099
14100   gcc_assert (INSN_P (insn));
14101   x = PATTERN (insn);
14102   if (GET_CODE (x) != SET)
14103     return SCHED_FUSION_NONE;
14104
14105   src = SET_SRC (x);
14106   dest = SET_DEST (x);
14107
14108   machine_mode dest_mode = GET_MODE (dest);
14109
14110   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
14111     return SCHED_FUSION_NONE;
14112
14113   if (GET_CODE (src) == SIGN_EXTEND)
14114     {
14115       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
14116       src = XEXP (src, 0);
14117       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14118         return SCHED_FUSION_NONE;
14119     }
14120   else if (GET_CODE (src) == ZERO_EXTEND)
14121     {
14122       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
14123       src = XEXP (src, 0);
14124       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14125         return SCHED_FUSION_NONE;
14126     }
14127
14128   if (GET_CODE (src) == MEM && REG_P (dest))
14129     extract_base_offset_in_addr (src, base, offset);
14130   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
14131     {
14132       fusion = SCHED_FUSION_ST;
14133       extract_base_offset_in_addr (dest, base, offset);
14134     }
14135   else
14136     return SCHED_FUSION_NONE;
14137
14138   if (*base == NULL_RTX || *offset == NULL_RTX)
14139     fusion = SCHED_FUSION_NONE;
14140
14141   return fusion;
14142 }
14143
14144 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
14145
14146    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
14147    and PRI are only calculated for these instructions.  For other instruction,
14148    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
14149    type instruction fusion can be added by returning different priorities.
14150
14151    It's important that irrelevant instructions get the largest FUSION_PRI.  */
14152
14153 static void
14154 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
14155                                int *fusion_pri, int *pri)
14156 {
14157   int tmp, off_val;
14158   rtx base, offset;
14159   enum sched_fusion_type fusion;
14160
14161   gcc_assert (INSN_P (insn));
14162
14163   tmp = max_pri - 1;
14164   fusion = fusion_load_store (insn, &base, &offset);
14165   if (fusion == SCHED_FUSION_NONE)
14166     {
14167       *pri = tmp;
14168       *fusion_pri = tmp;
14169       return;
14170     }
14171
14172   /* Set FUSION_PRI according to fusion type and base register.  */
14173   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
14174
14175   /* Calculate PRI.  */
14176   tmp /= 2;
14177
14178   /* INSN with smaller offset goes first.  */
14179   off_val = (int)(INTVAL (offset));
14180   if (off_val >= 0)
14181     tmp -= (off_val & 0xfffff);
14182   else
14183     tmp += ((- off_val) & 0xfffff);
14184
14185   *pri = tmp;
14186   return;
14187 }
14188
14189 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
14190    Adjust priority of sha1h instructions so they are scheduled before
14191    other SHA1 instructions.  */
14192
14193 static int
14194 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
14195 {
14196   rtx x = PATTERN (insn);
14197
14198   if (GET_CODE (x) == SET)
14199     {
14200       x = SET_SRC (x);
14201
14202       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
14203         return priority + 10;
14204     }
14205
14206   return priority;
14207 }
14208
14209 /* Given OPERANDS of consecutive load/store, check if we can merge
14210    them into ldp/stp.  LOAD is true if they are load instructions.
14211    MODE is the mode of memory operands.  */
14212
14213 bool
14214 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
14215                                 enum machine_mode mode)
14216 {
14217   HOST_WIDE_INT offval_1, offval_2, msize;
14218   enum reg_class rclass_1, rclass_2;
14219   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
14220
14221   if (load)
14222     {
14223       mem_1 = operands[1];
14224       mem_2 = operands[3];
14225       reg_1 = operands[0];
14226       reg_2 = operands[2];
14227       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
14228       if (REGNO (reg_1) == REGNO (reg_2))
14229         return false;
14230     }
14231   else
14232     {
14233       mem_1 = operands[0];
14234       mem_2 = operands[2];
14235       reg_1 = operands[1];
14236       reg_2 = operands[3];
14237     }
14238
14239   /* The mems cannot be volatile.  */
14240   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
14241     return false;
14242
14243   /* If we have SImode and slow unaligned ldp,
14244      check the alignment to be at least 8 byte. */
14245   if (mode == SImode
14246       && (aarch64_tune_params.extra_tuning_flags
14247           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14248       && !optimize_size
14249       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14250     return false;
14251
14252   /* Check if the addresses are in the form of [base+offset].  */
14253   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14254   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14255     return false;
14256   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14257   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14258     return false;
14259
14260   /* Check if the bases are same.  */
14261   if (!rtx_equal_p (base_1, base_2))
14262     return false;
14263
14264   offval_1 = INTVAL (offset_1);
14265   offval_2 = INTVAL (offset_2);
14266   msize = GET_MODE_SIZE (mode);
14267   /* Check if the offsets are consecutive.  */
14268   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
14269     return false;
14270
14271   /* Check if the addresses are clobbered by load.  */
14272   if (load)
14273     {
14274       if (reg_mentioned_p (reg_1, mem_1))
14275         return false;
14276
14277       /* In increasing order, the last load can clobber the address.  */
14278       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
14279       return false;
14280     }
14281
14282   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14283     rclass_1 = FP_REGS;
14284   else
14285     rclass_1 = GENERAL_REGS;
14286
14287   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14288     rclass_2 = FP_REGS;
14289   else
14290     rclass_2 = GENERAL_REGS;
14291
14292   /* Check if the registers are of same class.  */
14293   if (rclass_1 != rclass_2)
14294     return false;
14295
14296   return true;
14297 }
14298
14299 /* Given OPERANDS of consecutive load/store, check if we can merge
14300    them into ldp/stp by adjusting the offset.  LOAD is true if they
14301    are load instructions.  MODE is the mode of memory operands.
14302
14303    Given below consecutive stores:
14304
14305      str  w1, [xb, 0x100]
14306      str  w1, [xb, 0x104]
14307      str  w1, [xb, 0x108]
14308      str  w1, [xb, 0x10c]
14309
14310    Though the offsets are out of the range supported by stp, we can
14311    still pair them after adjusting the offset, like:
14312
14313      add  scratch, xb, 0x100
14314      stp  w1, w1, [scratch]
14315      stp  w1, w1, [scratch, 0x8]
14316
14317    The peephole patterns detecting this opportunity should guarantee
14318    the scratch register is avaliable.  */
14319
14320 bool
14321 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
14322                                        enum machine_mode mode)
14323 {
14324   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
14325   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
14326   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
14327   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
14328
14329   if (load)
14330     {
14331       reg_1 = operands[0];
14332       mem_1 = operands[1];
14333       reg_2 = operands[2];
14334       mem_2 = operands[3];
14335       reg_3 = operands[4];
14336       mem_3 = operands[5];
14337       reg_4 = operands[6];
14338       mem_4 = operands[7];
14339       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
14340                   && REG_P (reg_3) && REG_P (reg_4));
14341       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
14342         return false;
14343     }
14344   else
14345     {
14346       mem_1 = operands[0];
14347       reg_1 = operands[1];
14348       mem_2 = operands[2];
14349       reg_2 = operands[3];
14350       mem_3 = operands[4];
14351       reg_3 = operands[5];
14352       mem_4 = operands[6];
14353       reg_4 = operands[7];
14354     }
14355   /* Skip if memory operand is by itslef valid for ldp/stp.  */
14356   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
14357     return false;
14358
14359   /* The mems cannot be volatile.  */
14360   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
14361       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
14362     return false;
14363
14364   /* Check if the addresses are in the form of [base+offset].  */
14365   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14366   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14367     return false;
14368   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14369   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14370     return false;
14371   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
14372   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
14373     return false;
14374   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
14375   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
14376     return false;
14377
14378   /* Check if the bases are same.  */
14379   if (!rtx_equal_p (base_1, base_2)
14380       || !rtx_equal_p (base_2, base_3)
14381       || !rtx_equal_p (base_3, base_4))
14382     return false;
14383
14384   offval_1 = INTVAL (offset_1);
14385   offval_2 = INTVAL (offset_2);
14386   offval_3 = INTVAL (offset_3);
14387   offval_4 = INTVAL (offset_4);
14388   msize = GET_MODE_SIZE (mode);
14389   /* Check if the offsets are consecutive.  */
14390   if ((offval_1 != (offval_2 + msize)
14391        || offval_1 != (offval_3 + msize * 2)
14392        || offval_1 != (offval_4 + msize * 3))
14393       && (offval_4 != (offval_3 + msize)
14394           || offval_4 != (offval_2 + msize * 2)
14395           || offval_4 != (offval_1 + msize * 3)))
14396     return false;
14397
14398   /* Check if the addresses are clobbered by load.  */
14399   if (load)
14400     {
14401       if (reg_mentioned_p (reg_1, mem_1)
14402           || reg_mentioned_p (reg_2, mem_2)
14403           || reg_mentioned_p (reg_3, mem_3))
14404         return false;
14405
14406       /* In increasing order, the last load can clobber the address.  */
14407       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
14408         return false;
14409     }
14410
14411   /* If we have SImode and slow unaligned ldp,
14412      check the alignment to be at least 8 byte. */
14413   if (mode == SImode
14414       && (aarch64_tune_params.extra_tuning_flags
14415           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14416       && !optimize_size
14417       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14418     return false;
14419
14420   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14421     rclass_1 = FP_REGS;
14422   else
14423     rclass_1 = GENERAL_REGS;
14424
14425   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14426     rclass_2 = FP_REGS;
14427   else
14428     rclass_2 = GENERAL_REGS;
14429
14430   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
14431     rclass_3 = FP_REGS;
14432   else
14433     rclass_3 = GENERAL_REGS;
14434
14435   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
14436     rclass_4 = FP_REGS;
14437   else
14438     rclass_4 = GENERAL_REGS;
14439
14440   /* Check if the registers are of same class.  */
14441   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
14442     return false;
14443
14444   return true;
14445 }
14446
14447 /* Given OPERANDS of consecutive load/store, this function pairs them
14448    into ldp/stp after adjusting the offset.  It depends on the fact
14449    that addresses of load/store instructions are in increasing order.
14450    MODE is the mode of memory operands.  CODE is the rtl operator
14451    which should be applied to all memory operands, it's SIGN_EXTEND,
14452    ZERO_EXTEND or UNKNOWN.  */
14453
14454 bool
14455 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
14456                              enum machine_mode mode, RTX_CODE code)
14457 {
14458   rtx base, offset, t1, t2;
14459   rtx mem_1, mem_2, mem_3, mem_4;
14460   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
14461
14462   if (load)
14463     {
14464       mem_1 = operands[1];
14465       mem_2 = operands[3];
14466       mem_3 = operands[5];
14467       mem_4 = operands[7];
14468     }
14469   else
14470     {
14471       mem_1 = operands[0];
14472       mem_2 = operands[2];
14473       mem_3 = operands[4];
14474       mem_4 = operands[6];
14475       gcc_assert (code == UNKNOWN);
14476     }
14477
14478   extract_base_offset_in_addr (mem_1, &base, &offset);
14479   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
14480
14481   /* Adjust offset thus it can fit in ldp/stp instruction.  */
14482   msize = GET_MODE_SIZE (mode);
14483   stp_off_limit = msize * 0x40;
14484   off_val = INTVAL (offset);
14485   abs_off = (off_val < 0) ? -off_val : off_val;
14486   new_off = abs_off % stp_off_limit;
14487   adj_off = abs_off - new_off;
14488
14489   /* Further adjust to make sure all offsets are OK.  */
14490   if ((new_off + msize * 2) >= stp_off_limit)
14491     {
14492       adj_off += stp_off_limit;
14493       new_off -= stp_off_limit;
14494     }
14495
14496   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
14497   if (adj_off >= 0x1000)
14498     return false;
14499
14500   if (off_val < 0)
14501     {
14502       adj_off = -adj_off;
14503       new_off = -new_off;
14504     }
14505
14506   /* Create new memory references.  */
14507   mem_1 = change_address (mem_1, VOIDmode,
14508                           plus_constant (DImode, operands[8], new_off));
14509
14510   /* Check if the adjusted address is OK for ldp/stp.  */
14511   if (!aarch64_mem_pair_operand (mem_1, mode))
14512     return false;
14513
14514   msize = GET_MODE_SIZE (mode);
14515   mem_2 = change_address (mem_2, VOIDmode,
14516                           plus_constant (DImode,
14517                                          operands[8],
14518                                          new_off + msize));
14519   mem_3 = change_address (mem_3, VOIDmode,
14520                           plus_constant (DImode,
14521                                          operands[8],
14522                                          new_off + msize * 2));
14523   mem_4 = change_address (mem_4, VOIDmode,
14524                           plus_constant (DImode,
14525                                          operands[8],
14526                                          new_off + msize * 3));
14527
14528   if (code == ZERO_EXTEND)
14529     {
14530       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
14531       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
14532       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
14533       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
14534     }
14535   else if (code == SIGN_EXTEND)
14536     {
14537       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
14538       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
14539       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
14540       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
14541     }
14542
14543   if (load)
14544     {
14545       operands[1] = mem_1;
14546       operands[3] = mem_2;
14547       operands[5] = mem_3;
14548       operands[7] = mem_4;
14549     }
14550   else
14551     {
14552       operands[0] = mem_1;
14553       operands[2] = mem_2;
14554       operands[4] = mem_3;
14555       operands[6] = mem_4;
14556     }
14557
14558   /* Emit adjusting instruction.  */
14559   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
14560   /* Emit ldp/stp instructions.  */
14561   t1 = gen_rtx_SET (operands[0], operands[1]);
14562   t2 = gen_rtx_SET (operands[2], operands[3]);
14563   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
14564   t1 = gen_rtx_SET (operands[4], operands[5]);
14565   t2 = gen_rtx_SET (operands[6], operands[7]);
14566   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
14567   return true;
14568 }
14569
14570 /* Return 1 if pseudo register should be created and used to hold
14571    GOT address for PIC code.  */
14572
14573 bool
14574 aarch64_use_pseudo_pic_reg (void)
14575 {
14576   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
14577 }
14578
14579 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
14580
14581 static int
14582 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
14583 {
14584   switch (XINT (x, 1))
14585     {
14586     case UNSPEC_GOTSMALLPIC:
14587     case UNSPEC_GOTSMALLPIC28K:
14588     case UNSPEC_GOTTINYPIC:
14589       return 0;
14590     default:
14591       break;
14592     }
14593
14594   return default_unspec_may_trap_p (x, flags);
14595 }
14596
14597
14598 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
14599    return the log2 of that value.  Otherwise return -1.  */
14600
14601 int
14602 aarch64_fpconst_pow_of_2 (rtx x)
14603 {
14604   const REAL_VALUE_TYPE *r;
14605
14606   if (!CONST_DOUBLE_P (x))
14607     return -1;
14608
14609   r = CONST_DOUBLE_REAL_VALUE (x);
14610
14611   if (REAL_VALUE_NEGATIVE (*r)
14612       || REAL_VALUE_ISNAN (*r)
14613       || REAL_VALUE_ISINF (*r)
14614       || !real_isinteger (r, DFmode))
14615     return -1;
14616
14617   return exact_log2 (real_to_integer (r));
14618 }
14619
14620 /* If X is a vector of equal CONST_DOUBLE values and that value is
14621    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
14622
14623 int
14624 aarch64_vec_fpconst_pow_of_2 (rtx x)
14625 {
14626   if (GET_CODE (x) != CONST_VECTOR)
14627     return -1;
14628
14629   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
14630     return -1;
14631
14632   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
14633   if (firstval <= 0)
14634     return -1;
14635
14636   for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
14637     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
14638       return -1;
14639
14640   return firstval;
14641 }
14642
14643 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
14644    to float.
14645
14646    __fp16 always promotes through this hook.
14647    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
14648    through the generic excess precision logic rather than here.  */
14649
14650 static tree
14651 aarch64_promoted_type (const_tree t)
14652 {
14653   if (SCALAR_FLOAT_TYPE_P (t)
14654       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
14655     return float_type_node;
14656
14657   return NULL_TREE;
14658 }
14659
14660 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
14661
14662 static bool
14663 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
14664                            optimization_type opt_type)
14665 {
14666   switch (op)
14667     {
14668     case rsqrt_optab:
14669       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
14670
14671     default:
14672       return true;
14673     }
14674 }
14675
14676 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
14677    if MODE is HFmode, and punt to the generic implementation otherwise.  */
14678
14679 static bool
14680 aarch64_libgcc_floating_mode_supported_p (machine_mode mode)
14681 {
14682   return (mode == HFmode
14683           ? true
14684           : default_libgcc_floating_mode_supported_p (mode));
14685 }
14686
14687 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
14688    if MODE is HFmode, and punt to the generic implementation otherwise.  */
14689
14690 static bool
14691 aarch64_scalar_mode_supported_p (machine_mode mode)
14692 {
14693   return (mode == HFmode
14694           ? true
14695           : default_scalar_mode_supported_p (mode));
14696 }
14697
14698 /* Set the value of FLT_EVAL_METHOD.
14699    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
14700
14701     0: evaluate all operations and constants, whose semantic type has at
14702        most the range and precision of type float, to the range and
14703        precision of float; evaluate all other operations and constants to
14704        the range and precision of the semantic type;
14705
14706     N, where _FloatN is a supported interchange floating type
14707        evaluate all operations and constants, whose semantic type has at
14708        most the range and precision of _FloatN type, to the range and
14709        precision of the _FloatN type; evaluate all other operations and
14710        constants to the range and precision of the semantic type;
14711
14712    If we have the ARMv8.2-A extensions then we support _Float16 in native
14713    precision, so we should set this to 16.  Otherwise, we support the type,
14714    but want to evaluate expressions in float precision, so set this to
14715    0.  */
14716
14717 static enum flt_eval_method
14718 aarch64_excess_precision (enum excess_precision_type type)
14719 {
14720   switch (type)
14721     {
14722       case EXCESS_PRECISION_TYPE_FAST:
14723       case EXCESS_PRECISION_TYPE_STANDARD:
14724         /* We can calculate either in 16-bit range and precision or
14725            32-bit range and precision.  Make that decision based on whether
14726            we have native support for the ARMv8.2-A 16-bit floating-point
14727            instructions or not.  */
14728         return (TARGET_FP_F16INST
14729                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
14730                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
14731       case EXCESS_PRECISION_TYPE_IMPLICIT:
14732         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
14733       default:
14734         gcc_unreachable ();
14735     }
14736   return FLT_EVAL_METHOD_UNPREDICTABLE;
14737 }
14738
14739 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
14740    scheduled for speculative execution.  Reject the long-running division
14741    and square-root instructions.  */
14742
14743 static bool
14744 aarch64_sched_can_speculate_insn (rtx_insn *insn)
14745 {
14746   switch (get_attr_type (insn))
14747     {
14748       case TYPE_SDIV:
14749       case TYPE_UDIV:
14750       case TYPE_FDIVS:
14751       case TYPE_FDIVD:
14752       case TYPE_FSQRTS:
14753       case TYPE_FSQRTD:
14754       case TYPE_NEON_FP_SQRT_S:
14755       case TYPE_NEON_FP_SQRT_D:
14756       case TYPE_NEON_FP_SQRT_S_Q:
14757       case TYPE_NEON_FP_SQRT_D_Q:
14758       case TYPE_NEON_FP_DIV_S:
14759       case TYPE_NEON_FP_DIV_D:
14760       case TYPE_NEON_FP_DIV_S_Q:
14761       case TYPE_NEON_FP_DIV_D_Q:
14762         return false;
14763       default:
14764         return true;
14765     }
14766 }
14767
14768 /* Target-specific selftests.  */
14769
14770 #if CHECKING_P
14771
14772 namespace selftest {
14773
14774 /* Selftest for the RTL loader.
14775    Verify that the RTL loader copes with a dump from
14776    print_rtx_function.  This is essentially just a test that class
14777    function_reader can handle a real dump, but it also verifies
14778    that lookup_reg_by_dump_name correctly handles hard regs.
14779    The presence of hard reg names in the dump means that the test is
14780    target-specific, hence it is in this file.  */
14781
14782 static void
14783 aarch64_test_loading_full_dump ()
14784 {
14785   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
14786
14787   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
14788
14789   rtx_insn *insn_1 = get_insn_by_uid (1);
14790   ASSERT_EQ (NOTE, GET_CODE (insn_1));
14791
14792   rtx_insn *insn_15 = get_insn_by_uid (15);
14793   ASSERT_EQ (INSN, GET_CODE (insn_15));
14794   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
14795
14796   /* Verify crtl->return_rtx.  */
14797   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
14798   ASSERT_EQ (0, REGNO (crtl->return_rtx));
14799   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
14800 }
14801
14802 /* Run all target-specific selftests.  */
14803
14804 static void
14805 aarch64_run_selftests (void)
14806 {
14807   aarch64_test_loading_full_dump ();
14808 }
14809
14810 } // namespace selftest
14811
14812 #endif /* #if CHECKING_P */
14813
14814 #undef TARGET_ADDRESS_COST
14815 #define TARGET_ADDRESS_COST aarch64_address_cost
14816
14817 /* This hook will determines whether unnamed bitfields affect the alignment
14818    of the containing structure.  The hook returns true if the structure
14819    should inherit the alignment requirements of an unnamed bitfield's
14820    type.  */
14821 #undef TARGET_ALIGN_ANON_BITFIELD
14822 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
14823
14824 #undef TARGET_ASM_ALIGNED_DI_OP
14825 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
14826
14827 #undef TARGET_ASM_ALIGNED_HI_OP
14828 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
14829
14830 #undef TARGET_ASM_ALIGNED_SI_OP
14831 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
14832
14833 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
14834 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
14835   hook_bool_const_tree_hwi_hwi_const_tree_true
14836
14837 #undef TARGET_ASM_FILE_START
14838 #define TARGET_ASM_FILE_START aarch64_start_file
14839
14840 #undef TARGET_ASM_OUTPUT_MI_THUNK
14841 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
14842
14843 #undef TARGET_ASM_SELECT_RTX_SECTION
14844 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
14845
14846 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
14847 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
14848
14849 #undef TARGET_BUILD_BUILTIN_VA_LIST
14850 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
14851
14852 #undef TARGET_CALLEE_COPIES
14853 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
14854
14855 #undef TARGET_CAN_ELIMINATE
14856 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
14857
14858 #undef TARGET_CAN_INLINE_P
14859 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
14860
14861 #undef TARGET_CANNOT_FORCE_CONST_MEM
14862 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
14863
14864 #undef TARGET_CASE_VALUES_THRESHOLD
14865 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
14866
14867 #undef TARGET_CONDITIONAL_REGISTER_USAGE
14868 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
14869
14870 /* Only the least significant bit is used for initialization guard
14871    variables.  */
14872 #undef TARGET_CXX_GUARD_MASK_BIT
14873 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
14874
14875 #undef TARGET_C_MODE_FOR_SUFFIX
14876 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
14877
14878 #ifdef TARGET_BIG_ENDIAN_DEFAULT
14879 #undef  TARGET_DEFAULT_TARGET_FLAGS
14880 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
14881 #endif
14882
14883 #undef TARGET_CLASS_MAX_NREGS
14884 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
14885
14886 #undef TARGET_BUILTIN_DECL
14887 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
14888
14889 #undef TARGET_BUILTIN_RECIPROCAL
14890 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
14891
14892 #undef TARGET_C_EXCESS_PRECISION
14893 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
14894
14895 #undef  TARGET_EXPAND_BUILTIN
14896 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
14897
14898 #undef TARGET_EXPAND_BUILTIN_VA_START
14899 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
14900
14901 #undef TARGET_FOLD_BUILTIN
14902 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
14903
14904 #undef TARGET_FUNCTION_ARG
14905 #define TARGET_FUNCTION_ARG aarch64_function_arg
14906
14907 #undef TARGET_FUNCTION_ARG_ADVANCE
14908 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
14909
14910 #undef TARGET_FUNCTION_ARG_BOUNDARY
14911 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
14912
14913 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
14914 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
14915
14916 #undef TARGET_FUNCTION_VALUE
14917 #define TARGET_FUNCTION_VALUE aarch64_function_value
14918
14919 #undef TARGET_FUNCTION_VALUE_REGNO_P
14920 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
14921
14922 #undef TARGET_FRAME_POINTER_REQUIRED
14923 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
14924
14925 #undef TARGET_GIMPLE_FOLD_BUILTIN
14926 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
14927
14928 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
14929 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
14930
14931 #undef  TARGET_INIT_BUILTINS
14932 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
14933
14934 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
14935 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
14936   aarch64_ira_change_pseudo_allocno_class
14937
14938 #undef TARGET_LEGITIMATE_ADDRESS_P
14939 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
14940
14941 #undef TARGET_LEGITIMATE_CONSTANT_P
14942 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
14943
14944 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
14945 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
14946   aarch64_legitimize_address_displacement
14947
14948 #undef TARGET_LIBGCC_CMP_RETURN_MODE
14949 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
14950
14951 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
14952 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
14953 aarch64_libgcc_floating_mode_supported_p
14954
14955 #undef TARGET_MANGLE_TYPE
14956 #define TARGET_MANGLE_TYPE aarch64_mangle_type
14957
14958 #undef TARGET_MEMORY_MOVE_COST
14959 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
14960
14961 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
14962 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
14963
14964 #undef TARGET_MUST_PASS_IN_STACK
14965 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
14966
14967 /* This target hook should return true if accesses to volatile bitfields
14968    should use the narrowest mode possible.  It should return false if these
14969    accesses should use the bitfield container type.  */
14970 #undef TARGET_NARROW_VOLATILE_BITFIELD
14971 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
14972
14973 #undef  TARGET_OPTION_OVERRIDE
14974 #define TARGET_OPTION_OVERRIDE aarch64_override_options
14975
14976 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
14977 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
14978   aarch64_override_options_after_change
14979
14980 #undef TARGET_OPTION_SAVE
14981 #define TARGET_OPTION_SAVE aarch64_option_save
14982
14983 #undef TARGET_OPTION_RESTORE
14984 #define TARGET_OPTION_RESTORE aarch64_option_restore
14985
14986 #undef TARGET_OPTION_PRINT
14987 #define TARGET_OPTION_PRINT aarch64_option_print
14988
14989 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
14990 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
14991
14992 #undef TARGET_SET_CURRENT_FUNCTION
14993 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
14994
14995 #undef TARGET_PASS_BY_REFERENCE
14996 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
14997
14998 #undef TARGET_PREFERRED_RELOAD_CLASS
14999 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
15000
15001 #undef TARGET_SCHED_REASSOCIATION_WIDTH
15002 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
15003
15004 #undef TARGET_PROMOTED_TYPE
15005 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
15006
15007 #undef TARGET_SECONDARY_RELOAD
15008 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
15009
15010 #undef TARGET_SHIFT_TRUNCATION_MASK
15011 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
15012
15013 #undef TARGET_SETUP_INCOMING_VARARGS
15014 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
15015
15016 #undef TARGET_STRUCT_VALUE_RTX
15017 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
15018
15019 #undef TARGET_REGISTER_MOVE_COST
15020 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
15021
15022 #undef TARGET_RETURN_IN_MEMORY
15023 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
15024
15025 #undef TARGET_RETURN_IN_MSB
15026 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
15027
15028 #undef TARGET_RTX_COSTS
15029 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
15030
15031 #undef TARGET_SCALAR_MODE_SUPPORTED_P
15032 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
15033
15034 #undef TARGET_SCHED_ISSUE_RATE
15035 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
15036
15037 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
15038 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
15039   aarch64_sched_first_cycle_multipass_dfa_lookahead
15040
15041 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
15042 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
15043   aarch64_first_cycle_multipass_dfa_lookahead_guard
15044
15045 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
15046 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
15047   aarch64_get_separate_components
15048
15049 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
15050 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
15051   aarch64_components_for_bb
15052
15053 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
15054 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
15055   aarch64_disqualify_components
15056
15057 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
15058 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
15059   aarch64_emit_prologue_components
15060
15061 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
15062 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
15063   aarch64_emit_epilogue_components
15064
15065 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
15066 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
15067   aarch64_set_handled_components
15068
15069 #undef TARGET_TRAMPOLINE_INIT
15070 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
15071
15072 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
15073 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
15074
15075 #undef TARGET_VECTOR_MODE_SUPPORTED_P
15076 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
15077
15078 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
15079 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
15080   aarch64_builtin_support_vector_misalignment
15081
15082 #undef TARGET_ARRAY_MODE_SUPPORTED_P
15083 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
15084
15085 #undef TARGET_VECTORIZE_ADD_STMT_COST
15086 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
15087
15088 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
15089 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
15090   aarch64_builtin_vectorization_cost
15091
15092 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
15093 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
15094
15095 #undef TARGET_VECTORIZE_BUILTINS
15096 #define TARGET_VECTORIZE_BUILTINS
15097
15098 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
15099 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
15100   aarch64_builtin_vectorized_function
15101
15102 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
15103 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
15104   aarch64_autovectorize_vector_sizes
15105
15106 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
15107 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
15108   aarch64_atomic_assign_expand_fenv
15109
15110 /* Section anchor support.  */
15111
15112 #undef TARGET_MIN_ANCHOR_OFFSET
15113 #define TARGET_MIN_ANCHOR_OFFSET -256
15114
15115 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
15116    byte offset; we can do much more for larger data types, but have no way
15117    to determine the size of the access.  We assume accesses are aligned.  */
15118 #undef TARGET_MAX_ANCHOR_OFFSET
15119 #define TARGET_MAX_ANCHOR_OFFSET 4095
15120
15121 #undef TARGET_VECTOR_ALIGNMENT
15122 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
15123
15124 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
15125 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
15126   aarch64_simd_vector_alignment_reachable
15127
15128 /* vec_perm support.  */
15129
15130 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
15131 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
15132   aarch64_vectorize_vec_perm_const_ok
15133
15134 #undef TARGET_INIT_LIBFUNCS
15135 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
15136
15137 #undef TARGET_FIXED_CONDITION_CODE_REGS
15138 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
15139
15140 #undef TARGET_FLAGS_REGNUM
15141 #define TARGET_FLAGS_REGNUM CC_REGNUM
15142
15143 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
15144 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
15145
15146 #undef TARGET_ASAN_SHADOW_OFFSET
15147 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
15148
15149 #undef TARGET_LEGITIMIZE_ADDRESS
15150 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
15151
15152 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
15153 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
15154   aarch64_use_by_pieces_infrastructure_p
15155
15156 #undef TARGET_SCHED_CAN_SPECULATE_INSN
15157 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
15158
15159 #undef TARGET_CAN_USE_DOLOOP_P
15160 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
15161
15162 #undef TARGET_SCHED_ADJUST_PRIORITY
15163 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
15164
15165 #undef TARGET_SCHED_MACRO_FUSION_P
15166 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
15167
15168 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
15169 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
15170
15171 #undef TARGET_SCHED_FUSION_PRIORITY
15172 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
15173
15174 #undef TARGET_UNSPEC_MAY_TRAP_P
15175 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
15176
15177 #undef TARGET_USE_PSEUDO_PIC_REG
15178 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
15179
15180 #undef TARGET_PRINT_OPERAND
15181 #define TARGET_PRINT_OPERAND aarch64_print_operand
15182
15183 #undef TARGET_PRINT_OPERAND_ADDRESS
15184 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
15185
15186 #undef TARGET_OPTAB_SUPPORTED_P
15187 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
15188
15189 #undef TARGET_OMIT_STRUCT_RETURN_REG
15190 #define TARGET_OMIT_STRUCT_RETURN_REG true
15191
15192 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
15193 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
15194 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
15195
15196 #if CHECKING_P
15197 #undef TARGET_RUN_TARGET_SELFTESTS
15198 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
15199 #endif /* #if CHECKING_P */
15200
15201 struct gcc_target targetm = TARGET_INITIALIZER;
15202
15203 #include "gt-aarch64.h"