gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2017 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #define INCLUDE_STRING
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "memmodel.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "cfgloop.h"
  33 #include "df.h"
  34 #include "tm_p.h"
  35 #include "stringpool.h"
  36 #include "optabs.h"
  37 #include "regs.h"
  38 #include "emit-rtl.h"
  39 #include "recog.h"
  40 #include "diagnostic.h"
  41 #include "insn-attr.h"
  42 #include "alias.h"
  43 #include "fold-const.h"
  44 #include "stor-layout.h"
  45 #include "calls.h"
  46 #include "varasm.h"
  47 #include "output.h"
  48 #include "flags.h"
  49 #include "explow.h"
  50 #include "expr.h"
  51 #include "reload.h"
  52 #include "langhooks.h"
  53 #include "opts.h"
  54 #include "params.h"
  55 #include "gimplify.h"
  56 #include "dwarf2.h"
  57 #include "gimple-iterator.h"
  58 #include "tree-vectorizer.h"
  59 #include "aarch64-cost-tables.h"
  60 #include "dumpfile.h"
  61 #include "builtins.h"
  62 #include "rtl-iter.h"
  63 #include "tm-constrs.h"
  64 #include "sched-int.h"
  65 #include "target-globals.h"
  66 #include "common/common-target.h"
  67 #include "selftest.h"
  68 #include "selftest-rtl.h"
  69
  70 /* This file should be included last.  */
  71 #include "target-def.h"
  72
  73 /* Defined for convenience.  */
  74 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  75
  76 /* Classifies an address.
  77
  78    ADDRESS_REG_IMM
  79        A simple base register plus immediate offset.
  80
  81    ADDRESS_REG_WB
  82        A base register indexed by immediate offset with writeback.
  83
  84    ADDRESS_REG_REG
  85        A base register indexed by (optionally scaled) register.
  86
  87    ADDRESS_REG_UXTW
  88        A base register indexed by (optionally scaled) zero-extended register.
  89
  90    ADDRESS_REG_SXTW
  91        A base register indexed by (optionally scaled) sign-extended register.
  92
  93    ADDRESS_LO_SUM
  94        A LO_SUM rtx with a base register and "LO12" symbol relocation.
  95
  96    ADDRESS_SYMBOLIC:
  97        A constant symbolic address, in pc-relative literal pool.  */
  98
  99 enum aarch64_address_type {
 100   ADDRESS_REG_IMM,
 101   ADDRESS_REG_WB,
 102   ADDRESS_REG_REG,
 103   ADDRESS_REG_UXTW,
 104   ADDRESS_REG_SXTW,
 105   ADDRESS_LO_SUM,
 106   ADDRESS_SYMBOLIC
 107 };
 108
 109 struct aarch64_address_info {
 110   enum aarch64_address_type type;
 111   rtx base;
 112   rtx offset;
 113   int shift;
 114   enum aarch64_symbol_type symbol_type;
 115 };
 116
 117 struct simd_immediate_info
 118 {
 119   rtx value;
 120   int shift;
 121   int element_width;
 122   bool mvn;
 123   bool msl;
 124 };
 125
 126 /* The current code model.  */
 127 enum aarch64_code_model aarch64_cmodel;
 128
 129 #ifdef HAVE_AS_TLS
 130 #undef TARGET_HAVE_TLS
 131 #define TARGET_HAVE_TLS 1
 132 #endif
 133
 134 static bool aarch64_composite_type_p (const_tree, machine_mode);
 135 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 136                                                      const_tree,
 137                                                      machine_mode *, int *,
 138                                                      bool *);
 139 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 140 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 141 static void aarch64_override_options_after_change (void);
 142 static bool aarch64_vector_mode_supported_p (machine_mode);
 143 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
 144                                                  const unsigned char *sel);
 145 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 146 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 147                                                          const_tree type,
 148                                                          int misalignment,
 149                                                          bool is_packed);
 150
 151 /* Major revision number of the ARM Architecture implemented by the target.  */
 152 unsigned aarch64_architecture_version;
 153
 154 /* The processor for which instructions should be scheduled.  */
 155 enum aarch64_processor aarch64_tune = cortexa53;
 156
 157 /* Mask to specify which instruction scheduling options should be used.  */
 158 unsigned long aarch64_tune_flags = 0;
 159
 160 /* Global flag for PC relative loads.  */
 161 bool aarch64_pcrelative_literal_loads;
 162
 163 /* Support for command line parsing of boolean flags in the tuning
 164    structures.  */
 165 struct aarch64_flag_desc
 166 {
 167   const char* name;
 168   unsigned int flag;
 169 };
 170
 171 #define AARCH64_FUSION_PAIR(name, internal_name) \
 172   { name, AARCH64_FUSE_##internal_name },
 173 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 174 {
 175   { "none", AARCH64_FUSE_NOTHING },
 176 #include "aarch64-fusion-pairs.def"
 177   { "all", AARCH64_FUSE_ALL },
 178   { NULL, AARCH64_FUSE_NOTHING }
 179 };
 180
 181 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 182   { name, AARCH64_EXTRA_TUNE_##internal_name },
 183 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 184 {
 185   { "none", AARCH64_EXTRA_TUNE_NONE },
 186 #include "aarch64-tuning-flags.def"
 187   { "all", AARCH64_EXTRA_TUNE_ALL },
 188   { NULL, AARCH64_EXTRA_TUNE_NONE }
 189 };
 190
 191 /* Tuning parameters.  */
 192
 193 static const struct cpu_addrcost_table generic_addrcost_table =
 194 {
 195     {
 196       0, /* hi  */
 197       0, /* si  */
 198       0, /* di  */
 199       0, /* ti  */
 200     },
 201   0, /* pre_modify  */
 202   0, /* post_modify  */
 203   0, /* register_offset  */
 204   0, /* register_sextend  */
 205   0, /* register_zextend  */
 206   0 /* imm_offset  */
 207 };
 208
 209 static const struct cpu_addrcost_table cortexa57_addrcost_table =
 210 {
 211     {
 212       1, /* hi  */
 213       0, /* si  */
 214       0, /* di  */
 215       1, /* ti  */
 216     },
 217   0, /* pre_modify  */
 218   0, /* post_modify  */
 219   0, /* register_offset  */
 220   0, /* register_sextend  */
 221   0, /* register_zextend  */
 222   0, /* imm_offset  */
 223 };
 224
 225 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 226 {
 227     {
 228       0, /* hi  */
 229       0, /* si  */
 230       0, /* di  */
 231       2, /* ti  */
 232     },
 233   0, /* pre_modify  */
 234   0, /* post_modify  */
 235   1, /* register_offset  */
 236   1, /* register_sextend  */
 237   2, /* register_zextend  */
 238   0, /* imm_offset  */
 239 };
 240
 241 static const struct cpu_addrcost_table xgene1_addrcost_table =
 242 {
 243     {
 244       1, /* hi  */
 245       0, /* si  */
 246       0, /* di  */
 247       1, /* ti  */
 248     },
 249   1, /* pre_modify  */
 250   0, /* post_modify  */
 251   0, /* register_offset  */
 252   1, /* register_sextend  */
 253   1, /* register_zextend  */
 254   0, /* imm_offset  */
 255 };
 256
 257 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 258 {
 259     {
 260       1, /* hi  */
 261       0, /* si  */
 262       0, /* di  */
 263       1, /* ti  */
 264     },
 265   0, /* pre_modify  */
 266   0, /* post_modify  */
 267   0, /* register_offset  */
 268   0, /* register_sextend  */
 269   0, /* register_zextend  */
 270   0 /* imm_offset  */
 271 };
 272
 273 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 274 {
 275     {
 276       0, /* hi  */
 277       0, /* si  */
 278       0, /* di  */
 279       2, /* ti  */
 280     },
 281   0, /* pre_modify  */
 282   0, /* post_modify  */
 283   2, /* register_offset  */
 284   3, /* register_sextend  */
 285   3, /* register_zextend  */
 286   0, /* imm_offset  */
 287 };
 288
 289 static const struct cpu_regmove_cost generic_regmove_cost =
 290 {
 291   1, /* GP2GP  */
 292   /* Avoid the use of slow int<->fp moves for spilling by setting
 293      their cost higher than memmov_cost.  */
 294   5, /* GP2FP  */
 295   5, /* FP2GP  */
 296   2 /* FP2FP  */
 297 };
 298
 299 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 300 {
 301   1, /* GP2GP  */
 302   /* Avoid the use of slow int<->fp moves for spilling by setting
 303      their cost higher than memmov_cost.  */
 304   5, /* GP2FP  */
 305   5, /* FP2GP  */
 306   2 /* FP2FP  */
 307 };
 308
 309 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 310 {
 311   1, /* GP2GP  */
 312   /* Avoid the use of slow int<->fp moves for spilling by setting
 313      their cost higher than memmov_cost.  */
 314   5, /* GP2FP  */
 315   5, /* FP2GP  */
 316   2 /* FP2FP  */
 317 };
 318
 319 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 320 {
 321   1, /* GP2GP  */
 322   /* Avoid the use of slow int<->fp moves for spilling by setting
 323      their cost higher than memmov_cost (actual, 4 and 9).  */
 324   9, /* GP2FP  */
 325   9, /* FP2GP  */
 326   1 /* FP2FP  */
 327 };
 328
 329 static const struct cpu_regmove_cost thunderx_regmove_cost =
 330 {
 331   2, /* GP2GP  */
 332   2, /* GP2FP  */
 333   6, /* FP2GP  */
 334   4 /* FP2FP  */
 335 };
 336
 337 static const struct cpu_regmove_cost xgene1_regmove_cost =
 338 {
 339   1, /* GP2GP  */
 340   /* Avoid the use of slow int<->fp moves for spilling by setting
 341      their cost higher than memmov_cost.  */
 342   8, /* GP2FP  */
 343   8, /* FP2GP  */
 344   2 /* FP2FP  */
 345 };
 346
 347 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 348 {
 349   2, /* GP2GP  */
 350   /* Avoid the use of int<->fp moves for spilling.  */
 351   6, /* GP2FP  */
 352   6, /* FP2GP  */
 353   4 /* FP2FP  */
 354 };
 355
 356 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 357 {
 358   1, /* GP2GP  */
 359   /* Avoid the use of int<->fp moves for spilling.  */
 360   8, /* GP2FP  */
 361   8, /* FP2GP  */
 362   4  /* FP2FP  */
 363 };
 364
 365 /* Generic costs for vector insn classes.  */
 366 static const struct cpu_vector_cost generic_vector_cost =
 367 {
 368   1, /* scalar_stmt_cost  */
 369   1, /* scalar_load_cost  */
 370   1, /* scalar_store_cost  */
 371   1, /* vec_stmt_cost  */
 372   2, /* vec_permute_cost  */
 373   1, /* vec_to_scalar_cost  */
 374   1, /* scalar_to_vec_cost  */
 375   1, /* vec_align_load_cost  */
 376   1, /* vec_unalign_load_cost  */
 377   1, /* vec_unalign_store_cost  */
 378   1, /* vec_store_cost  */
 379   3, /* cond_taken_branch_cost  */
 380   1 /* cond_not_taken_branch_cost  */
 381 };
 382
 383 /* ThunderX costs for vector insn classes.  */
 384 static const struct cpu_vector_cost thunderx_vector_cost =
 385 {
 386   1, /* scalar_stmt_cost  */
 387   3, /* scalar_load_cost  */
 388   1, /* scalar_store_cost  */
 389   4, /* vec_stmt_cost  */
 390   4, /* vec_permute_cost  */
 391   2, /* vec_to_scalar_cost  */
 392   2, /* scalar_to_vec_cost  */
 393   3, /* vec_align_load_cost  */
 394   10, /* vec_unalign_load_cost  */
 395   10, /* vec_unalign_store_cost  */
 396   1, /* vec_store_cost  */
 397   3, /* cond_taken_branch_cost  */
 398   3 /* cond_not_taken_branch_cost  */
 399 };
 400
 401 /* Generic costs for vector insn classes.  */
 402 static const struct cpu_vector_cost cortexa57_vector_cost =
 403 {
 404   1, /* scalar_stmt_cost  */
 405   4, /* scalar_load_cost  */
 406   1, /* scalar_store_cost  */
 407   2, /* vec_stmt_cost  */
 408   3, /* vec_permute_cost  */
 409   8, /* vec_to_scalar_cost  */
 410   8, /* scalar_to_vec_cost  */
 411   4, /* vec_align_load_cost  */
 412   4, /* vec_unalign_load_cost  */
 413   1, /* vec_unalign_store_cost  */
 414   1, /* vec_store_cost  */
 415   1, /* cond_taken_branch_cost  */
 416   1 /* cond_not_taken_branch_cost  */
 417 };
 418
 419 static const struct cpu_vector_cost exynosm1_vector_cost =
 420 {
 421   1, /* scalar_stmt_cost  */
 422   5, /* scalar_load_cost  */
 423   1, /* scalar_store_cost  */
 424   3, /* vec_stmt_cost  */
 425   3, /* vec_permute_cost  */
 426   3, /* vec_to_scalar_cost  */
 427   3, /* scalar_to_vec_cost  */
 428   5, /* vec_align_load_cost  */
 429   5, /* vec_unalign_load_cost  */
 430   1, /* vec_unalign_store_cost  */
 431   1, /* vec_store_cost  */
 432   1, /* cond_taken_branch_cost  */
 433   1 /* cond_not_taken_branch_cost  */
 434 };
 435
 436 /* Generic costs for vector insn classes.  */
 437 static const struct cpu_vector_cost xgene1_vector_cost =
 438 {
 439   1, /* scalar_stmt_cost  */
 440   5, /* scalar_load_cost  */
 441   1, /* scalar_store_cost  */
 442   2, /* vec_stmt_cost  */
 443   2, /* vec_permute_cost  */
 444   4, /* vec_to_scalar_cost  */
 445   4, /* scalar_to_vec_cost  */
 446   10, /* vec_align_load_cost  */
 447   10, /* vec_unalign_load_cost  */
 448   2, /* vec_unalign_store_cost  */
 449   2, /* vec_store_cost  */
 450   2, /* cond_taken_branch_cost  */
 451   1 /* cond_not_taken_branch_cost  */
 452 };
 453
 454 /* Costs for vector insn classes for Vulcan.  */
 455 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 456 {
 457   6, /* scalar_stmt_cost  */
 458   4, /* scalar_load_cost  */
 459   1, /* scalar_store_cost  */
 460   6, /* vec_stmt_cost  */
 461   3, /* vec_permute_cost  */
 462   6, /* vec_to_scalar_cost  */
 463   5, /* scalar_to_vec_cost  */
 464   8, /* vec_align_load_cost  */
 465   8, /* vec_unalign_load_cost  */
 466   4, /* vec_unalign_store_cost  */
 467   4, /* vec_store_cost  */
 468   2, /* cond_taken_branch_cost  */
 469   1  /* cond_not_taken_branch_cost  */
 470 };
 471
 472 /* Generic costs for branch instructions.  */
 473 static const struct cpu_branch_cost generic_branch_cost =
 474 {
 475   2,  /* Predictable.  */
 476   2   /* Unpredictable.  */
 477 };
 478
 479 /* Branch costs for Cortex-A57.  */
 480 static const struct cpu_branch_cost cortexa57_branch_cost =
 481 {
 482   1,  /* Predictable.  */
 483   3   /* Unpredictable.  */
 484 };
 485
 486 /* Branch costs for Vulcan.  */
 487 static const struct cpu_branch_cost thunderx2t99_branch_cost =
 488 {
 489   1,  /* Predictable.  */
 490   3   /* Unpredictable.  */
 491 };
 492
 493 /* Generic approximation modes.  */
 494 static const cpu_approx_modes generic_approx_modes =
 495 {
 496   AARCH64_APPROX_NONE,  /* division  */
 497   AARCH64_APPROX_NONE,  /* sqrt  */
 498   AARCH64_APPROX_NONE   /* recip_sqrt  */
 499 };
 500
 501 /* Approximation modes for Exynos M1.  */
 502 static const cpu_approx_modes exynosm1_approx_modes =
 503 {
 504   AARCH64_APPROX_NONE,  /* division  */
 505   AARCH64_APPROX_ALL,   /* sqrt  */
 506   AARCH64_APPROX_ALL    /* recip_sqrt  */
 507 };
 508
 509 /* Approximation modes for X-Gene 1.  */
 510 static const cpu_approx_modes xgene1_approx_modes =
 511 {
 512   AARCH64_APPROX_NONE,  /* division  */
 513   AARCH64_APPROX_NONE,  /* sqrt  */
 514   AARCH64_APPROX_ALL    /* recip_sqrt  */
 515 };
 516
 517 static const struct tune_params generic_tunings =
 518 {
 519   &cortexa57_extra_costs,
 520   &generic_addrcost_table,
 521   &generic_regmove_cost,
 522   &generic_vector_cost,
 523   &generic_branch_cost,
 524   &generic_approx_modes,
 525   4, /* memmov_cost  */
 526   2, /* issue_rate  */
 527   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 528   8,    /* function_align.  */
 529   8,    /* jump_align.  */
 530   4,    /* loop_align.  */
 531   2,    /* int_reassoc_width.  */
 532   4,    /* fp_reassoc_width.  */
 533   1,    /* vec_reassoc_width.  */
 534   2,    /* min_div_recip_mul_sf.  */
 535   2,    /* min_div_recip_mul_df.  */
 536   0,    /* max_case_values.  */
 537   0,    /* cache_line_size.  */
 538   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 539   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 540 };
 541
 542 static const struct tune_params cortexa35_tunings =
 543 {
 544   &cortexa53_extra_costs,
 545   &generic_addrcost_table,
 546   &cortexa53_regmove_cost,
 547   &generic_vector_cost,
 548   &cortexa57_branch_cost,
 549   &generic_approx_modes,
 550   4, /* memmov_cost  */
 551   1, /* issue_rate  */
 552   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 553    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 554   16,   /* function_align.  */
 555   8,    /* jump_align.  */
 556   8,    /* loop_align.  */
 557   2,    /* int_reassoc_width.  */
 558   4,    /* fp_reassoc_width.  */
 559   1,    /* vec_reassoc_width.  */
 560   2,    /* min_div_recip_mul_sf.  */
 561   2,    /* min_div_recip_mul_df.  */
 562   0,    /* max_case_values.  */
 563   0,    /* cache_line_size.  */
 564   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 565   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 566 };
 567
 568 static const struct tune_params cortexa53_tunings =
 569 {
 570   &cortexa53_extra_costs,
 571   &generic_addrcost_table,
 572   &cortexa53_regmove_cost,
 573   &generic_vector_cost,
 574   &cortexa57_branch_cost,
 575   &generic_approx_modes,
 576   4, /* memmov_cost  */
 577   2, /* issue_rate  */
 578   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 579    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 580   16,   /* function_align.  */
 581   8,    /* jump_align.  */
 582   8,    /* loop_align.  */
 583   2,    /* int_reassoc_width.  */
 584   4,    /* fp_reassoc_width.  */
 585   1,    /* vec_reassoc_width.  */
 586   2,    /* min_div_recip_mul_sf.  */
 587   2,    /* min_div_recip_mul_df.  */
 588   0,    /* max_case_values.  */
 589   0,    /* cache_line_size.  */
 590   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 591   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 592 };
 593
 594 static const struct tune_params cortexa57_tunings =
 595 {
 596   &cortexa57_extra_costs,
 597   &cortexa57_addrcost_table,
 598   &cortexa57_regmove_cost,
 599   &cortexa57_vector_cost,
 600   &cortexa57_branch_cost,
 601   &generic_approx_modes,
 602   4, /* memmov_cost  */
 603   3, /* issue_rate  */
 604   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 605    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 606   16,   /* function_align.  */
 607   8,    /* jump_align.  */
 608   8,    /* loop_align.  */
 609   2,    /* int_reassoc_width.  */
 610   4,    /* fp_reassoc_width.  */
 611   1,    /* vec_reassoc_width.  */
 612   2,    /* min_div_recip_mul_sf.  */
 613   2,    /* min_div_recip_mul_df.  */
 614   0,    /* max_case_values.  */
 615   0,    /* cache_line_size.  */
 616   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 617   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS)  /* tune_flags.  */
 618 };
 619
 620 static const struct tune_params cortexa72_tunings =
 621 {
 622   &cortexa57_extra_costs,
 623   &cortexa57_addrcost_table,
 624   &cortexa57_regmove_cost,
 625   &cortexa57_vector_cost,
 626   &cortexa57_branch_cost,
 627   &generic_approx_modes,
 628   4, /* memmov_cost  */
 629   3, /* issue_rate  */
 630   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 631    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 632   16,   /* function_align.  */
 633   8,    /* jump_align.  */
 634   8,    /* loop_align.  */
 635   2,    /* int_reassoc_width.  */
 636   4,    /* fp_reassoc_width.  */
 637   1,    /* vec_reassoc_width.  */
 638   2,    /* min_div_recip_mul_sf.  */
 639   2,    /* min_div_recip_mul_df.  */
 640   0,    /* max_case_values.  */
 641   0,    /* cache_line_size.  */
 642   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 643   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 644 };
 645
 646 static const struct tune_params cortexa73_tunings =
 647 {
 648   &cortexa57_extra_costs,
 649   &cortexa57_addrcost_table,
 650   &cortexa57_regmove_cost,
 651   &cortexa57_vector_cost,
 652   &cortexa57_branch_cost,
 653   &generic_approx_modes,
 654   4, /* memmov_cost.  */
 655   2, /* issue_rate.  */
 656   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 657    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 658   16,   /* function_align.  */
 659   8,    /* jump_align.  */
 660   8,    /* loop_align.  */
 661   2,    /* int_reassoc_width.  */
 662   4,    /* fp_reassoc_width.  */
 663   1,    /* vec_reassoc_width.  */
 664   2,    /* min_div_recip_mul_sf.  */
 665   2,    /* min_div_recip_mul_df.  */
 666   0,    /* max_case_values.  */
 667   0,    /* cache_line_size.  */
 668   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 669   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 670 };
 671
 672 static const struct tune_params exynosm1_tunings =
 673 {
 674   &exynosm1_extra_costs,
 675   &exynosm1_addrcost_table,
 676   &exynosm1_regmove_cost,
 677   &exynosm1_vector_cost,
 678   &generic_branch_cost,
 679   &exynosm1_approx_modes,
 680   4,    /* memmov_cost  */
 681   3,    /* issue_rate  */
 682   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 683   4,    /* function_align.  */
 684   4,    /* jump_align.  */
 685   4,    /* loop_align.  */
 686   2,    /* int_reassoc_width.  */
 687   4,    /* fp_reassoc_width.  */
 688   1,    /* vec_reassoc_width.  */
 689   2,    /* min_div_recip_mul_sf.  */
 690   2,    /* min_div_recip_mul_df.  */
 691   48,   /* max_case_values.  */
 692   64,   /* cache_line_size.  */
 693   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 694   (AARCH64_EXTRA_TUNE_NONE) /* tune_flags.  */
 695 };
 696
 697 static const struct tune_params thunderx_tunings =
 698 {
 699   &thunderx_extra_costs,
 700   &generic_addrcost_table,
 701   &thunderx_regmove_cost,
 702   &thunderx_vector_cost,
 703   &generic_branch_cost,
 704   &generic_approx_modes,
 705   6, /* memmov_cost  */
 706   2, /* issue_rate  */
 707   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 708   8,    /* function_align.  */
 709   8,    /* jump_align.  */
 710   8,    /* loop_align.  */
 711   2,    /* int_reassoc_width.  */
 712   4,    /* fp_reassoc_width.  */
 713   1,    /* vec_reassoc_width.  */
 714   2,    /* min_div_recip_mul_sf.  */
 715   2,    /* min_div_recip_mul_df.  */
 716   0,    /* max_case_values.  */
 717   0,    /* cache_line_size.  */
 718   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 719   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)      /* tune_flags.  */
 720 };
 721
 722 static const struct tune_params xgene1_tunings =
 723 {
 724   &xgene1_extra_costs,
 725   &xgene1_addrcost_table,
 726   &xgene1_regmove_cost,
 727   &xgene1_vector_cost,
 728   &generic_branch_cost,
 729   &xgene1_approx_modes,
 730   6, /* memmov_cost  */
 731   4, /* issue_rate  */
 732   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 733   16,   /* function_align.  */
 734   8,    /* jump_align.  */
 735   16,   /* loop_align.  */
 736   2,    /* int_reassoc_width.  */
 737   4,    /* fp_reassoc_width.  */
 738   1,    /* vec_reassoc_width.  */
 739   2,    /* min_div_recip_mul_sf.  */
 740   2,    /* min_div_recip_mul_df.  */
 741   0,    /* max_case_values.  */
 742   0,    /* cache_line_size.  */
 743   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 744   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 745 };
 746
 747 static const struct tune_params qdf24xx_tunings =
 748 {
 749   &qdf24xx_extra_costs,
 750   &qdf24xx_addrcost_table,
 751   &qdf24xx_regmove_cost,
 752   &generic_vector_cost,
 753   &generic_branch_cost,
 754   &generic_approx_modes,
 755   4, /* memmov_cost  */
 756   4, /* issue_rate  */
 757   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 758    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 759   16,   /* function_align.  */
 760   8,    /* jump_align.  */
 761   16,   /* loop_align.  */
 762   2,    /* int_reassoc_width.  */
 763   4,    /* fp_reassoc_width.  */
 764   1,    /* vec_reassoc_width.  */
 765   2,    /* min_div_recip_mul_sf.  */
 766   2,    /* min_div_recip_mul_df.  */
 767   0,    /* max_case_values.  */
 768   64,   /* cache_line_size.  */
 769   tune_params::AUTOPREFETCHER_STRONG,   /* autoprefetcher_model.  */
 770   (AARCH64_EXTRA_TUNE_NONE)             /* tune_flags.  */
 771 };
 772
 773 static const struct tune_params thunderx2t99_tunings =
 774 {
 775   &thunderx2t99_extra_costs,
 776   &thunderx2t99_addrcost_table,
 777   &thunderx2t99_regmove_cost,
 778   &thunderx2t99_vector_cost,
 779   &thunderx2t99_branch_cost,
 780   &generic_approx_modes,
 781   4, /* memmov_cost.  */
 782   4, /* issue_rate.  */
 783   AARCH64_FUSE_NOTHING, /* fuseable_ops.  */
 784   16,   /* function_align.  */
 785   8,    /* jump_align.  */
 786   16,   /* loop_align.  */
 787   3,    /* int_reassoc_width.  */
 788   2,    /* fp_reassoc_width.  */
 789   2,    /* vec_reassoc_width.  */
 790   2,    /* min_div_recip_mul_sf.  */
 791   2,    /* min_div_recip_mul_df.  */
 792   0,    /* max_case_values.  */
 793   64,   /* cache_line_size.  */
 794   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 795   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 796 };
 797
 798 /* Support for fine-grained override of the tuning structures.  */
 799 struct aarch64_tuning_override_function
 800 {
 801   const char* name;
 802   void (*parse_override)(const char*, struct tune_params*);
 803 };
 804
 805 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 806 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 807
 808 static const struct aarch64_tuning_override_function
 809 aarch64_tuning_override_functions[] =
 810 {
 811   { "fuse", aarch64_parse_fuse_string },
 812   { "tune", aarch64_parse_tune_string },
 813   { NULL, NULL }
 814 };
 815
 816 /* A processor implementing AArch64.  */
 817 struct processor
 818 {
 819   const char *const name;
 820   enum aarch64_processor ident;
 821   enum aarch64_processor sched_core;
 822   enum aarch64_arch arch;
 823   unsigned architecture_version;
 824   const unsigned long flags;
 825   const struct tune_params *const tune;
 826 };
 827
 828 /* Architectures implementing AArch64.  */
 829 static const struct processor all_architectures[] =
 830 {
 831 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
 832   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
 833 #include "aarch64-arches.def"
 834   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 835 };
 836
 837 /* Processor cores implementing AArch64.  */
 838 static const struct processor all_cores[] =
 839 {
 840 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
 841   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
 842   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
 843   FLAGS, &COSTS##_tunings},
 844 #include "aarch64-cores.def"
 845   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
 846     AARCH64_FL_FOR_ARCH8, &generic_tunings},
 847   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 848 };
 849
 850
 851 /* Target specification.  These are populated by the -march, -mtune, -mcpu
 852    handling code or by target attributes.  */
 853 static const struct processor *selected_arch;
 854 static const struct processor *selected_cpu;
 855 static const struct processor *selected_tune;
 856
 857 /* The current tuning set.  */
 858 struct tune_params aarch64_tune_params = generic_tunings;
 859
 860 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 861
 862 /* An ISA extension in the co-processor and main instruction set space.  */
 863 struct aarch64_option_extension
 864 {
 865   const char *const name;
 866   const unsigned long flags_on;
 867   const unsigned long flags_off;
 868 };
 869
 870 typedef enum aarch64_cond_code
 871 {
 872   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 873   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 874   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 875 }
 876 aarch64_cc;
 877
 878 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 879
 880 /* The condition codes of the processor, and the inverse function.  */
 881 static const char * const aarch64_condition_codes[] =
 882 {
 883   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 884   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 885 };
 886
 887 /* Generate code to enable conditional branches in functions over 1 MiB.  */
 888 const char *
 889 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
 890                         const char * branch_format)
 891 {
 892     rtx_code_label * tmp_label = gen_label_rtx ();
 893     char label_buf[256];
 894     char buffer[128];
 895     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
 896                                  CODE_LABEL_NUMBER (tmp_label));
 897     const char *label_ptr = targetm.strip_name_encoding (label_buf);
 898     rtx dest_label = operands[pos_label];
 899     operands[pos_label] = tmp_label;
 900
 901     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
 902     output_asm_insn (buffer, operands);
 903
 904     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
 905     operands[pos_label] = dest_label;
 906     output_asm_insn (buffer, operands);
 907     return "";
 908 }
 909
 910 void
 911 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
 912 {
 913   const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
 914   if (TARGET_GENERAL_REGS_ONLY)
 915     error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
 916   else
 917     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
 918 }
 919
 920 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
 921    The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
 922    the same cost even if ALL_REGS has a much larger cost.  ALL_REGS is also
 923    used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
 924    cost (in this case the best class is the lowest cost one).  Using ALL_REGS
 925    irrespectively of its cost results in bad allocations with many redundant
 926    int<->FP moves which are expensive on various cores.
 927    To avoid this we don't allow ALL_REGS as the allocno class, but force a
 928    decision between FP_REGS and GENERAL_REGS.  We use the allocno class if it
 929    isn't ALL_REGS.  Similarly, use the best class if it isn't ALL_REGS.
 930    Otherwise set the allocno class depending on the mode.
 931    The result of this is that it is no longer inefficient to have a higher
 932    memory move cost than the register move cost.
 933 */
 934
 935 static reg_class_t
 936 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
 937                                          reg_class_t best_class)
 938 {
 939   enum machine_mode mode;
 940
 941   if (allocno_class != ALL_REGS)
 942     return allocno_class;
 943
 944   if (best_class != ALL_REGS)
 945     return best_class;
 946
 947   mode = PSEUDO_REGNO_MODE (regno);
 948   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
 949 }
 950
 951 static unsigned int
 952 aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
 953 {
 954   if (GET_MODE_UNIT_SIZE (mode) == 4)
 955     return aarch64_tune_params.min_div_recip_mul_sf;
 956   return aarch64_tune_params.min_div_recip_mul_df;
 957 }
 958
 959 static int
 960 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
 961                              enum machine_mode mode)
 962 {
 963   if (VECTOR_MODE_P (mode))
 964     return aarch64_tune_params.vec_reassoc_width;
 965   if (INTEGRAL_MODE_P (mode))
 966     return aarch64_tune_params.int_reassoc_width;
 967   if (FLOAT_MODE_P (mode))
 968     return aarch64_tune_params.fp_reassoc_width;
 969   return 1;
 970 }
 971
 972 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
 973 unsigned
 974 aarch64_dbx_register_number (unsigned regno)
 975 {
 976    if (GP_REGNUM_P (regno))
 977      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
 978    else if (regno == SP_REGNUM)
 979      return AARCH64_DWARF_SP;
 980    else if (FP_REGNUM_P (regno))
 981      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
 982
 983    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
 984       equivalent DWARF register.  */
 985    return DWARF_FRAME_REGISTERS;
 986 }
 987
 988 /* Return TRUE if MODE is any of the large INT modes.  */
 989 static bool
 990 aarch64_vect_struct_mode_p (machine_mode mode)
 991 {
 992   return mode == OImode || mode == CImode || mode == XImode;
 993 }
 994
 995 /* Return TRUE if MODE is any of the vector modes.  */
 996 static bool
 997 aarch64_vector_mode_p (machine_mode mode)
 998 {
 999   return aarch64_vector_mode_supported_p (mode)
1000          || aarch64_vect_struct_mode_p (mode);
1001 }
1002
1003 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1004 static bool
1005 aarch64_array_mode_supported_p (machine_mode mode,
1006                                 unsigned HOST_WIDE_INT nelems)
1007 {
1008   if (TARGET_SIMD
1009       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1010           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1011       && (nelems >= 2 && nelems <= 4))
1012     return true;
1013
1014   return false;
1015 }
1016
1017 /* Implement HARD_REGNO_NREGS.  */
1018
1019 int
1020 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1021 {
1022   switch (aarch64_regno_regclass (regno))
1023     {
1024     case FP_REGS:
1025     case FP_LO_REGS:
1026       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1027     default:
1028       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1029     }
1030   gcc_unreachable ();
1031 }
1032
1033 /* Implement HARD_REGNO_MODE_OK.  */
1034
1035 int
1036 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1037 {
1038   if (GET_MODE_CLASS (mode) == MODE_CC)
1039     return regno == CC_REGNUM;
1040
1041   if (regno == SP_REGNUM)
1042     /* The purpose of comparing with ptr_mode is to support the
1043        global register variable associated with the stack pointer
1044        register via the syntax of asm ("wsp") in ILP32.  */
1045     return mode == Pmode || mode == ptr_mode;
1046
1047   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1048     return mode == Pmode;
1049
1050   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1051     return 1;
1052
1053   if (FP_REGNUM_P (regno))
1054     {
1055       if (aarch64_vect_struct_mode_p (mode))
1056         return
1057           (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
1058       else
1059         return 1;
1060     }
1061
1062   return 0;
1063 }
1064
1065 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1066 machine_mode
1067 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1068                                      machine_mode mode)
1069 {
1070   /* Handle modes that fit within single registers.  */
1071   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1072     {
1073       if (GET_MODE_SIZE (mode) >= 4)
1074         return mode;
1075       else
1076         return SImode;
1077     }
1078   /* Fall back to generic for multi-reg and very large modes.  */
1079   else
1080     return choose_hard_reg_mode (regno, nregs, false);
1081 }
1082
1083 /* Return true if calls to DECL should be treated as
1084    long-calls (ie called via a register).  */
1085 static bool
1086 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1087 {
1088   return false;
1089 }
1090
1091 /* Return true if calls to symbol-ref SYM should be treated as
1092    long-calls (ie called via a register).  */
1093 bool
1094 aarch64_is_long_call_p (rtx sym)
1095 {
1096   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1097 }
1098
1099 /* Return true if calls to symbol-ref SYM should not go through
1100    plt stubs.  */
1101
1102 bool
1103 aarch64_is_noplt_call_p (rtx sym)
1104 {
1105   const_tree decl = SYMBOL_REF_DECL (sym);
1106
1107   if (flag_pic
1108       && decl
1109       && (!flag_plt
1110           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1111       && !targetm.binds_local_p (decl))
1112     return true;
1113
1114   return false;
1115 }
1116
1117 /* Return true if the offsets to a zero/sign-extract operation
1118    represent an expression that matches an extend operation.  The
1119    operands represent the paramters from
1120
1121    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1122 bool
1123 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
1124                                 rtx extract_imm)
1125 {
1126   HOST_WIDE_INT mult_val, extract_val;
1127
1128   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1129     return false;
1130
1131   mult_val = INTVAL (mult_imm);
1132   extract_val = INTVAL (extract_imm);
1133
1134   if (extract_val > 8
1135       && extract_val < GET_MODE_BITSIZE (mode)
1136       && exact_log2 (extract_val & ~7) > 0
1137       && (extract_val & 7) <= 4
1138       && mult_val == (1 << (extract_val & 7)))
1139     return true;
1140
1141   return false;
1142 }
1143
1144 /* Emit an insn that's a simple single-set.  Both the operands must be
1145    known to be valid.  */
1146 inline static rtx_insn *
1147 emit_set_insn (rtx x, rtx y)
1148 {
1149   return emit_insn (gen_rtx_SET (x, y));
1150 }
1151
1152 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1153    return the rtx for register 0 in the proper mode.  */
1154 rtx
1155 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1156 {
1157   machine_mode mode = SELECT_CC_MODE (code, x, y);
1158   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1159
1160   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1161   return cc_reg;
1162 }
1163
1164 /* Build the SYMBOL_REF for __tls_get_addr.  */
1165
1166 static GTY(()) rtx tls_get_addr_libfunc;
1167
1168 rtx
1169 aarch64_tls_get_addr (void)
1170 {
1171   if (!tls_get_addr_libfunc)
1172     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1173   return tls_get_addr_libfunc;
1174 }
1175
1176 /* Return the TLS model to use for ADDR.  */
1177
1178 static enum tls_model
1179 tls_symbolic_operand_type (rtx addr)
1180 {
1181   enum tls_model tls_kind = TLS_MODEL_NONE;
1182   rtx sym, addend;
1183
1184   if (GET_CODE (addr) == CONST)
1185     {
1186       split_const (addr, &sym, &addend);
1187       if (GET_CODE (sym) == SYMBOL_REF)
1188         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1189     }
1190   else if (GET_CODE (addr) == SYMBOL_REF)
1191     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1192
1193   return tls_kind;
1194 }
1195
1196 /* We'll allow lo_sum's in addresses in our legitimate addresses
1197    so that combine would take care of combining addresses where
1198    necessary, but for generation purposes, we'll generate the address
1199    as :
1200    RTL                               Absolute
1201    tmp = hi (symbol_ref);            adrp  x1, foo
1202    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1203                                      nop
1204
1205    PIC                               TLS
1206    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1207    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1208                                      bl   __tls_get_addr
1209                                      nop
1210
1211    Load TLS symbol, depending on TLS mechanism and TLS access model.
1212
1213    Global Dynamic - Traditional TLS:
1214    adrp tmp, :tlsgd:imm
1215    add  dest, tmp, #:tlsgd_lo12:imm
1216    bl   __tls_get_addr
1217
1218    Global Dynamic - TLS Descriptors:
1219    adrp dest, :tlsdesc:imm
1220    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1221    add  dest, dest, #:tlsdesc_lo12:imm
1222    blr  tmp
1223    mrs  tp, tpidr_el0
1224    add  dest, dest, tp
1225
1226    Initial Exec:
1227    mrs  tp, tpidr_el0
1228    adrp tmp, :gottprel:imm
1229    ldr  dest, [tmp, #:gottprel_lo12:imm]
1230    add  dest, dest, tp
1231
1232    Local Exec:
1233    mrs  tp, tpidr_el0
1234    add  t0, tp, #:tprel_hi12:imm, lsl #12
1235    add  t0, t0, #:tprel_lo12_nc:imm
1236 */
1237
1238 static void
1239 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1240                                    enum aarch64_symbol_type type)
1241 {
1242   switch (type)
1243     {
1244     case SYMBOL_SMALL_ABSOLUTE:
1245       {
1246         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1247         rtx tmp_reg = dest;
1248         machine_mode mode = GET_MODE (dest);
1249
1250         gcc_assert (mode == Pmode || mode == ptr_mode);
1251
1252         if (can_create_pseudo_p ())
1253           tmp_reg = gen_reg_rtx (mode);
1254
1255         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1256         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1257         return;
1258       }
1259
1260     case SYMBOL_TINY_ABSOLUTE:
1261       emit_insn (gen_rtx_SET (dest, imm));
1262       return;
1263
1264     case SYMBOL_SMALL_GOT_28K:
1265       {
1266         machine_mode mode = GET_MODE (dest);
1267         rtx gp_rtx = pic_offset_table_rtx;
1268         rtx insn;
1269         rtx mem;
1270
1271         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1272            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1273            decide rtx costs, in which case pic_offset_table_rtx is not
1274            initialized.  For that case no need to generate the first adrp
1275            instruction as the final cost for global variable access is
1276            one instruction.  */
1277         if (gp_rtx != NULL)
1278           {
1279             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1280                using the page base as GOT base, the first page may be wasted,
1281                in the worst scenario, there is only 28K space for GOT).
1282
1283                The generate instruction sequence for accessing global variable
1284                is:
1285
1286                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1287
1288                Only one instruction needed. But we must initialize
1289                pic_offset_table_rtx properly.  We generate initialize insn for
1290                every global access, and allow CSE to remove all redundant.
1291
1292                The final instruction sequences will look like the following
1293                for multiply global variables access.
1294
1295                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1296
1297                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1298                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1299                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1300                  ...  */
1301
1302             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1303             crtl->uses_pic_offset_table = 1;
1304             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1305
1306             if (mode != GET_MODE (gp_rtx))
1307              gp_rtx = gen_lowpart (mode, gp_rtx);
1308
1309           }
1310
1311         if (mode == ptr_mode)
1312           {
1313             if (mode == DImode)
1314               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1315             else
1316               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1317
1318             mem = XVECEXP (SET_SRC (insn), 0, 0);
1319           }
1320         else
1321           {
1322             gcc_assert (mode == Pmode);
1323
1324             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1325             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1326           }
1327
1328         /* The operand is expected to be MEM.  Whenever the related insn
1329            pattern changed, above code which calculate mem should be
1330            updated.  */
1331         gcc_assert (GET_CODE (mem) == MEM);
1332         MEM_READONLY_P (mem) = 1;
1333         MEM_NOTRAP_P (mem) = 1;
1334         emit_insn (insn);
1335         return;
1336       }
1337
1338     case SYMBOL_SMALL_GOT_4G:
1339       {
1340         /* In ILP32, the mode of dest can be either SImode or DImode,
1341            while the got entry is always of SImode size.  The mode of
1342            dest depends on how dest is used: if dest is assigned to a
1343            pointer (e.g. in the memory), it has SImode; it may have
1344            DImode if dest is dereferenced to access the memeory.
1345            This is why we have to handle three different ldr_got_small
1346            patterns here (two patterns for ILP32).  */
1347
1348         rtx insn;
1349         rtx mem;
1350         rtx tmp_reg = dest;
1351         machine_mode mode = GET_MODE (dest);
1352
1353         if (can_create_pseudo_p ())
1354           tmp_reg = gen_reg_rtx (mode);
1355
1356         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1357         if (mode == ptr_mode)
1358           {
1359             if (mode == DImode)
1360               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1361             else
1362               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1363
1364             mem = XVECEXP (SET_SRC (insn), 0, 0);
1365           }
1366         else
1367           {
1368             gcc_assert (mode == Pmode);
1369
1370             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1371             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1372           }
1373
1374         gcc_assert (GET_CODE (mem) == MEM);
1375         MEM_READONLY_P (mem) = 1;
1376         MEM_NOTRAP_P (mem) = 1;
1377         emit_insn (insn);
1378         return;
1379       }
1380
1381     case SYMBOL_SMALL_TLSGD:
1382       {
1383         rtx_insn *insns;
1384         machine_mode mode = GET_MODE (dest);
1385         rtx result = gen_rtx_REG (mode, R0_REGNUM);
1386
1387         start_sequence ();
1388         if (TARGET_ILP32)
1389           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1390         else
1391           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1392         insns = get_insns ();
1393         end_sequence ();
1394
1395         RTL_CONST_CALL_P (insns) = 1;
1396         emit_libcall_block (insns, dest, result, imm);
1397         return;
1398       }
1399
1400     case SYMBOL_SMALL_TLSDESC:
1401       {
1402         machine_mode mode = GET_MODE (dest);
1403         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1404         rtx tp;
1405
1406         gcc_assert (mode == Pmode || mode == ptr_mode);
1407
1408         /* In ILP32, the got entry is always of SImode size.  Unlike
1409            small GOT, the dest is fixed at reg 0.  */
1410         if (TARGET_ILP32)
1411           emit_insn (gen_tlsdesc_small_si (imm));
1412         else
1413           emit_insn (gen_tlsdesc_small_di (imm));
1414         tp = aarch64_load_tp (NULL);
1415
1416         if (mode != Pmode)
1417           tp = gen_lowpart (mode, tp);
1418
1419         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1420         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1421         return;
1422       }
1423
1424     case SYMBOL_SMALL_TLSIE:
1425       {
1426         /* In ILP32, the mode of dest can be either SImode or DImode,
1427            while the got entry is always of SImode size.  The mode of
1428            dest depends on how dest is used: if dest is assigned to a
1429            pointer (e.g. in the memory), it has SImode; it may have
1430            DImode if dest is dereferenced to access the memeory.
1431            This is why we have to handle three different tlsie_small
1432            patterns here (two patterns for ILP32).  */
1433         machine_mode mode = GET_MODE (dest);
1434         rtx tmp_reg = gen_reg_rtx (mode);
1435         rtx tp = aarch64_load_tp (NULL);
1436
1437         if (mode == ptr_mode)
1438           {
1439             if (mode == DImode)
1440               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1441             else
1442               {
1443                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1444                 tp = gen_lowpart (mode, tp);
1445               }
1446           }
1447         else
1448           {
1449             gcc_assert (mode == Pmode);
1450             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1451           }
1452
1453         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1454         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1455         return;
1456       }
1457
1458     case SYMBOL_TLSLE12:
1459     case SYMBOL_TLSLE24:
1460     case SYMBOL_TLSLE32:
1461     case SYMBOL_TLSLE48:
1462       {
1463         machine_mode mode = GET_MODE (dest);
1464         rtx tp = aarch64_load_tp (NULL);
1465
1466         if (mode != Pmode)
1467           tp = gen_lowpart (mode, tp);
1468
1469         switch (type)
1470           {
1471           case SYMBOL_TLSLE12:
1472             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1473                         (dest, tp, imm));
1474             break;
1475           case SYMBOL_TLSLE24:
1476             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1477                         (dest, tp, imm));
1478           break;
1479           case SYMBOL_TLSLE32:
1480             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1481                         (dest, imm));
1482             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1483                         (dest, dest, tp));
1484           break;
1485           case SYMBOL_TLSLE48:
1486             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1487                         (dest, imm));
1488             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1489                         (dest, dest, tp));
1490             break;
1491           default:
1492             gcc_unreachable ();
1493           }
1494
1495         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1496         return;
1497       }
1498
1499     case SYMBOL_TINY_GOT:
1500       emit_insn (gen_ldr_got_tiny (dest, imm));
1501       return;
1502
1503     case SYMBOL_TINY_TLSIE:
1504       {
1505         machine_mode mode = GET_MODE (dest);
1506         rtx tp = aarch64_load_tp (NULL);
1507
1508         if (mode == ptr_mode)
1509           {
1510             if (mode == DImode)
1511               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1512             else
1513               {
1514                 tp = gen_lowpart (mode, tp);
1515                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1516               }
1517           }
1518         else
1519           {
1520             gcc_assert (mode == Pmode);
1521             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1522           }
1523
1524         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1525         return;
1526       }
1527
1528     default:
1529       gcc_unreachable ();
1530     }
1531 }
1532
1533 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1534    handle all moves if !can_create_pseudo_p ().  The distinction is
1535    important because, unlike emit_move_insn, the move expanders know
1536    how to force Pmode objects into the constant pool even when the
1537    constant pool address is not itself legitimate.  */
1538 static rtx
1539 aarch64_emit_move (rtx dest, rtx src)
1540 {
1541   return (can_create_pseudo_p ()
1542           ? emit_move_insn (dest, src)
1543           : emit_move_insn_1 (dest, src));
1544 }
1545
1546 /* Split a 128-bit move operation into two 64-bit move operations,
1547    taking care to handle partial overlap of register to register
1548    copies.  Special cases are needed when moving between GP regs and
1549    FP regs.  SRC can be a register, constant or memory; DST a register
1550    or memory.  If either operand is memory it must not have any side
1551    effects.  */
1552 void
1553 aarch64_split_128bit_move (rtx dst, rtx src)
1554 {
1555   rtx dst_lo, dst_hi;
1556   rtx src_lo, src_hi;
1557
1558   machine_mode mode = GET_MODE (dst);
1559
1560   gcc_assert (mode == TImode || mode == TFmode);
1561   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1562   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1563
1564   if (REG_P (dst) && REG_P (src))
1565     {
1566       int src_regno = REGNO (src);
1567       int dst_regno = REGNO (dst);
1568
1569       /* Handle FP <-> GP regs.  */
1570       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1571         {
1572           src_lo = gen_lowpart (word_mode, src);
1573           src_hi = gen_highpart (word_mode, src);
1574
1575           if (mode == TImode)
1576             {
1577               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1578               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1579             }
1580           else
1581             {
1582               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1583               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1584             }
1585           return;
1586         }
1587       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1588         {
1589           dst_lo = gen_lowpart (word_mode, dst);
1590           dst_hi = gen_highpart (word_mode, dst);
1591
1592           if (mode == TImode)
1593             {
1594               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1595               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1596             }
1597           else
1598             {
1599               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1600               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1601             }
1602           return;
1603         }
1604     }
1605
1606   dst_lo = gen_lowpart (word_mode, dst);
1607   dst_hi = gen_highpart (word_mode, dst);
1608   src_lo = gen_lowpart (word_mode, src);
1609   src_hi = gen_highpart_mode (word_mode, mode, src);
1610
1611   /* At most one pairing may overlap.  */
1612   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1613     {
1614       aarch64_emit_move (dst_hi, src_hi);
1615       aarch64_emit_move (dst_lo, src_lo);
1616     }
1617   else
1618     {
1619       aarch64_emit_move (dst_lo, src_lo);
1620       aarch64_emit_move (dst_hi, src_hi);
1621     }
1622 }
1623
1624 bool
1625 aarch64_split_128bit_move_p (rtx dst, rtx src)
1626 {
1627   return (! REG_P (src)
1628           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1629 }
1630
1631 /* Split a complex SIMD combine.  */
1632
1633 void
1634 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1635 {
1636   machine_mode src_mode = GET_MODE (src1);
1637   machine_mode dst_mode = GET_MODE (dst);
1638
1639   gcc_assert (VECTOR_MODE_P (dst_mode));
1640
1641   if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1642     {
1643       rtx (*gen) (rtx, rtx, rtx);
1644
1645       switch (src_mode)
1646         {
1647         case V8QImode:
1648           gen = gen_aarch64_simd_combinev8qi;
1649           break;
1650         case V4HImode:
1651           gen = gen_aarch64_simd_combinev4hi;
1652           break;
1653         case V2SImode:
1654           gen = gen_aarch64_simd_combinev2si;
1655           break;
1656         case V4HFmode:
1657           gen = gen_aarch64_simd_combinev4hf;
1658           break;
1659         case V2SFmode:
1660           gen = gen_aarch64_simd_combinev2sf;
1661           break;
1662         case DImode:
1663           gen = gen_aarch64_simd_combinedi;
1664           break;
1665         case DFmode:
1666           gen = gen_aarch64_simd_combinedf;
1667           break;
1668         default:
1669           gcc_unreachable ();
1670         }
1671
1672       emit_insn (gen (dst, src1, src2));
1673       return;
1674     }
1675 }
1676
1677 /* Split a complex SIMD move.  */
1678
1679 void
1680 aarch64_split_simd_move (rtx dst, rtx src)
1681 {
1682   machine_mode src_mode = GET_MODE (src);
1683   machine_mode dst_mode = GET_MODE (dst);
1684
1685   gcc_assert (VECTOR_MODE_P (dst_mode));
1686
1687   if (REG_P (dst) && REG_P (src))
1688     {
1689       rtx (*gen) (rtx, rtx);
1690
1691       gcc_assert (VECTOR_MODE_P (src_mode));
1692
1693       switch (src_mode)
1694         {
1695         case V16QImode:
1696           gen = gen_aarch64_split_simd_movv16qi;
1697           break;
1698         case V8HImode:
1699           gen = gen_aarch64_split_simd_movv8hi;
1700           break;
1701         case V4SImode:
1702           gen = gen_aarch64_split_simd_movv4si;
1703           break;
1704         case V2DImode:
1705           gen = gen_aarch64_split_simd_movv2di;
1706           break;
1707         case V8HFmode:
1708           gen = gen_aarch64_split_simd_movv8hf;
1709           break;
1710         case V4SFmode:
1711           gen = gen_aarch64_split_simd_movv4sf;
1712           break;
1713         case V2DFmode:
1714           gen = gen_aarch64_split_simd_movv2df;
1715           break;
1716         default:
1717           gcc_unreachable ();
1718         }
1719
1720       emit_insn (gen (dst, src));
1721       return;
1722     }
1723 }
1724
1725 bool
1726 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1727                               machine_mode ymode, rtx y)
1728 {
1729   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1730   gcc_assert (r != NULL);
1731   return rtx_equal_p (x, r);
1732 }
1733
1734
1735 static rtx
1736 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1737 {
1738   if (can_create_pseudo_p ())
1739     return force_reg (mode, value);
1740   else
1741     {
1742       x = aarch64_emit_move (x, value);
1743       return x;
1744     }
1745 }
1746
1747
1748 static rtx
1749 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1750 {
1751   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1752     {
1753       rtx high;
1754       /* Load the full offset into a register.  This
1755          might be improvable in the future.  */
1756       high = GEN_INT (offset);
1757       offset = 0;
1758       high = aarch64_force_temporary (mode, temp, high);
1759       reg = aarch64_force_temporary (mode, temp,
1760                                      gen_rtx_PLUS (mode, high, reg));
1761     }
1762   return plus_constant (mode, reg, offset);
1763 }
1764
1765 static int
1766 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1767                                 machine_mode mode)
1768 {
1769   int i;
1770   unsigned HOST_WIDE_INT val, val2, mask;
1771   int one_match, zero_match;
1772   int num_insns;
1773
1774   val = INTVAL (imm);
1775
1776   if (aarch64_move_imm (val, mode))
1777     {
1778       if (generate)
1779         emit_insn (gen_rtx_SET (dest, imm));
1780       return 1;
1781     }
1782
1783   if ((val >> 32) == 0 || mode == SImode)
1784     {
1785       if (generate)
1786         {
1787           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1788           if (mode == SImode)
1789             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1790                                        GEN_INT ((val >> 16) & 0xffff)));
1791           else
1792             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1793                                        GEN_INT ((val >> 16) & 0xffff)));
1794         }
1795       return 2;
1796     }
1797
1798   /* Remaining cases are all for DImode.  */
1799
1800   mask = 0xffff;
1801   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1802     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1803   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1804     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1805
1806   if (zero_match != 2 && one_match != 2)
1807     {
1808       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1809          For a 64-bit bitmask try whether changing 16 bits to all ones or
1810          zeroes creates a valid bitmask.  To check any repeated bitmask,
1811          try using 16 bits from the other 32-bit half of val.  */
1812
1813       for (i = 0; i < 64; i += 16, mask <<= 16)
1814         {
1815           val2 = val & ~mask;
1816           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1817             break;
1818           val2 = val | mask;
1819           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1820             break;
1821           val2 = val2 & ~mask;
1822           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1823           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1824             break;
1825         }
1826       if (i != 64)
1827         {
1828           if (generate)
1829             {
1830               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1831               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1832                                          GEN_INT ((val >> i) & 0xffff)));
1833             }
1834           return 2;
1835         }
1836     }
1837
1838   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1839      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
1840      otherwise skip zero bits.  */
1841
1842   num_insns = 1;
1843   mask = 0xffff;
1844   val2 = one_match > zero_match ? ~val : val;
1845   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1846
1847   if (generate)
1848     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1849                                            ? (val | ~(mask << i))
1850                                            : (val & (mask << i)))));
1851   for (i += 16; i < 64; i += 16)
1852     {
1853       if ((val2 & (mask << i)) == 0)
1854         continue;
1855       if (generate)
1856         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1857                                    GEN_INT ((val >> i) & 0xffff)));
1858       num_insns ++;
1859     }
1860
1861   return num_insns;
1862 }
1863
1864
1865 void
1866 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1867 {
1868   machine_mode mode = GET_MODE (dest);
1869
1870   gcc_assert (mode == SImode || mode == DImode);
1871
1872   /* Check on what type of symbol it is.  */
1873   if (GET_CODE (imm) == SYMBOL_REF
1874       || GET_CODE (imm) == LABEL_REF
1875       || GET_CODE (imm) == CONST)
1876     {
1877       rtx mem, base, offset;
1878       enum aarch64_symbol_type sty;
1879
1880       /* If we have (const (plus symbol offset)), separate out the offset
1881          before we start classifying the symbol.  */
1882       split_const (imm, &base, &offset);
1883
1884       sty = aarch64_classify_symbol (base, offset);
1885       switch (sty)
1886         {
1887         case SYMBOL_FORCE_TO_MEM:
1888           if (offset != const0_rtx
1889               && targetm.cannot_force_const_mem (mode, imm))
1890             {
1891               gcc_assert (can_create_pseudo_p ());
1892               base = aarch64_force_temporary (mode, dest, base);
1893               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1894               aarch64_emit_move (dest, base);
1895               return;
1896             }
1897
1898           mem = force_const_mem (ptr_mode, imm);
1899           gcc_assert (mem);
1900
1901           /* If we aren't generating PC relative literals, then
1902              we need to expand the literal pool access carefully.
1903              This is something that needs to be done in a number
1904              of places, so could well live as a separate function.  */
1905           if (!aarch64_pcrelative_literal_loads)
1906             {
1907               gcc_assert (can_create_pseudo_p ());
1908               base = gen_reg_rtx (ptr_mode);
1909               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
1910               mem = gen_rtx_MEM (ptr_mode, base);
1911             }
1912
1913           if (mode != ptr_mode)
1914             mem = gen_rtx_ZERO_EXTEND (mode, mem);
1915
1916           emit_insn (gen_rtx_SET (dest, mem));
1917
1918           return;
1919
1920         case SYMBOL_SMALL_TLSGD:
1921         case SYMBOL_SMALL_TLSDESC:
1922         case SYMBOL_SMALL_TLSIE:
1923         case SYMBOL_SMALL_GOT_28K:
1924         case SYMBOL_SMALL_GOT_4G:
1925         case SYMBOL_TINY_GOT:
1926         case SYMBOL_TINY_TLSIE:
1927           if (offset != const0_rtx)
1928             {
1929               gcc_assert(can_create_pseudo_p ());
1930               base = aarch64_force_temporary (mode, dest, base);
1931               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1932               aarch64_emit_move (dest, base);
1933               return;
1934             }
1935           /* FALLTHRU */
1936
1937         case SYMBOL_SMALL_ABSOLUTE:
1938         case SYMBOL_TINY_ABSOLUTE:
1939         case SYMBOL_TLSLE12:
1940         case SYMBOL_TLSLE24:
1941         case SYMBOL_TLSLE32:
1942         case SYMBOL_TLSLE48:
1943           aarch64_load_symref_appropriately (dest, imm, sty);
1944           return;
1945
1946         default:
1947           gcc_unreachable ();
1948         }
1949     }
1950
1951   if (!CONST_INT_P (imm))
1952     {
1953       if (GET_CODE (imm) == HIGH)
1954         emit_insn (gen_rtx_SET (dest, imm));
1955       else
1956         {
1957           rtx mem = force_const_mem (mode, imm);
1958           gcc_assert (mem);
1959           emit_insn (gen_rtx_SET (dest, mem));
1960         }
1961
1962       return;
1963     }
1964
1965   aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1966 }
1967
1968 /* Add DELTA to REGNUM in mode MODE.  SCRATCHREG can be used to hold a
1969    temporary value if necessary.  FRAME_RELATED_P should be true if
1970    the RTX_FRAME_RELATED flag should be set and CFA adjustments added
1971    to the generated instructions.  If SCRATCHREG is known to hold
1972    abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
1973    immediate again.
1974
1975    Since this function may be used to adjust the stack pointer, we must
1976    ensure that it cannot cause transient stack deallocation (for example
1977    by first incrementing SP and then decrementing when adjusting by a
1978    large immediate).  */
1979
1980 static void
1981 aarch64_add_constant_internal (machine_mode mode, int regnum, int scratchreg,
1982                                HOST_WIDE_INT delta, bool frame_related_p,
1983                                bool emit_move_imm)
1984 {
1985   HOST_WIDE_INT mdelta = abs_hwi (delta);
1986   rtx this_rtx = gen_rtx_REG (mode, regnum);
1987   rtx_insn *insn;
1988
1989   if (!mdelta)
1990     return;
1991
1992   /* Single instruction adjustment.  */
1993   if (aarch64_uimm12_shift (mdelta))
1994     {
1995       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
1996       RTX_FRAME_RELATED_P (insn) = frame_related_p;
1997       return;
1998     }
1999
2000   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
2001      Only do this if mdelta is not a 16-bit move as adjusting using a move
2002      is better.  */
2003   if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode))
2004     {
2005       HOST_WIDE_INT low_off = mdelta & 0xfff;
2006
2007       low_off = delta < 0 ? -low_off : low_off;
2008       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
2009       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2010       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2011       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2012       return;
2013     }
2014
2015   /* Emit a move immediate if required and an addition/subtraction.  */
2016   rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2017   if (emit_move_imm)
2018     aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode);
2019   insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx)
2020                               : gen_add2_insn (this_rtx, scratch_rtx));
2021   if (frame_related_p)
2022     {
2023       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2024       rtx adj = plus_constant (mode, this_rtx, delta);
2025       add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2026     }
2027 }
2028
2029 static inline void
2030 aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
2031                       HOST_WIDE_INT delta)
2032 {
2033   aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true);
2034 }
2035
2036 static inline void
2037 aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm)
2038 {
2039   aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta,
2040                                  true, emit_move_imm);
2041 }
2042
2043 static inline void
2044 aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p)
2045 {
2046   aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta,
2047                                  frame_related_p, true);
2048 }
2049
2050 static bool
2051 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2052                                  tree exp ATTRIBUTE_UNUSED)
2053 {
2054   /* Currently, always true.  */
2055   return true;
2056 }
2057
2058 /* Implement TARGET_PASS_BY_REFERENCE.  */
2059
2060 static bool
2061 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2062                            machine_mode mode,
2063                            const_tree type,
2064                            bool named ATTRIBUTE_UNUSED)
2065 {
2066   HOST_WIDE_INT size;
2067   machine_mode dummymode;
2068   int nregs;
2069
2070   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
2071   size = (mode == BLKmode && type)
2072     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2073
2074   /* Aggregates are passed by reference based on their size.  */
2075   if (type && AGGREGATE_TYPE_P (type))
2076     {
2077       size = int_size_in_bytes (type);
2078     }
2079
2080   /* Variable sized arguments are always returned by reference.  */
2081   if (size < 0)
2082     return true;
2083
2084   /* Can this be a candidate to be passed in fp/simd register(s)?  */
2085   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2086                                                &dummymode, &nregs,
2087                                                NULL))
2088     return false;
2089
2090   /* Arguments which are variable sized or larger than 2 registers are
2091      passed by reference unless they are a homogenous floating point
2092      aggregate.  */
2093   return size > 2 * UNITS_PER_WORD;
2094 }
2095
2096 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
2097 static bool
2098 aarch64_return_in_msb (const_tree valtype)
2099 {
2100   machine_mode dummy_mode;
2101   int dummy_int;
2102
2103   /* Never happens in little-endian mode.  */
2104   if (!BYTES_BIG_ENDIAN)
2105     return false;
2106
2107   /* Only composite types smaller than or equal to 16 bytes can
2108      be potentially returned in registers.  */
2109   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2110       || int_size_in_bytes (valtype) <= 0
2111       || int_size_in_bytes (valtype) > 16)
2112     return false;
2113
2114   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2115      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2116      is always passed/returned in the least significant bits of fp/simd
2117      register(s).  */
2118   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2119                                                &dummy_mode, &dummy_int, NULL))
2120     return false;
2121
2122   return true;
2123 }
2124
2125 /* Implement TARGET_FUNCTION_VALUE.
2126    Define how to find the value returned by a function.  */
2127
2128 static rtx
2129 aarch64_function_value (const_tree type, const_tree func,
2130                         bool outgoing ATTRIBUTE_UNUSED)
2131 {
2132   machine_mode mode;
2133   int unsignedp;
2134   int count;
2135   machine_mode ag_mode;
2136
2137   mode = TYPE_MODE (type);
2138   if (INTEGRAL_TYPE_P (type))
2139     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2140
2141   if (aarch64_return_in_msb (type))
2142     {
2143       HOST_WIDE_INT size = int_size_in_bytes (type);
2144
2145       if (size % UNITS_PER_WORD != 0)
2146         {
2147           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2148           mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
2149         }
2150     }
2151
2152   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2153                                                &ag_mode, &count, NULL))
2154     {
2155       if (!aarch64_composite_type_p (type, mode))
2156         {
2157           gcc_assert (count == 1 && mode == ag_mode);
2158           return gen_rtx_REG (mode, V0_REGNUM);
2159         }
2160       else
2161         {
2162           int i;
2163           rtx par;
2164
2165           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2166           for (i = 0; i < count; i++)
2167             {
2168               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2169               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2170                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2171               XVECEXP (par, 0, i) = tmp;
2172             }
2173           return par;
2174         }
2175     }
2176   else
2177     return gen_rtx_REG (mode, R0_REGNUM);
2178 }
2179
2180 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2181    Return true if REGNO is the number of a hard register in which the values
2182    of called function may come back.  */
2183
2184 static bool
2185 aarch64_function_value_regno_p (const unsigned int regno)
2186 {
2187   /* Maximum of 16 bytes can be returned in the general registers.  Examples
2188      of 16-byte return values are: 128-bit integers and 16-byte small
2189      structures (excluding homogeneous floating-point aggregates).  */
2190   if (regno == R0_REGNUM || regno == R1_REGNUM)
2191     return true;
2192
2193   /* Up to four fp/simd registers can return a function value, e.g. a
2194      homogeneous floating-point aggregate having four members.  */
2195   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2196     return TARGET_FLOAT;
2197
2198   return false;
2199 }
2200
2201 /* Implement TARGET_RETURN_IN_MEMORY.
2202
2203    If the type T of the result of a function is such that
2204      void func (T arg)
2205    would require that arg be passed as a value in a register (or set of
2206    registers) according to the parameter passing rules, then the result
2207    is returned in the same registers as would be used for such an
2208    argument.  */
2209
2210 static bool
2211 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2212 {
2213   HOST_WIDE_INT size;
2214   machine_mode ag_mode;
2215   int count;
2216
2217   if (!AGGREGATE_TYPE_P (type)
2218       && TREE_CODE (type) != COMPLEX_TYPE
2219       && TREE_CODE (type) != VECTOR_TYPE)
2220     /* Simple scalar types always returned in registers.  */
2221     return false;
2222
2223   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2224                                                type,
2225                                                &ag_mode,
2226                                                &count,
2227                                                NULL))
2228     return false;
2229
2230   /* Types larger than 2 registers returned in memory.  */
2231   size = int_size_in_bytes (type);
2232   return (size < 0 || size > 2 * UNITS_PER_WORD);
2233 }
2234
2235 static bool
2236 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2237                                const_tree type, int *nregs)
2238 {
2239   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2240   return aarch64_vfp_is_call_or_return_candidate (mode,
2241                                                   type,
2242                                                   &pcum->aapcs_vfp_rmode,
2243                                                   nregs,
2244                                                   NULL);
2245 }
2246
2247 /* Given MODE and TYPE of a function argument, return the alignment in
2248    bits.  The idea is to suppress any stronger alignment requested by
2249    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2250    This is a helper function for local use only.  */
2251
2252 static unsigned int
2253 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2254 {
2255   if (!type)
2256     return GET_MODE_ALIGNMENT (mode);
2257   if (integer_zerop (TYPE_SIZE (type)))
2258     return 0;
2259
2260   gcc_assert (TYPE_MODE (type) == mode);
2261
2262   if (!AGGREGATE_TYPE_P (type))
2263     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2264
2265   if (TREE_CODE (type) == ARRAY_TYPE)
2266     return TYPE_ALIGN (TREE_TYPE (type));
2267
2268   unsigned int alignment = 0;
2269
2270   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2271     alignment = std::max (alignment, DECL_ALIGN (field));
2272
2273   return alignment;
2274 }
2275
2276 /* Layout a function argument according to the AAPCS64 rules.  The rule
2277    numbers refer to the rule numbers in the AAPCS64.  */
2278
2279 static void
2280 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2281                     const_tree type,
2282                     bool named ATTRIBUTE_UNUSED)
2283 {
2284   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2285   int ncrn, nvrn, nregs;
2286   bool allocate_ncrn, allocate_nvrn;
2287   HOST_WIDE_INT size;
2288
2289   /* We need to do this once per argument.  */
2290   if (pcum->aapcs_arg_processed)
2291     return;
2292
2293   pcum->aapcs_arg_processed = true;
2294
2295   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
2296   size
2297     = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2298                 UNITS_PER_WORD);
2299
2300   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2301   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2302                                                  mode,
2303                                                  type,
2304                                                  &nregs);
2305
2306   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2307      The following code thus handles passing by SIMD/FP registers first.  */
2308
2309   nvrn = pcum->aapcs_nvrn;
2310
2311   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2312      and homogenous short-vector aggregates (HVA).  */
2313   if (allocate_nvrn)
2314     {
2315       if (!TARGET_FLOAT)
2316         aarch64_err_no_fpadvsimd (mode, "argument");
2317
2318       if (nvrn + nregs <= NUM_FP_ARG_REGS)
2319         {
2320           pcum->aapcs_nextnvrn = nvrn + nregs;
2321           if (!aarch64_composite_type_p (type, mode))
2322             {
2323               gcc_assert (nregs == 1);
2324               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2325             }
2326           else
2327             {
2328               rtx par;
2329               int i;
2330               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2331               for (i = 0; i < nregs; i++)
2332                 {
2333                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2334                                          V0_REGNUM + nvrn + i);
2335                   tmp = gen_rtx_EXPR_LIST
2336                     (VOIDmode, tmp,
2337                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2338                   XVECEXP (par, 0, i) = tmp;
2339                 }
2340               pcum->aapcs_reg = par;
2341             }
2342           return;
2343         }
2344       else
2345         {
2346           /* C.3 NSRN is set to 8.  */
2347           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2348           goto on_stack;
2349         }
2350     }
2351
2352   ncrn = pcum->aapcs_ncrn;
2353   nregs = size / UNITS_PER_WORD;
2354
2355   /* C6 - C9.  though the sign and zero extension semantics are
2356      handled elsewhere.  This is the case where the argument fits
2357      entirely general registers.  */
2358   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2359     {
2360       unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2361
2362       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2363
2364       /* C.8 if the argument has an alignment of 16 then the NGRN is
2365          rounded up to the next even number.  */
2366       if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
2367         {
2368           ++ncrn;
2369           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2370         }
2371       /* NREGS can be 0 when e.g. an empty structure is to be passed.
2372          A reg is still generated for it, but the caller should be smart
2373          enough not to use it.  */
2374       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2375         {
2376           pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2377         }
2378       else
2379         {
2380           rtx par;
2381           int i;
2382
2383           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2384           for (i = 0; i < nregs; i++)
2385             {
2386               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2387               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2388                                        GEN_INT (i * UNITS_PER_WORD));
2389               XVECEXP (par, 0, i) = tmp;
2390             }
2391           pcum->aapcs_reg = par;
2392         }
2393
2394       pcum->aapcs_nextncrn = ncrn + nregs;
2395       return;
2396     }
2397
2398   /* C.11  */
2399   pcum->aapcs_nextncrn = NUM_ARG_REGS;
2400
2401   /* The argument is passed on stack; record the needed number of words for
2402      this argument and align the total size if necessary.  */
2403 on_stack:
2404   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2405   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2406     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2407                                        16 / UNITS_PER_WORD);
2408   return;
2409 }
2410
2411 /* Implement TARGET_FUNCTION_ARG.  */
2412
2413 static rtx
2414 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2415                       const_tree type, bool named)
2416 {
2417   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2418   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2419
2420   if (mode == VOIDmode)
2421     return NULL_RTX;
2422
2423   aarch64_layout_arg (pcum_v, mode, type, named);
2424   return pcum->aapcs_reg;
2425 }
2426
2427 void
2428 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2429                            const_tree fntype ATTRIBUTE_UNUSED,
2430                            rtx libname ATTRIBUTE_UNUSED,
2431                            const_tree fndecl ATTRIBUTE_UNUSED,
2432                            unsigned n_named ATTRIBUTE_UNUSED)
2433 {
2434   pcum->aapcs_ncrn = 0;
2435   pcum->aapcs_nvrn = 0;
2436   pcum->aapcs_nextncrn = 0;
2437   pcum->aapcs_nextnvrn = 0;
2438   pcum->pcs_variant = ARM_PCS_AAPCS64;
2439   pcum->aapcs_reg = NULL_RTX;
2440   pcum->aapcs_arg_processed = false;
2441   pcum->aapcs_stack_words = 0;
2442   pcum->aapcs_stack_size = 0;
2443
2444   if (!TARGET_FLOAT
2445       && fndecl && TREE_PUBLIC (fndecl)
2446       && fntype && fntype != error_mark_node)
2447     {
2448       const_tree type = TREE_TYPE (fntype);
2449       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
2450       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
2451       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2452                                                    &mode, &nregs, NULL))
2453         aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2454     }
2455   return;
2456 }
2457
2458 static void
2459 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2460                               machine_mode mode,
2461                               const_tree type,
2462                               bool named)
2463 {
2464   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2465   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2466     {
2467       aarch64_layout_arg (pcum_v, mode, type, named);
2468       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2469                   != (pcum->aapcs_stack_words != 0));
2470       pcum->aapcs_arg_processed = false;
2471       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2472       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2473       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2474       pcum->aapcs_stack_words = 0;
2475       pcum->aapcs_reg = NULL_RTX;
2476     }
2477 }
2478
2479 bool
2480 aarch64_function_arg_regno_p (unsigned regno)
2481 {
2482   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2483           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2484 }
2485
2486 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
2487    PARM_BOUNDARY bits of alignment, but will be given anything up
2488    to STACK_BOUNDARY bits if the type requires it.  This makes sure
2489    that both before and after the layout of each argument, the Next
2490    Stacked Argument Address (NSAA) will have a minimum alignment of
2491    8 bytes.  */
2492
2493 static unsigned int
2494 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2495 {
2496   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2497
2498   if (alignment < PARM_BOUNDARY)
2499     alignment = PARM_BOUNDARY;
2500   if (alignment > STACK_BOUNDARY)
2501     alignment = STACK_BOUNDARY;
2502   return alignment;
2503 }
2504
2505 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2506
2507    Return true if an argument passed on the stack should be padded upwards,
2508    i.e. if the least-significant byte of the stack slot has useful data.
2509
2510    Small aggregate types are placed in the lowest memory address.
2511
2512    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
2513
2514 bool
2515 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
2516 {
2517   /* On little-endian targets, the least significant byte of every stack
2518      argument is passed at the lowest byte address of the stack slot.  */
2519   if (!BYTES_BIG_ENDIAN)
2520     return true;
2521
2522   /* Otherwise, integral, floating-point and pointer types are padded downward:
2523      the least significant byte of a stack argument is passed at the highest
2524      byte address of the stack slot.  */
2525   if (type
2526       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2527          || POINTER_TYPE_P (type))
2528       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2529     return false;
2530
2531   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
2532   return true;
2533 }
2534
2535 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2536
2537    It specifies padding for the last (may also be the only)
2538    element of a block move between registers and memory.  If
2539    assuming the block is in the memory, padding upward means that
2540    the last element is padded after its highest significant byte,
2541    while in downward padding, the last element is padded at the
2542    its least significant byte side.
2543
2544    Small aggregates and small complex types are always padded
2545    upwards.
2546
2547    We don't need to worry about homogeneous floating-point or
2548    short-vector aggregates; their move is not affected by the
2549    padding direction determined here.  Regardless of endianness,
2550    each element of such an aggregate is put in the least
2551    significant bits of a fp/simd register.
2552
2553    Return !BYTES_BIG_ENDIAN if the least significant byte of the
2554    register has useful data, and return the opposite if the most
2555    significant byte does.  */
2556
2557 bool
2558 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2559                      bool first ATTRIBUTE_UNUSED)
2560 {
2561
2562   /* Small composite types are always padded upward.  */
2563   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2564     {
2565       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2566                             : GET_MODE_SIZE (mode));
2567       if (size < 2 * UNITS_PER_WORD)
2568         return true;
2569     }
2570
2571   /* Otherwise, use the default padding.  */
2572   return !BYTES_BIG_ENDIAN;
2573 }
2574
2575 static machine_mode
2576 aarch64_libgcc_cmp_return_mode (void)
2577 {
2578   return SImode;
2579 }
2580
2581 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2582
2583 /* We use the 12-bit shifted immediate arithmetic instructions so values
2584    must be multiple of (1 << 12), i.e. 4096.  */
2585 #define ARITH_FACTOR 4096
2586
2587 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2588 #error Cannot use simple address calculation for stack probing
2589 #endif
2590
2591 /* The pair of scratch registers used for stack probing.  */
2592 #define PROBE_STACK_FIRST_REG  9
2593 #define PROBE_STACK_SECOND_REG 10
2594
2595 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2596    inclusive.  These are offsets from the current stack pointer.  */
2597
2598 static void
2599 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2600 {
2601   rtx reg1 = gen_rtx_REG (ptr_mode, PROBE_STACK_FIRST_REG);
2602
2603   /* See the same assertion on PROBE_INTERVAL above.  */
2604   gcc_assert ((first % ARITH_FACTOR) == 0);
2605
2606   /* See if we have a constant small number of probes to generate.  If so,
2607      that's the easy case.  */
2608   if (size <= PROBE_INTERVAL)
2609     {
2610       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2611
2612       emit_set_insn (reg1,
2613                      plus_constant (ptr_mode,
2614                                     stack_pointer_rtx, -(first + base)));
2615       emit_stack_probe (plus_constant (ptr_mode, reg1, base - size));
2616     }
2617
2618   /* The run-time loop is made up of 8 insns in the generic case while the
2619      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
2620   else if (size <= 4 * PROBE_INTERVAL)
2621     {
2622       HOST_WIDE_INT i, rem;
2623
2624       emit_set_insn (reg1,
2625                      plus_constant (ptr_mode,
2626                                     stack_pointer_rtx,
2627                                     -(first + PROBE_INTERVAL)));
2628       emit_stack_probe (reg1);
2629
2630       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2631          it exceeds SIZE.  If only two probes are needed, this will not
2632          generate any code.  Then probe at FIRST + SIZE.  */
2633       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2634         {
2635           emit_set_insn (reg1,
2636                          plus_constant (ptr_mode, reg1, -PROBE_INTERVAL));
2637           emit_stack_probe (reg1);
2638         }
2639
2640       rem = size - (i - PROBE_INTERVAL);
2641       if (rem > 256)
2642         {
2643           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2644
2645           emit_set_insn (reg1, plus_constant (ptr_mode, reg1, -base));
2646           emit_stack_probe (plus_constant (ptr_mode, reg1, base - rem));
2647         }
2648       else
2649         emit_stack_probe (plus_constant (ptr_mode, reg1, -rem));
2650     }
2651
2652   /* Otherwise, do the same as above, but in a loop.  Note that we must be
2653      extra careful with variables wrapping around because we might be at
2654      the very top (or the very bottom) of the address space and we have
2655      to be able to handle this case properly; in particular, we use an
2656      equality test for the loop condition.  */
2657   else
2658     {
2659       rtx reg2 = gen_rtx_REG (ptr_mode, PROBE_STACK_SECOND_REG);
2660
2661       /* Step 1: round SIZE to the previous multiple of the interval.  */
2662
2663       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2664
2665
2666       /* Step 2: compute initial and final value of the loop counter.  */
2667
2668       /* TEST_ADDR = SP + FIRST.  */
2669       emit_set_insn (reg1,
2670                      plus_constant (ptr_mode, stack_pointer_rtx, -first));
2671
2672       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
2673       emit_set_insn (reg2,
2674                      plus_constant (ptr_mode, stack_pointer_rtx,
2675                                     -(first + rounded_size)));
2676
2677
2678       /* Step 3: the loop
2679
2680          do
2681            {
2682              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2683              probe at TEST_ADDR
2684            }
2685          while (TEST_ADDR != LAST_ADDR)
2686
2687          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2688          until it is equal to ROUNDED_SIZE.  */
2689
2690       if (ptr_mode == DImode)
2691         emit_insn (gen_probe_stack_range_di (reg1, reg1, reg2));
2692       else
2693         emit_insn (gen_probe_stack_range_si (reg1, reg1, reg2));
2694
2695
2696       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2697          that SIZE is equal to ROUNDED_SIZE.  */
2698
2699       if (size != rounded_size)
2700         {
2701           HOST_WIDE_INT rem = size - rounded_size;
2702
2703           if (rem > 256)
2704             {
2705               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2706
2707               emit_set_insn (reg2, plus_constant (ptr_mode, reg2, -base));
2708               emit_stack_probe (plus_constant (ptr_mode, reg2, base - rem));
2709             }
2710           else
2711             emit_stack_probe (plus_constant (ptr_mode, reg2, -rem));
2712         }
2713     }
2714
2715   /* Make sure nothing is scheduled before we are done.  */
2716   emit_insn (gen_blockage ());
2717 }
2718
2719 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
2720    absolute addresses.  */
2721
2722 const char *
2723 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2724 {
2725   static int labelno = 0;
2726   char loop_lab[32];
2727   rtx xops[2];
2728
2729   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2730
2731   /* Loop.  */
2732   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2733
2734   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
2735   xops[0] = reg1;
2736   xops[1] = GEN_INT (PROBE_INTERVAL);
2737   output_asm_insn ("sub\t%0, %0, %1", xops);
2738
2739   /* Probe at TEST_ADDR.  */
2740   output_asm_insn ("str\txzr, [%0]", xops);
2741
2742   /* Test if TEST_ADDR == LAST_ADDR.  */
2743   xops[1] = reg2;
2744   output_asm_insn ("cmp\t%0, %1", xops);
2745
2746   /* Branch.  */
2747   fputs ("\tb.ne\t", asm_out_file);
2748   assemble_name_raw (asm_out_file, loop_lab);
2749   fputc ('\n', asm_out_file);
2750
2751   return "";
2752 }
2753
2754 static bool
2755 aarch64_frame_pointer_required (void)
2756 {
2757   /* In aarch64_override_options_after_change
2758      flag_omit_leaf_frame_pointer turns off the frame pointer by
2759      default.  Turn it back on now if we've not got a leaf
2760      function.  */
2761   if (flag_omit_leaf_frame_pointer
2762       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2763     return true;
2764
2765   /* Force a frame pointer for EH returns so the return address is at FP+8.  */
2766   if (crtl->calls_eh_return)
2767     return true;
2768
2769   return false;
2770 }
2771
2772 /* Mark the registers that need to be saved by the callee and calculate
2773    the size of the callee-saved registers area and frame record (both FP
2774    and LR may be omitted).  */
2775 static void
2776 aarch64_layout_frame (void)
2777 {
2778   HOST_WIDE_INT offset = 0;
2779   int regno, last_fp_reg = INVALID_REGNUM;
2780
2781   if (reload_completed && cfun->machine->frame.laid_out)
2782     return;
2783
2784 #define SLOT_NOT_REQUIRED (-2)
2785 #define SLOT_REQUIRED     (-1)
2786
2787   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2788   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2789
2790   /* First mark all the registers that really need to be saved...  */
2791   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2792     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2793
2794   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2795     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2796
2797   /* ... that includes the eh data registers (if needed)...  */
2798   if (crtl->calls_eh_return)
2799     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2800       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2801         = SLOT_REQUIRED;
2802
2803   /* ... and any callee saved register that dataflow says is live.  */
2804   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2805     if (df_regs_ever_live_p (regno)
2806         && (regno == R30_REGNUM
2807             || !call_used_regs[regno]))
2808       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2809
2810   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2811     if (df_regs_ever_live_p (regno)
2812         && !call_used_regs[regno])
2813       {
2814         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2815         last_fp_reg = regno;
2816       }
2817
2818   if (frame_pointer_needed)
2819     {
2820       /* FP and LR are placed in the linkage record.  */
2821       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2822       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2823       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2824       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2825       offset += 2 * UNITS_PER_WORD;
2826     }
2827
2828   /* Now assign stack slots for them.  */
2829   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2830     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2831       {
2832         cfun->machine->frame.reg_offset[regno] = offset;
2833         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2834           cfun->machine->frame.wb_candidate1 = regno;
2835         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2836           cfun->machine->frame.wb_candidate2 = regno;
2837         offset += UNITS_PER_WORD;
2838       }
2839
2840   HOST_WIDE_INT max_int_offset = offset;
2841   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2842   bool has_align_gap = offset != max_int_offset;
2843
2844   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2845     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2846       {
2847         /* If there is an alignment gap between integer and fp callee-saves,
2848            allocate the last fp register to it if possible.  */
2849         if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
2850           {
2851             cfun->machine->frame.reg_offset[regno] = max_int_offset;
2852             break;
2853           }
2854
2855         cfun->machine->frame.reg_offset[regno] = offset;
2856         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2857           cfun->machine->frame.wb_candidate1 = regno;
2858         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2859                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2860           cfun->machine->frame.wb_candidate2 = regno;
2861         offset += UNITS_PER_WORD;
2862       }
2863
2864   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2865
2866   cfun->machine->frame.saved_regs_size = offset;
2867
2868   HOST_WIDE_INT varargs_and_saved_regs_size
2869     = offset + cfun->machine->frame.saved_varargs_size;
2870
2871   cfun->machine->frame.hard_fp_offset
2872     = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
2873                 STACK_BOUNDARY / BITS_PER_UNIT);
2874
2875   cfun->machine->frame.frame_size
2876     = ROUND_UP (cfun->machine->frame.hard_fp_offset
2877                 + crtl->outgoing_args_size,
2878                 STACK_BOUNDARY / BITS_PER_UNIT);
2879
2880   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
2881
2882   cfun->machine->frame.initial_adjust = 0;
2883   cfun->machine->frame.final_adjust = 0;
2884   cfun->machine->frame.callee_adjust = 0;
2885   cfun->machine->frame.callee_offset = 0;
2886
2887   HOST_WIDE_INT max_push_offset = 0;
2888   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
2889     max_push_offset = 512;
2890   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
2891     max_push_offset = 256;
2892
2893   if (cfun->machine->frame.frame_size < max_push_offset
2894       && crtl->outgoing_args_size == 0)
2895     {
2896       /* Simple, small frame with no outgoing arguments:
2897          stp reg1, reg2, [sp, -frame_size]!
2898          stp reg3, reg4, [sp, 16]  */
2899       cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
2900     }
2901   else if ((crtl->outgoing_args_size
2902             + cfun->machine->frame.saved_regs_size < 512)
2903            && !(cfun->calls_alloca
2904                 && cfun->machine->frame.hard_fp_offset < max_push_offset))
2905     {
2906       /* Frame with small outgoing arguments:
2907          sub sp, sp, frame_size
2908          stp reg1, reg2, [sp, outgoing_args_size]
2909          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
2910       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
2911       cfun->machine->frame.callee_offset
2912         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
2913     }
2914   else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
2915     {
2916       /* Frame with large outgoing arguments but a small local area:
2917          stp reg1, reg2, [sp, -hard_fp_offset]!
2918          stp reg3, reg4, [sp, 16]
2919          sub sp, sp, outgoing_args_size  */
2920       cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
2921       cfun->machine->frame.final_adjust
2922         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
2923     }
2924   else if (!frame_pointer_needed
2925            && varargs_and_saved_regs_size < max_push_offset)
2926     {
2927       /* Frame with large local area and outgoing arguments (this pushes the
2928          callee-saves first, followed by the locals and outgoing area):
2929          stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
2930          stp reg3, reg4, [sp, 16]
2931          sub sp, sp, frame_size - varargs_and_saved_regs_size  */
2932       cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
2933       cfun->machine->frame.final_adjust
2934         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
2935       cfun->machine->frame.hard_fp_offset = cfun->machine->frame.callee_adjust;
2936       cfun->machine->frame.locals_offset = cfun->machine->frame.hard_fp_offset;
2937     }
2938   else
2939     {
2940       /* Frame with large local area and outgoing arguments using frame pointer:
2941          sub sp, sp, hard_fp_offset
2942          stp x29, x30, [sp, 0]
2943          add x29, sp, 0
2944          stp reg3, reg4, [sp, 16]
2945          sub sp, sp, outgoing_args_size  */
2946       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
2947       cfun->machine->frame.final_adjust
2948         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
2949     }
2950
2951   cfun->machine->frame.laid_out = true;
2952 }
2953
2954 /* Return true if the register REGNO is saved on entry to
2955    the current function.  */
2956
2957 static bool
2958 aarch64_register_saved_on_entry (int regno)
2959 {
2960   return cfun->machine->frame.reg_offset[regno] >= 0;
2961 }
2962
2963 /* Return the next register up from REGNO up to LIMIT for the callee
2964    to save.  */
2965
2966 static unsigned
2967 aarch64_next_callee_save (unsigned regno, unsigned limit)
2968 {
2969   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2970     regno ++;
2971   return regno;
2972 }
2973
2974 /* Push the register number REGNO of mode MODE to the stack with write-back
2975    adjusting the stack by ADJUSTMENT.  */
2976
2977 static void
2978 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2979                            HOST_WIDE_INT adjustment)
2980  {
2981   rtx base_rtx = stack_pointer_rtx;
2982   rtx insn, reg, mem;
2983
2984   reg = gen_rtx_REG (mode, regno);
2985   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2986                             plus_constant (Pmode, base_rtx, -adjustment));
2987   mem = gen_rtx_MEM (mode, mem);
2988
2989   insn = emit_move_insn (mem, reg);
2990   RTX_FRAME_RELATED_P (insn) = 1;
2991 }
2992
2993 /* Generate and return an instruction to store the pair of registers
2994    REG and REG2 of mode MODE to location BASE with write-back adjusting
2995    the stack location BASE by ADJUSTMENT.  */
2996
2997 static rtx
2998 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2999                           HOST_WIDE_INT adjustment)
3000 {
3001   switch (mode)
3002     {
3003     case DImode:
3004       return gen_storewb_pairdi_di (base, base, reg, reg2,
3005                                     GEN_INT (-adjustment),
3006                                     GEN_INT (UNITS_PER_WORD - adjustment));
3007     case DFmode:
3008       return gen_storewb_pairdf_di (base, base, reg, reg2,
3009                                     GEN_INT (-adjustment),
3010                                     GEN_INT (UNITS_PER_WORD - adjustment));
3011     default:
3012       gcc_unreachable ();
3013     }
3014 }
3015
3016 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3017    stack pointer by ADJUSTMENT.  */
3018
3019 static void
3020 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
3021 {
3022   rtx_insn *insn;
3023   machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
3024
3025   if (regno2 == INVALID_REGNUM)
3026     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
3027
3028   rtx reg1 = gen_rtx_REG (mode, regno1);
3029   rtx reg2 = gen_rtx_REG (mode, regno2);
3030
3031   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
3032                                               reg2, adjustment));
3033   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
3034   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3035   RTX_FRAME_RELATED_P (insn) = 1;
3036 }
3037
3038 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3039    adjusting it by ADJUSTMENT afterwards.  */
3040
3041 static rtx
3042 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3043                          HOST_WIDE_INT adjustment)
3044 {
3045   switch (mode)
3046     {
3047     case DImode:
3048       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3049                                    GEN_INT (UNITS_PER_WORD));
3050     case DFmode:
3051       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3052                                    GEN_INT (UNITS_PER_WORD));
3053     default:
3054       gcc_unreachable ();
3055     }
3056 }
3057
3058 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3059    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3060    into CFI_OPS.  */
3061
3062 static void
3063 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
3064                   rtx *cfi_ops)
3065 {
3066   machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
3067   rtx reg1 = gen_rtx_REG (mode, regno1);
3068
3069   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
3070
3071   if (regno2 == INVALID_REGNUM)
3072     {
3073       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
3074       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3075       emit_move_insn (reg1, gen_rtx_MEM (mode, mem));
3076     }
3077   else
3078     {
3079       rtx reg2 = gen_rtx_REG (mode, regno2);
3080       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3081       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3082                                           reg2, adjustment));
3083     }
3084 }
3085
3086 /* Generate and return a store pair instruction of mode MODE to store
3087    register REG1 to MEM1 and register REG2 to MEM2.  */
3088
3089 static rtx
3090 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3091                         rtx reg2)
3092 {
3093   switch (mode)
3094     {
3095     case DImode:
3096       return gen_store_pairdi (mem1, reg1, mem2, reg2);
3097
3098     case DFmode:
3099       return gen_store_pairdf (mem1, reg1, mem2, reg2);
3100
3101     default:
3102       gcc_unreachable ();
3103     }
3104 }
3105
3106 /* Generate and regurn a load pair isntruction of mode MODE to load register
3107    REG1 from MEM1 and register REG2 from MEM2.  */
3108
3109 static rtx
3110 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3111                        rtx mem2)
3112 {
3113   switch (mode)
3114     {
3115     case DImode:
3116       return gen_load_pairdi (reg1, mem1, reg2, mem2);
3117
3118     case DFmode:
3119       return gen_load_pairdf (reg1, mem1, reg2, mem2);
3120
3121     default:
3122       gcc_unreachable ();
3123     }
3124 }
3125
3126 /* Emit code to save the callee-saved registers from register number START
3127    to LIMIT to the stack at the location starting at offset START_OFFSET,
3128    skipping any write-back candidates if SKIP_WB is true.  */
3129
3130 static void
3131 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3132                            unsigned start, unsigned limit, bool skip_wb)
3133 {
3134   rtx_insn *insn;
3135   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3136                                                  ? gen_frame_mem : gen_rtx_MEM);
3137   unsigned regno;
3138   unsigned regno2;
3139
3140   for (regno = aarch64_next_callee_save (start, limit);
3141        regno <= limit;
3142        regno = aarch64_next_callee_save (regno + 1, limit))
3143     {
3144       rtx reg, mem;
3145       HOST_WIDE_INT offset;
3146
3147       if (skip_wb
3148           && (regno == cfun->machine->frame.wb_candidate1
3149               || regno == cfun->machine->frame.wb_candidate2))
3150         continue;
3151
3152       if (cfun->machine->reg_is_wrapped_separately[regno])
3153        continue;
3154
3155       reg = gen_rtx_REG (mode, regno);
3156       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3157       mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3158                                               offset));
3159
3160       regno2 = aarch64_next_callee_save (regno + 1, limit);
3161
3162       if (regno2 <= limit
3163           && !cfun->machine->reg_is_wrapped_separately[regno2]
3164           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3165               == cfun->machine->frame.reg_offset[regno2]))
3166
3167         {
3168           rtx reg2 = gen_rtx_REG (mode, regno2);
3169           rtx mem2;
3170
3171           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3172           mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3173                                                    offset));
3174           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3175                                                     reg2));
3176
3177           /* The first part of a frame-related parallel insn is
3178              always assumed to be relevant to the frame
3179              calculations; subsequent parts, are only
3180              frame-related if explicitly marked.  */
3181           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3182           regno = regno2;
3183         }
3184       else
3185         insn = emit_move_insn (mem, reg);
3186
3187       RTX_FRAME_RELATED_P (insn) = 1;
3188     }
3189 }
3190
3191 /* Emit code to restore the callee registers of mode MODE from register
3192    number START up to and including LIMIT.  Restore from the stack offset
3193    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3194    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
3195
3196 static void
3197 aarch64_restore_callee_saves (machine_mode mode,
3198                               HOST_WIDE_INT start_offset, unsigned start,
3199                               unsigned limit, bool skip_wb, rtx *cfi_ops)
3200 {
3201   rtx base_rtx = stack_pointer_rtx;
3202   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3203                                                  ? gen_frame_mem : gen_rtx_MEM);
3204   unsigned regno;
3205   unsigned regno2;
3206   HOST_WIDE_INT offset;
3207
3208   for (regno = aarch64_next_callee_save (start, limit);
3209        regno <= limit;
3210        regno = aarch64_next_callee_save (regno + 1, limit))
3211     {
3212       if (cfun->machine->reg_is_wrapped_separately[regno])
3213        continue;
3214
3215       rtx reg, mem;
3216
3217       if (skip_wb
3218           && (regno == cfun->machine->frame.wb_candidate1
3219               || regno == cfun->machine->frame.wb_candidate2))
3220         continue;
3221
3222       reg = gen_rtx_REG (mode, regno);
3223       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3224       mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3225
3226       regno2 = aarch64_next_callee_save (regno + 1, limit);
3227
3228       if (regno2 <= limit
3229           && !cfun->machine->reg_is_wrapped_separately[regno2]
3230           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3231               == cfun->machine->frame.reg_offset[regno2]))
3232         {
3233           rtx reg2 = gen_rtx_REG (mode, regno2);
3234           rtx mem2;
3235
3236           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3237           mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3238           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3239
3240           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3241           regno = regno2;
3242         }
3243       else
3244         emit_move_insn (reg, mem);
3245       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3246     }
3247 }
3248
3249 static inline bool
3250 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3251                                HOST_WIDE_INT offset)
3252 {
3253   return offset >= -256 && offset < 256;
3254 }
3255
3256 static inline bool
3257 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3258 {
3259   return (offset >= 0
3260           && offset < 4096 * GET_MODE_SIZE (mode)
3261           && offset % GET_MODE_SIZE (mode) == 0);
3262 }
3263
3264 bool
3265 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3266 {
3267   return (offset >= -64 * GET_MODE_SIZE (mode)
3268           && offset < 64 * GET_MODE_SIZE (mode)
3269           && offset % GET_MODE_SIZE (mode) == 0);
3270 }
3271
3272 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
3273
3274 static sbitmap
3275 aarch64_get_separate_components (void)
3276 {
3277   aarch64_layout_frame ();
3278
3279   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3280   bitmap_clear (components);
3281
3282   /* The registers we need saved to the frame.  */
3283   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3284     if (aarch64_register_saved_on_entry (regno))
3285       {
3286         HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3287         if (!frame_pointer_needed)
3288           offset += cfun->machine->frame.frame_size
3289                     - cfun->machine->frame.hard_fp_offset;
3290         /* Check that we can access the stack slot of the register with one
3291            direct load with no adjustments needed.  */
3292         if (offset_12bit_unsigned_scaled_p (DImode, offset))
3293           bitmap_set_bit (components, regno);
3294       }
3295
3296   /* Don't mess with the hard frame pointer.  */
3297   if (frame_pointer_needed)
3298     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
3299
3300   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3301   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3302   /* If aarch64_layout_frame has chosen registers to store/restore with
3303      writeback don't interfere with them to avoid having to output explicit
3304      stack adjustment instructions.  */
3305   if (reg2 != INVALID_REGNUM)
3306     bitmap_clear_bit (components, reg2);
3307   if (reg1 != INVALID_REGNUM)
3308     bitmap_clear_bit (components, reg1);
3309
3310   bitmap_clear_bit (components, LR_REGNUM);
3311   bitmap_clear_bit (components, SP_REGNUM);
3312
3313   return components;
3314 }
3315
3316 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
3317
3318 static sbitmap
3319 aarch64_components_for_bb (basic_block bb)
3320 {
3321   bitmap in = DF_LIVE_IN (bb);
3322   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
3323   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
3324
3325   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3326   bitmap_clear (components);
3327
3328   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
3329   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3330     if ((!call_used_regs[regno])
3331        && (bitmap_bit_p (in, regno)
3332            || bitmap_bit_p (gen, regno)
3333            || bitmap_bit_p (kill, regno)))
3334           bitmap_set_bit (components, regno);
3335
3336   return components;
3337 }
3338
3339 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
3340    Nothing to do for aarch64.  */
3341
3342 static void
3343 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
3344 {
3345 }
3346
3347 /* Return the next set bit in BMP from START onwards.  Return the total number
3348    of bits in BMP if no set bit is found at or after START.  */
3349
3350 static unsigned int
3351 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
3352 {
3353   unsigned int nbits = SBITMAP_SIZE (bmp);
3354   if (start == nbits)
3355     return start;
3356
3357   gcc_assert (start < nbits);
3358   for (unsigned int i = start; i < nbits; i++)
3359     if (bitmap_bit_p (bmp, i))
3360       return i;
3361
3362   return nbits;
3363 }
3364
3365 /* Do the work for aarch64_emit_prologue_components and
3366    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
3367    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
3368    for these components or the epilogue sequence.  That is, it determines
3369    whether we should emit stores or loads and what kind of CFA notes to attach
3370    to the insns.  Otherwise the logic for the two sequences is very
3371    similar.  */
3372
3373 static void
3374 aarch64_process_components (sbitmap components, bool prologue_p)
3375 {
3376   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
3377                              ? HARD_FRAME_POINTER_REGNUM
3378                              : STACK_POINTER_REGNUM);
3379
3380   unsigned last_regno = SBITMAP_SIZE (components);
3381   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
3382   rtx_insn *insn = NULL;
3383
3384   while (regno != last_regno)
3385     {
3386       /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
3387          so DFmode for the vector registers is enough.  */
3388       machine_mode mode = GP_REGNUM_P (regno) ? DImode : DFmode;
3389       rtx reg = gen_rtx_REG (mode, regno);
3390       HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3391       if (!frame_pointer_needed)
3392         offset += cfun->machine->frame.frame_size
3393                   - cfun->machine->frame.hard_fp_offset;
3394       rtx addr = plus_constant (Pmode, ptr_reg, offset);
3395       rtx mem = gen_frame_mem (mode, addr);
3396
3397       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
3398       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
3399       /* No more registers to handle after REGNO.
3400          Emit a single save/restore and exit.  */
3401       if (regno2 == last_regno)
3402         {
3403           insn = emit_insn (set);
3404           RTX_FRAME_RELATED_P (insn) = 1;
3405           if (prologue_p)
3406             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3407           else
3408             add_reg_note (insn, REG_CFA_RESTORE, reg);
3409           break;
3410         }
3411
3412       HOST_WIDE_INT offset2 = cfun->machine->frame.reg_offset[regno2];
3413       /* The next register is not of the same class or its offset is not
3414          mergeable with the current one into a pair.  */
3415       if (!satisfies_constraint_Ump (mem)
3416           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
3417           || (offset2 - cfun->machine->frame.reg_offset[regno])
3418                 != GET_MODE_SIZE (mode))
3419         {
3420           insn = emit_insn (set);
3421           RTX_FRAME_RELATED_P (insn) = 1;
3422           if (prologue_p)
3423             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3424           else
3425             add_reg_note (insn, REG_CFA_RESTORE, reg);
3426
3427           regno = regno2;
3428           continue;
3429         }
3430
3431       /* REGNO2 can be saved/restored in a pair with REGNO.  */
3432       rtx reg2 = gen_rtx_REG (mode, regno2);
3433       if (!frame_pointer_needed)
3434         offset2 += cfun->machine->frame.frame_size
3435                   - cfun->machine->frame.hard_fp_offset;
3436       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
3437       rtx mem2 = gen_frame_mem (mode, addr2);
3438       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
3439                              : gen_rtx_SET (reg2, mem2);
3440
3441       if (prologue_p)
3442         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
3443       else
3444         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3445
3446       RTX_FRAME_RELATED_P (insn) = 1;
3447       if (prologue_p)
3448         {
3449           add_reg_note (insn, REG_CFA_OFFSET, set);
3450           add_reg_note (insn, REG_CFA_OFFSET, set2);
3451         }
3452       else
3453         {
3454           add_reg_note (insn, REG_CFA_RESTORE, reg);
3455           add_reg_note (insn, REG_CFA_RESTORE, reg2);
3456         }
3457
3458       regno = aarch64_get_next_set_bit (components, regno2 + 1);
3459     }
3460 }
3461
3462 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
3463
3464 static void
3465 aarch64_emit_prologue_components (sbitmap components)
3466 {
3467   aarch64_process_components (components, true);
3468 }
3469
3470 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
3471
3472 static void
3473 aarch64_emit_epilogue_components (sbitmap components)
3474 {
3475   aarch64_process_components (components, false);
3476 }
3477
3478 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
3479
3480 static void
3481 aarch64_set_handled_components (sbitmap components)
3482 {
3483   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3484     if (bitmap_bit_p (components, regno))
3485       cfun->machine->reg_is_wrapped_separately[regno] = true;
3486 }
3487
3488 /* AArch64 stack frames generated by this compiler look like:
3489
3490         +-------------------------------+
3491         |                               |
3492         |  incoming stack arguments     |
3493         |                               |
3494         +-------------------------------+
3495         |                               | <-- incoming stack pointer (aligned)
3496         |  callee-allocated save area   |
3497         |  for register varargs         |
3498         |                               |
3499         +-------------------------------+
3500         |  local variables              | <-- frame_pointer_rtx
3501         |                               |
3502         +-------------------------------+
3503         |  padding0                     | \
3504         +-------------------------------+  |
3505         |  callee-saved registers       |  | frame.saved_regs_size
3506         +-------------------------------+  |
3507         |  LR'                          |  |
3508         +-------------------------------+  |
3509         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
3510         +-------------------------------+
3511         |  dynamic allocation           |
3512         +-------------------------------+
3513         |  padding                      |
3514         +-------------------------------+
3515         |  outgoing stack arguments     | <-- arg_pointer
3516         |                               |
3517         +-------------------------------+
3518         |                               | <-- stack_pointer_rtx (aligned)
3519
3520    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3521    but leave frame_pointer_rtx and hard_frame_pointer_rtx
3522    unchanged.  */
3523
3524 /* Generate the prologue instructions for entry into a function.
3525    Establish the stack frame by decreasing the stack pointer with a
3526    properly calculated size and, if necessary, create a frame record
3527    filled with the values of LR and previous frame pointer.  The
3528    current FP is also set up if it is in use.  */
3529
3530 void
3531 aarch64_expand_prologue (void)
3532 {
3533   aarch64_layout_frame ();
3534
3535   HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3536   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3537   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3538   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3539   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3540   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3541   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3542   rtx_insn *insn;
3543
3544   if (flag_stack_usage_info)
3545     current_function_static_stack_size = frame_size;
3546
3547   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3548     {
3549       if (crtl->is_leaf && !cfun->calls_alloca)
3550         {
3551           if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
3552             aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
3553                                             frame_size - STACK_CHECK_PROTECT);
3554         }
3555       else if (frame_size > 0)
3556         aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
3557     }
3558
3559   aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
3560
3561   if (callee_adjust != 0)
3562     aarch64_push_regs (reg1, reg2, callee_adjust);
3563
3564   if (frame_pointer_needed)
3565     {
3566       if (callee_adjust == 0)
3567         aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3568                                    R30_REGNUM, false);
3569       insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3570                                        stack_pointer_rtx,
3571                                        GEN_INT (callee_offset)));
3572       RTX_FRAME_RELATED_P (insn) = 1;
3573       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3574     }
3575
3576   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3577                              callee_adjust != 0 || frame_pointer_needed);
3578   aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3579                              callee_adjust != 0 || frame_pointer_needed);
3580   aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
3581 }
3582
3583 /* Return TRUE if we can use a simple_return insn.
3584
3585    This function checks whether the callee saved stack is empty, which
3586    means no restore actions are need. The pro_and_epilogue will use
3587    this to check whether shrink-wrapping opt is feasible.  */
3588
3589 bool
3590 aarch64_use_return_insn_p (void)
3591 {
3592   if (!reload_completed)
3593     return false;
3594
3595   if (crtl->profile)
3596     return false;
3597
3598   aarch64_layout_frame ();
3599
3600   return cfun->machine->frame.frame_size == 0;
3601 }
3602
3603 /* Generate the epilogue instructions for returning from a function.
3604    This is almost exactly the reverse of the prolog sequence, except
3605    that we need to insert barriers to avoid scheduling loads that read
3606    from a deallocated stack, and we optimize the unwind records by
3607    emitting them all together if possible.  */
3608 void
3609 aarch64_expand_epilogue (bool for_sibcall)
3610 {
3611   aarch64_layout_frame ();
3612
3613   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3614   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3615   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3616   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3617   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3618   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3619   rtx cfi_ops = NULL;
3620   rtx_insn *insn;
3621
3622   /* We need to add memory barrier to prevent read from deallocated stack.  */
3623   bool need_barrier_p = (get_frame_size ()
3624                          + cfun->machine->frame.saved_varargs_size) != 0;
3625
3626   /* Emit a barrier to prevent loads from a deallocated stack.  */
3627   if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca
3628       || crtl->calls_eh_return)
3629     {
3630       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3631       need_barrier_p = false;
3632     }
3633
3634   /* Restore the stack pointer from the frame pointer if it may not
3635      be the same as the stack pointer.  */
3636   if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3637     {
3638       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3639                                        hard_frame_pointer_rtx,
3640                                        GEN_INT (-callee_offset)));
3641       /* If writeback is used when restoring callee-saves, the CFA
3642          is restored on the instruction doing the writeback.  */
3643       RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3644     }
3645   else
3646     aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
3647
3648   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3649                                 callee_adjust != 0, &cfi_ops);
3650   aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3651                                 callee_adjust != 0, &cfi_ops);
3652
3653   if (need_barrier_p)
3654     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3655
3656   if (callee_adjust != 0)
3657     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3658
3659   if (callee_adjust != 0 || initial_adjust > 65536)
3660     {
3661       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
3662       insn = get_last_insn ();
3663       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3664       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3665       RTX_FRAME_RELATED_P (insn) = 1;
3666       cfi_ops = NULL;
3667     }
3668
3669   aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
3670
3671   if (cfi_ops)
3672     {
3673       /* Emit delayed restores and reset the CFA to be SP.  */
3674       insn = get_last_insn ();
3675       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3676       REG_NOTES (insn) = cfi_ops;
3677       RTX_FRAME_RELATED_P (insn) = 1;
3678     }
3679
3680   /* Stack adjustment for exception handler.  */
3681   if (crtl->calls_eh_return)
3682     {
3683       /* We need to unwind the stack by the offset computed by
3684          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
3685          to be SP; letting the CFA move during this adjustment
3686          is just as correct as retaining the CFA from the body
3687          of the function.  Therefore, do nothing special.  */
3688       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3689     }
3690
3691   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3692   if (!for_sibcall)
3693     emit_jump_insn (ret_rtx);
3694 }
3695
3696 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
3697    normally or return to a previous frame after unwinding.
3698
3699    An EH return uses a single shared return sequence.  The epilogue is
3700    exactly like a normal epilogue except that it has an extra input
3701    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
3702    that must be applied after the frame has been destroyed.  An extra label
3703    is inserted before the epilogue which initializes this register to zero,
3704    and this is the entry point for a normal return.
3705
3706    An actual EH return updates the return address, initializes the stack
3707    adjustment and jumps directly into the epilogue (bypassing the zeroing
3708    of the adjustment).  Since the return address is typically saved on the
3709    stack when a function makes a call, the saved LR must be updated outside
3710    the epilogue.
3711
3712    This poses problems as the store is generated well before the epilogue,
3713    so the offset of LR is not known yet.  Also optimizations will remove the
3714    store as it appears dead, even after the epilogue is generated (as the
3715    base or offset for loading LR is different in many cases).
3716
3717    To avoid these problems this implementation forces the frame pointer
3718    in eh_return functions so that the location of LR is fixed and known early.
3719    It also marks the store volatile, so no optimization is permitted to
3720    remove the store.  */
3721 rtx
3722 aarch64_eh_return_handler_rtx (void)
3723 {
3724   rtx tmp = gen_frame_mem (Pmode,
3725     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3726
3727   /* Mark the store volatile, so no optimization is permitted to remove it.  */
3728   MEM_VOLATILE_P (tmp) = true;
3729   return tmp;
3730 }
3731
3732 /* Output code to add DELTA to the first argument, and then jump
3733    to FUNCTION.  Used for C++ multiple inheritance.  */
3734 static void
3735 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3736                          HOST_WIDE_INT delta,
3737                          HOST_WIDE_INT vcall_offset,
3738                          tree function)
3739 {
3740   /* The this pointer is always in x0.  Note that this differs from
3741      Arm where the this pointer maybe bumped to r1 if r0 is required
3742      to return a pointer to an aggregate.  On AArch64 a result value
3743      pointer will be in x8.  */
3744   int this_regno = R0_REGNUM;
3745   rtx this_rtx, temp0, temp1, addr, funexp;
3746   rtx_insn *insn;
3747
3748   reload_completed = 1;
3749   emit_note (NOTE_INSN_PROLOGUE_END);
3750
3751   if (vcall_offset == 0)
3752     aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3753   else
3754     {
3755       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3756
3757       this_rtx = gen_rtx_REG (Pmode, this_regno);
3758       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3759       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3760
3761       addr = this_rtx;
3762       if (delta != 0)
3763         {
3764           if (delta >= -256 && delta < 256)
3765             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3766                                        plus_constant (Pmode, this_rtx, delta));
3767           else
3768             aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3769         }
3770
3771       if (Pmode == ptr_mode)
3772         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3773       else
3774         aarch64_emit_move (temp0,
3775                            gen_rtx_ZERO_EXTEND (Pmode,
3776                                                 gen_rtx_MEM (ptr_mode, addr)));
3777
3778       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3779           addr = plus_constant (Pmode, temp0, vcall_offset);
3780       else
3781         {
3782           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3783                                           Pmode);
3784           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3785         }
3786
3787       if (Pmode == ptr_mode)
3788         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3789       else
3790         aarch64_emit_move (temp1,
3791                            gen_rtx_SIGN_EXTEND (Pmode,
3792                                                 gen_rtx_MEM (ptr_mode, addr)));
3793
3794       emit_insn (gen_add2_insn (this_rtx, temp1));
3795     }
3796
3797   /* Generate a tail call to the target function.  */
3798   if (!TREE_USED (function))
3799     {
3800       assemble_external (function);
3801       TREE_USED (function) = 1;
3802     }
3803   funexp = XEXP (DECL_RTL (function), 0);
3804   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3805   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3806   SIBLING_CALL_P (insn) = 1;
3807
3808   insn = get_insns ();
3809   shorten_branches (insn);
3810   final_start_function (insn, file, 1);
3811   final (insn, file, 1);
3812   final_end_function ();
3813
3814   /* Stop pretending to be a post-reload pass.  */
3815   reload_completed = 0;
3816 }
3817
3818 static bool
3819 aarch64_tls_referenced_p (rtx x)
3820 {
3821   if (!TARGET_HAVE_TLS)
3822     return false;
3823   subrtx_iterator::array_type array;
3824   FOR_EACH_SUBRTX (iter, array, x, ALL)
3825     {
3826       const_rtx x = *iter;
3827       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3828         return true;
3829       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3830          TLS offsets, not real symbol references.  */
3831       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3832         iter.skip_subrtxes ();
3833     }
3834   return false;
3835 }
3836
3837
3838 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3839    a left shift of 0 or 12 bits.  */
3840 bool
3841 aarch64_uimm12_shift (HOST_WIDE_INT val)
3842 {
3843   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3844           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3845           );
3846 }
3847
3848
3849 /* Return true if val is an immediate that can be loaded into a
3850    register by a MOVZ instruction.  */
3851 static bool
3852 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3853 {
3854   if (GET_MODE_SIZE (mode) > 4)
3855     {
3856       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3857           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3858         return 1;
3859     }
3860   else
3861     {
3862       /* Ignore sign extension.  */
3863       val &= (HOST_WIDE_INT) 0xffffffff;
3864     }
3865   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3866           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3867 }
3868
3869 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
3870
3871 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
3872   {
3873     0x0000000100000001ull,
3874     0x0001000100010001ull,
3875     0x0101010101010101ull,
3876     0x1111111111111111ull,
3877     0x5555555555555555ull,
3878   };
3879
3880
3881 /* Return true if val is a valid bitmask immediate.  */
3882
3883 bool
3884 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
3885 {
3886   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
3887   int bits;
3888
3889   /* Check for a single sequence of one bits and return quickly if so.
3890      The special cases of all ones and all zeroes returns false.  */
3891   val = (unsigned HOST_WIDE_INT) val_in;
3892   tmp = val + (val & -val);
3893
3894   if (tmp == (tmp & -tmp))
3895     return (val + 1) > 1;
3896
3897   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
3898   if (mode == SImode)
3899     val = (val << 32) | (val & 0xffffffff);
3900
3901   /* Invert if the immediate doesn't start with a zero bit - this means we
3902      only need to search for sequences of one bits.  */
3903   if (val & 1)
3904     val = ~val;
3905
3906   /* Find the first set bit and set tmp to val with the first sequence of one
3907      bits removed.  Return success if there is a single sequence of ones.  */
3908   first_one = val & -val;
3909   tmp = val & (val + first_one);
3910
3911   if (tmp == 0)
3912     return true;
3913
3914   /* Find the next set bit and compute the difference in bit position.  */
3915   next_one = tmp & -tmp;
3916   bits = clz_hwi (first_one) - clz_hwi (next_one);
3917   mask = val ^ tmp;
3918
3919   /* Check the bit position difference is a power of 2, and that the first
3920      sequence of one bits fits within 'bits' bits.  */
3921   if ((mask >> bits) != 0 || bits != (bits & -bits))
3922     return false;
3923
3924   /* Check the sequence of one bits is repeated 64/bits times.  */
3925   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
3926 }
3927
3928 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
3929    Assumed precondition: VAL_IN Is not zero.  */
3930
3931 unsigned HOST_WIDE_INT
3932 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
3933 {
3934   int lowest_bit_set = ctz_hwi (val_in);
3935   int highest_bit_set = floor_log2 (val_in);
3936   gcc_assert (val_in != 0);
3937
3938   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
3939           (HOST_WIDE_INT_1U << lowest_bit_set));
3940 }
3941
3942 /* Create constant where bits outside of lowest bit set to highest bit set
3943    are set to 1.  */
3944
3945 unsigned HOST_WIDE_INT
3946 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
3947 {
3948   return val_in | ~aarch64_and_split_imm1 (val_in);
3949 }
3950
3951 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
3952
3953 bool
3954 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
3955 {
3956   if (aarch64_bitmask_imm (val_in, mode))
3957     return false;
3958
3959   if (aarch64_move_imm (val_in, mode))
3960     return false;
3961
3962   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
3963
3964   return aarch64_bitmask_imm (imm2, mode);
3965 }
3966
3967 /* Return true if val is an immediate that can be loaded into a
3968    register in a single instruction.  */
3969 bool
3970 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3971 {
3972   if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3973     return 1;
3974   return aarch64_bitmask_imm (val, mode);
3975 }
3976
3977 static bool
3978 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3979 {
3980   rtx base, offset;
3981
3982   if (GET_CODE (x) == HIGH)
3983     return true;
3984
3985   split_const (x, &base, &offset);
3986   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3987     {
3988       if (aarch64_classify_symbol (base, offset)
3989           != SYMBOL_FORCE_TO_MEM)
3990         return true;
3991       else
3992         /* Avoid generating a 64-bit relocation in ILP32; leave
3993            to aarch64_expand_mov_immediate to handle it properly.  */
3994         return mode != ptr_mode;
3995     }
3996
3997   return aarch64_tls_referenced_p (x);
3998 }
3999
4000 /* Implement TARGET_CASE_VALUES_THRESHOLD.
4001    The expansion for a table switch is quite expensive due to the number
4002    of instructions, the table lookup and hard to predict indirect jump.
4003    When optimizing for speed, and -O3 enabled, use the per-core tuning if
4004    set, otherwise use tables for > 16 cases as a tradeoff between size and
4005    performance.  When optimizing for size, use the default setting.  */
4006
4007 static unsigned int
4008 aarch64_case_values_threshold (void)
4009 {
4010   /* Use the specified limit for the number of cases before using jump
4011      tables at higher optimization levels.  */
4012   if (optimize > 2
4013       && selected_cpu->tune->max_case_values != 0)
4014     return selected_cpu->tune->max_case_values;
4015   else
4016     return optimize_size ? default_case_values_threshold () : 17;
4017 }
4018
4019 /* Return true if register REGNO is a valid index register.
4020    STRICT_P is true if REG_OK_STRICT is in effect.  */
4021
4022 bool
4023 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
4024 {
4025   if (!HARD_REGISTER_NUM_P (regno))
4026     {
4027       if (!strict_p)
4028         return true;
4029
4030       if (!reg_renumber)
4031         return false;
4032
4033       regno = reg_renumber[regno];
4034     }
4035   return GP_REGNUM_P (regno);
4036 }
4037
4038 /* Return true if register REGNO is a valid base register for mode MODE.
4039    STRICT_P is true if REG_OK_STRICT is in effect.  */
4040
4041 bool
4042 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
4043 {
4044   if (!HARD_REGISTER_NUM_P (regno))
4045     {
4046       if (!strict_p)
4047         return true;
4048
4049       if (!reg_renumber)
4050         return false;
4051
4052       regno = reg_renumber[regno];
4053     }
4054
4055   /* The fake registers will be eliminated to either the stack or
4056      hard frame pointer, both of which are usually valid base registers.
4057      Reload deals with the cases where the eliminated form isn't valid.  */
4058   return (GP_REGNUM_P (regno)
4059           || regno == SP_REGNUM
4060           || regno == FRAME_POINTER_REGNUM
4061           || regno == ARG_POINTER_REGNUM);
4062 }
4063
4064 /* Return true if X is a valid base register for mode MODE.
4065    STRICT_P is true if REG_OK_STRICT is in effect.  */
4066
4067 static bool
4068 aarch64_base_register_rtx_p (rtx x, bool strict_p)
4069 {
4070   if (!strict_p && GET_CODE (x) == SUBREG)
4071     x = SUBREG_REG (x);
4072
4073   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
4074 }
4075
4076 /* Return true if address offset is a valid index.  If it is, fill in INFO
4077    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
4078
4079 static bool
4080 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
4081                         machine_mode mode, bool strict_p)
4082 {
4083   enum aarch64_address_type type;
4084   rtx index;
4085   int shift;
4086
4087   /* (reg:P) */
4088   if ((REG_P (x) || GET_CODE (x) == SUBREG)
4089       && GET_MODE (x) == Pmode)
4090     {
4091       type = ADDRESS_REG_REG;
4092       index = x;
4093       shift = 0;
4094     }
4095   /* (sign_extend:DI (reg:SI)) */
4096   else if ((GET_CODE (x) == SIGN_EXTEND
4097             || GET_CODE (x) == ZERO_EXTEND)
4098            && GET_MODE (x) == DImode
4099            && GET_MODE (XEXP (x, 0)) == SImode)
4100     {
4101       type = (GET_CODE (x) == SIGN_EXTEND)
4102         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4103       index = XEXP (x, 0);
4104       shift = 0;
4105     }
4106   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
4107   else if (GET_CODE (x) == MULT
4108            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4109                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4110            && GET_MODE (XEXP (x, 0)) == DImode
4111            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4112            && CONST_INT_P (XEXP (x, 1)))
4113     {
4114       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4115         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4116       index = XEXP (XEXP (x, 0), 0);
4117       shift = exact_log2 (INTVAL (XEXP (x, 1)));
4118     }
4119   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
4120   else if (GET_CODE (x) == ASHIFT
4121            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4122                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4123            && GET_MODE (XEXP (x, 0)) == DImode
4124            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4125            && CONST_INT_P (XEXP (x, 1)))
4126     {
4127       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4128         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4129       index = XEXP (XEXP (x, 0), 0);
4130       shift = INTVAL (XEXP (x, 1));
4131     }
4132   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
4133   else if ((GET_CODE (x) == SIGN_EXTRACT
4134             || GET_CODE (x) == ZERO_EXTRACT)
4135            && GET_MODE (x) == DImode
4136            && GET_CODE (XEXP (x, 0)) == MULT
4137            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4138            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4139     {
4140       type = (GET_CODE (x) == SIGN_EXTRACT)
4141         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4142       index = XEXP (XEXP (x, 0), 0);
4143       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4144       if (INTVAL (XEXP (x, 1)) != 32 + shift
4145           || INTVAL (XEXP (x, 2)) != 0)
4146         shift = -1;
4147     }
4148   /* (and:DI (mult:DI (reg:DI) (const_int scale))
4149      (const_int 0xffffffff<<shift)) */
4150   else if (GET_CODE (x) == AND
4151            && GET_MODE (x) == DImode
4152            && GET_CODE (XEXP (x, 0)) == MULT
4153            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4154            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4155            && CONST_INT_P (XEXP (x, 1)))
4156     {
4157       type = ADDRESS_REG_UXTW;
4158       index = XEXP (XEXP (x, 0), 0);
4159       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4160       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4161         shift = -1;
4162     }
4163   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
4164   else if ((GET_CODE (x) == SIGN_EXTRACT
4165             || GET_CODE (x) == ZERO_EXTRACT)
4166            && GET_MODE (x) == DImode
4167            && GET_CODE (XEXP (x, 0)) == ASHIFT
4168            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4169            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4170     {
4171       type = (GET_CODE (x) == SIGN_EXTRACT)
4172         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4173       index = XEXP (XEXP (x, 0), 0);
4174       shift = INTVAL (XEXP (XEXP (x, 0), 1));
4175       if (INTVAL (XEXP (x, 1)) != 32 + shift
4176           || INTVAL (XEXP (x, 2)) != 0)
4177         shift = -1;
4178     }
4179   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
4180      (const_int 0xffffffff<<shift)) */
4181   else if (GET_CODE (x) == AND
4182            && GET_MODE (x) == DImode
4183            && GET_CODE (XEXP (x, 0)) == ASHIFT
4184            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4185            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4186            && CONST_INT_P (XEXP (x, 1)))
4187     {
4188       type = ADDRESS_REG_UXTW;
4189       index = XEXP (XEXP (x, 0), 0);
4190       shift = INTVAL (XEXP (XEXP (x, 0), 1));
4191       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4192         shift = -1;
4193     }
4194   /* (mult:P (reg:P) (const_int scale)) */
4195   else if (GET_CODE (x) == MULT
4196            && GET_MODE (x) == Pmode
4197            && GET_MODE (XEXP (x, 0)) == Pmode
4198            && CONST_INT_P (XEXP (x, 1)))
4199     {
4200       type = ADDRESS_REG_REG;
4201       index = XEXP (x, 0);
4202       shift = exact_log2 (INTVAL (XEXP (x, 1)));
4203     }
4204   /* (ashift:P (reg:P) (const_int shift)) */
4205   else if (GET_CODE (x) == ASHIFT
4206            && GET_MODE (x) == Pmode
4207            && GET_MODE (XEXP (x, 0)) == Pmode
4208            && CONST_INT_P (XEXP (x, 1)))
4209     {
4210       type = ADDRESS_REG_REG;
4211       index = XEXP (x, 0);
4212       shift = INTVAL (XEXP (x, 1));
4213     }
4214   else
4215     return false;
4216
4217   if (GET_CODE (index) == SUBREG)
4218     index = SUBREG_REG (index);
4219
4220   if ((shift == 0 ||
4221        (shift > 0 && shift <= 3
4222         && (1 << shift) == GET_MODE_SIZE (mode)))
4223       && REG_P (index)
4224       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
4225     {
4226       info->type = type;
4227       info->offset = index;
4228       info->shift = shift;
4229       return true;
4230     }
4231
4232   return false;
4233 }
4234
4235 /* Return true if MODE is one of the modes for which we
4236    support LDP/STP operations.  */
4237
4238 static bool
4239 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
4240 {
4241   return mode == SImode || mode == DImode
4242          || mode == SFmode || mode == DFmode
4243          || (aarch64_vector_mode_supported_p (mode)
4244              && GET_MODE_SIZE (mode) == 8);
4245 }
4246
4247 /* Return true if REGNO is a virtual pointer register, or an eliminable
4248    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
4249    include stack_pointer or hard_frame_pointer.  */
4250 static bool
4251 virt_or_elim_regno_p (unsigned regno)
4252 {
4253   return ((regno >= FIRST_VIRTUAL_REGISTER
4254            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
4255           || regno == FRAME_POINTER_REGNUM
4256           || regno == ARG_POINTER_REGNUM);
4257 }
4258
4259 /* Return true if X is a valid address for machine mode MODE.  If it is,
4260    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
4261    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
4262
4263 static bool
4264 aarch64_classify_address (struct aarch64_address_info *info,
4265                           rtx x, machine_mode mode,
4266                           RTX_CODE outer_code, bool strict_p)
4267 {
4268   enum rtx_code code = GET_CODE (x);
4269   rtx op0, op1;
4270
4271   /* On BE, we use load/store pair for all large int mode load/stores.
4272      TI/TFmode may also use a load/store pair.  */
4273   bool load_store_pair_p = (outer_code == PARALLEL
4274                             || mode == TImode
4275                             || mode == TFmode
4276                             || (BYTES_BIG_ENDIAN
4277                                 && aarch64_vect_struct_mode_p (mode)));
4278
4279   bool allow_reg_index_p =
4280     !load_store_pair_p
4281     && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
4282     && !aarch64_vect_struct_mode_p (mode);
4283
4284   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4285      REG addressing.  */
4286   if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
4287       && (code != POST_INC && code != REG))
4288     return false;
4289
4290   switch (code)
4291     {
4292     case REG:
4293     case SUBREG:
4294       info->type = ADDRESS_REG_IMM;
4295       info->base = x;
4296       info->offset = const0_rtx;
4297       return aarch64_base_register_rtx_p (x, strict_p);
4298
4299     case PLUS:
4300       op0 = XEXP (x, 0);
4301       op1 = XEXP (x, 1);
4302
4303       if (! strict_p
4304           && REG_P (op0)
4305           && virt_or_elim_regno_p (REGNO (op0))
4306           && CONST_INT_P (op1))
4307         {
4308           info->type = ADDRESS_REG_IMM;
4309           info->base = op0;
4310           info->offset = op1;
4311
4312           return true;
4313         }
4314
4315       if (GET_MODE_SIZE (mode) != 0
4316           && CONST_INT_P (op1)
4317           && aarch64_base_register_rtx_p (op0, strict_p))
4318         {
4319           HOST_WIDE_INT offset = INTVAL (op1);
4320
4321           info->type = ADDRESS_REG_IMM;
4322           info->base = op0;
4323           info->offset = op1;
4324
4325           /* TImode and TFmode values are allowed in both pairs of X
4326              registers and individual Q registers.  The available
4327              address modes are:
4328              X,X: 7-bit signed scaled offset
4329              Q:   9-bit signed offset
4330              We conservatively require an offset representable in either mode.
4331              When performing the check for pairs of X registers i.e.  LDP/STP
4332              pass down DImode since that is the natural size of the LDP/STP
4333              instruction memory accesses.  */
4334           if (mode == TImode || mode == TFmode)
4335             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
4336                     && (offset_9bit_signed_unscaled_p (mode, offset)
4337                         || offset_12bit_unsigned_scaled_p (mode, offset)));
4338
4339           /* A 7bit offset check because OImode will emit a ldp/stp
4340              instruction (only big endian will get here).
4341              For ldp/stp instructions, the offset is scaled for the size of a
4342              single element of the pair.  */
4343           if (mode == OImode)
4344             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4345
4346           /* Three 9/12 bit offsets checks because CImode will emit three
4347              ldr/str instructions (only big endian will get here).  */
4348           if (mode == CImode)
4349             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4350                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4351                         || offset_12bit_unsigned_scaled_p (V16QImode,
4352                                                            offset + 32)));
4353
4354           /* Two 7bit offsets checks because XImode will emit two ldp/stp
4355              instructions (only big endian will get here).  */
4356           if (mode == XImode)
4357             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4358                     && aarch64_offset_7bit_signed_scaled_p (TImode,
4359                                                             offset + 32));
4360
4361           if (load_store_pair_p)
4362             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4363                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4364           else
4365             return (offset_9bit_signed_unscaled_p (mode, offset)
4366                     || offset_12bit_unsigned_scaled_p (mode, offset));
4367         }
4368
4369       if (allow_reg_index_p)
4370         {
4371           /* Look for base + (scaled/extended) index register.  */
4372           if (aarch64_base_register_rtx_p (op0, strict_p)
4373               && aarch64_classify_index (info, op1, mode, strict_p))
4374             {
4375               info->base = op0;
4376               return true;
4377             }
4378           if (aarch64_base_register_rtx_p (op1, strict_p)
4379               && aarch64_classify_index (info, op0, mode, strict_p))
4380             {
4381               info->base = op1;
4382               return true;
4383             }
4384         }
4385
4386       return false;
4387
4388     case POST_INC:
4389     case POST_DEC:
4390     case PRE_INC:
4391     case PRE_DEC:
4392       info->type = ADDRESS_REG_WB;
4393       info->base = XEXP (x, 0);
4394       info->offset = NULL_RTX;
4395       return aarch64_base_register_rtx_p (info->base, strict_p);
4396
4397     case POST_MODIFY:
4398     case PRE_MODIFY:
4399       info->type = ADDRESS_REG_WB;
4400       info->base = XEXP (x, 0);
4401       if (GET_CODE (XEXP (x, 1)) == PLUS
4402           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4403           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4404           && aarch64_base_register_rtx_p (info->base, strict_p))
4405         {
4406           HOST_WIDE_INT offset;
4407           info->offset = XEXP (XEXP (x, 1), 1);
4408           offset = INTVAL (info->offset);
4409
4410           /* TImode and TFmode values are allowed in both pairs of X
4411              registers and individual Q registers.  The available
4412              address modes are:
4413              X,X: 7-bit signed scaled offset
4414              Q:   9-bit signed offset
4415              We conservatively require an offset representable in either mode.
4416            */
4417           if (mode == TImode || mode == TFmode)
4418             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4419                     && offset_9bit_signed_unscaled_p (mode, offset));
4420
4421           if (load_store_pair_p)
4422             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4423                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4424           else
4425             return offset_9bit_signed_unscaled_p (mode, offset);
4426         }
4427       return false;
4428
4429     case CONST:
4430     case SYMBOL_REF:
4431     case LABEL_REF:
4432       /* load literal: pc-relative constant pool entry.  Only supported
4433          for SI mode or larger.  */
4434       info->type = ADDRESS_SYMBOLIC;
4435
4436       if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4437         {
4438           rtx sym, addend;
4439
4440           split_const (x, &sym, &addend);
4441           return ((GET_CODE (sym) == LABEL_REF
4442                    || (GET_CODE (sym) == SYMBOL_REF
4443                        && CONSTANT_POOL_ADDRESS_P (sym)
4444                        && aarch64_pcrelative_literal_loads)));
4445         }
4446       return false;
4447
4448     case LO_SUM:
4449       info->type = ADDRESS_LO_SUM;
4450       info->base = XEXP (x, 0);
4451       info->offset = XEXP (x, 1);
4452       if (allow_reg_index_p
4453           && aarch64_base_register_rtx_p (info->base, strict_p))
4454         {
4455           rtx sym, offs;
4456           split_const (info->offset, &sym, &offs);
4457           if (GET_CODE (sym) == SYMBOL_REF
4458               && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4459             {
4460               /* The symbol and offset must be aligned to the access size.  */
4461               unsigned int align;
4462               unsigned int ref_size;
4463
4464               if (CONSTANT_POOL_ADDRESS_P (sym))
4465                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4466               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4467                 {
4468                   tree exp = SYMBOL_REF_DECL (sym);
4469                   align = TYPE_ALIGN (TREE_TYPE (exp));
4470                   align = CONSTANT_ALIGNMENT (exp, align);
4471                 }
4472               else if (SYMBOL_REF_DECL (sym))
4473                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4474               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4475                        && SYMBOL_REF_BLOCK (sym) != NULL)
4476                 align = SYMBOL_REF_BLOCK (sym)->alignment;
4477               else
4478                 align = BITS_PER_UNIT;
4479
4480               ref_size = GET_MODE_SIZE (mode);
4481               if (ref_size == 0)
4482                 ref_size = GET_MODE_SIZE (DImode);
4483
4484               return ((INTVAL (offs) & (ref_size - 1)) == 0
4485                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4486             }
4487         }
4488       return false;
4489
4490     default:
4491       return false;
4492     }
4493 }
4494
4495 bool
4496 aarch64_symbolic_address_p (rtx x)
4497 {
4498   rtx offset;
4499
4500   split_const (x, &x, &offset);
4501   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4502 }
4503
4504 /* Classify the base of symbolic expression X.  */
4505
4506 enum aarch64_symbol_type
4507 aarch64_classify_symbolic_expression (rtx x)
4508 {
4509   rtx offset;
4510
4511   split_const (x, &x, &offset);
4512   return aarch64_classify_symbol (x, offset);
4513 }
4514
4515
4516 /* Return TRUE if X is a legitimate address for accessing memory in
4517    mode MODE.  */
4518 static bool
4519 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4520 {
4521   struct aarch64_address_info addr;
4522
4523   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4524 }
4525
4526 /* Return TRUE if X is a legitimate address for accessing memory in
4527    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
4528    pair operation.  */
4529 bool
4530 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4531                               RTX_CODE outer_code, bool strict_p)
4532 {
4533   struct aarch64_address_info addr;
4534
4535   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4536 }
4537
4538 /* Split an out-of-range address displacement into a base and offset.
4539    Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4540    to increase opportunities for sharing the base address of different sizes.
4541    For unaligned accesses and TI/TF mode use the signed 9-bit range.  */
4542 static bool
4543 aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
4544 {
4545   HOST_WIDE_INT offset = INTVAL (*disp);
4546   HOST_WIDE_INT base = offset & ~(GET_MODE_SIZE (mode) < 4 ? 0xfff : 0x3ffc);
4547
4548   if (mode == TImode || mode == TFmode
4549       || (offset & (GET_MODE_SIZE (mode) - 1)) != 0)
4550     base = (offset + 0x100) & ~0x1ff;
4551
4552   *off = GEN_INT (base);
4553   *disp = GEN_INT (offset - base);
4554   return true;
4555 }
4556
4557 /* Return TRUE if rtx X is immediate constant 0.0 */
4558 bool
4559 aarch64_float_const_zero_rtx_p (rtx x)
4560 {
4561   if (GET_MODE (x) == VOIDmode)
4562     return false;
4563
4564   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4565     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4566   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4567 }
4568
4569 /* Return the fixed registers used for condition codes.  */
4570
4571 static bool
4572 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4573 {
4574   *p1 = CC_REGNUM;
4575   *p2 = INVALID_REGNUM;
4576   return true;
4577 }
4578
4579 /* Emit call insn with PAT and do aarch64-specific handling.  */
4580
4581 void
4582 aarch64_emit_call_insn (rtx pat)
4583 {
4584   rtx insn = emit_call_insn (pat);
4585
4586   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4587   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4588   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4589 }
4590
4591 machine_mode
4592 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4593 {
4594   /* All floating point compares return CCFP if it is an equality
4595      comparison, and CCFPE otherwise.  */
4596   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4597     {
4598       switch (code)
4599         {
4600         case EQ:
4601         case NE:
4602         case UNORDERED:
4603         case ORDERED:
4604         case UNLT:
4605         case UNLE:
4606         case UNGT:
4607         case UNGE:
4608         case UNEQ:
4609         case LTGT:
4610           return CCFPmode;
4611
4612         case LT:
4613         case LE:
4614         case GT:
4615         case GE:
4616           return CCFPEmode;
4617
4618         default:
4619           gcc_unreachable ();
4620         }
4621     }
4622
4623   /* Equality comparisons of short modes against zero can be performed
4624      using the TST instruction with the appropriate bitmask.  */
4625   if (y == const0_rtx && REG_P (x)
4626       && (code == EQ || code == NE)
4627       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4628     return CC_NZmode;
4629
4630   /* Similarly, comparisons of zero_extends from shorter modes can
4631      be performed using an ANDS with an immediate mask.  */
4632   if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
4633       && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4634       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
4635       && (code == EQ || code == NE))
4636     return CC_NZmode;
4637
4638   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4639       && y == const0_rtx
4640       && (code == EQ || code == NE || code == LT || code == GE)
4641       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4642           || GET_CODE (x) == NEG
4643           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4644               && CONST_INT_P (XEXP (x, 2)))))
4645     return CC_NZmode;
4646
4647   /* A compare with a shifted operand.  Because of canonicalization,
4648      the comparison will have to be swapped when we emit the assembly
4649      code.  */
4650   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4651       && (REG_P (y) || GET_CODE (y) == SUBREG)
4652       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4653           || GET_CODE (x) == LSHIFTRT
4654           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4655     return CC_SWPmode;
4656
4657   /* Similarly for a negated operand, but we can only do this for
4658      equalities.  */
4659   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4660       && (REG_P (y) || GET_CODE (y) == SUBREG)
4661       && (code == EQ || code == NE)
4662       && GET_CODE (x) == NEG)
4663     return CC_Zmode;
4664
4665   /* A test for unsigned overflow.  */
4666   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
4667       && code == NE
4668       && GET_CODE (x) == PLUS
4669       && GET_CODE (y) == ZERO_EXTEND)
4670     return CC_Cmode;
4671
4672   /* For everything else, return CCmode.  */
4673   return CCmode;
4674 }
4675
4676 static int
4677 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
4678
4679 int
4680 aarch64_get_condition_code (rtx x)
4681 {
4682   machine_mode mode = GET_MODE (XEXP (x, 0));
4683   enum rtx_code comp_code = GET_CODE (x);
4684
4685   if (GET_MODE_CLASS (mode) != MODE_CC)
4686     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
4687   return aarch64_get_condition_code_1 (mode, comp_code);
4688 }
4689
4690 static int
4691 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
4692 {
4693   switch (mode)
4694     {
4695     case CCFPmode:
4696     case CCFPEmode:
4697       switch (comp_code)
4698         {
4699         case GE: return AARCH64_GE;
4700         case GT: return AARCH64_GT;
4701         case LE: return AARCH64_LS;
4702         case LT: return AARCH64_MI;
4703         case NE: return AARCH64_NE;
4704         case EQ: return AARCH64_EQ;
4705         case ORDERED: return AARCH64_VC;
4706         case UNORDERED: return AARCH64_VS;
4707         case UNLT: return AARCH64_LT;
4708         case UNLE: return AARCH64_LE;
4709         case UNGT: return AARCH64_HI;
4710         case UNGE: return AARCH64_PL;
4711         default: return -1;
4712         }
4713       break;
4714
4715     case CCmode:
4716       switch (comp_code)
4717         {
4718         case NE: return AARCH64_NE;
4719         case EQ: return AARCH64_EQ;
4720         case GE: return AARCH64_GE;
4721         case GT: return AARCH64_GT;
4722         case LE: return AARCH64_LE;
4723         case LT: return AARCH64_LT;
4724         case GEU: return AARCH64_CS;
4725         case GTU: return AARCH64_HI;
4726         case LEU: return AARCH64_LS;
4727         case LTU: return AARCH64_CC;
4728         default: return -1;
4729         }
4730       break;
4731
4732     case CC_SWPmode:
4733       switch (comp_code)
4734         {
4735         case NE: return AARCH64_NE;
4736         case EQ: return AARCH64_EQ;
4737         case GE: return AARCH64_LE;
4738         case GT: return AARCH64_LT;
4739         case LE: return AARCH64_GE;
4740         case LT: return AARCH64_GT;
4741         case GEU: return AARCH64_LS;
4742         case GTU: return AARCH64_CC;
4743         case LEU: return AARCH64_CS;
4744         case LTU: return AARCH64_HI;
4745         default: return -1;
4746         }
4747       break;
4748
4749     case CC_NZmode:
4750       switch (comp_code)
4751         {
4752         case NE: return AARCH64_NE;
4753         case EQ: return AARCH64_EQ;
4754         case GE: return AARCH64_PL;
4755         case LT: return AARCH64_MI;
4756         default: return -1;
4757         }
4758       break;
4759
4760     case CC_Zmode:
4761       switch (comp_code)
4762         {
4763         case NE: return AARCH64_NE;
4764         case EQ: return AARCH64_EQ;
4765         default: return -1;
4766         }
4767       break;
4768
4769     case CC_Cmode:
4770       switch (comp_code)
4771         {
4772         case NE: return AARCH64_CS;
4773         case EQ: return AARCH64_CC;
4774         default: return -1;
4775         }
4776       break;
4777
4778     default:
4779       return -1;
4780     }
4781
4782   return -1;
4783 }
4784
4785 bool
4786 aarch64_const_vec_all_same_in_range_p (rtx x,
4787                                   HOST_WIDE_INT minval,
4788                                   HOST_WIDE_INT maxval)
4789 {
4790   HOST_WIDE_INT firstval;
4791   int count, i;
4792
4793   if (GET_CODE (x) != CONST_VECTOR
4794       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
4795     return false;
4796
4797   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
4798   if (firstval < minval || firstval > maxval)
4799     return false;
4800
4801   count = CONST_VECTOR_NUNITS (x);
4802   for (i = 1; i < count; i++)
4803     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
4804       return false;
4805
4806   return true;
4807 }
4808
4809 bool
4810 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
4811 {
4812   return aarch64_const_vec_all_same_in_range_p (x, val, val);
4813 }
4814
4815
4816 /* N Z C V.  */
4817 #define AARCH64_CC_V 1
4818 #define AARCH64_CC_C (1 << 1)
4819 #define AARCH64_CC_Z (1 << 2)
4820 #define AARCH64_CC_N (1 << 3)
4821
4822 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
4823 static const int aarch64_nzcv_codes[] =
4824 {
4825   0,            /* EQ, Z == 1.  */
4826   AARCH64_CC_Z, /* NE, Z == 0.  */
4827   0,            /* CS, C == 1.  */
4828   AARCH64_CC_C, /* CC, C == 0.  */
4829   0,            /* MI, N == 1.  */
4830   AARCH64_CC_N, /* PL, N == 0.  */
4831   0,            /* VS, V == 1.  */
4832   AARCH64_CC_V, /* VC, V == 0.  */
4833   0,            /* HI, C ==1 && Z == 0.  */
4834   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
4835   AARCH64_CC_V, /* GE, N == V.  */
4836   0,            /* LT, N != V.  */
4837   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
4838   0,            /* LE, !(Z == 0 && N == V).  */
4839   0,            /* AL, Any.  */
4840   0             /* NV, Any.  */
4841 };
4842
4843 static void
4844 aarch64_print_operand (FILE *f, rtx x, int code)
4845 {
4846   switch (code)
4847     {
4848     /* An integer or symbol address without a preceding # sign.  */
4849     case 'c':
4850       switch (GET_CODE (x))
4851         {
4852         case CONST_INT:
4853           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4854           break;
4855
4856         case SYMBOL_REF:
4857           output_addr_const (f, x);
4858           break;
4859
4860         case CONST:
4861           if (GET_CODE (XEXP (x, 0)) == PLUS
4862               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4863             {
4864               output_addr_const (f, x);
4865               break;
4866             }
4867           /* Fall through.  */
4868
4869         default:
4870           output_operand_lossage ("Unsupported operand for code '%c'", code);
4871         }
4872       break;
4873
4874     case 'e':
4875       /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w.  */
4876       {
4877         int n;
4878
4879         if (!CONST_INT_P (x)
4880             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4881           {
4882             output_operand_lossage ("invalid operand for '%%%c'", code);
4883             return;
4884           }
4885
4886         switch (n)
4887           {
4888           case 3:
4889             fputc ('b', f);
4890             break;
4891           case 4:
4892             fputc ('h', f);
4893             break;
4894           case 5:
4895             fputc ('w', f);
4896             break;
4897           default:
4898             output_operand_lossage ("invalid operand for '%%%c'", code);
4899             return;
4900           }
4901       }
4902       break;
4903
4904     case 'p':
4905       {
4906         int n;
4907
4908         /* Print N such that 2^N == X.  */
4909         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4910           {
4911             output_operand_lossage ("invalid operand for '%%%c'", code);
4912             return;
4913           }
4914
4915         asm_fprintf (f, "%d", n);
4916       }
4917       break;
4918
4919     case 'P':
4920       /* Print the number of non-zero bits in X (a const_int).  */
4921       if (!CONST_INT_P (x))
4922         {
4923           output_operand_lossage ("invalid operand for '%%%c'", code);
4924           return;
4925         }
4926
4927       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
4928       break;
4929
4930     case 'H':
4931       /* Print the higher numbered register of a pair (TImode) of regs.  */
4932       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4933         {
4934           output_operand_lossage ("invalid operand for '%%%c'", code);
4935           return;
4936         }
4937
4938       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4939       break;
4940
4941     case 'M':
4942     case 'm':
4943       {
4944         int cond_code;
4945         /* Print a condition (eq, ne, etc) or its inverse.  */
4946
4947         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
4948         if (x == const_true_rtx)
4949           {
4950             if (code == 'M')
4951               fputs ("nv", f);
4952             return;
4953           }
4954
4955         if (!COMPARISON_P (x))
4956           {
4957             output_operand_lossage ("invalid operand for '%%%c'", code);
4958             return;
4959           }
4960
4961         cond_code = aarch64_get_condition_code (x);
4962         gcc_assert (cond_code >= 0);
4963         if (code == 'M')
4964           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
4965         fputs (aarch64_condition_codes[cond_code], f);
4966       }
4967       break;
4968
4969     case 'b':
4970     case 'h':
4971     case 's':
4972     case 'd':
4973     case 'q':
4974       /* Print a scalar FP/SIMD register name.  */
4975       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4976         {
4977           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4978           return;
4979         }
4980       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4981       break;
4982
4983     case 'S':
4984     case 'T':
4985     case 'U':
4986     case 'V':
4987       /* Print the first FP/SIMD register name in a list.  */
4988       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4989         {
4990           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4991           return;
4992         }
4993       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4994       break;
4995
4996     case 'R':
4997       /* Print a scalar FP/SIMD register name + 1.  */
4998       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4999         {
5000           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5001           return;
5002         }
5003       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
5004       break;
5005
5006     case 'X':
5007       /* Print bottom 16 bits of integer constant in hex.  */
5008       if (!CONST_INT_P (x))
5009         {
5010           output_operand_lossage ("invalid operand for '%%%c'", code);
5011           return;
5012         }
5013       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
5014       break;
5015
5016     case 'w':
5017     case 'x':
5018       /* Print a general register name or the zero register (32-bit or
5019          64-bit).  */
5020       if (x == const0_rtx
5021           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
5022         {
5023           asm_fprintf (f, "%czr", code);
5024           break;
5025         }
5026
5027       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
5028         {
5029           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
5030           break;
5031         }
5032
5033       if (REG_P (x) && REGNO (x) == SP_REGNUM)
5034         {
5035           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
5036           break;
5037         }
5038
5039       /* Fall through */
5040
5041     case 0:
5042       /* Print a normal operand, if it's a general register, then we
5043          assume DImode.  */
5044       if (x == NULL)
5045         {
5046           output_operand_lossage ("missing operand");
5047           return;
5048         }
5049
5050       switch (GET_CODE (x))
5051         {
5052         case REG:
5053           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
5054           break;
5055
5056         case MEM:
5057           output_address (GET_MODE (x), XEXP (x, 0));
5058           break;
5059
5060         case CONST:
5061         case LABEL_REF:
5062         case SYMBOL_REF:
5063           output_addr_const (asm_out_file, x);
5064           break;
5065
5066         case CONST_INT:
5067           asm_fprintf (f, "%wd", INTVAL (x));
5068           break;
5069
5070         case CONST_VECTOR:
5071           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
5072             {
5073               gcc_assert (
5074                   aarch64_const_vec_all_same_in_range_p (x,
5075                                                          HOST_WIDE_INT_MIN,
5076                                                          HOST_WIDE_INT_MAX));
5077               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
5078             }
5079           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
5080             {
5081               fputc ('0', f);
5082             }
5083           else
5084             gcc_unreachable ();
5085           break;
5086
5087         case CONST_DOUBLE:
5088           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
5089              be getting CONST_DOUBLEs holding integers.  */
5090           gcc_assert (GET_MODE (x) != VOIDmode);
5091           if (aarch64_float_const_zero_rtx_p (x))
5092             {
5093               fputc ('0', f);
5094               break;
5095             }
5096           else if (aarch64_float_const_representable_p (x))
5097             {
5098 #define buf_size 20
5099               char float_buf[buf_size] = {'\0'};
5100               real_to_decimal_for_mode (float_buf,
5101                                         CONST_DOUBLE_REAL_VALUE (x),
5102                                         buf_size, buf_size,
5103                                         1, GET_MODE (x));
5104               asm_fprintf (asm_out_file, "%s", float_buf);
5105               break;
5106 #undef buf_size
5107             }
5108           output_operand_lossage ("invalid constant");
5109           return;
5110         default:
5111           output_operand_lossage ("invalid operand");
5112           return;
5113         }
5114       break;
5115
5116     case 'A':
5117       if (GET_CODE (x) == HIGH)
5118         x = XEXP (x, 0);
5119
5120       switch (aarch64_classify_symbolic_expression (x))
5121         {
5122         case SYMBOL_SMALL_GOT_4G:
5123           asm_fprintf (asm_out_file, ":got:");
5124           break;
5125
5126         case SYMBOL_SMALL_TLSGD:
5127           asm_fprintf (asm_out_file, ":tlsgd:");
5128           break;
5129
5130         case SYMBOL_SMALL_TLSDESC:
5131           asm_fprintf (asm_out_file, ":tlsdesc:");
5132           break;
5133
5134         case SYMBOL_SMALL_TLSIE:
5135           asm_fprintf (asm_out_file, ":gottprel:");
5136           break;
5137
5138         case SYMBOL_TLSLE24:
5139           asm_fprintf (asm_out_file, ":tprel:");
5140           break;
5141
5142         case SYMBOL_TINY_GOT:
5143           gcc_unreachable ();
5144           break;
5145
5146         default:
5147           break;
5148         }
5149       output_addr_const (asm_out_file, x);
5150       break;
5151
5152     case 'L':
5153       switch (aarch64_classify_symbolic_expression (x))
5154         {
5155         case SYMBOL_SMALL_GOT_4G:
5156           asm_fprintf (asm_out_file, ":lo12:");
5157           break;
5158
5159         case SYMBOL_SMALL_TLSGD:
5160           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
5161           break;
5162
5163         case SYMBOL_SMALL_TLSDESC:
5164           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
5165           break;
5166
5167         case SYMBOL_SMALL_TLSIE:
5168           asm_fprintf (asm_out_file, ":gottprel_lo12:");
5169           break;
5170
5171         case SYMBOL_TLSLE12:
5172           asm_fprintf (asm_out_file, ":tprel_lo12:");
5173           break;
5174
5175         case SYMBOL_TLSLE24:
5176           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
5177           break;
5178
5179         case SYMBOL_TINY_GOT:
5180           asm_fprintf (asm_out_file, ":got:");
5181           break;
5182
5183         case SYMBOL_TINY_TLSIE:
5184           asm_fprintf (asm_out_file, ":gottprel:");
5185           break;
5186
5187         default:
5188           break;
5189         }
5190       output_addr_const (asm_out_file, x);
5191       break;
5192
5193     case 'G':
5194
5195       switch (aarch64_classify_symbolic_expression (x))
5196         {
5197         case SYMBOL_TLSLE24:
5198           asm_fprintf (asm_out_file, ":tprel_hi12:");
5199           break;
5200         default:
5201           break;
5202         }
5203       output_addr_const (asm_out_file, x);
5204       break;
5205
5206     case 'k':
5207       {
5208         HOST_WIDE_INT cond_code;
5209         /* Print nzcv.  */
5210
5211         if (!CONST_INT_P (x))
5212           {
5213             output_operand_lossage ("invalid operand for '%%%c'", code);
5214             return;
5215           }
5216
5217         cond_code = INTVAL (x);
5218         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
5219         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
5220       }
5221       break;
5222
5223     default:
5224       output_operand_lossage ("invalid operand prefix '%%%c'", code);
5225       return;
5226     }
5227 }
5228
5229 static void
5230 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
5231 {
5232   struct aarch64_address_info addr;
5233
5234   if (aarch64_classify_address (&addr, x, mode, MEM, true))
5235     switch (addr.type)
5236       {
5237       case ADDRESS_REG_IMM:
5238         if (addr.offset == const0_rtx)
5239           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
5240         else
5241           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
5242                        INTVAL (addr.offset));
5243         return;
5244
5245       case ADDRESS_REG_REG:
5246         if (addr.shift == 0)
5247           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
5248                        reg_names [REGNO (addr.offset)]);
5249         else
5250           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
5251                        reg_names [REGNO (addr.offset)], addr.shift);
5252         return;
5253
5254       case ADDRESS_REG_UXTW:
5255         if (addr.shift == 0)
5256           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
5257                        REGNO (addr.offset) - R0_REGNUM);
5258         else
5259           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
5260                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
5261         return;
5262
5263       case ADDRESS_REG_SXTW:
5264         if (addr.shift == 0)
5265           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
5266                        REGNO (addr.offset) - R0_REGNUM);
5267         else
5268           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
5269                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
5270         return;
5271
5272       case ADDRESS_REG_WB:
5273         switch (GET_CODE (x))
5274           {
5275           case PRE_INC:
5276             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
5277                          GET_MODE_SIZE (mode));
5278             return;
5279           case POST_INC:
5280             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
5281                          GET_MODE_SIZE (mode));
5282             return;
5283           case PRE_DEC:
5284             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
5285                          GET_MODE_SIZE (mode));
5286             return;
5287           case POST_DEC:
5288             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
5289                          GET_MODE_SIZE (mode));
5290             return;
5291           case PRE_MODIFY:
5292             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
5293                          INTVAL (addr.offset));
5294             return;
5295           case POST_MODIFY:
5296             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
5297                          INTVAL (addr.offset));
5298             return;
5299           default:
5300             break;
5301           }
5302         break;
5303
5304       case ADDRESS_LO_SUM:
5305         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
5306         output_addr_const (f, addr.offset);
5307         asm_fprintf (f, "]");
5308         return;
5309
5310       case ADDRESS_SYMBOLIC:
5311         break;
5312       }
5313
5314   output_addr_const (f, x);
5315 }
5316
5317 bool
5318 aarch64_label_mentioned_p (rtx x)
5319 {
5320   const char *fmt;
5321   int i;
5322
5323   if (GET_CODE (x) == LABEL_REF)
5324     return true;
5325
5326   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5327      referencing instruction, but they are constant offsets, not
5328      symbols.  */
5329   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5330     return false;
5331
5332   fmt = GET_RTX_FORMAT (GET_CODE (x));
5333   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
5334     {
5335       if (fmt[i] == 'E')
5336         {
5337           int j;
5338
5339           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
5340             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
5341               return 1;
5342         }
5343       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
5344         return 1;
5345     }
5346
5347   return 0;
5348 }
5349
5350 /* Implement REGNO_REG_CLASS.  */
5351
5352 enum reg_class
5353 aarch64_regno_regclass (unsigned regno)
5354 {
5355   if (GP_REGNUM_P (regno))
5356     return GENERAL_REGS;
5357
5358   if (regno == SP_REGNUM)
5359     return STACK_REG;
5360
5361   if (regno == FRAME_POINTER_REGNUM
5362       || regno == ARG_POINTER_REGNUM)
5363     return POINTER_REGS;
5364
5365   if (FP_REGNUM_P (regno))
5366     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
5367
5368   return NO_REGS;
5369 }
5370
5371 static rtx
5372 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
5373 {
5374   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5375      where mask is selected by alignment and size of the offset.
5376      We try to pick as large a range for the offset as possible to
5377      maximize the chance of a CSE.  However, for aligned addresses
5378      we limit the range to 4k so that structures with different sized
5379      elements are likely to use the same base.  We need to be careful
5380      not to split a CONST for some forms of address expression, otherwise
5381      it will generate sub-optimal code.  */
5382
5383   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5384     {
5385       rtx base = XEXP (x, 0);
5386       rtx offset_rtx = XEXP (x, 1);
5387       HOST_WIDE_INT offset = INTVAL (offset_rtx);
5388
5389       if (GET_CODE (base) == PLUS)
5390         {
5391           rtx op0 = XEXP (base, 0);
5392           rtx op1 = XEXP (base, 1);
5393
5394           /* Force any scaling into a temp for CSE.  */
5395           op0 = force_reg (Pmode, op0);
5396           op1 = force_reg (Pmode, op1);
5397
5398           /* Let the pointer register be in op0.  */
5399           if (REG_POINTER (op1))
5400             std::swap (op0, op1);
5401
5402           /* If the pointer is virtual or frame related, then we know that
5403              virtual register instantiation or register elimination is going
5404              to apply a second constant.  We want the two constants folded
5405              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
5406           if (virt_or_elim_regno_p (REGNO (op0)))
5407             {
5408               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5409                                    NULL_RTX, true, OPTAB_DIRECT);
5410               return gen_rtx_PLUS (Pmode, base, op1);
5411             }
5412
5413           /* Otherwise, in order to encourage CSE (and thence loop strength
5414              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
5415           base = expand_binop (Pmode, add_optab, op0, op1,
5416                                NULL_RTX, true, OPTAB_DIRECT);
5417           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5418         }
5419
5420       /* Does it look like we'll need a 16-byte load/store-pair operation?  */
5421       HOST_WIDE_INT base_offset;
5422       if (GET_MODE_SIZE (mode) > 16)
5423         base_offset = (offset + 0x400) & ~0x7f0;
5424       /* For offsets aren't a multiple of the access size, the limit is
5425          -256...255.  */
5426       else if (offset & (GET_MODE_SIZE (mode) - 1))
5427         {
5428           base_offset = (offset + 0x100) & ~0x1ff;
5429
5430           /* BLKmode typically uses LDP of X-registers.  */
5431           if (mode == BLKmode)
5432             base_offset = (offset + 512) & ~0x3ff;
5433         }
5434       /* Small negative offsets are supported.  */
5435       else if (IN_RANGE (offset, -256, 0))
5436         base_offset = 0;
5437       else if (mode == TImode || mode == TFmode)
5438         base_offset = (offset + 0x100) & ~0x1ff;
5439       /* Use 12-bit offset by access size.  */
5440       else
5441         base_offset = offset & (~0xfff * GET_MODE_SIZE (mode));
5442
5443       if (base_offset != 0)
5444         {
5445           base = plus_constant (Pmode, base, base_offset);
5446           base = force_operand (base, NULL_RTX);
5447           return plus_constant (Pmode, base, offset - base_offset);
5448         }
5449     }
5450
5451   return x;
5452 }
5453
5454 /* Return the reload icode required for a constant pool in mode.  */
5455 static enum insn_code
5456 aarch64_constant_pool_reload_icode (machine_mode mode)
5457 {
5458   switch (mode)
5459     {
5460     case SFmode:
5461       return CODE_FOR_aarch64_reload_movcpsfdi;
5462
5463     case DFmode:
5464       return CODE_FOR_aarch64_reload_movcpdfdi;
5465
5466     case TFmode:
5467       return CODE_FOR_aarch64_reload_movcptfdi;
5468
5469     case V8QImode:
5470       return CODE_FOR_aarch64_reload_movcpv8qidi;
5471
5472     case V16QImode:
5473       return CODE_FOR_aarch64_reload_movcpv16qidi;
5474
5475     case V4HImode:
5476       return CODE_FOR_aarch64_reload_movcpv4hidi;
5477
5478     case V8HImode:
5479       return CODE_FOR_aarch64_reload_movcpv8hidi;
5480
5481     case V2SImode:
5482       return CODE_FOR_aarch64_reload_movcpv2sidi;
5483
5484     case V4SImode:
5485       return CODE_FOR_aarch64_reload_movcpv4sidi;
5486
5487     case V2DImode:
5488       return CODE_FOR_aarch64_reload_movcpv2didi;
5489
5490     case V2DFmode:
5491       return CODE_FOR_aarch64_reload_movcpv2dfdi;
5492
5493     default:
5494       gcc_unreachable ();
5495     }
5496
5497   gcc_unreachable ();
5498 }
5499 static reg_class_t
5500 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5501                           reg_class_t rclass,
5502                           machine_mode mode,
5503                           secondary_reload_info *sri)
5504 {
5505
5506   /* If we have to disable direct literal pool loads and stores because the
5507      function is too big, then we need a scratch register.  */
5508   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5509       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5510           || targetm.vector_mode_supported_p (GET_MODE (x)))
5511       && !aarch64_pcrelative_literal_loads)
5512     {
5513       sri->icode = aarch64_constant_pool_reload_icode (mode);
5514       return NO_REGS;
5515     }
5516
5517   /* Without the TARGET_SIMD instructions we cannot move a Q register
5518      to a Q register directly.  We need a scratch.  */
5519   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5520       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5521       && reg_class_subset_p (rclass, FP_REGS))
5522     {
5523       if (mode == TFmode)
5524         sri->icode = CODE_FOR_aarch64_reload_movtf;
5525       else if (mode == TImode)
5526         sri->icode = CODE_FOR_aarch64_reload_movti;
5527       return NO_REGS;
5528     }
5529
5530   /* A TFmode or TImode memory access should be handled via an FP_REGS
5531      because AArch64 has richer addressing modes for LDR/STR instructions
5532      than LDP/STP instructions.  */
5533   if (TARGET_FLOAT && rclass == GENERAL_REGS
5534       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5535     return FP_REGS;
5536
5537   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5538       return GENERAL_REGS;
5539
5540   return NO_REGS;
5541 }
5542
5543 static bool
5544 aarch64_can_eliminate (const int from, const int to)
5545 {
5546   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5547      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
5548
5549   if (frame_pointer_needed)
5550     {
5551       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5552         return true;
5553       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5554         return false;
5555       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5556           && !cfun->calls_alloca)
5557         return true;
5558       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5559         return true;
5560
5561       return false;
5562     }
5563   else
5564     {
5565       /* If we decided that we didn't need a leaf frame pointer but then used
5566          LR in the function, then we'll want a frame pointer after all, so
5567          prevent this elimination to ensure a frame pointer is used.  */
5568       if (to == STACK_POINTER_REGNUM
5569           && flag_omit_leaf_frame_pointer
5570           && df_regs_ever_live_p (LR_REGNUM))
5571         return false;
5572     }
5573
5574   return true;
5575 }
5576
5577 HOST_WIDE_INT
5578 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5579 {
5580   aarch64_layout_frame ();
5581
5582   if (to == HARD_FRAME_POINTER_REGNUM)
5583     {
5584       if (from == ARG_POINTER_REGNUM)
5585         return cfun->machine->frame.hard_fp_offset;
5586
5587       if (from == FRAME_POINTER_REGNUM)
5588         return cfun->machine->frame.hard_fp_offset
5589                - cfun->machine->frame.locals_offset;
5590     }
5591
5592   if (to == STACK_POINTER_REGNUM)
5593     {
5594       if (from == FRAME_POINTER_REGNUM)
5595           return cfun->machine->frame.frame_size
5596                  - cfun->machine->frame.locals_offset;
5597     }
5598
5599   return cfun->machine->frame.frame_size;
5600 }
5601
5602 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
5603    previous frame.  */
5604
5605 rtx
5606 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5607 {
5608   if (count != 0)
5609     return const0_rtx;
5610   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5611 }
5612
5613
5614 static void
5615 aarch64_asm_trampoline_template (FILE *f)
5616 {
5617   if (TARGET_ILP32)
5618     {
5619       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5620       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5621     }
5622   else
5623     {
5624       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5625       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5626     }
5627   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5628   assemble_aligned_integer (4, const0_rtx);
5629   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5630   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5631 }
5632
5633 static void
5634 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5635 {
5636   rtx fnaddr, mem, a_tramp;
5637   const int tramp_code_sz = 16;
5638
5639   /* Don't need to copy the trailing D-words, we fill those in below.  */
5640   emit_block_move (m_tramp, assemble_trampoline_template (),
5641                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5642   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5643   fnaddr = XEXP (DECL_RTL (fndecl), 0);
5644   if (GET_MODE (fnaddr) != ptr_mode)
5645     fnaddr = convert_memory_address (ptr_mode, fnaddr);
5646   emit_move_insn (mem, fnaddr);
5647
5648   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5649   emit_move_insn (mem, chain_value);
5650
5651   /* XXX We should really define a "clear_cache" pattern and use
5652      gen_clear_cache().  */
5653   a_tramp = XEXP (m_tramp, 0);
5654   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5655                      LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
5656                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5657                      ptr_mode);
5658 }
5659
5660 static unsigned char
5661 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5662 {
5663   switch (regclass)
5664     {
5665     case CALLER_SAVE_REGS:
5666     case POINTER_REGS:
5667     case GENERAL_REGS:
5668     case ALL_REGS:
5669     case FP_REGS:
5670     case FP_LO_REGS:
5671       return
5672         aarch64_vector_mode_p (mode)
5673           ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
5674           : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5675     case STACK_REG:
5676       return 1;
5677
5678     case NO_REGS:
5679       return 0;
5680
5681     default:
5682       break;
5683     }
5684   gcc_unreachable ();
5685 }
5686
5687 static reg_class_t
5688 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
5689 {
5690   if (regclass == POINTER_REGS)
5691     return GENERAL_REGS;
5692
5693   if (regclass == STACK_REG)
5694     {
5695       if (REG_P(x)
5696           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
5697           return regclass;
5698
5699       return NO_REGS;
5700     }
5701
5702   /* If it's an integer immediate that MOVI can't handle, then
5703      FP_REGS is not an option, so we return NO_REGS instead.  */
5704   if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
5705       && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
5706     return NO_REGS;
5707
5708   /* Register eliminiation can result in a request for
5709      SP+constant->FP_REGS.  We cannot support such operations which
5710      use SP as source and an FP_REG as destination, so reject out
5711      right now.  */
5712   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
5713     {
5714       rtx lhs = XEXP (x, 0);
5715
5716       /* Look through a possible SUBREG introduced by ILP32.  */
5717       if (GET_CODE (lhs) == SUBREG)
5718         lhs = SUBREG_REG (lhs);
5719
5720       gcc_assert (REG_P (lhs));
5721       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
5722                                       POINTER_REGS));
5723       return NO_REGS;
5724     }
5725
5726   return regclass;
5727 }
5728
5729 void
5730 aarch64_asm_output_labelref (FILE* f, const char *name)
5731 {
5732   asm_fprintf (f, "%U%s", name);
5733 }
5734
5735 static void
5736 aarch64_elf_asm_constructor (rtx symbol, int priority)
5737 {
5738   if (priority == DEFAULT_INIT_PRIORITY)
5739     default_ctor_section_asm_out_constructor (symbol, priority);
5740   else
5741     {
5742       section *s;
5743       char buf[18];
5744       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
5745       s = get_section (buf, SECTION_WRITE, NULL);
5746       switch_to_section (s);
5747       assemble_align (POINTER_SIZE);
5748       assemble_aligned_integer (POINTER_BYTES, symbol);
5749     }
5750 }
5751
5752 static void
5753 aarch64_elf_asm_destructor (rtx symbol, int priority)
5754 {
5755   if (priority == DEFAULT_INIT_PRIORITY)
5756     default_dtor_section_asm_out_destructor (symbol, priority);
5757   else
5758     {
5759       section *s;
5760       char buf[18];
5761       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5762       s = get_section (buf, SECTION_WRITE, NULL);
5763       switch_to_section (s);
5764       assemble_align (POINTER_SIZE);
5765       assemble_aligned_integer (POINTER_BYTES, symbol);
5766     }
5767 }
5768
5769 const char*
5770 aarch64_output_casesi (rtx *operands)
5771 {
5772   char buf[100];
5773   char label[100];
5774   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5775   int index;
5776   static const char *const patterns[4][2] =
5777   {
5778     {
5779       "ldrb\t%w3, [%0,%w1,uxtw]",
5780       "add\t%3, %4, %w3, sxtb #2"
5781     },
5782     {
5783       "ldrh\t%w3, [%0,%w1,uxtw #1]",
5784       "add\t%3, %4, %w3, sxth #2"
5785     },
5786     {
5787       "ldr\t%w3, [%0,%w1,uxtw #2]",
5788       "add\t%3, %4, %w3, sxtw #2"
5789     },
5790     /* We assume that DImode is only generated when not optimizing and
5791        that we don't really need 64-bit address offsets.  That would
5792        imply an object file with 8GB of code in a single function!  */
5793     {
5794       "ldr\t%w3, [%0,%w1,uxtw #2]",
5795       "add\t%3, %4, %w3, sxtw #2"
5796     }
5797   };
5798
5799   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5800
5801   index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5802
5803   gcc_assert (index >= 0 && index <= 3);
5804
5805   /* Need to implement table size reduction, by chaning the code below.  */
5806   output_asm_insn (patterns[index][0], operands);
5807   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5808   snprintf (buf, sizeof (buf),
5809             "adr\t%%4, %s", targetm.strip_name_encoding (label));
5810   output_asm_insn (buf, operands);
5811   output_asm_insn (patterns[index][1], operands);
5812   output_asm_insn ("br\t%3", operands);
5813   assemble_label (asm_out_file, label);
5814   return "";
5815 }
5816
5817
5818 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5819    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5820    operator.  */
5821
5822 int
5823 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5824 {
5825   if (shift >= 0 && shift <= 3)
5826     {
5827       int size;
5828       for (size = 8; size <= 32; size *= 2)
5829         {
5830           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5831           if (mask == bits << shift)
5832             return size;
5833         }
5834     }
5835   return 0;
5836 }
5837
5838 /* Constant pools are per function only when PC relative
5839    literal loads are true or we are in the large memory
5840    model.  */
5841
5842 static inline bool
5843 aarch64_can_use_per_function_literal_pools_p (void)
5844 {
5845   return (aarch64_pcrelative_literal_loads
5846           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
5847 }
5848
5849 static bool
5850 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
5851 {
5852   /* Fixme:: In an ideal world this would work similar
5853      to the logic in aarch64_select_rtx_section but this
5854      breaks bootstrap in gcc go.  For now we workaround
5855      this by returning false here.  */
5856   return false;
5857 }
5858
5859 /* Select appropriate section for constants depending
5860    on where we place literal pools.  */
5861
5862 static section *
5863 aarch64_select_rtx_section (machine_mode mode,
5864                             rtx x,
5865                             unsigned HOST_WIDE_INT align)
5866 {
5867   if (aarch64_can_use_per_function_literal_pools_p ())
5868     return function_section (current_function_decl);
5869
5870   return default_elf_select_rtx_section (mode, x, align);
5871 }
5872
5873 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
5874 void
5875 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
5876                                   HOST_WIDE_INT offset)
5877 {
5878   /* When using per-function literal pools, we must ensure that any code
5879      section is aligned to the minimal instruction length, lest we get
5880      errors from the assembler re "unaligned instructions".  */
5881   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
5882     ASM_OUTPUT_ALIGN (f, 2);
5883 }
5884
5885 /* Costs.  */
5886
5887 /* Helper function for rtx cost calculation.  Strip a shift expression
5888    from X.  Returns the inner operand if successful, or the original
5889    expression on failure.  */
5890 static rtx
5891 aarch64_strip_shift (rtx x)
5892 {
5893   rtx op = x;
5894
5895   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5896      we can convert both to ROR during final output.  */
5897   if ((GET_CODE (op) == ASHIFT
5898        || GET_CODE (op) == ASHIFTRT
5899        || GET_CODE (op) == LSHIFTRT
5900        || GET_CODE (op) == ROTATERT
5901        || GET_CODE (op) == ROTATE)
5902       && CONST_INT_P (XEXP (op, 1)))
5903     return XEXP (op, 0);
5904
5905   if (GET_CODE (op) == MULT
5906       && CONST_INT_P (XEXP (op, 1))
5907       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5908     return XEXP (op, 0);
5909
5910   return x;
5911 }
5912
5913 /* Helper function for rtx cost calculation.  Strip an extend
5914    expression from X.  Returns the inner operand if successful, or the
5915    original expression on failure.  We deal with a number of possible
5916    canonicalization variations here.  */
5917 static rtx
5918 aarch64_strip_extend (rtx x)
5919 {
5920   rtx op = x;
5921
5922   /* Zero and sign extraction of a widened value.  */
5923   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5924       && XEXP (op, 2) == const0_rtx
5925       && GET_CODE (XEXP (op, 0)) == MULT
5926       && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5927                                          XEXP (op, 1)))
5928     return XEXP (XEXP (op, 0), 0);
5929
5930   /* It can also be represented (for zero-extend) as an AND with an
5931      immediate.  */
5932   if (GET_CODE (op) == AND
5933       && GET_CODE (XEXP (op, 0)) == MULT
5934       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5935       && CONST_INT_P (XEXP (op, 1))
5936       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5937                            INTVAL (XEXP (op, 1))) != 0)
5938     return XEXP (XEXP (op, 0), 0);
5939
5940   /* Now handle extended register, as this may also have an optional
5941      left shift by 1..4.  */
5942   if (GET_CODE (op) == ASHIFT
5943       && CONST_INT_P (XEXP (op, 1))
5944       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5945     op = XEXP (op, 0);
5946
5947   if (GET_CODE (op) == ZERO_EXTEND
5948       || GET_CODE (op) == SIGN_EXTEND)
5949     op = XEXP (op, 0);
5950
5951   if (op != x)
5952     return op;
5953
5954   return x;
5955 }
5956
5957 /* Return true iff CODE is a shift supported in combination
5958    with arithmetic instructions.  */
5959
5960 static bool
5961 aarch64_shift_p (enum rtx_code code)
5962 {
5963   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
5964 }
5965
5966 /* Helper function for rtx cost calculation.  Calculate the cost of
5967    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
5968    Return the calculated cost of the expression, recursing manually in to
5969    operands where needed.  */
5970
5971 static int
5972 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
5973 {
5974   rtx op0, op1;
5975   const struct cpu_cost_table *extra_cost
5976     = aarch64_tune_params.insn_extra_cost;
5977   int cost = 0;
5978   bool compound_p = (outer == PLUS || outer == MINUS);
5979   machine_mode mode = GET_MODE (x);
5980
5981   gcc_checking_assert (code == MULT);
5982
5983   op0 = XEXP (x, 0);
5984   op1 = XEXP (x, 1);
5985
5986   if (VECTOR_MODE_P (mode))
5987     mode = GET_MODE_INNER (mode);
5988
5989   /* Integer multiply/fma.  */
5990   if (GET_MODE_CLASS (mode) == MODE_INT)
5991     {
5992       /* The multiply will be canonicalized as a shift, cost it as such.  */
5993       if (aarch64_shift_p (GET_CODE (x))
5994           || (CONST_INT_P (op1)
5995               && exact_log2 (INTVAL (op1)) > 0))
5996         {
5997           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
5998                            || GET_CODE (op0) == SIGN_EXTEND;
5999           if (speed)
6000             {
6001               if (compound_p)
6002                 {
6003                   if (REG_P (op1))
6004                     /* ARITH + shift-by-register.  */
6005                     cost += extra_cost->alu.arith_shift_reg;
6006                   else if (is_extend)
6007                     /* ARITH + extended register.  We don't have a cost field
6008                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
6009                     cost += extra_cost->alu.extend_arith;
6010                   else
6011                     /* ARITH + shift-by-immediate.  */
6012                     cost += extra_cost->alu.arith_shift;
6013                 }
6014               else
6015                 /* LSL (immediate).  */
6016                 cost += extra_cost->alu.shift;
6017
6018             }
6019           /* Strip extends as we will have costed them in the case above.  */
6020           if (is_extend)
6021             op0 = aarch64_strip_extend (op0);
6022
6023           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
6024
6025           return cost;
6026         }
6027
6028       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
6029          compound and let the below cases handle it.  After all, MNEG is a
6030          special-case alias of MSUB.  */
6031       if (GET_CODE (op0) == NEG)
6032         {
6033           op0 = XEXP (op0, 0);
6034           compound_p = true;
6035         }
6036
6037       /* Integer multiplies or FMAs have zero/sign extending variants.  */
6038       if ((GET_CODE (op0) == ZERO_EXTEND
6039            && GET_CODE (op1) == ZERO_EXTEND)
6040           || (GET_CODE (op0) == SIGN_EXTEND
6041               && GET_CODE (op1) == SIGN_EXTEND))
6042         {
6043           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
6044           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
6045
6046           if (speed)
6047             {
6048               if (compound_p)
6049                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
6050                 cost += extra_cost->mult[0].extend_add;
6051               else
6052                 /* MUL/SMULL/UMULL.  */
6053                 cost += extra_cost->mult[0].extend;
6054             }
6055
6056           return cost;
6057         }
6058
6059       /* This is either an integer multiply or a MADD.  In both cases
6060          we want to recurse and cost the operands.  */
6061       cost += rtx_cost (op0, mode, MULT, 0, speed);
6062       cost += rtx_cost (op1, mode, MULT, 1, speed);
6063
6064       if (speed)
6065         {
6066           if (compound_p)
6067             /* MADD/MSUB.  */
6068             cost += extra_cost->mult[mode == DImode].add;
6069           else
6070             /* MUL.  */
6071             cost += extra_cost->mult[mode == DImode].simple;
6072         }
6073
6074       return cost;
6075     }
6076   else
6077     {
6078       if (speed)
6079         {
6080           /* Floating-point FMA/FMUL can also support negations of the
6081              operands, unless the rounding mode is upward or downward in
6082              which case FNMUL is different than FMUL with operand negation.  */
6083           bool neg0 = GET_CODE (op0) == NEG;
6084           bool neg1 = GET_CODE (op1) == NEG;
6085           if (compound_p || !flag_rounding_math || (neg0 && neg1))
6086             {
6087               if (neg0)
6088                 op0 = XEXP (op0, 0);
6089               if (neg1)
6090                 op1 = XEXP (op1, 0);
6091             }
6092
6093           if (compound_p)
6094             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
6095             cost += extra_cost->fp[mode == DFmode].fma;
6096           else
6097             /* FMUL/FNMUL.  */
6098             cost += extra_cost->fp[mode == DFmode].mult;
6099         }
6100
6101       cost += rtx_cost (op0, mode, MULT, 0, speed);
6102       cost += rtx_cost (op1, mode, MULT, 1, speed);
6103       return cost;
6104     }
6105 }
6106
6107 static int
6108 aarch64_address_cost (rtx x,
6109                       machine_mode mode,
6110                       addr_space_t as ATTRIBUTE_UNUSED,
6111                       bool speed)
6112 {
6113   enum rtx_code c = GET_CODE (x);
6114   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
6115   struct aarch64_address_info info;
6116   int cost = 0;
6117   info.shift = 0;
6118
6119   if (!aarch64_classify_address (&info, x, mode, c, false))
6120     {
6121       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
6122         {
6123           /* This is a CONST or SYMBOL ref which will be split
6124              in a different way depending on the code model in use.
6125              Cost it through the generic infrastructure.  */
6126           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
6127           /* Divide through by the cost of one instruction to
6128              bring it to the same units as the address costs.  */
6129           cost_symbol_ref /= COSTS_N_INSNS (1);
6130           /* The cost is then the cost of preparing the address,
6131              followed by an immediate (possibly 0) offset.  */
6132           return cost_symbol_ref + addr_cost->imm_offset;
6133         }
6134       else
6135         {
6136           /* This is most likely a jump table from a case
6137              statement.  */
6138           return addr_cost->register_offset;
6139         }
6140     }
6141
6142   switch (info.type)
6143     {
6144       case ADDRESS_LO_SUM:
6145       case ADDRESS_SYMBOLIC:
6146       case ADDRESS_REG_IMM:
6147         cost += addr_cost->imm_offset;
6148         break;
6149
6150       case ADDRESS_REG_WB:
6151         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
6152           cost += addr_cost->pre_modify;
6153         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
6154           cost += addr_cost->post_modify;
6155         else
6156           gcc_unreachable ();
6157
6158         break;
6159
6160       case ADDRESS_REG_REG:
6161         cost += addr_cost->register_offset;
6162         break;
6163
6164       case ADDRESS_REG_SXTW:
6165         cost += addr_cost->register_sextend;
6166         break;
6167
6168       case ADDRESS_REG_UXTW:
6169         cost += addr_cost->register_zextend;
6170         break;
6171
6172       default:
6173         gcc_unreachable ();
6174     }
6175
6176
6177   if (info.shift > 0)
6178     {
6179       /* For the sake of calculating the cost of the shifted register
6180          component, we can treat same sized modes in the same way.  */
6181       switch (GET_MODE_BITSIZE (mode))
6182         {
6183           case 16:
6184             cost += addr_cost->addr_scale_costs.hi;
6185             break;
6186
6187           case 32:
6188             cost += addr_cost->addr_scale_costs.si;
6189             break;
6190
6191           case 64:
6192             cost += addr_cost->addr_scale_costs.di;
6193             break;
6194
6195           /* We can't tell, or this is a 128-bit vector.  */
6196           default:
6197             cost += addr_cost->addr_scale_costs.ti;
6198             break;
6199         }
6200     }
6201
6202   return cost;
6203 }
6204
6205 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
6206    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
6207    to be taken.  */
6208
6209 int
6210 aarch64_branch_cost (bool speed_p, bool predictable_p)
6211 {
6212   /* When optimizing for speed, use the cost of unpredictable branches.  */
6213   const struct cpu_branch_cost *branch_costs =
6214     aarch64_tune_params.branch_costs;
6215
6216   if (!speed_p || predictable_p)
6217     return branch_costs->predictable;
6218   else
6219     return branch_costs->unpredictable;
6220 }
6221
6222 /* Return true if the RTX X in mode MODE is a zero or sign extract
6223    usable in an ADD or SUB (extended register) instruction.  */
6224 static bool
6225 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
6226 {
6227   /* Catch add with a sign extract.
6228      This is add_<optab><mode>_multp2.  */
6229   if (GET_CODE (x) == SIGN_EXTRACT
6230       || GET_CODE (x) == ZERO_EXTRACT)
6231     {
6232       rtx op0 = XEXP (x, 0);
6233       rtx op1 = XEXP (x, 1);
6234       rtx op2 = XEXP (x, 2);
6235
6236       if (GET_CODE (op0) == MULT
6237           && CONST_INT_P (op1)
6238           && op2 == const0_rtx
6239           && CONST_INT_P (XEXP (op0, 1))
6240           && aarch64_is_extend_from_extract (mode,
6241                                              XEXP (op0, 1),
6242                                              op1))
6243         {
6244           return true;
6245         }
6246     }
6247   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
6248      No shift.  */
6249   else if (GET_CODE (x) == SIGN_EXTEND
6250            || GET_CODE (x) == ZERO_EXTEND)
6251     return REG_P (XEXP (x, 0));
6252
6253   return false;
6254 }
6255
6256 static bool
6257 aarch64_frint_unspec_p (unsigned int u)
6258 {
6259   switch (u)
6260     {
6261       case UNSPEC_FRINTZ:
6262       case UNSPEC_FRINTP:
6263       case UNSPEC_FRINTM:
6264       case UNSPEC_FRINTA:
6265       case UNSPEC_FRINTN:
6266       case UNSPEC_FRINTX:
6267       case UNSPEC_FRINTI:
6268         return true;
6269
6270       default:
6271         return false;
6272     }
6273 }
6274
6275 /* Return true iff X is an rtx that will match an extr instruction
6276    i.e. as described in the *extr<mode>5_insn family of patterns.
6277    OP0 and OP1 will be set to the operands of the shifts involved
6278    on success and will be NULL_RTX otherwise.  */
6279
6280 static bool
6281 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
6282 {
6283   rtx op0, op1;
6284   machine_mode mode = GET_MODE (x);
6285
6286   *res_op0 = NULL_RTX;
6287   *res_op1 = NULL_RTX;
6288
6289   if (GET_CODE (x) != IOR)
6290     return false;
6291
6292   op0 = XEXP (x, 0);
6293   op1 = XEXP (x, 1);
6294
6295   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
6296       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
6297     {
6298      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
6299       if (GET_CODE (op1) == ASHIFT)
6300         std::swap (op0, op1);
6301
6302       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6303         return false;
6304
6305       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6306       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6307
6308       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6309           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6310         {
6311           *res_op0 = XEXP (op0, 0);
6312           *res_op1 = XEXP (op1, 0);
6313           return true;
6314         }
6315     }
6316
6317   return false;
6318 }
6319
6320 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6321    storing it in *COST.  Result is true if the total cost of the operation
6322    has now been calculated.  */
6323 static bool
6324 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6325 {
6326   rtx inner;
6327   rtx comparator;
6328   enum rtx_code cmpcode;
6329
6330   if (COMPARISON_P (op0))
6331     {
6332       inner = XEXP (op0, 0);
6333       comparator = XEXP (op0, 1);
6334       cmpcode = GET_CODE (op0);
6335     }
6336   else
6337     {
6338       inner = op0;
6339       comparator = const0_rtx;
6340       cmpcode = NE;
6341     }
6342
6343   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6344     {
6345       /* Conditional branch.  */
6346       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6347         return true;
6348       else
6349         {
6350           if (cmpcode == NE || cmpcode == EQ)
6351             {
6352               if (comparator == const0_rtx)
6353                 {
6354                   /* TBZ/TBNZ/CBZ/CBNZ.  */
6355                   if (GET_CODE (inner) == ZERO_EXTRACT)
6356                     /* TBZ/TBNZ.  */
6357                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6358                                        ZERO_EXTRACT, 0, speed);
6359                   else
6360                     /* CBZ/CBNZ.  */
6361                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6362
6363                 return true;
6364               }
6365             }
6366           else if (cmpcode == LT || cmpcode == GE)
6367             {
6368               /* TBZ/TBNZ.  */
6369               if (comparator == const0_rtx)
6370                 return true;
6371             }
6372         }
6373     }
6374   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6375     {
6376       /* CCMP.  */
6377       if (GET_CODE (op1) == COMPARE)
6378         {
6379           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
6380           if (XEXP (op1, 1) == const0_rtx)
6381             *cost += 1;
6382           if (speed)
6383             {
6384               machine_mode mode = GET_MODE (XEXP (op1, 0));
6385               const struct cpu_cost_table *extra_cost
6386                 = aarch64_tune_params.insn_extra_cost;
6387
6388               if (GET_MODE_CLASS (mode) == MODE_INT)
6389                 *cost += extra_cost->alu.arith;
6390               else
6391                 *cost += extra_cost->fp[mode == DFmode].compare;
6392             }
6393           return true;
6394         }
6395
6396       /* It's a conditional operation based on the status flags,
6397          so it must be some flavor of CSEL.  */
6398
6399       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
6400       if (GET_CODE (op1) == NEG
6401           || GET_CODE (op1) == NOT
6402           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6403         op1 = XEXP (op1, 0);
6404       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6405         {
6406           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
6407           op1 = XEXP (op1, 0);
6408           op2 = XEXP (op2, 0);
6409         }
6410
6411       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6412       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6413       return true;
6414     }
6415
6416   /* We don't know what this is, cost all operands.  */
6417   return false;
6418 }
6419
6420 /* Check whether X is a bitfield operation of the form shift + extend that
6421    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
6422    operand to which the bitfield operation is applied.  Otherwise return
6423    NULL_RTX.  */
6424
6425 static rtx
6426 aarch64_extend_bitfield_pattern_p (rtx x)
6427 {
6428   rtx_code outer_code = GET_CODE (x);
6429   machine_mode outer_mode = GET_MODE (x);
6430
6431   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6432       && outer_mode != SImode && outer_mode != DImode)
6433     return NULL_RTX;
6434
6435   rtx inner = XEXP (x, 0);
6436   rtx_code inner_code = GET_CODE (inner);
6437   machine_mode inner_mode = GET_MODE (inner);
6438   rtx op = NULL_RTX;
6439
6440   switch (inner_code)
6441     {
6442       case ASHIFT:
6443         if (CONST_INT_P (XEXP (inner, 1))
6444             && (inner_mode == QImode || inner_mode == HImode))
6445           op = XEXP (inner, 0);
6446         break;
6447       case LSHIFTRT:
6448         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6449             && (inner_mode == QImode || inner_mode == HImode))
6450           op = XEXP (inner, 0);
6451         break;
6452       case ASHIFTRT:
6453         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6454             && (inner_mode == QImode || inner_mode == HImode))
6455           op = XEXP (inner, 0);
6456         break;
6457       default:
6458         break;
6459     }
6460
6461   return op;
6462 }
6463
6464 /* Return true if the mask and a shift amount from an RTX of the form
6465    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6466    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
6467
6468 bool
6469 aarch64_mask_and_shift_for_ubfiz_p (machine_mode mode, rtx mask, rtx shft_amnt)
6470 {
6471   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6472          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6473          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6474          && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6475 }
6476
6477 /* Calculate the cost of calculating X, storing it in *COST.  Result
6478    is true if the total cost of the operation has now been calculated.  */
6479 static bool
6480 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6481                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6482 {
6483   rtx op0, op1, op2;
6484   const struct cpu_cost_table *extra_cost
6485     = aarch64_tune_params.insn_extra_cost;
6486   int code = GET_CODE (x);
6487
6488   /* By default, assume that everything has equivalent cost to the
6489      cheapest instruction.  Any additional costs are applied as a delta
6490      above this default.  */
6491   *cost = COSTS_N_INSNS (1);
6492
6493   switch (code)
6494     {
6495     case SET:
6496       /* The cost depends entirely on the operands to SET.  */
6497       *cost = 0;
6498       op0 = SET_DEST (x);
6499       op1 = SET_SRC (x);
6500
6501       switch (GET_CODE (op0))
6502         {
6503         case MEM:
6504           if (speed)
6505             {
6506               rtx address = XEXP (op0, 0);
6507               if (VECTOR_MODE_P (mode))
6508                 *cost += extra_cost->ldst.storev;
6509               else if (GET_MODE_CLASS (mode) == MODE_INT)
6510                 *cost += extra_cost->ldst.store;
6511               else if (mode == SFmode)
6512                 *cost += extra_cost->ldst.storef;
6513               else if (mode == DFmode)
6514                 *cost += extra_cost->ldst.stored;
6515
6516               *cost +=
6517                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6518                                                      0, speed));
6519             }
6520
6521           *cost += rtx_cost (op1, mode, SET, 1, speed);
6522           return true;
6523
6524         case SUBREG:
6525           if (! REG_P (SUBREG_REG (op0)))
6526             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6527
6528           /* Fall through.  */
6529         case REG:
6530           /* The cost is one per vector-register copied.  */
6531           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6532             {
6533               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6534                               / GET_MODE_SIZE (V4SImode);
6535               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6536             }
6537           /* const0_rtx is in general free, but we will use an
6538              instruction to set a register to 0.  */
6539           else if (REG_P (op1) || op1 == const0_rtx)
6540             {
6541               /* The cost is 1 per register copied.  */
6542               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6543                               / UNITS_PER_WORD;
6544               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6545             }
6546           else
6547             /* Cost is just the cost of the RHS of the set.  */
6548             *cost += rtx_cost (op1, mode, SET, 1, speed);
6549           return true;
6550
6551         case ZERO_EXTRACT:
6552         case SIGN_EXTRACT:
6553           /* Bit-field insertion.  Strip any redundant widening of
6554              the RHS to meet the width of the target.  */
6555           if (GET_CODE (op1) == SUBREG)
6556             op1 = SUBREG_REG (op1);
6557           if ((GET_CODE (op1) == ZERO_EXTEND
6558                || GET_CODE (op1) == SIGN_EXTEND)
6559               && CONST_INT_P (XEXP (op0, 1))
6560               && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
6561                   >= INTVAL (XEXP (op0, 1))))
6562             op1 = XEXP (op1, 0);
6563
6564           if (CONST_INT_P (op1))
6565             {
6566               /* MOV immediate is assumed to always be cheap.  */
6567               *cost = COSTS_N_INSNS (1);
6568             }
6569           else
6570             {
6571               /* BFM.  */
6572               if (speed)
6573                 *cost += extra_cost->alu.bfi;
6574               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6575             }
6576
6577           return true;
6578
6579         default:
6580           /* We can't make sense of this, assume default cost.  */
6581           *cost = COSTS_N_INSNS (1);
6582           return false;
6583         }
6584       return false;
6585
6586     case CONST_INT:
6587       /* If an instruction can incorporate a constant within the
6588          instruction, the instruction's expression avoids calling
6589          rtx_cost() on the constant.  If rtx_cost() is called on a
6590          constant, then it is usually because the constant must be
6591          moved into a register by one or more instructions.
6592
6593          The exception is constant 0, which can be expressed
6594          as XZR/WZR and is therefore free.  The exception to this is
6595          if we have (set (reg) (const0_rtx)) in which case we must cost
6596          the move.  However, we can catch that when we cost the SET, so
6597          we don't need to consider that here.  */
6598       if (x == const0_rtx)
6599         *cost = 0;
6600       else
6601         {
6602           /* To an approximation, building any other constant is
6603              proportionally expensive to the number of instructions
6604              required to build that constant.  This is true whether we
6605              are compiling for SPEED or otherwise.  */
6606           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
6607                                  (NULL_RTX, x, false, mode));
6608         }
6609       return true;
6610
6611     case CONST_DOUBLE:
6612       if (speed)
6613         {
6614           /* mov[df,sf]_aarch64.  */
6615           if (aarch64_float_const_representable_p (x))
6616             /* FMOV (scalar immediate).  */
6617             *cost += extra_cost->fp[mode == DFmode].fpconst;
6618           else if (!aarch64_float_const_zero_rtx_p (x))
6619             {
6620               /* This will be a load from memory.  */
6621               if (mode == DFmode)
6622                 *cost += extra_cost->ldst.loadd;
6623               else
6624                 *cost += extra_cost->ldst.loadf;
6625             }
6626           else
6627             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
6628                or MOV v0.s[0], wzr - neither of which are modeled by the
6629                cost tables.  Just use the default cost.  */
6630             {
6631             }
6632         }
6633
6634       return true;
6635
6636     case MEM:
6637       if (speed)
6638         {
6639           /* For loads we want the base cost of a load, plus an
6640              approximation for the additional cost of the addressing
6641              mode.  */
6642           rtx address = XEXP (x, 0);
6643           if (VECTOR_MODE_P (mode))
6644             *cost += extra_cost->ldst.loadv;
6645           else if (GET_MODE_CLASS (mode) == MODE_INT)
6646             *cost += extra_cost->ldst.load;
6647           else if (mode == SFmode)
6648             *cost += extra_cost->ldst.loadf;
6649           else if (mode == DFmode)
6650             *cost += extra_cost->ldst.loadd;
6651
6652           *cost +=
6653                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6654                                                      0, speed));
6655         }
6656
6657       return true;
6658
6659     case NEG:
6660       op0 = XEXP (x, 0);
6661
6662       if (VECTOR_MODE_P (mode))
6663         {
6664           if (speed)
6665             {
6666               /* FNEG.  */
6667               *cost += extra_cost->vect.alu;
6668             }
6669           return false;
6670         }
6671
6672       if (GET_MODE_CLASS (mode) == MODE_INT)
6673         {
6674           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6675               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6676             {
6677               /* CSETM.  */
6678               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
6679               return true;
6680             }
6681
6682           /* Cost this as SUB wzr, X.  */
6683           op0 = CONST0_RTX (mode);
6684           op1 = XEXP (x, 0);
6685           goto cost_minus;
6686         }
6687
6688       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6689         {
6690           /* Support (neg(fma...)) as a single instruction only if
6691              sign of zeros is unimportant.  This matches the decision
6692              making in aarch64.md.  */
6693           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
6694             {
6695               /* FNMADD.  */
6696               *cost = rtx_cost (op0, mode, NEG, 0, speed);
6697               return true;
6698             }
6699           if (GET_CODE (op0) == MULT)
6700             {
6701               /* FNMUL.  */
6702               *cost = rtx_cost (op0, mode, NEG, 0, speed);
6703               return true;
6704             }
6705           if (speed)
6706             /* FNEG.  */
6707             *cost += extra_cost->fp[mode == DFmode].neg;
6708           return false;
6709         }
6710
6711       return false;
6712
6713     case CLRSB:
6714     case CLZ:
6715       if (speed)
6716         {
6717           if (VECTOR_MODE_P (mode))
6718             *cost += extra_cost->vect.alu;
6719           else
6720             *cost += extra_cost->alu.clz;
6721         }
6722
6723       return false;
6724
6725     case COMPARE:
6726       op0 = XEXP (x, 0);
6727       op1 = XEXP (x, 1);
6728
6729       if (op1 == const0_rtx
6730           && GET_CODE (op0) == AND)
6731         {
6732           x = op0;
6733           mode = GET_MODE (op0);
6734           goto cost_logic;
6735         }
6736
6737       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
6738         {
6739           /* TODO: A write to the CC flags possibly costs extra, this
6740              needs encoding in the cost tables.  */
6741
6742           mode = GET_MODE (op0);
6743           /* ANDS.  */
6744           if (GET_CODE (op0) == AND)
6745             {
6746               x = op0;
6747               goto cost_logic;
6748             }
6749
6750           if (GET_CODE (op0) == PLUS)
6751             {
6752               /* ADDS (and CMN alias).  */
6753               x = op0;
6754               goto cost_plus;
6755             }
6756
6757           if (GET_CODE (op0) == MINUS)
6758             {
6759               /* SUBS.  */
6760               x = op0;
6761               goto cost_minus;
6762             }
6763
6764           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
6765               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
6766               && CONST_INT_P (XEXP (op0, 2)))
6767             {
6768               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
6769                  Handle it here directly rather than going to cost_logic
6770                  since we know the immediate generated for the TST is valid
6771                  so we can avoid creating an intermediate rtx for it only
6772                  for costing purposes.  */
6773               if (speed)
6774                 *cost += extra_cost->alu.logical;
6775
6776               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
6777                                  ZERO_EXTRACT, 0, speed);
6778               return true;
6779             }
6780
6781           if (GET_CODE (op1) == NEG)
6782             {
6783               /* CMN.  */
6784               if (speed)
6785                 *cost += extra_cost->alu.arith;
6786
6787               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
6788               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
6789               return true;
6790             }
6791
6792           /* CMP.
6793
6794              Compare can freely swap the order of operands, and
6795              canonicalization puts the more complex operation first.
6796              But the integer MINUS logic expects the shift/extend
6797              operation in op1.  */
6798           if (! (REG_P (op0)
6799                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
6800           {
6801             op0 = XEXP (x, 1);
6802             op1 = XEXP (x, 0);
6803           }
6804           goto cost_minus;
6805         }
6806
6807       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
6808         {
6809           /* FCMP.  */
6810           if (speed)
6811             *cost += extra_cost->fp[mode == DFmode].compare;
6812
6813           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
6814             {
6815               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
6816               /* FCMP supports constant 0.0 for no extra cost. */
6817               return true;
6818             }
6819           return false;
6820         }
6821
6822       if (VECTOR_MODE_P (mode))
6823         {
6824           /* Vector compare.  */
6825           if (speed)
6826             *cost += extra_cost->vect.alu;
6827
6828           if (aarch64_float_const_zero_rtx_p (op1))
6829             {
6830               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
6831                  cost.  */
6832               return true;
6833             }
6834           return false;
6835         }
6836       return false;
6837
6838     case MINUS:
6839       {
6840         op0 = XEXP (x, 0);
6841         op1 = XEXP (x, 1);
6842
6843 cost_minus:
6844         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
6845
6846         /* Detect valid immediates.  */
6847         if ((GET_MODE_CLASS (mode) == MODE_INT
6848              || (GET_MODE_CLASS (mode) == MODE_CC
6849                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
6850             && CONST_INT_P (op1)
6851             && aarch64_uimm12_shift (INTVAL (op1)))
6852           {
6853             if (speed)
6854               /* SUB(S) (immediate).  */
6855               *cost += extra_cost->alu.arith;
6856             return true;
6857           }
6858
6859         /* Look for SUB (extended register).  */
6860         if (aarch64_rtx_arith_op_extract_p (op1, mode))
6861           {
6862             if (speed)
6863               *cost += extra_cost->alu.extend_arith;
6864
6865             op1 = aarch64_strip_extend (op1);
6866             *cost += rtx_cost (op1, VOIDmode,
6867                                (enum rtx_code) GET_CODE (op1), 0, speed);
6868             return true;
6869           }
6870
6871         rtx new_op1 = aarch64_strip_extend (op1);
6872
6873         /* Cost this as an FMA-alike operation.  */
6874         if ((GET_CODE (new_op1) == MULT
6875              || aarch64_shift_p (GET_CODE (new_op1)))
6876             && code != COMPARE)
6877           {
6878             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
6879                                             (enum rtx_code) code,
6880                                             speed);
6881             return true;
6882           }
6883
6884         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
6885
6886         if (speed)
6887           {
6888             if (VECTOR_MODE_P (mode))
6889               {
6890                 /* Vector SUB.  */
6891                 *cost += extra_cost->vect.alu;
6892               }
6893             else if (GET_MODE_CLASS (mode) == MODE_INT)
6894               {
6895                 /* SUB(S).  */
6896                 *cost += extra_cost->alu.arith;
6897               }
6898             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6899               {
6900                 /* FSUB.  */
6901                 *cost += extra_cost->fp[mode == DFmode].addsub;
6902               }
6903           }
6904         return true;
6905       }
6906
6907     case PLUS:
6908       {
6909         rtx new_op0;
6910
6911         op0 = XEXP (x, 0);
6912         op1 = XEXP (x, 1);
6913
6914 cost_plus:
6915         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6916             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6917           {
6918             /* CSINC.  */
6919             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
6920             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6921             return true;
6922           }
6923
6924         if (GET_MODE_CLASS (mode) == MODE_INT
6925             && CONST_INT_P (op1)
6926             && aarch64_uimm12_shift (INTVAL (op1)))
6927           {
6928             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
6929
6930             if (speed)
6931               /* ADD (immediate).  */
6932               *cost += extra_cost->alu.arith;
6933             return true;
6934           }
6935
6936         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6937
6938         /* Look for ADD (extended register).  */
6939         if (aarch64_rtx_arith_op_extract_p (op0, mode))
6940           {
6941             if (speed)
6942               *cost += extra_cost->alu.extend_arith;
6943
6944             op0 = aarch64_strip_extend (op0);
6945             *cost += rtx_cost (op0, VOIDmode,
6946                                (enum rtx_code) GET_CODE (op0), 0, speed);
6947             return true;
6948           }
6949
6950         /* Strip any extend, leave shifts behind as we will
6951            cost them through mult_cost.  */
6952         new_op0 = aarch64_strip_extend (op0);
6953
6954         if (GET_CODE (new_op0) == MULT
6955             || aarch64_shift_p (GET_CODE (new_op0)))
6956           {
6957             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
6958                                             speed);
6959             return true;
6960           }
6961
6962         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
6963
6964         if (speed)
6965           {
6966             if (VECTOR_MODE_P (mode))
6967               {
6968                 /* Vector ADD.  */
6969                 *cost += extra_cost->vect.alu;
6970               }
6971             else if (GET_MODE_CLASS (mode) == MODE_INT)
6972               {
6973                 /* ADD.  */
6974                 *cost += extra_cost->alu.arith;
6975               }
6976             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6977               {
6978                 /* FADD.  */
6979                 *cost += extra_cost->fp[mode == DFmode].addsub;
6980               }
6981           }
6982         return true;
6983       }
6984
6985     case BSWAP:
6986       *cost = COSTS_N_INSNS (1);
6987
6988       if (speed)
6989         {
6990           if (VECTOR_MODE_P (mode))
6991             *cost += extra_cost->vect.alu;
6992           else
6993             *cost += extra_cost->alu.rev;
6994         }
6995       return false;
6996
6997     case IOR:
6998       if (aarch_rev16_p (x))
6999         {
7000           *cost = COSTS_N_INSNS (1);
7001
7002           if (speed)
7003             {
7004               if (VECTOR_MODE_P (mode))
7005                 *cost += extra_cost->vect.alu;
7006               else
7007                 *cost += extra_cost->alu.rev;
7008             }
7009           return true;
7010         }
7011
7012       if (aarch64_extr_rtx_p (x, &op0, &op1))
7013         {
7014           *cost += rtx_cost (op0, mode, IOR, 0, speed);
7015           *cost += rtx_cost (op1, mode, IOR, 1, speed);
7016           if (speed)
7017             *cost += extra_cost->alu.shift;
7018
7019           return true;
7020         }
7021     /* Fall through.  */
7022     case XOR:
7023     case AND:
7024     cost_logic:
7025       op0 = XEXP (x, 0);
7026       op1 = XEXP (x, 1);
7027
7028       if (VECTOR_MODE_P (mode))
7029         {
7030           if (speed)
7031             *cost += extra_cost->vect.alu;
7032           return true;
7033         }
7034
7035       if (code == AND
7036           && GET_CODE (op0) == MULT
7037           && CONST_INT_P (XEXP (op0, 1))
7038           && CONST_INT_P (op1)
7039           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
7040                                INTVAL (op1)) != 0)
7041         {
7042           /* This is a UBFM/SBFM.  */
7043           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
7044           if (speed)
7045             *cost += extra_cost->alu.bfx;
7046           return true;
7047         }
7048
7049       if (GET_MODE_CLASS (mode) == MODE_INT)
7050         {
7051           if (CONST_INT_P (op1))
7052             {
7053               /* We have a mask + shift version of a UBFIZ
7054                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
7055               if (GET_CODE (op0) == ASHIFT
7056                   && aarch64_mask_and_shift_for_ubfiz_p (mode, op1,
7057                                                           XEXP (op0, 1)))
7058                 {
7059                   *cost += rtx_cost (XEXP (op0, 0), mode,
7060                                      (enum rtx_code) code, 0, speed);
7061                   if (speed)
7062                     *cost += extra_cost->alu.bfx;
7063
7064                   return true;
7065                 }
7066               else if (aarch64_bitmask_imm (INTVAL (op1), mode))
7067                 {
7068                 /* We possibly get the immediate for free, this is not
7069                    modelled.  */
7070                   *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7071                   if (speed)
7072                     *cost += extra_cost->alu.logical;
7073
7074                   return true;
7075                 }
7076             }
7077           else
7078             {
7079               rtx new_op0 = op0;
7080
7081               /* Handle ORN, EON, or BIC.  */
7082               if (GET_CODE (op0) == NOT)
7083                 op0 = XEXP (op0, 0);
7084
7085               new_op0 = aarch64_strip_shift (op0);
7086
7087               /* If we had a shift on op0 then this is a logical-shift-
7088                  by-register/immediate operation.  Otherwise, this is just
7089                  a logical operation.  */
7090               if (speed)
7091                 {
7092                   if (new_op0 != op0)
7093                     {
7094                       /* Shift by immediate.  */
7095                       if (CONST_INT_P (XEXP (op0, 1)))
7096                         *cost += extra_cost->alu.log_shift;
7097                       else
7098                         *cost += extra_cost->alu.log_shift_reg;
7099                     }
7100                   else
7101                     *cost += extra_cost->alu.logical;
7102                 }
7103
7104               /* In both cases we want to cost both operands.  */
7105               *cost += rtx_cost (new_op0, mode, (enum rtx_code) code, 0, speed);
7106               *cost += rtx_cost (op1, mode, (enum rtx_code) code, 1, speed);
7107
7108               return true;
7109             }
7110         }
7111       return false;
7112
7113     case NOT:
7114       x = XEXP (x, 0);
7115       op0 = aarch64_strip_shift (x);
7116
7117       if (VECTOR_MODE_P (mode))
7118         {
7119           /* Vector NOT.  */
7120           *cost += extra_cost->vect.alu;
7121           return false;
7122         }
7123
7124       /* MVN-shifted-reg.  */
7125       if (op0 != x)
7126         {
7127           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7128
7129           if (speed)
7130             *cost += extra_cost->alu.log_shift;
7131
7132           return true;
7133         }
7134       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
7135          Handle the second form here taking care that 'a' in the above can
7136          be a shift.  */
7137       else if (GET_CODE (op0) == XOR)
7138         {
7139           rtx newop0 = XEXP (op0, 0);
7140           rtx newop1 = XEXP (op0, 1);
7141           rtx op0_stripped = aarch64_strip_shift (newop0);
7142
7143           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
7144           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
7145
7146           if (speed)
7147             {
7148               if (op0_stripped != newop0)
7149                 *cost += extra_cost->alu.log_shift;
7150               else
7151                 *cost += extra_cost->alu.logical;
7152             }
7153
7154           return true;
7155         }
7156       /* MVN.  */
7157       if (speed)
7158         *cost += extra_cost->alu.logical;
7159
7160       return false;
7161
7162     case ZERO_EXTEND:
7163
7164       op0 = XEXP (x, 0);
7165       /* If a value is written in SI mode, then zero extended to DI
7166          mode, the operation will in general be free as a write to
7167          a 'w' register implicitly zeroes the upper bits of an 'x'
7168          register.  However, if this is
7169
7170            (set (reg) (zero_extend (reg)))
7171
7172          we must cost the explicit register move.  */
7173       if (mode == DImode
7174           && GET_MODE (op0) == SImode
7175           && outer == SET)
7176         {
7177           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
7178
7179         /* If OP_COST is non-zero, then the cost of the zero extend
7180            is effectively the cost of the inner operation.  Otherwise
7181            we have a MOV instruction and we take the cost from the MOV
7182            itself.  This is true independently of whether we are
7183            optimizing for space or time.  */
7184           if (op_cost)
7185             *cost = op_cost;
7186
7187           return true;
7188         }
7189       else if (MEM_P (op0))
7190         {
7191           /* All loads can zero extend to any size for free.  */
7192           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
7193           return true;
7194         }
7195
7196       op0 = aarch64_extend_bitfield_pattern_p (x);
7197       if (op0)
7198         {
7199           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
7200           if (speed)
7201             *cost += extra_cost->alu.bfx;
7202           return true;
7203         }
7204
7205       if (speed)
7206         {
7207           if (VECTOR_MODE_P (mode))
7208             {
7209               /* UMOV.  */
7210               *cost += extra_cost->vect.alu;
7211             }
7212           else
7213             {
7214               /* We generate an AND instead of UXTB/UXTH.  */
7215               *cost += extra_cost->alu.logical;
7216             }
7217         }
7218       return false;
7219
7220     case SIGN_EXTEND:
7221       if (MEM_P (XEXP (x, 0)))
7222         {
7223           /* LDRSH.  */
7224           if (speed)
7225             {
7226               rtx address = XEXP (XEXP (x, 0), 0);
7227               *cost += extra_cost->ldst.load_sign_extend;
7228
7229               *cost +=
7230                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7231                                                      0, speed));
7232             }
7233           return true;
7234         }
7235
7236       op0 = aarch64_extend_bitfield_pattern_p (x);
7237       if (op0)
7238         {
7239           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
7240           if (speed)
7241             *cost += extra_cost->alu.bfx;
7242           return true;
7243         }
7244
7245       if (speed)
7246         {
7247           if (VECTOR_MODE_P (mode))
7248             *cost += extra_cost->vect.alu;
7249           else
7250             *cost += extra_cost->alu.extend;
7251         }
7252       return false;
7253
7254     case ASHIFT:
7255       op0 = XEXP (x, 0);
7256       op1 = XEXP (x, 1);
7257
7258       if (CONST_INT_P (op1))
7259         {
7260           if (speed)
7261             {
7262               if (VECTOR_MODE_P (mode))
7263                 {
7264                   /* Vector shift (immediate).  */
7265                   *cost += extra_cost->vect.alu;
7266                 }
7267               else
7268                 {
7269                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
7270                      aliases.  */
7271                   *cost += extra_cost->alu.shift;
7272                 }
7273             }
7274
7275           /* We can incorporate zero/sign extend for free.  */
7276           if (GET_CODE (op0) == ZERO_EXTEND
7277               || GET_CODE (op0) == SIGN_EXTEND)
7278             op0 = XEXP (op0, 0);
7279
7280           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
7281           return true;
7282         }
7283       else
7284         {
7285           if (speed)
7286             {
7287               if (VECTOR_MODE_P (mode))
7288                 {
7289                   /* Vector shift (register).  */
7290                   *cost += extra_cost->vect.alu;
7291                 }
7292               else
7293                 {
7294                   /* LSLV.  */
7295                   *cost += extra_cost->alu.shift_reg;
7296                 }
7297             }
7298           return false;  /* All arguments need to be in registers.  */
7299         }
7300
7301     case ROTATE:
7302     case ROTATERT:
7303     case LSHIFTRT:
7304     case ASHIFTRT:
7305       op0 = XEXP (x, 0);
7306       op1 = XEXP (x, 1);
7307
7308       if (CONST_INT_P (op1))
7309         {
7310           /* ASR (immediate) and friends.  */
7311           if (speed)
7312             {
7313               if (VECTOR_MODE_P (mode))
7314                 *cost += extra_cost->vect.alu;
7315               else
7316                 *cost += extra_cost->alu.shift;
7317             }
7318
7319           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7320           return true;
7321         }
7322       else
7323         {
7324
7325           /* ASR (register) and friends.  */
7326           if (speed)
7327             {
7328               if (VECTOR_MODE_P (mode))
7329                 *cost += extra_cost->vect.alu;
7330               else
7331                 *cost += extra_cost->alu.shift_reg;
7332             }
7333           return false;  /* All arguments need to be in registers.  */
7334         }
7335
7336     case SYMBOL_REF:
7337
7338       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7339           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7340         {
7341           /* LDR.  */
7342           if (speed)
7343             *cost += extra_cost->ldst.load;
7344         }
7345       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7346                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7347         {
7348           /* ADRP, followed by ADD.  */
7349           *cost += COSTS_N_INSNS (1);
7350           if (speed)
7351             *cost += 2 * extra_cost->alu.arith;
7352         }
7353       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7354                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7355         {
7356           /* ADR.  */
7357           if (speed)
7358             *cost += extra_cost->alu.arith;
7359         }
7360
7361       if (flag_pic)
7362         {
7363           /* One extra load instruction, after accessing the GOT.  */
7364           *cost += COSTS_N_INSNS (1);
7365           if (speed)
7366             *cost += extra_cost->ldst.load;
7367         }
7368       return true;
7369
7370     case HIGH:
7371     case LO_SUM:
7372       /* ADRP/ADD (immediate).  */
7373       if (speed)
7374         *cost += extra_cost->alu.arith;
7375       return true;
7376
7377     case ZERO_EXTRACT:
7378     case SIGN_EXTRACT:
7379       /* UBFX/SBFX.  */
7380       if (speed)
7381         {
7382           if (VECTOR_MODE_P (mode))
7383             *cost += extra_cost->vect.alu;
7384           else
7385             *cost += extra_cost->alu.bfx;
7386         }
7387
7388       /* We can trust that the immediates used will be correct (there
7389          are no by-register forms), so we need only cost op0.  */
7390       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7391       return true;
7392
7393     case MULT:
7394       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7395       /* aarch64_rtx_mult_cost always handles recursion to its
7396          operands.  */
7397       return true;
7398
7399     case MOD:
7400     /* We can expand signed mod by power of 2 using a NEGS, two parallel
7401        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
7402        an unconditional negate.  This case should only ever be reached through
7403        the set_smod_pow2_cheap check in expmed.c.  */
7404       if (CONST_INT_P (XEXP (x, 1))
7405           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7406           && (mode == SImode || mode == DImode))
7407         {
7408           /* We expand to 4 instructions.  Reset the baseline.  */
7409           *cost = COSTS_N_INSNS (4);
7410
7411           if (speed)
7412             *cost += 2 * extra_cost->alu.logical
7413                      + 2 * extra_cost->alu.arith;
7414
7415           return true;
7416         }
7417
7418     /* Fall-through.  */
7419     case UMOD:
7420       if (speed)
7421         {
7422           if (VECTOR_MODE_P (mode))
7423             *cost += extra_cost->vect.alu;
7424           else if (GET_MODE_CLASS (mode) == MODE_INT)
7425             *cost += (extra_cost->mult[mode == DImode].add
7426                       + extra_cost->mult[mode == DImode].idiv);
7427           else if (mode == DFmode)
7428             *cost += (extra_cost->fp[1].mult
7429                       + extra_cost->fp[1].div);
7430           else if (mode == SFmode)
7431             *cost += (extra_cost->fp[0].mult
7432                       + extra_cost->fp[0].div);
7433         }
7434       return false;  /* All arguments need to be in registers.  */
7435
7436     case DIV:
7437     case UDIV:
7438     case SQRT:
7439       if (speed)
7440         {
7441           if (VECTOR_MODE_P (mode))
7442             *cost += extra_cost->vect.alu;
7443           else if (GET_MODE_CLASS (mode) == MODE_INT)
7444             /* There is no integer SQRT, so only DIV and UDIV can get
7445                here.  */
7446             *cost += extra_cost->mult[mode == DImode].idiv;
7447           else
7448             *cost += extra_cost->fp[mode == DFmode].div;
7449         }
7450       return false;  /* All arguments need to be in registers.  */
7451
7452     case IF_THEN_ELSE:
7453       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7454                                          XEXP (x, 2), cost, speed);
7455
7456     case EQ:
7457     case NE:
7458     case GT:
7459     case GTU:
7460     case LT:
7461     case LTU:
7462     case GE:
7463     case GEU:
7464     case LE:
7465     case LEU:
7466
7467       return false; /* All arguments must be in registers.  */
7468
7469     case FMA:
7470       op0 = XEXP (x, 0);
7471       op1 = XEXP (x, 1);
7472       op2 = XEXP (x, 2);
7473
7474       if (speed)
7475         {
7476           if (VECTOR_MODE_P (mode))
7477             *cost += extra_cost->vect.alu;
7478           else
7479             *cost += extra_cost->fp[mode == DFmode].fma;
7480         }
7481
7482       /* FMSUB, FNMADD, and FNMSUB are free.  */
7483       if (GET_CODE (op0) == NEG)
7484         op0 = XEXP (op0, 0);
7485
7486       if (GET_CODE (op2) == NEG)
7487         op2 = XEXP (op2, 0);
7488
7489       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7490          and the by-element operand as operand 0.  */
7491       if (GET_CODE (op1) == NEG)
7492         op1 = XEXP (op1, 0);
7493
7494       /* Catch vector-by-element operations.  The by-element operand can
7495          either be (vec_duplicate (vec_select (x))) or just
7496          (vec_select (x)), depending on whether we are multiplying by
7497          a vector or a scalar.
7498
7499          Canonicalization is not very good in these cases, FMA4 will put the
7500          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
7501       if (GET_CODE (op0) == VEC_DUPLICATE)
7502         op0 = XEXP (op0, 0);
7503       else if (GET_CODE (op1) == VEC_DUPLICATE)
7504         op1 = XEXP (op1, 0);
7505
7506       if (GET_CODE (op0) == VEC_SELECT)
7507         op0 = XEXP (op0, 0);
7508       else if (GET_CODE (op1) == VEC_SELECT)
7509         op1 = XEXP (op1, 0);
7510
7511       /* If the remaining parameters are not registers,
7512          get the cost to put them into registers.  */
7513       *cost += rtx_cost (op0, mode, FMA, 0, speed);
7514       *cost += rtx_cost (op1, mode, FMA, 1, speed);
7515       *cost += rtx_cost (op2, mode, FMA, 2, speed);
7516       return true;
7517
7518     case FLOAT:
7519     case UNSIGNED_FLOAT:
7520       if (speed)
7521         *cost += extra_cost->fp[mode == DFmode].fromint;
7522       return false;
7523
7524     case FLOAT_EXTEND:
7525       if (speed)
7526         {
7527           if (VECTOR_MODE_P (mode))
7528             {
7529               /*Vector truncate.  */
7530               *cost += extra_cost->vect.alu;
7531             }
7532           else
7533             *cost += extra_cost->fp[mode == DFmode].widen;
7534         }
7535       return false;
7536
7537     case FLOAT_TRUNCATE:
7538       if (speed)
7539         {
7540           if (VECTOR_MODE_P (mode))
7541             {
7542               /*Vector conversion.  */
7543               *cost += extra_cost->vect.alu;
7544             }
7545           else
7546             *cost += extra_cost->fp[mode == DFmode].narrow;
7547         }
7548       return false;
7549
7550     case FIX:
7551     case UNSIGNED_FIX:
7552       x = XEXP (x, 0);
7553       /* Strip the rounding part.  They will all be implemented
7554          by the fcvt* family of instructions anyway.  */
7555       if (GET_CODE (x) == UNSPEC)
7556         {
7557           unsigned int uns_code = XINT (x, 1);
7558
7559           if (uns_code == UNSPEC_FRINTA
7560               || uns_code == UNSPEC_FRINTM
7561               || uns_code == UNSPEC_FRINTN
7562               || uns_code == UNSPEC_FRINTP
7563               || uns_code == UNSPEC_FRINTZ)
7564             x = XVECEXP (x, 0, 0);
7565         }
7566
7567       if (speed)
7568         {
7569           if (VECTOR_MODE_P (mode))
7570             *cost += extra_cost->vect.alu;
7571           else
7572             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
7573         }
7574
7575       /* We can combine fmul by a power of 2 followed by a fcvt into a single
7576          fixed-point fcvt.  */
7577       if (GET_CODE (x) == MULT
7578           && ((VECTOR_MODE_P (mode)
7579                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
7580               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
7581         {
7582           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
7583                              0, speed);
7584           return true;
7585         }
7586
7587       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
7588       return true;
7589
7590     case ABS:
7591       if (VECTOR_MODE_P (mode))
7592         {
7593           /* ABS (vector).  */
7594           if (speed)
7595             *cost += extra_cost->vect.alu;
7596         }
7597       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7598         {
7599           op0 = XEXP (x, 0);
7600
7601           /* FABD, which is analogous to FADD.  */
7602           if (GET_CODE (op0) == MINUS)
7603             {
7604               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
7605               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
7606               if (speed)
7607                 *cost += extra_cost->fp[mode == DFmode].addsub;
7608
7609               return true;
7610             }
7611           /* Simple FABS is analogous to FNEG.  */
7612           if (speed)
7613             *cost += extra_cost->fp[mode == DFmode].neg;
7614         }
7615       else
7616         {
7617           /* Integer ABS will either be split to
7618              two arithmetic instructions, or will be an ABS
7619              (scalar), which we don't model.  */
7620           *cost = COSTS_N_INSNS (2);
7621           if (speed)
7622             *cost += 2 * extra_cost->alu.arith;
7623         }
7624       return false;
7625
7626     case SMAX:
7627     case SMIN:
7628       if (speed)
7629         {
7630           if (VECTOR_MODE_P (mode))
7631             *cost += extra_cost->vect.alu;
7632           else
7633             {
7634               /* FMAXNM/FMINNM/FMAX/FMIN.
7635                  TODO: This may not be accurate for all implementations, but
7636                  we do not model this in the cost tables.  */
7637               *cost += extra_cost->fp[mode == DFmode].addsub;
7638             }
7639         }
7640       return false;
7641
7642     case UNSPEC:
7643       /* The floating point round to integer frint* instructions.  */
7644       if (aarch64_frint_unspec_p (XINT (x, 1)))
7645         {
7646           if (speed)
7647             *cost += extra_cost->fp[mode == DFmode].roundint;
7648
7649           return false;
7650         }
7651
7652       if (XINT (x, 1) == UNSPEC_RBIT)
7653         {
7654           if (speed)
7655             *cost += extra_cost->alu.rev;
7656
7657           return false;
7658         }
7659       break;
7660
7661     case TRUNCATE:
7662
7663       /* Decompose <su>muldi3_highpart.  */
7664       if (/* (truncate:DI  */
7665           mode == DImode
7666           /*   (lshiftrt:TI  */
7667           && GET_MODE (XEXP (x, 0)) == TImode
7668           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
7669           /*      (mult:TI  */
7670           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
7671           /*        (ANY_EXTEND:TI (reg:DI))
7672                     (ANY_EXTEND:TI (reg:DI)))  */
7673           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
7674                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
7675               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
7676                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
7677           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
7678           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
7679           /*     (const_int 64)  */
7680           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7681           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
7682         {
7683           /* UMULH/SMULH.  */
7684           if (speed)
7685             *cost += extra_cost->mult[mode == DImode].extend;
7686           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
7687                              mode, MULT, 0, speed);
7688           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
7689                              mode, MULT, 1, speed);
7690           return true;
7691         }
7692
7693       /* Fall through.  */
7694     default:
7695       break;
7696     }
7697
7698   if (dump_file
7699       && flag_aarch64_verbose_cost)
7700     fprintf (dump_file,
7701       "\nFailed to cost RTX.  Assuming default cost.\n");
7702
7703   return true;
7704 }
7705
7706 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
7707    calculated for X.  This cost is stored in *COST.  Returns true
7708    if the total cost of X was calculated.  */
7709 static bool
7710 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
7711                    int param, int *cost, bool speed)
7712 {
7713   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
7714
7715   if (dump_file
7716       && flag_aarch64_verbose_cost)
7717     {
7718       print_rtl_single (dump_file, x);
7719       fprintf (dump_file, "\n%s cost: %d (%s)\n",
7720                speed ? "Hot" : "Cold",
7721                *cost, result ? "final" : "partial");
7722     }
7723
7724   return result;
7725 }
7726
7727 static int
7728 aarch64_register_move_cost (machine_mode mode,
7729                             reg_class_t from_i, reg_class_t to_i)
7730 {
7731   enum reg_class from = (enum reg_class) from_i;
7732   enum reg_class to = (enum reg_class) to_i;
7733   const struct cpu_regmove_cost *regmove_cost
7734     = aarch64_tune_params.regmove_cost;
7735
7736   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
7737   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
7738     to = GENERAL_REGS;
7739
7740   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
7741     from = GENERAL_REGS;
7742
7743   /* Moving between GPR and stack cost is the same as GP2GP.  */
7744   if ((from == GENERAL_REGS && to == STACK_REG)
7745       || (to == GENERAL_REGS && from == STACK_REG))
7746     return regmove_cost->GP2GP;
7747
7748   /* To/From the stack register, we move via the gprs.  */
7749   if (to == STACK_REG || from == STACK_REG)
7750     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
7751             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
7752
7753   if (GET_MODE_SIZE (mode) == 16)
7754     {
7755       /* 128-bit operations on general registers require 2 instructions.  */
7756       if (from == GENERAL_REGS && to == GENERAL_REGS)
7757         return regmove_cost->GP2GP * 2;
7758       else if (from == GENERAL_REGS)
7759         return regmove_cost->GP2FP * 2;
7760       else if (to == GENERAL_REGS)
7761         return regmove_cost->FP2GP * 2;
7762
7763       /* When AdvSIMD instructions are disabled it is not possible to move
7764          a 128-bit value directly between Q registers.  This is handled in
7765          secondary reload.  A general register is used as a scratch to move
7766          the upper DI value and the lower DI value is moved directly,
7767          hence the cost is the sum of three moves. */
7768       if (! TARGET_SIMD)
7769         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
7770
7771       return regmove_cost->FP2FP;
7772     }
7773
7774   if (from == GENERAL_REGS && to == GENERAL_REGS)
7775     return regmove_cost->GP2GP;
7776   else if (from == GENERAL_REGS)
7777     return regmove_cost->GP2FP;
7778   else if (to == GENERAL_REGS)
7779     return regmove_cost->FP2GP;
7780
7781   return regmove_cost->FP2FP;
7782 }
7783
7784 static int
7785 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
7786                           reg_class_t rclass ATTRIBUTE_UNUSED,
7787                           bool in ATTRIBUTE_UNUSED)
7788 {
7789   return aarch64_tune_params.memmov_cost;
7790 }
7791
7792 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
7793    to optimize 1.0/sqrt.  */
7794
7795 static bool
7796 use_rsqrt_p (machine_mode mode)
7797 {
7798   return (!flag_trapping_math
7799           && flag_unsafe_math_optimizations
7800           && ((aarch64_tune_params.approx_modes->recip_sqrt
7801                & AARCH64_APPROX_MODE (mode))
7802               || flag_mrecip_low_precision_sqrt));
7803 }
7804
7805 /* Function to decide when to use the approximate reciprocal square root
7806    builtin.  */
7807
7808 static tree
7809 aarch64_builtin_reciprocal (tree fndecl)
7810 {
7811   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
7812
7813   if (!use_rsqrt_p (mode))
7814     return NULL_TREE;
7815   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
7816 }
7817
7818 typedef rtx (*rsqrte_type) (rtx, rtx);
7819
7820 /* Select reciprocal square root initial estimate insn depending on machine
7821    mode.  */
7822
7823 static rsqrte_type
7824 get_rsqrte_type (machine_mode mode)
7825 {
7826   switch (mode)
7827   {
7828     case DFmode:   return gen_aarch64_rsqrtedf;
7829     case SFmode:   return gen_aarch64_rsqrtesf;
7830     case V2DFmode: return gen_aarch64_rsqrtev2df;
7831     case V2SFmode: return gen_aarch64_rsqrtev2sf;
7832     case V4SFmode: return gen_aarch64_rsqrtev4sf;
7833     default: gcc_unreachable ();
7834   }
7835 }
7836
7837 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
7838
7839 /* Select reciprocal square root series step insn depending on machine mode.  */
7840
7841 static rsqrts_type
7842 get_rsqrts_type (machine_mode mode)
7843 {
7844   switch (mode)
7845   {
7846     case DFmode:   return gen_aarch64_rsqrtsdf;
7847     case SFmode:   return gen_aarch64_rsqrtssf;
7848     case V2DFmode: return gen_aarch64_rsqrtsv2df;
7849     case V2SFmode: return gen_aarch64_rsqrtsv2sf;
7850     case V4SFmode: return gen_aarch64_rsqrtsv4sf;
7851     default: gcc_unreachable ();
7852   }
7853 }
7854
7855 /* Emit instruction sequence to compute either the approximate square root
7856    or its approximate reciprocal, depending on the flag RECP, and return
7857    whether the sequence was emitted or not.  */
7858
7859 bool
7860 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
7861 {
7862   machine_mode mode = GET_MODE (dst);
7863
7864   if (GET_MODE_INNER (mode) == HFmode)
7865     return false;
7866
7867   machine_mode mmsk = mode_for_vector
7868                         (int_mode_for_mode (GET_MODE_INNER (mode)),
7869                          GET_MODE_NUNITS (mode));
7870   bool use_approx_sqrt_p = (!recp
7871                             && (flag_mlow_precision_sqrt
7872                                 || (aarch64_tune_params.approx_modes->sqrt
7873                                     & AARCH64_APPROX_MODE (mode))));
7874   bool use_approx_rsqrt_p = (recp
7875                              && (flag_mrecip_low_precision_sqrt
7876                                  || (aarch64_tune_params.approx_modes->recip_sqrt
7877                                      & AARCH64_APPROX_MODE (mode))));
7878
7879   if (!flag_finite_math_only
7880       || flag_trapping_math
7881       || !flag_unsafe_math_optimizations
7882       || !(use_approx_sqrt_p || use_approx_rsqrt_p)
7883       || optimize_function_for_size_p (cfun))
7884     return false;
7885
7886   rtx xmsk = gen_reg_rtx (mmsk);
7887   if (!recp)
7888     /* When calculating the approximate square root, compare the argument with
7889        0.0 and create a mask.  */
7890     emit_insn (gen_rtx_SET (xmsk, gen_rtx_NEG (mmsk, gen_rtx_EQ (mmsk, src,
7891                                                           CONST0_RTX (mode)))));
7892
7893   /* Estimate the approximate reciprocal square root.  */
7894   rtx xdst = gen_reg_rtx (mode);
7895   emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
7896
7897   /* Iterate over the series twice for SF and thrice for DF.  */
7898   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
7899
7900   /* Optionally iterate over the series once less for faster performance
7901      while sacrificing the accuracy.  */
7902   if ((recp && flag_mrecip_low_precision_sqrt)
7903       || (!recp && flag_mlow_precision_sqrt))
7904     iterations--;
7905
7906   /* Iterate over the series to calculate the approximate reciprocal square
7907      root.  */
7908   rtx x1 = gen_reg_rtx (mode);
7909   while (iterations--)
7910     {
7911       rtx x2 = gen_reg_rtx (mode);
7912       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
7913
7914       emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
7915
7916       if (iterations > 0)
7917         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
7918     }
7919
7920   if (!recp)
7921     {
7922       /* Qualify the approximate reciprocal square root when the argument is
7923          0.0 by squashing the intermediary result to 0.0.  */
7924       rtx xtmp = gen_reg_rtx (mmsk);
7925       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
7926                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
7927       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
7928
7929       /* Calculate the approximate square root.  */
7930       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
7931     }
7932
7933   /* Finalize the approximation.  */
7934   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
7935
7936   return true;
7937 }
7938
7939 typedef rtx (*recpe_type) (rtx, rtx);
7940
7941 /* Select reciprocal initial estimate insn depending on machine mode.  */
7942
7943 static recpe_type
7944 get_recpe_type (machine_mode mode)
7945 {
7946   switch (mode)
7947   {
7948     case SFmode:   return (gen_aarch64_frecpesf);
7949     case V2SFmode: return (gen_aarch64_frecpev2sf);
7950     case V4SFmode: return (gen_aarch64_frecpev4sf);
7951     case DFmode:   return (gen_aarch64_frecpedf);
7952     case V2DFmode: return (gen_aarch64_frecpev2df);
7953     default:       gcc_unreachable ();
7954   }
7955 }
7956
7957 typedef rtx (*recps_type) (rtx, rtx, rtx);
7958
7959 /* Select reciprocal series step insn depending on machine mode.  */
7960
7961 static recps_type
7962 get_recps_type (machine_mode mode)
7963 {
7964   switch (mode)
7965   {
7966     case SFmode:   return (gen_aarch64_frecpssf);
7967     case V2SFmode: return (gen_aarch64_frecpsv2sf);
7968     case V4SFmode: return (gen_aarch64_frecpsv4sf);
7969     case DFmode:   return (gen_aarch64_frecpsdf);
7970     case V2DFmode: return (gen_aarch64_frecpsv2df);
7971     default:       gcc_unreachable ();
7972   }
7973 }
7974
7975 /* Emit the instruction sequence to compute the approximation for the division
7976    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
7977
7978 bool
7979 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
7980 {
7981   machine_mode mode = GET_MODE (quo);
7982
7983   if (GET_MODE_INNER (mode) == HFmode)
7984     return false;
7985
7986   bool use_approx_division_p = (flag_mlow_precision_div
7987                                 || (aarch64_tune_params.approx_modes->division
7988                                     & AARCH64_APPROX_MODE (mode)));
7989
7990   if (!flag_finite_math_only
7991       || flag_trapping_math
7992       || !flag_unsafe_math_optimizations
7993       || optimize_function_for_size_p (cfun)
7994       || !use_approx_division_p)
7995     return false;
7996
7997   /* Estimate the approximate reciprocal.  */
7998   rtx xrcp = gen_reg_rtx (mode);
7999   emit_insn ((*get_recpe_type (mode)) (xrcp, den));
8000
8001   /* Iterate over the series twice for SF and thrice for DF.  */
8002   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8003
8004   /* Optionally iterate over the series once less for faster performance,
8005      while sacrificing the accuracy.  */
8006   if (flag_mlow_precision_div)
8007     iterations--;
8008
8009   /* Iterate over the series to calculate the approximate reciprocal.  */
8010   rtx xtmp = gen_reg_rtx (mode);
8011   while (iterations--)
8012     {
8013       emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
8014
8015       if (iterations > 0)
8016         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
8017     }
8018
8019   if (num != CONST1_RTX (mode))
8020     {
8021       /* As the approximate reciprocal of DEN is already calculated, only
8022          calculate the approximate division when NUM is not 1.0.  */
8023       rtx xnum = force_reg (mode, num);
8024       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
8025     }
8026
8027   /* Finalize the approximation.  */
8028   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
8029   return true;
8030 }
8031
8032 /* Return the number of instructions that can be issued per cycle.  */
8033 static int
8034 aarch64_sched_issue_rate (void)
8035 {
8036   return aarch64_tune_params.issue_rate;
8037 }
8038
8039 static int
8040 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
8041 {
8042   int issue_rate = aarch64_sched_issue_rate ();
8043
8044   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
8045 }
8046
8047
8048 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
8049    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
8050    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
8051
8052 static int
8053 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
8054                                                     int ready_index)
8055 {
8056   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
8057 }
8058
8059
8060 /* Vectorizer cost model target hooks.  */
8061
8062 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
8063 static int
8064 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
8065                                     tree vectype,
8066                                     int misalign ATTRIBUTE_UNUSED)
8067 {
8068   unsigned elements;
8069
8070   switch (type_of_cost)
8071     {
8072       case scalar_stmt:
8073         return aarch64_tune_params.vec_costs->scalar_stmt_cost;
8074
8075       case scalar_load:
8076         return aarch64_tune_params.vec_costs->scalar_load_cost;
8077
8078       case scalar_store:
8079         return aarch64_tune_params.vec_costs->scalar_store_cost;
8080
8081       case vector_stmt:
8082         return aarch64_tune_params.vec_costs->vec_stmt_cost;
8083
8084       case vector_load:
8085         return aarch64_tune_params.vec_costs->vec_align_load_cost;
8086
8087       case vector_store:
8088         return aarch64_tune_params.vec_costs->vec_store_cost;
8089
8090       case vec_to_scalar:
8091         return aarch64_tune_params.vec_costs->vec_to_scalar_cost;
8092
8093       case scalar_to_vec:
8094         return aarch64_tune_params.vec_costs->scalar_to_vec_cost;
8095
8096       case unaligned_load:
8097         return aarch64_tune_params.vec_costs->vec_unalign_load_cost;
8098
8099       case unaligned_store:
8100         return aarch64_tune_params.vec_costs->vec_unalign_store_cost;
8101
8102       case cond_branch_taken:
8103         return aarch64_tune_params.vec_costs->cond_taken_branch_cost;
8104
8105       case cond_branch_not_taken:
8106         return aarch64_tune_params.vec_costs->cond_not_taken_branch_cost;
8107
8108       case vec_perm:
8109         return aarch64_tune_params.vec_costs->vec_permute_cost;
8110
8111       case vec_promote_demote:
8112         return aarch64_tune_params.vec_costs->vec_stmt_cost;
8113
8114       case vec_construct:
8115         elements = TYPE_VECTOR_SUBPARTS (vectype);
8116         return elements / 2 + 1;
8117
8118       default:
8119         gcc_unreachable ();
8120     }
8121 }
8122
8123 /* Implement targetm.vectorize.add_stmt_cost.  */
8124 static unsigned
8125 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
8126                        struct _stmt_vec_info *stmt_info, int misalign,
8127                        enum vect_cost_model_location where)
8128 {
8129   unsigned *cost = (unsigned *) data;
8130   unsigned retval = 0;
8131
8132   if (flag_vect_cost_model)
8133     {
8134       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
8135       int stmt_cost =
8136             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
8137
8138       /* Statements in an inner loop relative to the loop being
8139          vectorized are weighted more heavily.  The value here is
8140          arbitrary and could potentially be improved with analysis.  */
8141       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
8142         count *= 50; /*  FIXME  */
8143
8144       retval = (unsigned) (count * stmt_cost);
8145       cost[where] += retval;
8146     }
8147
8148   return retval;
8149 }
8150
8151 static void initialize_aarch64_code_model (struct gcc_options *);
8152
8153 /* Parse the TO_PARSE string and put the architecture struct that it
8154    selects into RES and the architectural features into ISA_FLAGS.
8155    Return an aarch64_parse_opt_result describing the parse result.
8156    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
8157
8158 static enum aarch64_parse_opt_result
8159 aarch64_parse_arch (const char *to_parse, const struct processor **res,
8160                     unsigned long *isa_flags)
8161 {
8162   char *ext;
8163   const struct processor *arch;
8164   char *str = (char *) alloca (strlen (to_parse) + 1);
8165   size_t len;
8166
8167   strcpy (str, to_parse);
8168
8169   ext = strchr (str, '+');
8170
8171   if (ext != NULL)
8172     len = ext - str;
8173   else
8174     len = strlen (str);
8175
8176   if (len == 0)
8177     return AARCH64_PARSE_MISSING_ARG;
8178
8179
8180   /* Loop through the list of supported ARCHes to find a match.  */
8181   for (arch = all_architectures; arch->name != NULL; arch++)
8182     {
8183       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
8184         {
8185           unsigned long isa_temp = arch->flags;
8186
8187           if (ext != NULL)
8188             {
8189               /* TO_PARSE string contains at least one extension.  */
8190               enum aarch64_parse_opt_result ext_res
8191                 = aarch64_parse_extension (ext, &isa_temp);
8192
8193               if (ext_res != AARCH64_PARSE_OK)
8194                 return ext_res;
8195             }
8196           /* Extension parsing was successful.  Confirm the result
8197              arch and ISA flags.  */
8198           *res = arch;
8199           *isa_flags = isa_temp;
8200           return AARCH64_PARSE_OK;
8201         }
8202     }
8203
8204   /* ARCH name not found in list.  */
8205   return AARCH64_PARSE_INVALID_ARG;
8206 }
8207
8208 /* Parse the TO_PARSE string and put the result tuning in RES and the
8209    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
8210    describing the parse result.  If there is an error parsing, RES and
8211    ISA_FLAGS are left unchanged.  */
8212
8213 static enum aarch64_parse_opt_result
8214 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
8215                    unsigned long *isa_flags)
8216 {
8217   char *ext;
8218   const struct processor *cpu;
8219   char *str = (char *) alloca (strlen (to_parse) + 1);
8220   size_t len;
8221
8222   strcpy (str, to_parse);
8223
8224   ext = strchr (str, '+');
8225
8226   if (ext != NULL)
8227     len = ext - str;
8228   else
8229     len = strlen (str);
8230
8231   if (len == 0)
8232     return AARCH64_PARSE_MISSING_ARG;
8233
8234
8235   /* Loop through the list of supported CPUs to find a match.  */
8236   for (cpu = all_cores; cpu->name != NULL; cpu++)
8237     {
8238       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
8239         {
8240           unsigned long isa_temp = cpu->flags;
8241
8242
8243           if (ext != NULL)
8244             {
8245               /* TO_PARSE string contains at least one extension.  */
8246               enum aarch64_parse_opt_result ext_res
8247                 = aarch64_parse_extension (ext, &isa_temp);
8248
8249               if (ext_res != AARCH64_PARSE_OK)
8250                 return ext_res;
8251             }
8252           /* Extension parsing was successfull.  Confirm the result
8253              cpu and ISA flags.  */
8254           *res = cpu;
8255           *isa_flags = isa_temp;
8256           return AARCH64_PARSE_OK;
8257         }
8258     }
8259
8260   /* CPU name not found in list.  */
8261   return AARCH64_PARSE_INVALID_ARG;
8262 }
8263
8264 /* Parse the TO_PARSE string and put the cpu it selects into RES.
8265    Return an aarch64_parse_opt_result describing the parse result.
8266    If the parsing fails the RES does not change.  */
8267
8268 static enum aarch64_parse_opt_result
8269 aarch64_parse_tune (const char *to_parse, const struct processor **res)
8270 {
8271   const struct processor *cpu;
8272   char *str = (char *) alloca (strlen (to_parse) + 1);
8273
8274   strcpy (str, to_parse);
8275
8276   /* Loop through the list of supported CPUs to find a match.  */
8277   for (cpu = all_cores; cpu->name != NULL; cpu++)
8278     {
8279       if (strcmp (cpu->name, str) == 0)
8280         {
8281           *res = cpu;
8282           return AARCH64_PARSE_OK;
8283         }
8284     }
8285
8286   /* CPU name not found in list.  */
8287   return AARCH64_PARSE_INVALID_ARG;
8288 }
8289
8290 /* Parse TOKEN, which has length LENGTH to see if it is an option
8291    described in FLAG.  If it is, return the index bit for that fusion type.
8292    If not, error (printing OPTION_NAME) and return zero.  */
8293
8294 static unsigned int
8295 aarch64_parse_one_option_token (const char *token,
8296                                 size_t length,
8297                                 const struct aarch64_flag_desc *flag,
8298                                 const char *option_name)
8299 {
8300   for (; flag->name != NULL; flag++)
8301     {
8302       if (length == strlen (flag->name)
8303           && !strncmp (flag->name, token, length))
8304         return flag->flag;
8305     }
8306
8307   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
8308   return 0;
8309 }
8310
8311 /* Parse OPTION which is a comma-separated list of flags to enable.
8312    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8313    default state we inherit from the CPU tuning structures.  OPTION_NAME
8314    gives the top-level option we are parsing in the -moverride string,
8315    for use in error messages.  */
8316
8317 static unsigned int
8318 aarch64_parse_boolean_options (const char *option,
8319                                const struct aarch64_flag_desc *flags,
8320                                unsigned int initial_state,
8321                                const char *option_name)
8322 {
8323   const char separator = '.';
8324   const char* specs = option;
8325   const char* ntoken = option;
8326   unsigned int found_flags = initial_state;
8327
8328   while ((ntoken = strchr (specs, separator)))
8329     {
8330       size_t token_length = ntoken - specs;
8331       unsigned token_ops = aarch64_parse_one_option_token (specs,
8332                                                            token_length,
8333                                                            flags,
8334                                                            option_name);
8335       /* If we find "none" (or, for simplicity's sake, an error) anywhere
8336          in the token stream, reset the supported operations.  So:
8337
8338            adrp+add.cmp+branch.none.adrp+add
8339
8340            would have the result of turning on only adrp+add fusion.  */
8341       if (!token_ops)
8342         found_flags = 0;
8343
8344       found_flags |= token_ops;
8345       specs = ++ntoken;
8346     }
8347
8348   /* We ended with a comma, print something.  */
8349   if (!(*specs))
8350     {
8351       error ("%s string ill-formed\n", option_name);
8352       return 0;
8353     }
8354
8355   /* We still have one more token to parse.  */
8356   size_t token_length = strlen (specs);
8357   unsigned token_ops = aarch64_parse_one_option_token (specs,
8358                                                        token_length,
8359                                                        flags,
8360                                                        option_name);
8361    if (!token_ops)
8362      found_flags = 0;
8363
8364   found_flags |= token_ops;
8365   return found_flags;
8366 }
8367
8368 /* Support for overriding instruction fusion.  */
8369
8370 static void
8371 aarch64_parse_fuse_string (const char *fuse_string,
8372                             struct tune_params *tune)
8373 {
8374   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8375                                                      aarch64_fusible_pairs,
8376                                                      tune->fusible_ops,
8377                                                      "fuse=");
8378 }
8379
8380 /* Support for overriding other tuning flags.  */
8381
8382 static void
8383 aarch64_parse_tune_string (const char *tune_string,
8384                             struct tune_params *tune)
8385 {
8386   tune->extra_tuning_flags
8387     = aarch64_parse_boolean_options (tune_string,
8388                                      aarch64_tuning_flags,
8389                                      tune->extra_tuning_flags,
8390                                      "tune=");
8391 }
8392
8393 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8394    we understand.  If it is, extract the option string and handoff to
8395    the appropriate function.  */
8396
8397 void
8398 aarch64_parse_one_override_token (const char* token,
8399                                   size_t length,
8400                                   struct tune_params *tune)
8401 {
8402   const struct aarch64_tuning_override_function *fn
8403     = aarch64_tuning_override_functions;
8404
8405   const char *option_part = strchr (token, '=');
8406   if (!option_part)
8407     {
8408       error ("tuning string missing in option (%s)", token);
8409       return;
8410     }
8411
8412   /* Get the length of the option name.  */
8413   length = option_part - token;
8414   /* Skip the '=' to get to the option string.  */
8415   option_part++;
8416
8417   for (; fn->name != NULL; fn++)
8418     {
8419       if (!strncmp (fn->name, token, length))
8420         {
8421           fn->parse_override (option_part, tune);
8422           return;
8423         }
8424     }
8425
8426   error ("unknown tuning option (%s)",token);
8427   return;
8428 }
8429
8430 /* A checking mechanism for the implementation of the tls size.  */
8431
8432 static void
8433 initialize_aarch64_tls_size (struct gcc_options *opts)
8434 {
8435   if (aarch64_tls_size == 0)
8436     aarch64_tls_size = 24;
8437
8438   switch (opts->x_aarch64_cmodel_var)
8439     {
8440     case AARCH64_CMODEL_TINY:
8441       /* Both the default and maximum TLS size allowed under tiny is 1M which
8442          needs two instructions to address, so we clamp the size to 24.  */
8443       if (aarch64_tls_size > 24)
8444         aarch64_tls_size = 24;
8445       break;
8446     case AARCH64_CMODEL_SMALL:
8447       /* The maximum TLS size allowed under small is 4G.  */
8448       if (aarch64_tls_size > 32)
8449         aarch64_tls_size = 32;
8450       break;
8451     case AARCH64_CMODEL_LARGE:
8452       /* The maximum TLS size allowed under large is 16E.
8453          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
8454       if (aarch64_tls_size > 48)
8455         aarch64_tls_size = 48;
8456       break;
8457     default:
8458       gcc_unreachable ();
8459     }
8460
8461   return;
8462 }
8463
8464 /* Parse STRING looking for options in the format:
8465      string     :: option:string
8466      option     :: name=substring
8467      name       :: {a-z}
8468      substring  :: defined by option.  */
8469
8470 static void
8471 aarch64_parse_override_string (const char* input_string,
8472                                struct tune_params* tune)
8473 {
8474   const char separator = ':';
8475   size_t string_length = strlen (input_string) + 1;
8476   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8477   char *string = string_root;
8478   strncpy (string, input_string, string_length);
8479   string[string_length - 1] = '\0';
8480
8481   char* ntoken = string;
8482
8483   while ((ntoken = strchr (string, separator)))
8484     {
8485       size_t token_length = ntoken - string;
8486       /* Make this substring look like a string.  */
8487       *ntoken = '\0';
8488       aarch64_parse_one_override_token (string, token_length, tune);
8489       string = ++ntoken;
8490     }
8491
8492   /* One last option to parse.  */
8493   aarch64_parse_one_override_token (string, strlen (string), tune);
8494   free (string_root);
8495 }
8496
8497
8498 static void
8499 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8500 {
8501   /* The logic here is that if we are disabling all frame pointer generation
8502      then we do not need to disable leaf frame pointer generation as a
8503      separate operation.  But if we are *only* disabling leaf frame pointer
8504      generation then we set flag_omit_frame_pointer to true, but in
8505      aarch64_frame_pointer_required we return false only for leaf functions.
8506
8507      PR 70044: We have to be careful about being called multiple times for the
8508      same function.  Once we have decided to set flag_omit_frame_pointer just
8509      so that we can omit leaf frame pointers, we must then not interpret a
8510      second call as meaning that all frame pointer generation should be
8511      omitted.  We do this by setting flag_omit_frame_pointer to a special,
8512      non-zero value.  */
8513   if (opts->x_flag_omit_frame_pointer == 2)
8514     opts->x_flag_omit_frame_pointer = 0;
8515
8516   if (opts->x_flag_omit_frame_pointer)
8517     opts->x_flag_omit_leaf_frame_pointer = false;
8518   else if (opts->x_flag_omit_leaf_frame_pointer)
8519     opts->x_flag_omit_frame_pointer = 2;
8520
8521   /* If not optimizing for size, set the default
8522      alignment to what the target wants.  */
8523   if (!opts->x_optimize_size)
8524     {
8525       if (opts->x_align_loops <= 0)
8526         opts->x_align_loops = aarch64_tune_params.loop_align;
8527       if (opts->x_align_jumps <= 0)
8528         opts->x_align_jumps = aarch64_tune_params.jump_align;
8529       if (opts->x_align_functions <= 0)
8530         opts->x_align_functions = aarch64_tune_params.function_align;
8531     }
8532
8533   /* We default to no pc-relative literal loads.  */
8534
8535   aarch64_pcrelative_literal_loads = false;
8536
8537   /* If -mpc-relative-literal-loads is set on the command line, this
8538      implies that the user asked for PC relative literal loads.  */
8539   if (opts->x_pcrelative_literal_loads == 1)
8540     aarch64_pcrelative_literal_loads = true;
8541
8542   /* This is PR70113. When building the Linux kernel with
8543      CONFIG_ARM64_ERRATUM_843419, support for relocations
8544      R_AARCH64_ADR_PREL_PG_HI21 and R_AARCH64_ADR_PREL_PG_HI21_NC is
8545      removed from the kernel to avoid loading objects with possibly
8546      offending sequences.  Without -mpc-relative-literal-loads we would
8547      generate such relocations, preventing the kernel build from
8548      succeeding.  */
8549   if (opts->x_pcrelative_literal_loads == 2
8550       && TARGET_FIX_ERR_A53_843419)
8551     aarch64_pcrelative_literal_loads = true;
8552
8553   /* In the tiny memory model it makes no sense to disallow PC relative
8554      literal pool loads.  */
8555   if (aarch64_cmodel == AARCH64_CMODEL_TINY
8556       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
8557     aarch64_pcrelative_literal_loads = true;
8558
8559   /* When enabling the lower precision Newton series for the square root, also
8560      enable it for the reciprocal square root, since the latter is an
8561      intermediary step for the former.  */
8562   if (flag_mlow_precision_sqrt)
8563     flag_mrecip_low_precision_sqrt = true;
8564 }
8565
8566 /* 'Unpack' up the internal tuning structs and update the options
8567     in OPTS.  The caller must have set up selected_tune and selected_arch
8568     as all the other target-specific codegen decisions are
8569     derived from them.  */
8570
8571 void
8572 aarch64_override_options_internal (struct gcc_options *opts)
8573 {
8574   aarch64_tune_flags = selected_tune->flags;
8575   aarch64_tune = selected_tune->sched_core;
8576   /* Make a copy of the tuning parameters attached to the core, which
8577      we may later overwrite.  */
8578   aarch64_tune_params = *(selected_tune->tune);
8579   aarch64_architecture_version = selected_arch->architecture_version;
8580
8581   if (opts->x_aarch64_override_tune_string)
8582     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
8583                                   &aarch64_tune_params);
8584
8585   /* This target defaults to strict volatile bitfields.  */
8586   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
8587     opts->x_flag_strict_volatile_bitfields = 1;
8588
8589   initialize_aarch64_code_model (opts);
8590   initialize_aarch64_tls_size (opts);
8591
8592   int queue_depth = 0;
8593   switch (aarch64_tune_params.autoprefetcher_model)
8594     {
8595       case tune_params::AUTOPREFETCHER_OFF:
8596         queue_depth = -1;
8597         break;
8598       case tune_params::AUTOPREFETCHER_WEAK:
8599         queue_depth = 0;
8600         break;
8601       case tune_params::AUTOPREFETCHER_STRONG:
8602         queue_depth = max_insn_queue_index + 1;
8603         break;
8604       default:
8605         gcc_unreachable ();
8606     }
8607
8608   /* We don't mind passing in global_options_set here as we don't use
8609      the *options_set structs anyway.  */
8610   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
8611                          queue_depth,
8612                          opts->x_param_values,
8613                          global_options_set.x_param_values);
8614
8615   /* Set the L1 cache line size.  */
8616   if (selected_cpu->tune->cache_line_size != 0)
8617     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
8618                            selected_cpu->tune->cache_line_size,
8619                            opts->x_param_values,
8620                            global_options_set.x_param_values);
8621
8622   aarch64_override_options_after_change_1 (opts);
8623 }
8624
8625 /* Print a hint with a suggestion for a core or architecture name that
8626    most closely resembles what the user passed in STR.  ARCH is true if
8627    the user is asking for an architecture name.  ARCH is false if the user
8628    is asking for a core name.  */
8629
8630 static void
8631 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
8632 {
8633   auto_vec<const char *> candidates;
8634   const struct processor *entry = arch ? all_architectures : all_cores;
8635   for (; entry->name != NULL; entry++)
8636     candidates.safe_push (entry->name);
8637   char *s;
8638   const char *hint = candidates_list_and_hint (str, s, candidates);
8639   if (hint)
8640     inform (input_location, "valid arguments are: %s;"
8641                              " did you mean %qs?", s, hint);
8642   XDELETEVEC (s);
8643 }
8644
8645 /* Print a hint with a suggestion for a core name that most closely resembles
8646    what the user passed in STR.  */
8647
8648 inline static void
8649 aarch64_print_hint_for_core (const char *str)
8650 {
8651   aarch64_print_hint_for_core_or_arch (str, false);
8652 }
8653
8654 /* Print a hint with a suggestion for an architecture name that most closely
8655    resembles what the user passed in STR.  */
8656
8657 inline static void
8658 aarch64_print_hint_for_arch (const char *str)
8659 {
8660   aarch64_print_hint_for_core_or_arch (str, true);
8661 }
8662
8663 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
8664    specified in STR and throw errors if appropriate.  Put the results if
8665    they are valid in RES and ISA_FLAGS.  Return whether the option is
8666    valid.  */
8667
8668 static bool
8669 aarch64_validate_mcpu (const char *str, const struct processor **res,
8670                        unsigned long *isa_flags)
8671 {
8672   enum aarch64_parse_opt_result parse_res
8673     = aarch64_parse_cpu (str, res, isa_flags);
8674
8675   if (parse_res == AARCH64_PARSE_OK)
8676     return true;
8677
8678   switch (parse_res)
8679     {
8680       case AARCH64_PARSE_MISSING_ARG:
8681         error ("missing cpu name in -mcpu=%qs", str);
8682         break;
8683       case AARCH64_PARSE_INVALID_ARG:
8684         error ("unknown value %qs for -mcpu", str);
8685         aarch64_print_hint_for_core (str);
8686         break;
8687       case AARCH64_PARSE_INVALID_FEATURE:
8688         error ("invalid feature modifier in -mcpu=%qs", str);
8689         break;
8690       default:
8691         gcc_unreachable ();
8692     }
8693
8694   return false;
8695 }
8696
8697 /* Validate a command-line -march option.  Parse the arch and extensions
8698    (if any) specified in STR and throw errors if appropriate.  Put the
8699    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
8700    option is valid.  */
8701
8702 static bool
8703 aarch64_validate_march (const char *str, const struct processor **res,
8704                          unsigned long *isa_flags)
8705 {
8706   enum aarch64_parse_opt_result parse_res
8707     = aarch64_parse_arch (str, res, isa_flags);
8708
8709   if (parse_res == AARCH64_PARSE_OK)
8710     return true;
8711
8712   switch (parse_res)
8713     {
8714       case AARCH64_PARSE_MISSING_ARG:
8715         error ("missing arch name in -march=%qs", str);
8716         break;
8717       case AARCH64_PARSE_INVALID_ARG:
8718         error ("unknown value %qs for -march", str);
8719         aarch64_print_hint_for_arch (str);
8720         break;
8721       case AARCH64_PARSE_INVALID_FEATURE:
8722         error ("invalid feature modifier in -march=%qs", str);
8723         break;
8724       default:
8725         gcc_unreachable ();
8726     }
8727
8728   return false;
8729 }
8730
8731 /* Validate a command-line -mtune option.  Parse the cpu
8732    specified in STR and throw errors if appropriate.  Put the
8733    result, if it is valid, in RES.  Return whether the option is
8734    valid.  */
8735
8736 static bool
8737 aarch64_validate_mtune (const char *str, const struct processor **res)
8738 {
8739   enum aarch64_parse_opt_result parse_res
8740     = aarch64_parse_tune (str, res);
8741
8742   if (parse_res == AARCH64_PARSE_OK)
8743     return true;
8744
8745   switch (parse_res)
8746     {
8747       case AARCH64_PARSE_MISSING_ARG:
8748         error ("missing cpu name in -mtune=%qs", str);
8749         break;
8750       case AARCH64_PARSE_INVALID_ARG:
8751         error ("unknown value %qs for -mtune", str);
8752         aarch64_print_hint_for_core (str);
8753         break;
8754       default:
8755         gcc_unreachable ();
8756     }
8757   return false;
8758 }
8759
8760 /* Return the CPU corresponding to the enum CPU.
8761    If it doesn't specify a cpu, return the default.  */
8762
8763 static const struct processor *
8764 aarch64_get_tune_cpu (enum aarch64_processor cpu)
8765 {
8766   if (cpu != aarch64_none)
8767     return &all_cores[cpu];
8768
8769   /* The & 0x3f is to extract the bottom 6 bits that encode the
8770      default cpu as selected by the --with-cpu GCC configure option
8771      in config.gcc.
8772      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
8773      flags mechanism should be reworked to make it more sane.  */
8774   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8775 }
8776
8777 /* Return the architecture corresponding to the enum ARCH.
8778    If it doesn't specify a valid architecture, return the default.  */
8779
8780 static const struct processor *
8781 aarch64_get_arch (enum aarch64_arch arch)
8782 {
8783   if (arch != aarch64_no_arch)
8784     return &all_architectures[arch];
8785
8786   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8787
8788   return &all_architectures[cpu->arch];
8789 }
8790
8791 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
8792    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
8793    tuning structs.  In particular it must set selected_tune and
8794    aarch64_isa_flags that define the available ISA features and tuning
8795    decisions.  It must also set selected_arch as this will be used to
8796    output the .arch asm tags for each function.  */
8797
8798 static void
8799 aarch64_override_options (void)
8800 {
8801   unsigned long cpu_isa = 0;
8802   unsigned long arch_isa = 0;
8803   aarch64_isa_flags = 0;
8804
8805   bool valid_cpu = true;
8806   bool valid_tune = true;
8807   bool valid_arch = true;
8808
8809   selected_cpu = NULL;
8810   selected_arch = NULL;
8811   selected_tune = NULL;
8812
8813   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
8814      If either of -march or -mtune is given, they override their
8815      respective component of -mcpu.  */
8816   if (aarch64_cpu_string)
8817     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
8818                                         &cpu_isa);
8819
8820   if (aarch64_arch_string)
8821     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
8822                                           &arch_isa);
8823
8824   if (aarch64_tune_string)
8825     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
8826
8827   /* If the user did not specify a processor, choose the default
8828      one for them.  This will be the CPU set during configuration using
8829      --with-cpu, otherwise it is "generic".  */
8830   if (!selected_cpu)
8831     {
8832       if (selected_arch)
8833         {
8834           selected_cpu = &all_cores[selected_arch->ident];
8835           aarch64_isa_flags = arch_isa;
8836           explicit_arch = selected_arch->arch;
8837         }
8838       else
8839         {
8840           /* Get default configure-time CPU.  */
8841           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
8842           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
8843         }
8844
8845       if (selected_tune)
8846         explicit_tune_core = selected_tune->ident;
8847     }
8848   /* If both -mcpu and -march are specified check that they are architecturally
8849      compatible, warn if they're not and prefer the -march ISA flags.  */
8850   else if (selected_arch)
8851     {
8852       if (selected_arch->arch != selected_cpu->arch)
8853         {
8854           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
8855                        all_architectures[selected_cpu->arch].name,
8856                        selected_arch->name);
8857         }
8858       aarch64_isa_flags = arch_isa;
8859       explicit_arch = selected_arch->arch;
8860       explicit_tune_core = selected_tune ? selected_tune->ident
8861                                           : selected_cpu->ident;
8862     }
8863   else
8864     {
8865       /* -mcpu but no -march.  */
8866       aarch64_isa_flags = cpu_isa;
8867       explicit_tune_core = selected_tune ? selected_tune->ident
8868                                           : selected_cpu->ident;
8869       gcc_assert (selected_cpu);
8870       selected_arch = &all_architectures[selected_cpu->arch];
8871       explicit_arch = selected_arch->arch;
8872     }
8873
8874   /* Set the arch as well as we will need it when outputing
8875      the .arch directive in assembly.  */
8876   if (!selected_arch)
8877     {
8878       gcc_assert (selected_cpu);
8879       selected_arch = &all_architectures[selected_cpu->arch];
8880     }
8881
8882   if (!selected_tune)
8883     selected_tune = selected_cpu;
8884
8885 #ifndef HAVE_AS_MABI_OPTION
8886   /* The compiler may have been configured with 2.23.* binutils, which does
8887      not have support for ILP32.  */
8888   if (TARGET_ILP32)
8889     error ("Assembler does not support -mabi=ilp32");
8890 #endif
8891
8892   /* Make sure we properly set up the explicit options.  */
8893   if ((aarch64_cpu_string && valid_cpu)
8894        || (aarch64_tune_string && valid_tune))
8895     gcc_assert (explicit_tune_core != aarch64_none);
8896
8897   if ((aarch64_cpu_string && valid_cpu)
8898        || (aarch64_arch_string && valid_arch))
8899     gcc_assert (explicit_arch != aarch64_no_arch);
8900
8901   aarch64_override_options_internal (&global_options);
8902
8903   /* Save these options as the default ones in case we push and pop them later
8904      while processing functions with potential target attributes.  */
8905   target_option_default_node = target_option_current_node
8906       = build_target_option_node (&global_options);
8907 }
8908
8909 /* Implement targetm.override_options_after_change.  */
8910
8911 static void
8912 aarch64_override_options_after_change (void)
8913 {
8914   aarch64_override_options_after_change_1 (&global_options);
8915 }
8916
8917 static struct machine_function *
8918 aarch64_init_machine_status (void)
8919 {
8920   struct machine_function *machine;
8921   machine = ggc_cleared_alloc<machine_function> ();
8922   return machine;
8923 }
8924
8925 void
8926 aarch64_init_expanders (void)
8927 {
8928   init_machine_status = aarch64_init_machine_status;
8929 }
8930
8931 /* A checking mechanism for the implementation of the various code models.  */
8932 static void
8933 initialize_aarch64_code_model (struct gcc_options *opts)
8934 {
8935    if (opts->x_flag_pic)
8936      {
8937        switch (opts->x_aarch64_cmodel_var)
8938          {
8939          case AARCH64_CMODEL_TINY:
8940            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
8941            break;
8942          case AARCH64_CMODEL_SMALL:
8943 #ifdef HAVE_AS_SMALL_PIC_RELOCS
8944            aarch64_cmodel = (flag_pic == 2
8945                              ? AARCH64_CMODEL_SMALL_PIC
8946                              : AARCH64_CMODEL_SMALL_SPIC);
8947 #else
8948            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
8949 #endif
8950            break;
8951          case AARCH64_CMODEL_LARGE:
8952            sorry ("code model %qs with -f%s", "large",
8953                   opts->x_flag_pic > 1 ? "PIC" : "pic");
8954            break;
8955          default:
8956            gcc_unreachable ();
8957          }
8958      }
8959    else
8960      aarch64_cmodel = opts->x_aarch64_cmodel_var;
8961 }
8962
8963 /* Implement TARGET_OPTION_SAVE.  */
8964
8965 static void
8966 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
8967 {
8968   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
8969 }
8970
8971 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
8972    using the information saved in PTR.  */
8973
8974 static void
8975 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
8976 {
8977   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
8978   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
8979   opts->x_explicit_arch = ptr->x_explicit_arch;
8980   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
8981   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
8982
8983   aarch64_override_options_internal (opts);
8984 }
8985
8986 /* Implement TARGET_OPTION_PRINT.  */
8987
8988 static void
8989 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
8990 {
8991   const struct processor *cpu
8992     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
8993   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
8994   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
8995   std::string extension
8996     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
8997
8998   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
8999   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
9000            arch->name, extension.c_str ());
9001 }
9002
9003 static GTY(()) tree aarch64_previous_fndecl;
9004
9005 void
9006 aarch64_reset_previous_fndecl (void)
9007 {
9008   aarch64_previous_fndecl = NULL;
9009 }
9010
9011 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
9012    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
9013    make sure optab availability predicates are recomputed when necessary.  */
9014
9015 void
9016 aarch64_save_restore_target_globals (tree new_tree)
9017 {
9018   if (TREE_TARGET_GLOBALS (new_tree))
9019     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
9020   else if (new_tree == target_option_default_node)
9021     restore_target_globals (&default_target_globals);
9022   else
9023     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
9024 }
9025
9026 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
9027    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
9028    of the function, if such exists.  This function may be called multiple
9029    times on a single function so use aarch64_previous_fndecl to avoid
9030    setting up identical state.  */
9031
9032 static void
9033 aarch64_set_current_function (tree fndecl)
9034 {
9035   if (!fndecl || fndecl == aarch64_previous_fndecl)
9036     return;
9037
9038   tree old_tree = (aarch64_previous_fndecl
9039                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
9040                    : NULL_TREE);
9041
9042   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9043
9044   /* If current function has no attributes but the previous one did,
9045      use the default node.  */
9046   if (!new_tree && old_tree)
9047     new_tree = target_option_default_node;
9048
9049   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
9050      the default have been handled by aarch64_save_restore_target_globals from
9051      aarch64_pragma_target_parse.  */
9052   if (old_tree == new_tree)
9053     return;
9054
9055   aarch64_previous_fndecl = fndecl;
9056
9057   /* First set the target options.  */
9058   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
9059
9060   aarch64_save_restore_target_globals (new_tree);
9061 }
9062
9063 /* Enum describing the various ways we can handle attributes.
9064    In many cases we can reuse the generic option handling machinery.  */
9065
9066 enum aarch64_attr_opt_type
9067 {
9068   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
9069   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
9070   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
9071   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
9072 };
9073
9074 /* All the information needed to handle a target attribute.
9075    NAME is the name of the attribute.
9076    ATTR_TYPE specifies the type of behavior of the attribute as described
9077    in the definition of enum aarch64_attr_opt_type.
9078    ALLOW_NEG is true if the attribute supports a "no-" form.
9079    HANDLER is the function that takes the attribute string and whether
9080    it is a pragma or attribute and handles the option.  It is needed only
9081    when the ATTR_TYPE is aarch64_attr_custom.
9082    OPT_NUM is the enum specifying the option that the attribute modifies.
9083    This is needed for attributes that mirror the behavior of a command-line
9084    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
9085    aarch64_attr_enum.  */
9086
9087 struct aarch64_attribute_info
9088 {
9089   const char *name;
9090   enum aarch64_attr_opt_type attr_type;
9091   bool allow_neg;
9092   bool (*handler) (const char *, const char *);
9093   enum opt_code opt_num;
9094 };
9095
9096 /* Handle the ARCH_STR argument to the arch= target attribute.
9097    PRAGMA_OR_ATTR is used in potential error messages.  */
9098
9099 static bool
9100 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
9101 {
9102   const struct processor *tmp_arch = NULL;
9103   enum aarch64_parse_opt_result parse_res
9104     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
9105
9106   if (parse_res == AARCH64_PARSE_OK)
9107     {
9108       gcc_assert (tmp_arch);
9109       selected_arch = tmp_arch;
9110       explicit_arch = selected_arch->arch;
9111       return true;
9112     }
9113
9114   switch (parse_res)
9115     {
9116       case AARCH64_PARSE_MISSING_ARG:
9117         error ("missing architecture name in 'arch' target %s", pragma_or_attr);
9118         break;
9119       case AARCH64_PARSE_INVALID_ARG:
9120         error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
9121         aarch64_print_hint_for_arch (str);
9122         break;
9123       case AARCH64_PARSE_INVALID_FEATURE:
9124         error ("invalid feature modifier %qs for 'arch' target %s",
9125                str, pragma_or_attr);
9126         break;
9127       default:
9128         gcc_unreachable ();
9129     }
9130
9131   return false;
9132 }
9133
9134 /* Handle the argument CPU_STR to the cpu= target attribute.
9135    PRAGMA_OR_ATTR is used in potential error messages.  */
9136
9137 static bool
9138 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
9139 {
9140   const struct processor *tmp_cpu = NULL;
9141   enum aarch64_parse_opt_result parse_res
9142     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
9143
9144   if (parse_res == AARCH64_PARSE_OK)
9145     {
9146       gcc_assert (tmp_cpu);
9147       selected_tune = tmp_cpu;
9148       explicit_tune_core = selected_tune->ident;
9149
9150       selected_arch = &all_architectures[tmp_cpu->arch];
9151       explicit_arch = selected_arch->arch;
9152       return true;
9153     }
9154
9155   switch (parse_res)
9156     {
9157       case AARCH64_PARSE_MISSING_ARG:
9158         error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
9159         break;
9160       case AARCH64_PARSE_INVALID_ARG:
9161         error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
9162         aarch64_print_hint_for_core (str);
9163         break;
9164       case AARCH64_PARSE_INVALID_FEATURE:
9165         error ("invalid feature modifier %qs for 'cpu' target %s",
9166                str, pragma_or_attr);
9167         break;
9168       default:
9169         gcc_unreachable ();
9170     }
9171
9172   return false;
9173 }
9174
9175 /* Handle the argument STR to the tune= target attribute.
9176    PRAGMA_OR_ATTR is used in potential error messages.  */
9177
9178 static bool
9179 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
9180 {
9181   const struct processor *tmp_tune = NULL;
9182   enum aarch64_parse_opt_result parse_res
9183     = aarch64_parse_tune (str, &tmp_tune);
9184
9185   if (parse_res == AARCH64_PARSE_OK)
9186     {
9187       gcc_assert (tmp_tune);
9188       selected_tune = tmp_tune;
9189       explicit_tune_core = selected_tune->ident;
9190       return true;
9191     }
9192
9193   switch (parse_res)
9194     {
9195       case AARCH64_PARSE_INVALID_ARG:
9196         error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
9197         aarch64_print_hint_for_core (str);
9198         break;
9199       default:
9200         gcc_unreachable ();
9201     }
9202
9203   return false;
9204 }
9205
9206 /* Parse an architecture extensions target attribute string specified in STR.
9207    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
9208    if successful.  Update aarch64_isa_flags to reflect the ISA features
9209    modified.
9210    PRAGMA_OR_ATTR is used in potential error messages.  */
9211
9212 static bool
9213 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
9214 {
9215   enum aarch64_parse_opt_result parse_res;
9216   unsigned long isa_flags = aarch64_isa_flags;
9217
9218   /* We allow "+nothing" in the beginning to clear out all architectural
9219      features if the user wants to handpick specific features.  */
9220   if (strncmp ("+nothing", str, 8) == 0)
9221     {
9222       isa_flags = 0;
9223       str += 8;
9224     }
9225
9226   parse_res = aarch64_parse_extension (str, &isa_flags);
9227
9228   if (parse_res == AARCH64_PARSE_OK)
9229     {
9230       aarch64_isa_flags = isa_flags;
9231       return true;
9232     }
9233
9234   switch (parse_res)
9235     {
9236       case AARCH64_PARSE_MISSING_ARG:
9237         error ("missing feature modifier in target %s %qs",
9238                pragma_or_attr, str);
9239         break;
9240
9241       case AARCH64_PARSE_INVALID_FEATURE:
9242         error ("invalid feature modifier in target %s %qs",
9243                pragma_or_attr, str);
9244         break;
9245
9246       default:
9247         gcc_unreachable ();
9248     }
9249
9250  return false;
9251 }
9252
9253 /* The target attributes that we support.  On top of these we also support just
9254    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
9255    handled explicitly in aarch64_process_one_target_attr.  */
9256
9257 static const struct aarch64_attribute_info aarch64_attributes[] =
9258 {
9259   { "general-regs-only", aarch64_attr_mask, false, NULL,
9260      OPT_mgeneral_regs_only },
9261   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
9262      OPT_mfix_cortex_a53_835769 },
9263   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
9264      OPT_mfix_cortex_a53_843419 },
9265   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
9266   { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
9267   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
9268      OPT_momit_leaf_frame_pointer },
9269   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
9270   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
9271      OPT_march_ },
9272   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
9273   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
9274      OPT_mtune_ },
9275   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
9276 };
9277
9278 /* Parse ARG_STR which contains the definition of one target attribute.
9279    Show appropriate errors if any or return true if the attribute is valid.
9280    PRAGMA_OR_ATTR holds the string to use in error messages about whether
9281    we're processing a target attribute or pragma.  */
9282
9283 static bool
9284 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
9285 {
9286   bool invert = false;
9287
9288   size_t len = strlen (arg_str);
9289
9290   if (len == 0)
9291     {
9292       error ("malformed target %s", pragma_or_attr);
9293       return false;
9294     }
9295
9296   char *str_to_check = (char *) alloca (len + 1);
9297   strcpy (str_to_check, arg_str);
9298
9299   /* Skip leading whitespace.  */
9300   while (*str_to_check == ' ' || *str_to_check == '\t')
9301     str_to_check++;
9302
9303   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
9304      It is easier to detect and handle it explicitly here rather than going
9305      through the machinery for the rest of the target attributes in this
9306      function.  */
9307   if (*str_to_check == '+')
9308     return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
9309
9310   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
9311     {
9312       invert = true;
9313       str_to_check += 3;
9314     }
9315   char *arg = strchr (str_to_check, '=');
9316
9317   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9318      and point ARG to "foo".  */
9319   if (arg)
9320     {
9321       *arg = '\0';
9322       arg++;
9323     }
9324   const struct aarch64_attribute_info *p_attr;
9325   bool found = false;
9326   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
9327     {
9328       /* If the names don't match up, or the user has given an argument
9329          to an attribute that doesn't accept one, or didn't give an argument
9330          to an attribute that expects one, fail to match.  */
9331       if (strcmp (str_to_check, p_attr->name) != 0)
9332         continue;
9333
9334       found = true;
9335       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
9336                               || p_attr->attr_type == aarch64_attr_enum;
9337
9338       if (attr_need_arg_p ^ (arg != NULL))
9339         {
9340           error ("target %s %qs does not accept an argument",
9341                   pragma_or_attr, str_to_check);
9342           return false;
9343         }
9344
9345       /* If the name matches but the attribute does not allow "no-" versions
9346          then we can't match.  */
9347       if (invert && !p_attr->allow_neg)
9348         {
9349           error ("target %s %qs does not allow a negated form",
9350                   pragma_or_attr, str_to_check);
9351           return false;
9352         }
9353
9354       switch (p_attr->attr_type)
9355         {
9356         /* Has a custom handler registered.
9357            For example, cpu=, arch=, tune=.  */
9358           case aarch64_attr_custom:
9359             gcc_assert (p_attr->handler);
9360             if (!p_attr->handler (arg, pragma_or_attr))
9361               return false;
9362             break;
9363
9364           /* Either set or unset a boolean option.  */
9365           case aarch64_attr_bool:
9366             {
9367               struct cl_decoded_option decoded;
9368
9369               generate_option (p_attr->opt_num, NULL, !invert,
9370                                CL_TARGET, &decoded);
9371               aarch64_handle_option (&global_options, &global_options_set,
9372                                       &decoded, input_location);
9373               break;
9374             }
9375           /* Set or unset a bit in the target_flags.  aarch64_handle_option
9376              should know what mask to apply given the option number.  */
9377           case aarch64_attr_mask:
9378             {
9379               struct cl_decoded_option decoded;
9380               /* We only need to specify the option number.
9381                  aarch64_handle_option will know which mask to apply.  */
9382               decoded.opt_index = p_attr->opt_num;
9383               decoded.value = !invert;
9384               aarch64_handle_option (&global_options, &global_options_set,
9385                                       &decoded, input_location);
9386               break;
9387             }
9388           /* Use the option setting machinery to set an option to an enum.  */
9389           case aarch64_attr_enum:
9390             {
9391               gcc_assert (arg);
9392               bool valid;
9393               int value;
9394               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
9395                                               &value, CL_TARGET);
9396               if (valid)
9397                 {
9398                   set_option (&global_options, NULL, p_attr->opt_num, value,
9399                               NULL, DK_UNSPECIFIED, input_location,
9400                               global_dc);
9401                 }
9402               else
9403                 {
9404                   error ("target %s %s=%s is not valid",
9405                          pragma_or_attr, str_to_check, arg);
9406                 }
9407               break;
9408             }
9409           default:
9410             gcc_unreachable ();
9411         }
9412     }
9413
9414   /* If we reached here we either have found an attribute and validated
9415      it or didn't match any.  If we matched an attribute but its arguments
9416      were malformed we will have returned false already.  */
9417   return found;
9418 }
9419
9420 /* Count how many times the character C appears in
9421    NULL-terminated string STR.  */
9422
9423 static unsigned int
9424 num_occurences_in_str (char c, char *str)
9425 {
9426   unsigned int res = 0;
9427   while (*str != '\0')
9428     {
9429       if (*str == c)
9430         res++;
9431
9432       str++;
9433     }
9434
9435   return res;
9436 }
9437
9438 /* Parse the tree in ARGS that contains the target attribute information
9439    and update the global target options space.  PRAGMA_OR_ATTR is a string
9440    to be used in error messages, specifying whether this is processing
9441    a target attribute or a target pragma.  */
9442
9443 bool
9444 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9445 {
9446   if (TREE_CODE (args) == TREE_LIST)
9447     {
9448       do
9449         {
9450           tree head = TREE_VALUE (args);
9451           if (head)
9452             {
9453               if (!aarch64_process_target_attr (head, pragma_or_attr))
9454                 return false;
9455             }
9456           args = TREE_CHAIN (args);
9457         } while (args);
9458
9459       return true;
9460     }
9461   /* We expect to find a string to parse.  */
9462   gcc_assert (TREE_CODE (args) == STRING_CST);
9463
9464   size_t len = strlen (TREE_STRING_POINTER (args));
9465   char *str_to_check = (char *) alloca (len + 1);
9466   strcpy (str_to_check, TREE_STRING_POINTER (args));
9467
9468   if (len == 0)
9469     {
9470       error ("malformed target %s value", pragma_or_attr);
9471       return false;
9472     }
9473
9474   /* Used to catch empty spaces between commas i.e.
9475      attribute ((target ("attr1,,attr2"))).  */
9476   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9477
9478   /* Handle multiple target attributes separated by ','.  */
9479   char *token = strtok (str_to_check, ",");
9480
9481   unsigned int num_attrs = 0;
9482   while (token)
9483     {
9484       num_attrs++;
9485       if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9486         {
9487           error ("target %s %qs is invalid", pragma_or_attr, token);
9488           return false;
9489         }
9490
9491       token = strtok (NULL, ",");
9492     }
9493
9494   if (num_attrs != num_commas + 1)
9495     {
9496       error ("malformed target %s list %qs",
9497               pragma_or_attr, TREE_STRING_POINTER (args));
9498       return false;
9499     }
9500
9501   return true;
9502 }
9503
9504 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
9505    process attribute ((target ("..."))).  */
9506
9507 static bool
9508 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9509 {
9510   struct cl_target_option cur_target;
9511   bool ret;
9512   tree old_optimize;
9513   tree new_target, new_optimize;
9514   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9515
9516   /* If what we're processing is the current pragma string then the
9517      target option node is already stored in target_option_current_node
9518      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
9519      having to re-parse the string.  This is especially useful to keep
9520      arm_neon.h compile times down since that header contains a lot
9521      of intrinsics enclosed in pragmas.  */
9522   if (!existing_target && args == current_target_pragma)
9523     {
9524       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
9525       return true;
9526     }
9527   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9528
9529   old_optimize = build_optimization_node (&global_options);
9530   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9531
9532   /* If the function changed the optimization levels as well as setting
9533      target options, start with the optimizations specified.  */
9534   if (func_optimize && func_optimize != old_optimize)
9535     cl_optimization_restore (&global_options,
9536                              TREE_OPTIMIZATION (func_optimize));
9537
9538   /* Save the current target options to restore at the end.  */
9539   cl_target_option_save (&cur_target, &global_options);
9540
9541   /* If fndecl already has some target attributes applied to it, unpack
9542      them so that we add this attribute on top of them, rather than
9543      overwriting them.  */
9544   if (existing_target)
9545     {
9546       struct cl_target_option *existing_options
9547         = TREE_TARGET_OPTION (existing_target);
9548
9549       if (existing_options)
9550         cl_target_option_restore (&global_options, existing_options);
9551     }
9552   else
9553     cl_target_option_restore (&global_options,
9554                         TREE_TARGET_OPTION (target_option_current_node));
9555
9556
9557   ret = aarch64_process_target_attr (args, "attribute");
9558
9559   /* Set up any additional state.  */
9560   if (ret)
9561     {
9562       aarch64_override_options_internal (&global_options);
9563       /* Initialize SIMD builtins if we haven't already.
9564          Set current_target_pragma to NULL for the duration so that
9565          the builtin initialization code doesn't try to tag the functions
9566          being built with the attributes specified by any current pragma, thus
9567          going into an infinite recursion.  */
9568       if (TARGET_SIMD)
9569         {
9570           tree saved_current_target_pragma = current_target_pragma;
9571           current_target_pragma = NULL;
9572           aarch64_init_simd_builtins ();
9573           current_target_pragma = saved_current_target_pragma;
9574         }
9575       new_target = build_target_option_node (&global_options);
9576     }
9577   else
9578     new_target = NULL;
9579
9580   new_optimize = build_optimization_node (&global_options);
9581
9582   if (fndecl && ret)
9583     {
9584       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
9585
9586       if (old_optimize != new_optimize)
9587         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
9588     }
9589
9590   cl_target_option_restore (&global_options, &cur_target);
9591
9592   if (old_optimize != new_optimize)
9593     cl_optimization_restore (&global_options,
9594                              TREE_OPTIMIZATION (old_optimize));
9595   return ret;
9596 }
9597
9598 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
9599    tri-bool options (yes, no, don't care) and the default value is
9600    DEF, determine whether to reject inlining.  */
9601
9602 static bool
9603 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
9604                                      int dont_care, int def)
9605 {
9606   /* If the callee doesn't care, always allow inlining.  */
9607   if (callee == dont_care)
9608     return true;
9609
9610   /* If the caller doesn't care, always allow inlining.  */
9611   if (caller == dont_care)
9612     return true;
9613
9614   /* Otherwise, allow inlining if either the callee and caller values
9615      agree, or if the callee is using the default value.  */
9616   return (callee == caller || callee == def);
9617 }
9618
9619 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
9620    to inline CALLEE into CALLER based on target-specific info.
9621    Make sure that the caller and callee have compatible architectural
9622    features.  Then go through the other possible target attributes
9623    and see if they can block inlining.  Try not to reject always_inline
9624    callees unless they are incompatible architecturally.  */
9625
9626 static bool
9627 aarch64_can_inline_p (tree caller, tree callee)
9628 {
9629   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
9630   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
9631
9632   /* If callee has no option attributes, then it is ok to inline.  */
9633   if (!callee_tree)
9634     return true;
9635
9636   struct cl_target_option *caller_opts
9637         = TREE_TARGET_OPTION (caller_tree ? caller_tree
9638                                            : target_option_default_node);
9639
9640   struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
9641
9642
9643   /* Callee's ISA flags should be a subset of the caller's.  */
9644   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
9645        != callee_opts->x_aarch64_isa_flags)
9646     return false;
9647
9648   /* Allow non-strict aligned functions inlining into strict
9649      aligned ones.  */
9650   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
9651        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
9652       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
9653            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
9654     return false;
9655
9656   bool always_inline = lookup_attribute ("always_inline",
9657                                           DECL_ATTRIBUTES (callee));
9658
9659   /* If the architectural features match up and the callee is always_inline
9660      then the other attributes don't matter.  */
9661   if (always_inline)
9662     return true;
9663
9664   if (caller_opts->x_aarch64_cmodel_var
9665       != callee_opts->x_aarch64_cmodel_var)
9666     return false;
9667
9668   if (caller_opts->x_aarch64_tls_dialect
9669       != callee_opts->x_aarch64_tls_dialect)
9670     return false;
9671
9672   /* Honour explicit requests to workaround errata.  */
9673   if (!aarch64_tribools_ok_for_inlining_p (
9674           caller_opts->x_aarch64_fix_a53_err835769,
9675           callee_opts->x_aarch64_fix_a53_err835769,
9676           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
9677     return false;
9678
9679   if (!aarch64_tribools_ok_for_inlining_p (
9680           caller_opts->x_aarch64_fix_a53_err843419,
9681           callee_opts->x_aarch64_fix_a53_err843419,
9682           2, TARGET_FIX_ERR_A53_843419))
9683     return false;
9684
9685   /* If the user explicitly specified -momit-leaf-frame-pointer for the
9686      caller and calle and they don't match up, reject inlining.  */
9687   if (!aarch64_tribools_ok_for_inlining_p (
9688           caller_opts->x_flag_omit_leaf_frame_pointer,
9689           callee_opts->x_flag_omit_leaf_frame_pointer,
9690           2, 1))
9691     return false;
9692
9693   /* If the callee has specific tuning overrides, respect them.  */
9694   if (callee_opts->x_aarch64_override_tune_string != NULL
9695       && caller_opts->x_aarch64_override_tune_string == NULL)
9696     return false;
9697
9698   /* If the user specified tuning override strings for the
9699      caller and callee and they don't match up, reject inlining.
9700      We just do a string compare here, we don't analyze the meaning
9701      of the string, as it would be too costly for little gain.  */
9702   if (callee_opts->x_aarch64_override_tune_string
9703       && caller_opts->x_aarch64_override_tune_string
9704       && (strcmp (callee_opts->x_aarch64_override_tune_string,
9705                   caller_opts->x_aarch64_override_tune_string) != 0))
9706     return false;
9707
9708   return true;
9709 }
9710
9711 /* Return true if SYMBOL_REF X binds locally.  */
9712
9713 static bool
9714 aarch64_symbol_binds_local_p (const_rtx x)
9715 {
9716   return (SYMBOL_REF_DECL (x)
9717           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
9718           : SYMBOL_REF_LOCAL_P (x));
9719 }
9720
9721 /* Return true if SYMBOL_REF X is thread local */
9722 static bool
9723 aarch64_tls_symbol_p (rtx x)
9724 {
9725   if (! TARGET_HAVE_TLS)
9726     return false;
9727
9728   if (GET_CODE (x) != SYMBOL_REF)
9729     return false;
9730
9731   return SYMBOL_REF_TLS_MODEL (x) != 0;
9732 }
9733
9734 /* Classify a TLS symbol into one of the TLS kinds.  */
9735 enum aarch64_symbol_type
9736 aarch64_classify_tls_symbol (rtx x)
9737 {
9738   enum tls_model tls_kind = tls_symbolic_operand_type (x);
9739
9740   switch (tls_kind)
9741     {
9742     case TLS_MODEL_GLOBAL_DYNAMIC:
9743     case TLS_MODEL_LOCAL_DYNAMIC:
9744       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
9745
9746     case TLS_MODEL_INITIAL_EXEC:
9747       switch (aarch64_cmodel)
9748         {
9749         case AARCH64_CMODEL_TINY:
9750         case AARCH64_CMODEL_TINY_PIC:
9751           return SYMBOL_TINY_TLSIE;
9752         default:
9753           return SYMBOL_SMALL_TLSIE;
9754         }
9755
9756     case TLS_MODEL_LOCAL_EXEC:
9757       if (aarch64_tls_size == 12)
9758         return SYMBOL_TLSLE12;
9759       else if (aarch64_tls_size == 24)
9760         return SYMBOL_TLSLE24;
9761       else if (aarch64_tls_size == 32)
9762         return SYMBOL_TLSLE32;
9763       else if (aarch64_tls_size == 48)
9764         return SYMBOL_TLSLE48;
9765       else
9766         gcc_unreachable ();
9767
9768     case TLS_MODEL_EMULATED:
9769     case TLS_MODEL_NONE:
9770       return SYMBOL_FORCE_TO_MEM;
9771
9772     default:
9773       gcc_unreachable ();
9774     }
9775 }
9776
9777 /* Return the method that should be used to access SYMBOL_REF or
9778    LABEL_REF X.  */
9779
9780 enum aarch64_symbol_type
9781 aarch64_classify_symbol (rtx x, rtx offset)
9782 {
9783   if (GET_CODE (x) == LABEL_REF)
9784     {
9785       switch (aarch64_cmodel)
9786         {
9787         case AARCH64_CMODEL_LARGE:
9788           return SYMBOL_FORCE_TO_MEM;
9789
9790         case AARCH64_CMODEL_TINY_PIC:
9791         case AARCH64_CMODEL_TINY:
9792           return SYMBOL_TINY_ABSOLUTE;
9793
9794         case AARCH64_CMODEL_SMALL_SPIC:
9795         case AARCH64_CMODEL_SMALL_PIC:
9796         case AARCH64_CMODEL_SMALL:
9797           return SYMBOL_SMALL_ABSOLUTE;
9798
9799         default:
9800           gcc_unreachable ();
9801         }
9802     }
9803
9804   if (GET_CODE (x) == SYMBOL_REF)
9805     {
9806       if (aarch64_tls_symbol_p (x))
9807         return aarch64_classify_tls_symbol (x);
9808
9809       switch (aarch64_cmodel)
9810         {
9811         case AARCH64_CMODEL_TINY:
9812           /* When we retrieve symbol + offset address, we have to make sure
9813              the offset does not cause overflow of the final address.  But
9814              we have no way of knowing the address of symbol at compile time
9815              so we can't accurately say if the distance between the PC and
9816              symbol + offset is outside the addressible range of +/-1M in the
9817              TINY code model.  So we rely on images not being greater than
9818              1M and cap the offset at 1M and anything beyond 1M will have to
9819              be loaded using an alternative mechanism.  Furthermore if the
9820              symbol is a weak reference to something that isn't known to
9821              resolve to a symbol in this module, then force to memory.  */
9822           if ((SYMBOL_REF_WEAK (x)
9823                && !aarch64_symbol_binds_local_p (x))
9824               || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
9825             return SYMBOL_FORCE_TO_MEM;
9826           return SYMBOL_TINY_ABSOLUTE;
9827
9828         case AARCH64_CMODEL_SMALL:
9829           /* Same reasoning as the tiny code model, but the offset cap here is
9830              4G.  */
9831           if ((SYMBOL_REF_WEAK (x)
9832                && !aarch64_symbol_binds_local_p (x))
9833               || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
9834                             HOST_WIDE_INT_C (4294967264)))
9835             return SYMBOL_FORCE_TO_MEM;
9836           return SYMBOL_SMALL_ABSOLUTE;
9837
9838         case AARCH64_CMODEL_TINY_PIC:
9839           if (!aarch64_symbol_binds_local_p (x))
9840             return SYMBOL_TINY_GOT;
9841           return SYMBOL_TINY_ABSOLUTE;
9842
9843         case AARCH64_CMODEL_SMALL_SPIC:
9844         case AARCH64_CMODEL_SMALL_PIC:
9845           if (!aarch64_symbol_binds_local_p (x))
9846             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
9847                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
9848           return SYMBOL_SMALL_ABSOLUTE;
9849
9850         case AARCH64_CMODEL_LARGE:
9851           /* This is alright even in PIC code as the constant
9852              pool reference is always PC relative and within
9853              the same translation unit.  */
9854           if (CONSTANT_POOL_ADDRESS_P (x))
9855             return SYMBOL_SMALL_ABSOLUTE;
9856           else
9857             return SYMBOL_FORCE_TO_MEM;
9858
9859         default:
9860           gcc_unreachable ();
9861         }
9862     }
9863
9864   /* By default push everything into the constant pool.  */
9865   return SYMBOL_FORCE_TO_MEM;
9866 }
9867
9868 bool
9869 aarch64_constant_address_p (rtx x)
9870 {
9871   return (CONSTANT_P (x) && memory_address_p (DImode, x));
9872 }
9873
9874 bool
9875 aarch64_legitimate_pic_operand_p (rtx x)
9876 {
9877   if (GET_CODE (x) == SYMBOL_REF
9878       || (GET_CODE (x) == CONST
9879           && GET_CODE (XEXP (x, 0)) == PLUS
9880           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
9881      return false;
9882
9883   return true;
9884 }
9885
9886 /* Return true if X holds either a quarter-precision or
9887      floating-point +0.0 constant.  */
9888 static bool
9889 aarch64_valid_floating_const (machine_mode mode, rtx x)
9890 {
9891   if (!CONST_DOUBLE_P (x))
9892     return false;
9893
9894   if (aarch64_float_const_zero_rtx_p (x))
9895     return true;
9896
9897   /* We only handle moving 0.0 to a TFmode register.  */
9898   if (!(mode == SFmode || mode == DFmode))
9899     return false;
9900
9901   return aarch64_float_const_representable_p (x);
9902 }
9903
9904 static bool
9905 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
9906 {
9907   /* Do not allow vector struct mode constants.  We could support
9908      0 and -1 easily, but they need support in aarch64-simd.md.  */
9909   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
9910     return false;
9911
9912   /* This could probably go away because
9913      we now decompose CONST_INTs according to expand_mov_immediate.  */
9914   if ((GET_CODE (x) == CONST_VECTOR
9915        && aarch64_simd_valid_immediate (x, mode, false, NULL))
9916       || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
9917         return !targetm.cannot_force_const_mem (mode, x);
9918
9919   if (GET_CODE (x) == HIGH
9920       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
9921     return true;
9922
9923   return aarch64_constant_address_p (x);
9924 }
9925
9926 rtx
9927 aarch64_load_tp (rtx target)
9928 {
9929   if (!target
9930       || GET_MODE (target) != Pmode
9931       || !register_operand (target, Pmode))
9932     target = gen_reg_rtx (Pmode);
9933
9934   /* Can return in any reg.  */
9935   emit_insn (gen_aarch64_load_tp_hard (target));
9936   return target;
9937 }
9938
9939 /* On AAPCS systems, this is the "struct __va_list".  */
9940 static GTY(()) tree va_list_type;
9941
9942 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
9943    Return the type to use as __builtin_va_list.
9944
9945    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
9946
9947    struct __va_list
9948    {
9949      void *__stack;
9950      void *__gr_top;
9951      void *__vr_top;
9952      int   __gr_offs;
9953      int   __vr_offs;
9954    };  */
9955
9956 static tree
9957 aarch64_build_builtin_va_list (void)
9958 {
9959   tree va_list_name;
9960   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9961
9962   /* Create the type.  */
9963   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
9964   /* Give it the required name.  */
9965   va_list_name = build_decl (BUILTINS_LOCATION,
9966                              TYPE_DECL,
9967                              get_identifier ("__va_list"),
9968                              va_list_type);
9969   DECL_ARTIFICIAL (va_list_name) = 1;
9970   TYPE_NAME (va_list_type) = va_list_name;
9971   TYPE_STUB_DECL (va_list_type) = va_list_name;
9972
9973   /* Create the fields.  */
9974   f_stack = build_decl (BUILTINS_LOCATION,
9975                         FIELD_DECL, get_identifier ("__stack"),
9976                         ptr_type_node);
9977   f_grtop = build_decl (BUILTINS_LOCATION,
9978                         FIELD_DECL, get_identifier ("__gr_top"),
9979                         ptr_type_node);
9980   f_vrtop = build_decl (BUILTINS_LOCATION,
9981                         FIELD_DECL, get_identifier ("__vr_top"),
9982                         ptr_type_node);
9983   f_groff = build_decl (BUILTINS_LOCATION,
9984                         FIELD_DECL, get_identifier ("__gr_offs"),
9985                         integer_type_node);
9986   f_vroff = build_decl (BUILTINS_LOCATION,
9987                         FIELD_DECL, get_identifier ("__vr_offs"),
9988                         integer_type_node);
9989
9990   /* Tell tree-stdarg pass about our internal offset fields.
9991      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
9992      purpose to identify whether the code is updating va_list internal
9993      offset fields through irregular way.  */
9994   va_list_gpr_counter_field = f_groff;
9995   va_list_fpr_counter_field = f_vroff;
9996
9997   DECL_ARTIFICIAL (f_stack) = 1;
9998   DECL_ARTIFICIAL (f_grtop) = 1;
9999   DECL_ARTIFICIAL (f_vrtop) = 1;
10000   DECL_ARTIFICIAL (f_groff) = 1;
10001   DECL_ARTIFICIAL (f_vroff) = 1;
10002
10003   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
10004   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
10005   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
10006   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
10007   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
10008
10009   TYPE_FIELDS (va_list_type) = f_stack;
10010   DECL_CHAIN (f_stack) = f_grtop;
10011   DECL_CHAIN (f_grtop) = f_vrtop;
10012   DECL_CHAIN (f_vrtop) = f_groff;
10013   DECL_CHAIN (f_groff) = f_vroff;
10014
10015   /* Compute its layout.  */
10016   layout_type (va_list_type);
10017
10018   return va_list_type;
10019 }
10020
10021 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
10022 static void
10023 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
10024 {
10025   const CUMULATIVE_ARGS *cum;
10026   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10027   tree stack, grtop, vrtop, groff, vroff;
10028   tree t;
10029   int gr_save_area_size = cfun->va_list_gpr_size;
10030   int vr_save_area_size = cfun->va_list_fpr_size;
10031   int vr_offset;
10032
10033   cum = &crtl->args.info;
10034   if (cfun->va_list_gpr_size)
10035     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
10036                              cfun->va_list_gpr_size);
10037   if (cfun->va_list_fpr_size)
10038     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
10039                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
10040
10041   if (!TARGET_FLOAT)
10042     {
10043       gcc_assert (cum->aapcs_nvrn == 0);
10044       vr_save_area_size = 0;
10045     }
10046
10047   f_stack = TYPE_FIELDS (va_list_type_node);
10048   f_grtop = DECL_CHAIN (f_stack);
10049   f_vrtop = DECL_CHAIN (f_grtop);
10050   f_groff = DECL_CHAIN (f_vrtop);
10051   f_vroff = DECL_CHAIN (f_groff);
10052
10053   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
10054                   NULL_TREE);
10055   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
10056                   NULL_TREE);
10057   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
10058                   NULL_TREE);
10059   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
10060                   NULL_TREE);
10061   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
10062                   NULL_TREE);
10063
10064   /* Emit code to initialize STACK, which points to the next varargs stack
10065      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
10066      by named arguments.  STACK is 8-byte aligned.  */
10067   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
10068   if (cum->aapcs_stack_size > 0)
10069     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
10070   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
10071   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10072
10073   /* Emit code to initialize GRTOP, the top of the GR save area.
10074      virtual_incoming_args_rtx should have been 16 byte aligned.  */
10075   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
10076   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
10077   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10078
10079   /* Emit code to initialize VRTOP, the top of the VR save area.
10080      This address is gr_save_area_bytes below GRTOP, rounded
10081      down to the next 16-byte boundary.  */
10082   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
10083   vr_offset = ROUND_UP (gr_save_area_size,
10084                         STACK_BOUNDARY / BITS_PER_UNIT);
10085
10086   if (vr_offset)
10087     t = fold_build_pointer_plus_hwi (t, -vr_offset);
10088   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
10089   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10090
10091   /* Emit code to initialize GROFF, the offset from GRTOP of the
10092      next GPR argument.  */
10093   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
10094               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
10095   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10096
10097   /* Likewise emit code to initialize VROFF, the offset from FTOP
10098      of the next VR argument.  */
10099   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
10100               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
10101   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10102 }
10103
10104 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
10105
10106 static tree
10107 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
10108                               gimple_seq *post_p ATTRIBUTE_UNUSED)
10109 {
10110   tree addr;
10111   bool indirect_p;
10112   bool is_ha;           /* is HFA or HVA.  */
10113   bool dw_align;        /* double-word align.  */
10114   machine_mode ag_mode = VOIDmode;
10115   int nregs;
10116   machine_mode mode;
10117
10118   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10119   tree stack, f_top, f_off, off, arg, roundup, on_stack;
10120   HOST_WIDE_INT size, rsize, adjust, align;
10121   tree t, u, cond1, cond2;
10122
10123   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
10124   if (indirect_p)
10125     type = build_pointer_type (type);
10126
10127   mode = TYPE_MODE (type);
10128
10129   f_stack = TYPE_FIELDS (va_list_type_node);
10130   f_grtop = DECL_CHAIN (f_stack);
10131   f_vrtop = DECL_CHAIN (f_grtop);
10132   f_groff = DECL_CHAIN (f_vrtop);
10133   f_vroff = DECL_CHAIN (f_groff);
10134
10135   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
10136                   f_stack, NULL_TREE);
10137   size = int_size_in_bytes (type);
10138   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
10139
10140   dw_align = false;
10141   adjust = 0;
10142   if (aarch64_vfp_is_call_or_return_candidate (mode,
10143                                                type,
10144                                                &ag_mode,
10145                                                &nregs,
10146                                                &is_ha))
10147     {
10148       /* TYPE passed in fp/simd registers.  */
10149       if (!TARGET_FLOAT)
10150         aarch64_err_no_fpadvsimd (mode, "varargs");
10151
10152       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
10153                       unshare_expr (valist), f_vrtop, NULL_TREE);
10154       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
10155                       unshare_expr (valist), f_vroff, NULL_TREE);
10156
10157       rsize = nregs * UNITS_PER_VREG;
10158
10159       if (is_ha)
10160         {
10161           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
10162             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
10163         }
10164       else if (BLOCK_REG_PADDING (mode, type, 1) == downward
10165                && size < UNITS_PER_VREG)
10166         {
10167           adjust = UNITS_PER_VREG - size;
10168         }
10169     }
10170   else
10171     {
10172       /* TYPE passed in general registers.  */
10173       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
10174                       unshare_expr (valist), f_grtop, NULL_TREE);
10175       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
10176                       unshare_expr (valist), f_groff, NULL_TREE);
10177       rsize = ROUND_UP (size, UNITS_PER_WORD);
10178       nregs = rsize / UNITS_PER_WORD;
10179
10180       if (align > 8)
10181         dw_align = true;
10182
10183       if (BLOCK_REG_PADDING (mode, type, 1) == downward
10184           && size < UNITS_PER_WORD)
10185         {
10186           adjust = UNITS_PER_WORD  - size;
10187         }
10188     }
10189
10190   /* Get a local temporary for the field value.  */
10191   off = get_initialized_tmp_var (f_off, pre_p, NULL);
10192
10193   /* Emit code to branch if off >= 0.  */
10194   t = build2 (GE_EXPR, boolean_type_node, off,
10195               build_int_cst (TREE_TYPE (off), 0));
10196   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
10197
10198   if (dw_align)
10199     {
10200       /* Emit: offs = (offs + 15) & -16.  */
10201       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10202                   build_int_cst (TREE_TYPE (off), 15));
10203       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
10204                   build_int_cst (TREE_TYPE (off), -16));
10205       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
10206     }
10207   else
10208     roundup = NULL;
10209
10210   /* Update ap.__[g|v]r_offs  */
10211   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10212               build_int_cst (TREE_TYPE (off), rsize));
10213   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
10214
10215   /* String up.  */
10216   if (roundup)
10217     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10218
10219   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
10220   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
10221               build_int_cst (TREE_TYPE (f_off), 0));
10222   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
10223
10224   /* String up: make sure the assignment happens before the use.  */
10225   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
10226   COND_EXPR_ELSE (cond1) = t;
10227
10228   /* Prepare the trees handling the argument that is passed on the stack;
10229      the top level node will store in ON_STACK.  */
10230   arg = get_initialized_tmp_var (stack, pre_p, NULL);
10231   if (align > 8)
10232     {
10233       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
10234       t = fold_convert (intDI_type_node, arg);
10235       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10236                   build_int_cst (TREE_TYPE (t), 15));
10237       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10238                   build_int_cst (TREE_TYPE (t), -16));
10239       t = fold_convert (TREE_TYPE (arg), t);
10240       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
10241     }
10242   else
10243     roundup = NULL;
10244   /* Advance ap.__stack  */
10245   t = fold_convert (intDI_type_node, arg);
10246   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10247               build_int_cst (TREE_TYPE (t), size + 7));
10248   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10249               build_int_cst (TREE_TYPE (t), -8));
10250   t = fold_convert (TREE_TYPE (arg), t);
10251   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
10252   /* String up roundup and advance.  */
10253   if (roundup)
10254     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10255   /* String up with arg */
10256   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
10257   /* Big-endianness related address adjustment.  */
10258   if (BLOCK_REG_PADDING (mode, type, 1) == downward
10259       && size < UNITS_PER_WORD)
10260   {
10261     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
10262                 size_int (UNITS_PER_WORD - size));
10263     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
10264   }
10265
10266   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
10267   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
10268
10269   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
10270   t = off;
10271   if (adjust)
10272     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
10273                 build_int_cst (TREE_TYPE (off), adjust));
10274
10275   t = fold_convert (sizetype, t);
10276   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
10277
10278   if (is_ha)
10279     {
10280       /* type ha; // treat as "struct {ftype field[n];}"
10281          ... [computing offs]
10282          for (i = 0; i <nregs; ++i, offs += 16)
10283            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
10284          return ha;  */
10285       int i;
10286       tree tmp_ha, field_t, field_ptr_t;
10287
10288       /* Declare a local variable.  */
10289       tmp_ha = create_tmp_var_raw (type, "ha");
10290       gimple_add_tmp_var (tmp_ha);
10291
10292       /* Establish the base type.  */
10293       switch (ag_mode)
10294         {
10295         case SFmode:
10296           field_t = float_type_node;
10297           field_ptr_t = float_ptr_type_node;
10298           break;
10299         case DFmode:
10300           field_t = double_type_node;
10301           field_ptr_t = double_ptr_type_node;
10302           break;
10303         case TFmode:
10304           field_t = long_double_type_node;
10305           field_ptr_t = long_double_ptr_type_node;
10306           break;
10307         case HFmode:
10308           field_t = aarch64_fp16_type_node;
10309           field_ptr_t = aarch64_fp16_ptr_type_node;
10310           break;
10311         case V2SImode:
10312         case V4SImode:
10313             {
10314               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
10315               field_t = build_vector_type_for_mode (innertype, ag_mode);
10316               field_ptr_t = build_pointer_type (field_t);
10317             }
10318           break;
10319         default:
10320           gcc_assert (0);
10321         }
10322
10323       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
10324       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
10325       addr = t;
10326       t = fold_convert (field_ptr_t, addr);
10327       t = build2 (MODIFY_EXPR, field_t,
10328                   build1 (INDIRECT_REF, field_t, tmp_ha),
10329                   build1 (INDIRECT_REF, field_t, t));
10330
10331       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
10332       for (i = 1; i < nregs; ++i)
10333         {
10334           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
10335           u = fold_convert (field_ptr_t, addr);
10336           u = build2 (MODIFY_EXPR, field_t,
10337                       build2 (MEM_REF, field_t, tmp_ha,
10338                               build_int_cst (field_ptr_t,
10339                                              (i *
10340                                               int_size_in_bytes (field_t)))),
10341                       build1 (INDIRECT_REF, field_t, u));
10342           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
10343         }
10344
10345       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
10346       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
10347     }
10348
10349   COND_EXPR_ELSE (cond2) = t;
10350   addr = fold_convert (build_pointer_type (type), cond1);
10351   addr = build_va_arg_indirect_ref (addr);
10352
10353   if (indirect_p)
10354     addr = build_va_arg_indirect_ref (addr);
10355
10356   return addr;
10357 }
10358
10359 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
10360
10361 static void
10362 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10363                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
10364                                 int no_rtl)
10365 {
10366   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10367   CUMULATIVE_ARGS local_cum;
10368   int gr_saved = cfun->va_list_gpr_size;
10369   int vr_saved = cfun->va_list_fpr_size;
10370
10371   /* The caller has advanced CUM up to, but not beyond, the last named
10372      argument.  Advance a local copy of CUM past the last "real" named
10373      argument, to find out how many registers are left over.  */
10374   local_cum = *cum;
10375   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
10376
10377   /* Found out how many registers we need to save.
10378      Honor tree-stdvar analysis results.  */
10379   if (cfun->va_list_gpr_size)
10380     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
10381                     cfun->va_list_gpr_size / UNITS_PER_WORD);
10382   if (cfun->va_list_fpr_size)
10383     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
10384                     cfun->va_list_fpr_size / UNITS_PER_VREG);
10385
10386   if (!TARGET_FLOAT)
10387     {
10388       gcc_assert (local_cum.aapcs_nvrn == 0);
10389       vr_saved = 0;
10390     }
10391
10392   if (!no_rtl)
10393     {
10394       if (gr_saved > 0)
10395         {
10396           rtx ptr, mem;
10397
10398           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
10399           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
10400                                - gr_saved * UNITS_PER_WORD);
10401           mem = gen_frame_mem (BLKmode, ptr);
10402           set_mem_alias_set (mem, get_varargs_alias_set ());
10403
10404           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
10405                                mem, gr_saved);
10406         }
10407       if (vr_saved > 0)
10408         {
10409           /* We can't use move_block_from_reg, because it will use
10410              the wrong mode, storing D regs only.  */
10411           machine_mode mode = TImode;
10412           int off, i, vr_start;
10413
10414           /* Set OFF to the offset from virtual_incoming_args_rtx of
10415              the first vector register.  The VR save area lies below
10416              the GR one, and is aligned to 16 bytes.  */
10417           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10418                            STACK_BOUNDARY / BITS_PER_UNIT);
10419           off -= vr_saved * UNITS_PER_VREG;
10420
10421           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10422           for (i = 0; i < vr_saved; ++i)
10423             {
10424               rtx ptr, mem;
10425
10426               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10427               mem = gen_frame_mem (mode, ptr);
10428               set_mem_alias_set (mem, get_varargs_alias_set ());
10429               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10430               off += UNITS_PER_VREG;
10431             }
10432         }
10433     }
10434
10435   /* We don't save the size into *PRETEND_SIZE because we want to avoid
10436      any complication of having crtl->args.pretend_args_size changed.  */
10437   cfun->machine->frame.saved_varargs_size
10438     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10439                  STACK_BOUNDARY / BITS_PER_UNIT)
10440        + vr_saved * UNITS_PER_VREG);
10441 }
10442
10443 static void
10444 aarch64_conditional_register_usage (void)
10445 {
10446   int i;
10447   if (!TARGET_FLOAT)
10448     {
10449       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10450         {
10451           fixed_regs[i] = 1;
10452           call_used_regs[i] = 1;
10453         }
10454     }
10455 }
10456
10457 /* Walk down the type tree of TYPE counting consecutive base elements.
10458    If *MODEP is VOIDmode, then set it to the first valid floating point
10459    type.  If a non-floating point type is found, or if a floating point
10460    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10461    otherwise return the count in the sub-tree.  */
10462 static int
10463 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10464 {
10465   machine_mode mode;
10466   HOST_WIDE_INT size;
10467
10468   switch (TREE_CODE (type))
10469     {
10470     case REAL_TYPE:
10471       mode = TYPE_MODE (type);
10472       if (mode != DFmode && mode != SFmode
10473           && mode != TFmode && mode != HFmode)
10474         return -1;
10475
10476       if (*modep == VOIDmode)
10477         *modep = mode;
10478
10479       if (*modep == mode)
10480         return 1;
10481
10482       break;
10483
10484     case COMPLEX_TYPE:
10485       mode = TYPE_MODE (TREE_TYPE (type));
10486       if (mode != DFmode && mode != SFmode
10487           && mode != TFmode && mode != HFmode)
10488         return -1;
10489
10490       if (*modep == VOIDmode)
10491         *modep = mode;
10492
10493       if (*modep == mode)
10494         return 2;
10495
10496       break;
10497
10498     case VECTOR_TYPE:
10499       /* Use V2SImode and V4SImode as representatives of all 64-bit
10500          and 128-bit vector types.  */
10501       size = int_size_in_bytes (type);
10502       switch (size)
10503         {
10504         case 8:
10505           mode = V2SImode;
10506           break;
10507         case 16:
10508           mode = V4SImode;
10509           break;
10510         default:
10511           return -1;
10512         }
10513
10514       if (*modep == VOIDmode)
10515         *modep = mode;
10516
10517       /* Vector modes are considered to be opaque: two vectors are
10518          equivalent for the purposes of being homogeneous aggregates
10519          if they are the same size.  */
10520       if (*modep == mode)
10521         return 1;
10522
10523       break;
10524
10525     case ARRAY_TYPE:
10526       {
10527         int count;
10528         tree index = TYPE_DOMAIN (type);
10529
10530         /* Can't handle incomplete types nor sizes that are not
10531            fixed.  */
10532         if (!COMPLETE_TYPE_P (type)
10533             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10534           return -1;
10535
10536         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
10537         if (count == -1
10538             || !index
10539             || !TYPE_MAX_VALUE (index)
10540             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
10541             || !TYPE_MIN_VALUE (index)
10542             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
10543             || count < 0)
10544           return -1;
10545
10546         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
10547                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
10548
10549         /* There must be no padding.  */
10550         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10551           return -1;
10552
10553         return count;
10554       }
10555
10556     case RECORD_TYPE:
10557       {
10558         int count = 0;
10559         int sub_count;
10560         tree field;
10561
10562         /* Can't handle incomplete types nor sizes that are not
10563            fixed.  */
10564         if (!COMPLETE_TYPE_P (type)
10565             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10566           return -1;
10567
10568         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10569           {
10570             if (TREE_CODE (field) != FIELD_DECL)
10571               continue;
10572
10573             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10574             if (sub_count < 0)
10575               return -1;
10576             count += sub_count;
10577           }
10578
10579         /* There must be no padding.  */
10580         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10581           return -1;
10582
10583         return count;
10584       }
10585
10586     case UNION_TYPE:
10587     case QUAL_UNION_TYPE:
10588       {
10589         /* These aren't very interesting except in a degenerate case.  */
10590         int count = 0;
10591         int sub_count;
10592         tree field;
10593
10594         /* Can't handle incomplete types nor sizes that are not
10595            fixed.  */
10596         if (!COMPLETE_TYPE_P (type)
10597             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10598           return -1;
10599
10600         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10601           {
10602             if (TREE_CODE (field) != FIELD_DECL)
10603               continue;
10604
10605             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10606             if (sub_count < 0)
10607               return -1;
10608             count = count > sub_count ? count : sub_count;
10609           }
10610
10611         /* There must be no padding.  */
10612         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10613           return -1;
10614
10615         return count;
10616       }
10617
10618     default:
10619       break;
10620     }
10621
10622   return -1;
10623 }
10624
10625 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
10626    type as described in AAPCS64 \S 4.1.2.
10627
10628    See the comment above aarch64_composite_type_p for the notes on MODE.  */
10629
10630 static bool
10631 aarch64_short_vector_p (const_tree type,
10632                         machine_mode mode)
10633 {
10634   HOST_WIDE_INT size = -1;
10635
10636   if (type && TREE_CODE (type) == VECTOR_TYPE)
10637     size = int_size_in_bytes (type);
10638   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
10639             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
10640     size = GET_MODE_SIZE (mode);
10641
10642   return (size == 8 || size == 16);
10643 }
10644
10645 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
10646    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
10647    array types.  The C99 floating-point complex types are also considered
10648    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
10649    types, which are GCC extensions and out of the scope of AAPCS64, are
10650    treated as composite types here as well.
10651
10652    Note that MODE itself is not sufficient in determining whether a type
10653    is such a composite type or not.  This is because
10654    stor-layout.c:compute_record_mode may have already changed the MODE
10655    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
10656    structure with only one field may have its MODE set to the mode of the
10657    field.  Also an integer mode whose size matches the size of the
10658    RECORD_TYPE type may be used to substitute the original mode
10659    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
10660    solely relied on.  */
10661
10662 static bool
10663 aarch64_composite_type_p (const_tree type,
10664                           machine_mode mode)
10665 {
10666   if (aarch64_short_vector_p (type, mode))
10667     return false;
10668
10669   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
10670     return true;
10671
10672   if (mode == BLKmode
10673       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
10674       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
10675     return true;
10676
10677   return false;
10678 }
10679
10680 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
10681    shall be passed or returned in simd/fp register(s) (providing these
10682    parameter passing registers are available).
10683
10684    Upon successful return, *COUNT returns the number of needed registers,
10685    *BASE_MODE returns the mode of the individual register and when IS_HAF
10686    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
10687    floating-point aggregate or a homogeneous short-vector aggregate.  */
10688
10689 static bool
10690 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
10691                                          const_tree type,
10692                                          machine_mode *base_mode,
10693                                          int *count,
10694                                          bool *is_ha)
10695 {
10696   machine_mode new_mode = VOIDmode;
10697   bool composite_p = aarch64_composite_type_p (type, mode);
10698
10699   if (is_ha != NULL) *is_ha = false;
10700
10701   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
10702       || aarch64_short_vector_p (type, mode))
10703     {
10704       *count = 1;
10705       new_mode = mode;
10706     }
10707   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
10708     {
10709       if (is_ha != NULL) *is_ha = true;
10710       *count = 2;
10711       new_mode = GET_MODE_INNER (mode);
10712     }
10713   else if (type && composite_p)
10714     {
10715       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
10716
10717       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
10718         {
10719           if (is_ha != NULL) *is_ha = true;
10720           *count = ag_count;
10721         }
10722       else
10723         return false;
10724     }
10725   else
10726     return false;
10727
10728   *base_mode = new_mode;
10729   return true;
10730 }
10731
10732 /* Implement TARGET_STRUCT_VALUE_RTX.  */
10733
10734 static rtx
10735 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
10736                           int incoming ATTRIBUTE_UNUSED)
10737 {
10738   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
10739 }
10740
10741 /* Implements target hook vector_mode_supported_p.  */
10742 static bool
10743 aarch64_vector_mode_supported_p (machine_mode mode)
10744 {
10745   if (TARGET_SIMD
10746       && (mode == V4SImode  || mode == V8HImode
10747           || mode == V16QImode || mode == V2DImode
10748           || mode == V2SImode  || mode == V4HImode
10749           || mode == V8QImode || mode == V2SFmode
10750           || mode == V4SFmode || mode == V2DFmode
10751           || mode == V4HFmode || mode == V8HFmode
10752           || mode == V1DFmode))
10753     return true;
10754
10755   return false;
10756 }
10757
10758 /* Return appropriate SIMD container
10759    for MODE within a vector of WIDTH bits.  */
10760 static machine_mode
10761 aarch64_simd_container_mode (machine_mode mode, unsigned width)
10762 {
10763   gcc_assert (width == 64 || width == 128);
10764   if (TARGET_SIMD)
10765     {
10766       if (width == 128)
10767         switch (mode)
10768           {
10769           case DFmode:
10770             return V2DFmode;
10771           case SFmode:
10772             return V4SFmode;
10773           case SImode:
10774             return V4SImode;
10775           case HImode:
10776             return V8HImode;
10777           case QImode:
10778             return V16QImode;
10779           case DImode:
10780             return V2DImode;
10781           default:
10782             break;
10783           }
10784       else
10785         switch (mode)
10786           {
10787           case SFmode:
10788             return V2SFmode;
10789           case SImode:
10790             return V2SImode;
10791           case HImode:
10792             return V4HImode;
10793           case QImode:
10794             return V8QImode;
10795           default:
10796             break;
10797           }
10798     }
10799   return word_mode;
10800 }
10801
10802 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
10803 static machine_mode
10804 aarch64_preferred_simd_mode (machine_mode mode)
10805 {
10806   return aarch64_simd_container_mode (mode, 128);
10807 }
10808
10809 /* Return the bitmask of possible vector sizes for the vectorizer
10810    to iterate over.  */
10811 static unsigned int
10812 aarch64_autovectorize_vector_sizes (void)
10813 {
10814   return (16 | 8);
10815 }
10816
10817 /* Implement TARGET_MANGLE_TYPE.  */
10818
10819 static const char *
10820 aarch64_mangle_type (const_tree type)
10821 {
10822   /* The AArch64 ABI documents say that "__va_list" has to be
10823      managled as if it is in the "std" namespace.  */
10824   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
10825     return "St9__va_list";
10826
10827   /* Half-precision float.  */
10828   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
10829     return "Dh";
10830
10831   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
10832      builtin types.  */
10833   if (TYPE_NAME (type) != NULL)
10834     return aarch64_mangle_builtin_type (type);
10835
10836   /* Use the default mangling.  */
10837   return NULL;
10838 }
10839
10840
10841 /* Return true if the rtx_insn contains a MEM RTX somewhere
10842    in it.  */
10843
10844 static bool
10845 has_memory_op (rtx_insn *mem_insn)
10846 {
10847   subrtx_iterator::array_type array;
10848   FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
10849     if (MEM_P (*iter))
10850       return true;
10851
10852   return false;
10853 }
10854
10855 /* Find the first rtx_insn before insn that will generate an assembly
10856    instruction.  */
10857
10858 static rtx_insn *
10859 aarch64_prev_real_insn (rtx_insn *insn)
10860 {
10861   if (!insn)
10862     return NULL;
10863
10864   do
10865     {
10866       insn = prev_real_insn (insn);
10867     }
10868   while (insn && recog_memoized (insn) < 0);
10869
10870   return insn;
10871 }
10872
10873 static bool
10874 is_madd_op (enum attr_type t1)
10875 {
10876   unsigned int i;
10877   /* A number of these may be AArch32 only.  */
10878   enum attr_type mlatypes[] = {
10879     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
10880     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
10881     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
10882   };
10883
10884   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
10885     {
10886       if (t1 == mlatypes[i])
10887         return true;
10888     }
10889
10890   return false;
10891 }
10892
10893 /* Check if there is a register dependency between a load and the insn
10894    for which we hold recog_data.  */
10895
10896 static bool
10897 dep_between_memop_and_curr (rtx memop)
10898 {
10899   rtx load_reg;
10900   int opno;
10901
10902   gcc_assert (GET_CODE (memop) == SET);
10903
10904   if (!REG_P (SET_DEST (memop)))
10905     return false;
10906
10907   load_reg = SET_DEST (memop);
10908   for (opno = 1; opno < recog_data.n_operands; opno++)
10909     {
10910       rtx operand = recog_data.operand[opno];
10911       if (REG_P (operand)
10912           && reg_overlap_mentioned_p (load_reg, operand))
10913         return true;
10914
10915     }
10916   return false;
10917 }
10918
10919
10920 /* When working around the Cortex-A53 erratum 835769,
10921    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
10922    instruction and has a preceding memory instruction such that a NOP
10923    should be inserted between them.  */
10924
10925 bool
10926 aarch64_madd_needs_nop (rtx_insn* insn)
10927 {
10928   enum attr_type attr_type;
10929   rtx_insn *prev;
10930   rtx body;
10931
10932   if (!TARGET_FIX_ERR_A53_835769)
10933     return false;
10934
10935   if (!INSN_P (insn) || recog_memoized (insn) < 0)
10936     return false;
10937
10938   attr_type = get_attr_type (insn);
10939   if (!is_madd_op (attr_type))
10940     return false;
10941
10942   prev = aarch64_prev_real_insn (insn);
10943   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
10944      Restore recog state to INSN to avoid state corruption.  */
10945   extract_constrain_insn_cached (insn);
10946
10947   if (!prev || !has_memory_op (prev))
10948     return false;
10949
10950   body = single_set (prev);
10951
10952   /* If the previous insn is a memory op and there is no dependency between
10953      it and the DImode madd, emit a NOP between them.  If body is NULL then we
10954      have a complex memory operation, probably a load/store pair.
10955      Be conservative for now and emit a NOP.  */
10956   if (GET_MODE (recog_data.operand[0]) == DImode
10957       && (!body || !dep_between_memop_and_curr (body)))
10958     return true;
10959
10960   return false;
10961
10962 }
10963
10964
10965 /* Implement FINAL_PRESCAN_INSN.  */
10966
10967 void
10968 aarch64_final_prescan_insn (rtx_insn *insn)
10969 {
10970   if (aarch64_madd_needs_nop (insn))
10971     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
10972 }
10973
10974
10975 /* Return the equivalent letter for size.  */
10976 static char
10977 sizetochar (int size)
10978 {
10979   switch (size)
10980     {
10981     case 64: return 'd';
10982     case 32: return 's';
10983     case 16: return 'h';
10984     case 8 : return 'b';
10985     default: gcc_unreachable ();
10986     }
10987 }
10988
10989 /* Return true iff x is a uniform vector of floating-point
10990    constants, and the constant can be represented in
10991    quarter-precision form.  Note, as aarch64_float_const_representable
10992    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
10993 static bool
10994 aarch64_vect_float_const_representable_p (rtx x)
10995 {
10996   rtx elt;
10997   return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10998           && const_vec_duplicate_p (x, &elt)
10999           && aarch64_float_const_representable_p (elt));
11000 }
11001
11002 /* Return true for valid and false for invalid.  */
11003 bool
11004 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
11005                               struct simd_immediate_info *info)
11006 {
11007 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
11008   matches = 1;                                          \
11009   for (i = 0; i < idx; i += (STRIDE))                   \
11010     if (!(TEST))                                        \
11011       matches = 0;                                      \
11012   if (matches)                                          \
11013     {                                                   \
11014       immtype = (CLASS);                                \
11015       elsize = (ELSIZE);                                \
11016       eshift = (SHIFT);                                 \
11017       emvn = (NEG);                                     \
11018       break;                                            \
11019     }
11020
11021   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
11022   unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
11023   unsigned char bytes[16];
11024   int immtype = -1, matches;
11025   unsigned int invmask = inverse ? 0xff : 0;
11026   int eshift, emvn;
11027
11028   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11029     {
11030       if (! (aarch64_simd_imm_zero_p (op, mode)
11031              || aarch64_vect_float_const_representable_p (op)))
11032         return false;
11033
11034       if (info)
11035         {
11036           info->value = CONST_VECTOR_ELT (op, 0);
11037           info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
11038           info->mvn = false;
11039           info->shift = 0;
11040         }
11041
11042       return true;
11043     }
11044
11045   /* Splat vector constant out into a byte vector.  */
11046   for (i = 0; i < n_elts; i++)
11047     {
11048       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
11049          it must be laid out in the vector register in reverse order.  */
11050       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
11051       unsigned HOST_WIDE_INT elpart;
11052
11053       gcc_assert (CONST_INT_P (el));
11054       elpart = INTVAL (el);
11055
11056       for (unsigned int byte = 0; byte < innersize; byte++)
11057         {
11058           bytes[idx++] = (elpart & 0xff) ^ invmask;
11059           elpart >>= BITS_PER_UNIT;
11060         }
11061
11062     }
11063
11064   /* Sanity check.  */
11065   gcc_assert (idx == GET_MODE_SIZE (mode));
11066
11067   do
11068     {
11069       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
11070              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
11071
11072       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11073              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11074
11075       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
11076              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11077
11078       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
11079              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
11080
11081       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
11082
11083       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
11084
11085       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
11086              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
11087
11088       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11089              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11090
11091       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
11092              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11093
11094       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
11095              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
11096
11097       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
11098
11099       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
11100
11101       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11102              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11103
11104       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11105              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11106
11107       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
11108              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11109
11110       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
11111              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11112
11113       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
11114
11115       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
11116              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
11117     }
11118   while (0);
11119
11120   if (immtype == -1)
11121     return false;
11122
11123   if (info)
11124     {
11125       info->element_width = elsize;
11126       info->mvn = emvn != 0;
11127       info->shift = eshift;
11128
11129       unsigned HOST_WIDE_INT imm = 0;
11130
11131       if (immtype >= 12 && immtype <= 15)
11132         info->msl = true;
11133
11134       /* Un-invert bytes of recognized vector, if necessary.  */
11135       if (invmask != 0)
11136         for (i = 0; i < idx; i++)
11137           bytes[i] ^= invmask;
11138
11139       if (immtype == 17)
11140         {
11141           /* FIXME: Broken on 32-bit H_W_I hosts.  */
11142           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
11143
11144           for (i = 0; i < 8; i++)
11145             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
11146               << (i * BITS_PER_UNIT);
11147
11148
11149           info->value = GEN_INT (imm);
11150         }
11151       else
11152         {
11153           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
11154             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
11155
11156           /* Construct 'abcdefgh' because the assembler cannot handle
11157              generic constants.  */
11158           if (info->mvn)
11159             imm = ~imm;
11160           imm = (imm >> info->shift) & 0xff;
11161           info->value = GEN_INT (imm);
11162         }
11163     }
11164
11165   return true;
11166 #undef CHECK
11167 }
11168
11169 /* Check of immediate shift constants are within range.  */
11170 bool
11171 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
11172 {
11173   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
11174   if (left)
11175     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
11176   else
11177     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
11178 }
11179
11180 /* Return true if X is a uniform vector where all elements
11181    are either the floating-point constant 0.0 or the
11182    integer constant 0.  */
11183 bool
11184 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
11185 {
11186   return x == CONST0_RTX (mode);
11187 }
11188
11189
11190 /* Return the bitmask CONST_INT to select the bits required by a zero extract
11191    operation of width WIDTH at bit position POS.  */
11192
11193 rtx
11194 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
11195 {
11196   gcc_assert (CONST_INT_P (width));
11197   gcc_assert (CONST_INT_P (pos));
11198
11199   unsigned HOST_WIDE_INT mask
11200     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
11201   return GEN_INT (mask << UINTVAL (pos));
11202 }
11203
11204 bool
11205 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
11206 {
11207   HOST_WIDE_INT imm = INTVAL (x);
11208   int i;
11209
11210   for (i = 0; i < 8; i++)
11211     {
11212       unsigned int byte = imm & 0xff;
11213       if (byte != 0xff && byte != 0)
11214        return false;
11215       imm >>= 8;
11216     }
11217
11218   return true;
11219 }
11220
11221 bool
11222 aarch64_mov_operand_p (rtx x, machine_mode mode)
11223 {
11224   if (GET_CODE (x) == HIGH
11225       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
11226     return true;
11227
11228   if (CONST_INT_P (x))
11229     return true;
11230
11231   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
11232     return true;
11233
11234   return aarch64_classify_symbolic_expression (x)
11235     == SYMBOL_TINY_ABSOLUTE;
11236 }
11237
11238 /* Return a const_int vector of VAL.  */
11239 rtx
11240 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
11241 {
11242   int nunits = GET_MODE_NUNITS (mode);
11243   rtvec v = rtvec_alloc (nunits);
11244   int i;
11245
11246   rtx cache = GEN_INT (val);
11247
11248   for (i=0; i < nunits; i++)
11249     RTVEC_ELT (v, i) = cache;
11250
11251   return gen_rtx_CONST_VECTOR (mode, v);
11252 }
11253
11254 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
11255
11256 bool
11257 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
11258 {
11259   machine_mode vmode;
11260
11261   gcc_assert (!VECTOR_MODE_P (mode));
11262   vmode = aarch64_preferred_simd_mode (mode);
11263   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
11264   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
11265 }
11266
11267 /* Construct and return a PARALLEL RTX vector with elements numbering the
11268    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
11269    the vector - from the perspective of the architecture.  This does not
11270    line up with GCC's perspective on lane numbers, so we end up with
11271    different masks depending on our target endian-ness.  The diagram
11272    below may help.  We must draw the distinction when building masks
11273    which select one half of the vector.  An instruction selecting
11274    architectural low-lanes for a big-endian target, must be described using
11275    a mask selecting GCC high-lanes.
11276
11277                  Big-Endian             Little-Endian
11278
11279 GCC             0   1   2   3           3   2   1   0
11280               | x | x | x | x |       | x | x | x | x |
11281 Architecture    3   2   1   0           3   2   1   0
11282
11283 Low Mask:         { 2, 3 }                { 0, 1 }
11284 High Mask:        { 0, 1 }                { 2, 3 }
11285 */
11286
11287 rtx
11288 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
11289 {
11290   int nunits = GET_MODE_NUNITS (mode);
11291   rtvec v = rtvec_alloc (nunits / 2);
11292   int high_base = nunits / 2;
11293   int low_base = 0;
11294   int base;
11295   rtx t1;
11296   int i;
11297
11298   if (BYTES_BIG_ENDIAN)
11299     base = high ? low_base : high_base;
11300   else
11301     base = high ? high_base : low_base;
11302
11303   for (i = 0; i < nunits / 2; i++)
11304     RTVEC_ELT (v, i) = GEN_INT (base + i);
11305
11306   t1 = gen_rtx_PARALLEL (mode, v);
11307   return t1;
11308 }
11309
11310 /* Check OP for validity as a PARALLEL RTX vector with elements
11311    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
11312    from the perspective of the architecture.  See the diagram above
11313    aarch64_simd_vect_par_cnst_half for more details.  */
11314
11315 bool
11316 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
11317                                        bool high)
11318 {
11319   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
11320   HOST_WIDE_INT count_op = XVECLEN (op, 0);
11321   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
11322   int i = 0;
11323
11324   if (!VECTOR_MODE_P (mode))
11325     return false;
11326
11327   if (count_op != count_ideal)
11328     return false;
11329
11330   for (i = 0; i < count_ideal; i++)
11331     {
11332       rtx elt_op = XVECEXP (op, 0, i);
11333       rtx elt_ideal = XVECEXP (ideal, 0, i);
11334
11335       if (!CONST_INT_P (elt_op)
11336           || INTVAL (elt_ideal) != INTVAL (elt_op))
11337         return false;
11338     }
11339   return true;
11340 }
11341
11342 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
11343    HIGH (exclusive).  */
11344 void
11345 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
11346                           const_tree exp)
11347 {
11348   HOST_WIDE_INT lane;
11349   gcc_assert (CONST_INT_P (operand));
11350   lane = INTVAL (operand);
11351
11352   if (lane < low || lane >= high)
11353   {
11354     if (exp)
11355       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
11356     else
11357       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
11358   }
11359 }
11360
11361 /* Return TRUE if OP is a valid vector addressing mode.  */
11362 bool
11363 aarch64_simd_mem_operand_p (rtx op)
11364 {
11365   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
11366                         || REG_P (XEXP (op, 0)));
11367 }
11368
11369 /* Emit a register copy from operand to operand, taking care not to
11370    early-clobber source registers in the process.
11371
11372    COUNT is the number of components into which the copy needs to be
11373    decomposed.  */
11374 void
11375 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
11376                                 unsigned int count)
11377 {
11378   unsigned int i;
11379   int rdest = REGNO (operands[0]);
11380   int rsrc = REGNO (operands[1]);
11381
11382   if (!reg_overlap_mentioned_p (operands[0], operands[1])
11383       || rdest < rsrc)
11384     for (i = 0; i < count; i++)
11385       emit_move_insn (gen_rtx_REG (mode, rdest + i),
11386                       gen_rtx_REG (mode, rsrc + i));
11387   else
11388     for (i = 0; i < count; i++)
11389       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
11390                       gen_rtx_REG (mode, rsrc + count - i - 1));
11391 }
11392
11393 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11394    one of VSTRUCT modes: OI, CI, or XI.  */
11395 int
11396 aarch64_simd_attr_length_rglist (enum machine_mode mode)
11397 {
11398   return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
11399 }
11400
11401 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
11402    alignment of a vector to 128 bits.  */
11403 static HOST_WIDE_INT
11404 aarch64_simd_vector_alignment (const_tree type)
11405 {
11406   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
11407   return MIN (align, 128);
11408 }
11409
11410 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
11411 static bool
11412 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
11413 {
11414   if (is_packed)
11415     return false;
11416
11417   /* We guarantee alignment for vectors up to 128-bits.  */
11418   if (tree_int_cst_compare (TYPE_SIZE (type),
11419                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11420     return false;
11421
11422   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
11423   return true;
11424 }
11425
11426 /* Return true if the vector misalignment factor is supported by the
11427    target.  */
11428 static bool
11429 aarch64_builtin_support_vector_misalignment (machine_mode mode,
11430                                              const_tree type, int misalignment,
11431                                              bool is_packed)
11432 {
11433   if (TARGET_SIMD && STRICT_ALIGNMENT)
11434     {
11435       /* Return if movmisalign pattern is not supported for this mode.  */
11436       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
11437         return false;
11438
11439       if (misalignment == -1)
11440         {
11441           /* Misalignment factor is unknown at compile time but we know
11442              it's word aligned.  */
11443           if (aarch64_simd_vector_alignment_reachable (type, is_packed))
11444             {
11445               int element_size = TREE_INT_CST_LOW (TYPE_SIZE (type));
11446
11447               if (element_size != 64)
11448                 return true;
11449             }
11450           return false;
11451         }
11452     }
11453   return default_builtin_support_vector_misalignment (mode, type, misalignment,
11454                                                       is_packed);
11455 }
11456
11457 /* If VALS is a vector constant that can be loaded into a register
11458    using DUP, generate instructions to do so and return an RTX to
11459    assign to the register.  Otherwise return NULL_RTX.  */
11460 static rtx
11461 aarch64_simd_dup_constant (rtx vals)
11462 {
11463   machine_mode mode = GET_MODE (vals);
11464   machine_mode inner_mode = GET_MODE_INNER (mode);
11465   rtx x;
11466
11467   if (!const_vec_duplicate_p (vals, &x))
11468     return NULL_RTX;
11469
11470   /* We can load this constant by using DUP and a constant in a
11471      single ARM register.  This will be cheaper than a vector
11472      load.  */
11473   x = copy_to_mode_reg (inner_mode, x);
11474   return gen_rtx_VEC_DUPLICATE (mode, x);
11475 }
11476
11477
11478 /* Generate code to load VALS, which is a PARALLEL containing only
11479    constants (for vec_init) or CONST_VECTOR, efficiently into a
11480    register.  Returns an RTX to copy into the register, or NULL_RTX
11481    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
11482 static rtx
11483 aarch64_simd_make_constant (rtx vals)
11484 {
11485   machine_mode mode = GET_MODE (vals);
11486   rtx const_dup;
11487   rtx const_vec = NULL_RTX;
11488   int n_elts = GET_MODE_NUNITS (mode);
11489   int n_const = 0;
11490   int i;
11491
11492   if (GET_CODE (vals) == CONST_VECTOR)
11493     const_vec = vals;
11494   else if (GET_CODE (vals) == PARALLEL)
11495     {
11496       /* A CONST_VECTOR must contain only CONST_INTs and
11497          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11498          Only store valid constants in a CONST_VECTOR.  */
11499       for (i = 0; i < n_elts; ++i)
11500         {
11501           rtx x = XVECEXP (vals, 0, i);
11502           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11503             n_const++;
11504         }
11505       if (n_const == n_elts)
11506         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11507     }
11508   else
11509     gcc_unreachable ();
11510
11511   if (const_vec != NULL_RTX
11512       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11513     /* Load using MOVI/MVNI.  */
11514     return const_vec;
11515   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11516     /* Loaded using DUP.  */
11517     return const_dup;
11518   else if (const_vec != NULL_RTX)
11519     /* Load from constant pool. We can not take advantage of single-cycle
11520        LD1 because we need a PC-relative addressing mode.  */
11521     return const_vec;
11522   else
11523     /* A PARALLEL containing something not valid inside CONST_VECTOR.
11524        We can not construct an initializer.  */
11525     return NULL_RTX;
11526 }
11527
11528 /* Expand a vector initialisation sequence, such that TARGET is
11529    initialised to contain VALS.  */
11530
11531 void
11532 aarch64_expand_vector_init (rtx target, rtx vals)
11533 {
11534   machine_mode mode = GET_MODE (target);
11535   machine_mode inner_mode = GET_MODE_INNER (mode);
11536   /* The number of vector elements.  */
11537   int n_elts = GET_MODE_NUNITS (mode);
11538   /* The number of vector elements which are not constant.  */
11539   int n_var = 0;
11540   rtx any_const = NULL_RTX;
11541   /* The first element of vals.  */
11542   rtx v0 = XVECEXP (vals, 0, 0);
11543   bool all_same = true;
11544
11545   /* Count the number of variable elements to initialise.  */
11546   for (int i = 0; i < n_elts; ++i)
11547     {
11548       rtx x = XVECEXP (vals, 0, i);
11549       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
11550         ++n_var;
11551       else
11552         any_const = x;
11553
11554       all_same &= rtx_equal_p (x, v0);
11555     }
11556
11557   /* No variable elements, hand off to aarch64_simd_make_constant which knows
11558      how best to handle this.  */
11559   if (n_var == 0)
11560     {
11561       rtx constant = aarch64_simd_make_constant (vals);
11562       if (constant != NULL_RTX)
11563         {
11564           emit_move_insn (target, constant);
11565           return;
11566         }
11567     }
11568
11569   /* Splat a single non-constant element if we can.  */
11570   if (all_same)
11571     {
11572       rtx x = copy_to_mode_reg (inner_mode, v0);
11573       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
11574       return;
11575     }
11576
11577   /* Initialise a vector which is part-variable.  We want to first try
11578      to build those lanes which are constant in the most efficient way we
11579      can.  */
11580   if (n_var != n_elts)
11581     {
11582       rtx copy = copy_rtx (vals);
11583
11584       /* Load constant part of vector.  We really don't care what goes into the
11585          parts we will overwrite, but we're more likely to be able to load the
11586          constant efficiently if it has fewer, larger, repeating parts
11587          (see aarch64_simd_valid_immediate).  */
11588       for (int i = 0; i < n_elts; i++)
11589         {
11590           rtx x = XVECEXP (vals, 0, i);
11591           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11592             continue;
11593           rtx subst = any_const;
11594           for (int bit = n_elts / 2; bit > 0; bit /= 2)
11595             {
11596               /* Look in the copied vector, as more elements are const.  */
11597               rtx test = XVECEXP (copy, 0, i ^ bit);
11598               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
11599                 {
11600                   subst = test;
11601                   break;
11602                 }
11603             }
11604           XVECEXP (copy, 0, i) = subst;
11605         }
11606       aarch64_expand_vector_init (target, copy);
11607     }
11608
11609   /* Insert the variable lanes directly.  */
11610
11611   enum insn_code icode = optab_handler (vec_set_optab, mode);
11612   gcc_assert (icode != CODE_FOR_nothing);
11613
11614   for (int i = 0; i < n_elts; i++)
11615     {
11616       rtx x = XVECEXP (vals, 0, i);
11617       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11618         continue;
11619       x = copy_to_mode_reg (inner_mode, x);
11620       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
11621     }
11622 }
11623
11624 static unsigned HOST_WIDE_INT
11625 aarch64_shift_truncation_mask (machine_mode mode)
11626 {
11627   return
11628     (!SHIFT_COUNT_TRUNCATED
11629      || aarch64_vector_mode_supported_p (mode)
11630      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
11631 }
11632
11633 /* Select a format to encode pointers in exception handling data.  */
11634 int
11635 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
11636 {
11637    int type;
11638    switch (aarch64_cmodel)
11639      {
11640      case AARCH64_CMODEL_TINY:
11641      case AARCH64_CMODEL_TINY_PIC:
11642      case AARCH64_CMODEL_SMALL:
11643      case AARCH64_CMODEL_SMALL_PIC:
11644      case AARCH64_CMODEL_SMALL_SPIC:
11645        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
11646           for everything.  */
11647        type = DW_EH_PE_sdata4;
11648        break;
11649      default:
11650        /* No assumptions here.  8-byte relocs required.  */
11651        type = DW_EH_PE_sdata8;
11652        break;
11653      }
11654    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
11655 }
11656
11657 /* The last .arch and .tune assembly strings that we printed.  */
11658 static std::string aarch64_last_printed_arch_string;
11659 static std::string aarch64_last_printed_tune_string;
11660
11661 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
11662    by the function fndecl.  */
11663
11664 void
11665 aarch64_declare_function_name (FILE *stream, const char* name,
11666                                 tree fndecl)
11667 {
11668   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11669
11670   struct cl_target_option *targ_options;
11671   if (target_parts)
11672     targ_options = TREE_TARGET_OPTION (target_parts);
11673   else
11674     targ_options = TREE_TARGET_OPTION (target_option_current_node);
11675   gcc_assert (targ_options);
11676
11677   const struct processor *this_arch
11678     = aarch64_get_arch (targ_options->x_explicit_arch);
11679
11680   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
11681   std::string extension
11682     = aarch64_get_extension_string_for_isa_flags (isa_flags,
11683                                                   this_arch->flags);
11684   /* Only update the assembler .arch string if it is distinct from the last
11685      such string we printed.  */
11686   std::string to_print = this_arch->name + extension;
11687   if (to_print != aarch64_last_printed_arch_string)
11688     {
11689       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
11690       aarch64_last_printed_arch_string = to_print;
11691     }
11692
11693   /* Print the cpu name we're tuning for in the comments, might be
11694      useful to readers of the generated asm.  Do it only when it changes
11695      from function to function and verbose assembly is requested.  */
11696   const struct processor *this_tune
11697     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
11698
11699   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
11700     {
11701       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
11702                    this_tune->name);
11703       aarch64_last_printed_tune_string = this_tune->name;
11704     }
11705
11706   /* Don't forget the type directive for ELF.  */
11707   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
11708   ASM_OUTPUT_LABEL (stream, name);
11709 }
11710
11711 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
11712
11713 static void
11714 aarch64_start_file (void)
11715 {
11716   struct cl_target_option *default_options
11717     = TREE_TARGET_OPTION (target_option_default_node);
11718
11719   const struct processor *default_arch
11720     = aarch64_get_arch (default_options->x_explicit_arch);
11721   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
11722   std::string extension
11723     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
11724                                                   default_arch->flags);
11725
11726    aarch64_last_printed_arch_string = default_arch->name + extension;
11727    aarch64_last_printed_tune_string = "";
11728    asm_fprintf (asm_out_file, "\t.arch %s\n",
11729                 aarch64_last_printed_arch_string.c_str ());
11730
11731    default_file_start ();
11732 }
11733
11734 /* Emit load exclusive.  */
11735
11736 static void
11737 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
11738                              rtx mem, rtx model_rtx)
11739 {
11740   rtx (*gen) (rtx, rtx, rtx);
11741
11742   switch (mode)
11743     {
11744     case QImode: gen = gen_aarch64_load_exclusiveqi; break;
11745     case HImode: gen = gen_aarch64_load_exclusivehi; break;
11746     case SImode: gen = gen_aarch64_load_exclusivesi; break;
11747     case DImode: gen = gen_aarch64_load_exclusivedi; break;
11748     default:
11749       gcc_unreachable ();
11750     }
11751
11752   emit_insn (gen (rval, mem, model_rtx));
11753 }
11754
11755 /* Emit store exclusive.  */
11756
11757 static void
11758 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
11759                               rtx rval, rtx mem, rtx model_rtx)
11760 {
11761   rtx (*gen) (rtx, rtx, rtx, rtx);
11762
11763   switch (mode)
11764     {
11765     case QImode: gen = gen_aarch64_store_exclusiveqi; break;
11766     case HImode: gen = gen_aarch64_store_exclusivehi; break;
11767     case SImode: gen = gen_aarch64_store_exclusivesi; break;
11768     case DImode: gen = gen_aarch64_store_exclusivedi; break;
11769     default:
11770       gcc_unreachable ();
11771     }
11772
11773   emit_insn (gen (bval, rval, mem, model_rtx));
11774 }
11775
11776 /* Mark the previous jump instruction as unlikely.  */
11777
11778 static void
11779 aarch64_emit_unlikely_jump (rtx insn)
11780 {
11781   int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
11782
11783   rtx_insn *jump = emit_jump_insn (insn);
11784   add_int_reg_note (jump, REG_BR_PROB, very_unlikely);
11785 }
11786
11787 /* Expand a compare and swap pattern.  */
11788
11789 void
11790 aarch64_expand_compare_and_swap (rtx operands[])
11791 {
11792   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
11793   machine_mode mode, cmp_mode;
11794   typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
11795   int idx;
11796   gen_cas_fn gen;
11797   const gen_cas_fn split_cas[] =
11798   {
11799     gen_aarch64_compare_and_swapqi,
11800     gen_aarch64_compare_and_swaphi,
11801     gen_aarch64_compare_and_swapsi,
11802     gen_aarch64_compare_and_swapdi
11803   };
11804   const gen_cas_fn atomic_cas[] =
11805   {
11806     gen_aarch64_compare_and_swapqi_lse,
11807     gen_aarch64_compare_and_swaphi_lse,
11808     gen_aarch64_compare_and_swapsi_lse,
11809     gen_aarch64_compare_and_swapdi_lse
11810   };
11811
11812   bval = operands[0];
11813   rval = operands[1];
11814   mem = operands[2];
11815   oldval = operands[3];
11816   newval = operands[4];
11817   is_weak = operands[5];
11818   mod_s = operands[6];
11819   mod_f = operands[7];
11820   mode = GET_MODE (mem);
11821   cmp_mode = mode;
11822
11823   /* Normally the succ memory model must be stronger than fail, but in the
11824      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
11825      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
11826
11827   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
11828       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
11829     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
11830
11831   switch (mode)
11832     {
11833     case QImode:
11834     case HImode:
11835       /* For short modes, we're going to perform the comparison in SImode,
11836          so do the zero-extension now.  */
11837       cmp_mode = SImode;
11838       rval = gen_reg_rtx (SImode);
11839       oldval = convert_modes (SImode, mode, oldval, true);
11840       /* Fall through.  */
11841
11842     case SImode:
11843     case DImode:
11844       /* Force the value into a register if needed.  */
11845       if (!aarch64_plus_operand (oldval, mode))
11846         oldval = force_reg (cmp_mode, oldval);
11847       break;
11848
11849     default:
11850       gcc_unreachable ();
11851     }
11852
11853   switch (mode)
11854     {
11855     case QImode: idx = 0; break;
11856     case HImode: idx = 1; break;
11857     case SImode: idx = 2; break;
11858     case DImode: idx = 3; break;
11859     default:
11860       gcc_unreachable ();
11861     }
11862   if (TARGET_LSE)
11863     gen = atomic_cas[idx];
11864   else
11865     gen = split_cas[idx];
11866
11867   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
11868
11869   if (mode == QImode || mode == HImode)
11870     emit_move_insn (operands[1], gen_lowpart (mode, rval));
11871
11872   x = gen_rtx_REG (CCmode, CC_REGNUM);
11873   x = gen_rtx_EQ (SImode, x, const0_rtx);
11874   emit_insn (gen_rtx_SET (bval, x));
11875 }
11876
11877 /* Test whether the target supports using a atomic load-operate instruction.
11878    CODE is the operation and AFTER is TRUE if the data in memory after the
11879    operation should be returned and FALSE if the data before the operation
11880    should be returned.  Returns FALSE if the operation isn't supported by the
11881    architecture.  */
11882
11883 bool
11884 aarch64_atomic_ldop_supported_p (enum rtx_code code)
11885 {
11886   if (!TARGET_LSE)
11887     return false;
11888
11889   switch (code)
11890     {
11891     case SET:
11892     case AND:
11893     case IOR:
11894     case XOR:
11895     case MINUS:
11896     case PLUS:
11897       return true;
11898     default:
11899       return false;
11900     }
11901 }
11902
11903 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
11904    sequence implementing an atomic operation.  */
11905
11906 static void
11907 aarch64_emit_post_barrier (enum memmodel model)
11908 {
11909   const enum memmodel base_model = memmodel_base (model);
11910
11911   if (is_mm_sync (model)
11912       && (base_model == MEMMODEL_ACQUIRE
11913           || base_model == MEMMODEL_ACQ_REL
11914           || base_model == MEMMODEL_SEQ_CST))
11915     {
11916       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
11917     }
11918 }
11919
11920 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
11921    for the data in memory.  EXPECTED is the value expected to be in memory.
11922    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
11923    is the memory ordering to use.  */
11924
11925 void
11926 aarch64_gen_atomic_cas (rtx rval, rtx mem,
11927                         rtx expected, rtx desired,
11928                         rtx model)
11929 {
11930   rtx (*gen) (rtx, rtx, rtx, rtx);
11931   machine_mode mode;
11932
11933   mode = GET_MODE (mem);
11934
11935   switch (mode)
11936     {
11937     case QImode: gen = gen_aarch64_atomic_casqi; break;
11938     case HImode: gen = gen_aarch64_atomic_cashi; break;
11939     case SImode: gen = gen_aarch64_atomic_cassi; break;
11940     case DImode: gen = gen_aarch64_atomic_casdi; break;
11941     default:
11942       gcc_unreachable ();
11943     }
11944
11945   /* Move the expected value into the CAS destination register.  */
11946   emit_insn (gen_rtx_SET (rval, expected));
11947
11948   /* Emit the CAS.  */
11949   emit_insn (gen (rval, mem, desired, model));
11950
11951   /* Compare the expected value with the value loaded by the CAS, to establish
11952      whether the swap was made.  */
11953   aarch64_gen_compare_reg (EQ, rval, expected);
11954 }
11955
11956 /* Split a compare and swap pattern.  */
11957
11958 void
11959 aarch64_split_compare_and_swap (rtx operands[])
11960 {
11961   rtx rval, mem, oldval, newval, scratch;
11962   machine_mode mode;
11963   bool is_weak;
11964   rtx_code_label *label1, *label2;
11965   rtx x, cond;
11966   enum memmodel model;
11967   rtx model_rtx;
11968
11969   rval = operands[0];
11970   mem = operands[1];
11971   oldval = operands[2];
11972   newval = operands[3];
11973   is_weak = (operands[4] != const0_rtx);
11974   model_rtx = operands[5];
11975   scratch = operands[7];
11976   mode = GET_MODE (mem);
11977   model = memmodel_from_int (INTVAL (model_rtx));
11978
11979   label1 = NULL;
11980   if (!is_weak)
11981     {
11982       label1 = gen_label_rtx ();
11983       emit_label (label1);
11984     }
11985   label2 = gen_label_rtx ();
11986
11987   /* The initial load can be relaxed for a __sync operation since a final
11988      barrier will be emitted to stop code hoisting.  */
11989   if (is_mm_sync (model))
11990     aarch64_emit_load_exclusive (mode, rval, mem,
11991                                  GEN_INT (MEMMODEL_RELAXED));
11992   else
11993     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
11994
11995   cond = aarch64_gen_compare_reg (NE, rval, oldval);
11996   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
11997   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11998                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
11999   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12000
12001   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
12002
12003   if (!is_weak)
12004     {
12005       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
12006       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12007                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
12008       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12009     }
12010   else
12011     {
12012       cond = gen_rtx_REG (CCmode, CC_REGNUM);
12013       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
12014       emit_insn (gen_rtx_SET (cond, x));
12015     }
12016
12017   emit_label (label2);
12018
12019   /* Emit any final barrier needed for a __sync operation.  */
12020   if (is_mm_sync (model))
12021     aarch64_emit_post_barrier (model);
12022 }
12023
12024 /* Emit a BIC instruction.  */
12025
12026 static void
12027 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
12028 {
12029   rtx shift_rtx = GEN_INT (shift);
12030   rtx (*gen) (rtx, rtx, rtx, rtx);
12031
12032   switch (mode)
12033     {
12034     case SImode: gen = gen_and_one_cmpl_lshrsi3; break;
12035     case DImode: gen = gen_and_one_cmpl_lshrdi3; break;
12036     default:
12037       gcc_unreachable ();
12038     }
12039
12040   emit_insn (gen (dst, s2, shift_rtx, s1));
12041 }
12042
12043 /* Emit an atomic swap.  */
12044
12045 static void
12046 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
12047                           rtx mem, rtx model)
12048 {
12049   rtx (*gen) (rtx, rtx, rtx, rtx);
12050
12051   switch (mode)
12052     {
12053     case QImode: gen = gen_aarch64_atomic_swpqi; break;
12054     case HImode: gen = gen_aarch64_atomic_swphi; break;
12055     case SImode: gen = gen_aarch64_atomic_swpsi; break;
12056     case DImode: gen = gen_aarch64_atomic_swpdi; break;
12057     default:
12058       gcc_unreachable ();
12059     }
12060
12061   emit_insn (gen (dst, mem, value, model));
12062 }
12063
12064 /* Operations supported by aarch64_emit_atomic_load_op.  */
12065
12066 enum aarch64_atomic_load_op_code
12067 {
12068   AARCH64_LDOP_PLUS,    /* A + B  */
12069   AARCH64_LDOP_XOR,     /* A ^ B  */
12070   AARCH64_LDOP_OR,      /* A | B  */
12071   AARCH64_LDOP_BIC      /* A & ~B  */
12072 };
12073
12074 /* Emit an atomic load-operate.  */
12075
12076 static void
12077 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
12078                              machine_mode mode, rtx dst, rtx src,
12079                              rtx mem, rtx model)
12080 {
12081   typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
12082   const aarch64_atomic_load_op_fn plus[] =
12083   {
12084     gen_aarch64_atomic_loadaddqi,
12085     gen_aarch64_atomic_loadaddhi,
12086     gen_aarch64_atomic_loadaddsi,
12087     gen_aarch64_atomic_loadadddi
12088   };
12089   const aarch64_atomic_load_op_fn eor[] =
12090   {
12091     gen_aarch64_atomic_loadeorqi,
12092     gen_aarch64_atomic_loadeorhi,
12093     gen_aarch64_atomic_loadeorsi,
12094     gen_aarch64_atomic_loadeordi
12095   };
12096   const aarch64_atomic_load_op_fn ior[] =
12097   {
12098     gen_aarch64_atomic_loadsetqi,
12099     gen_aarch64_atomic_loadsethi,
12100     gen_aarch64_atomic_loadsetsi,
12101     gen_aarch64_atomic_loadsetdi
12102   };
12103   const aarch64_atomic_load_op_fn bic[] =
12104   {
12105     gen_aarch64_atomic_loadclrqi,
12106     gen_aarch64_atomic_loadclrhi,
12107     gen_aarch64_atomic_loadclrsi,
12108     gen_aarch64_atomic_loadclrdi
12109   };
12110   aarch64_atomic_load_op_fn gen;
12111   int idx = 0;
12112
12113   switch (mode)
12114     {
12115     case QImode: idx = 0; break;
12116     case HImode: idx = 1; break;
12117     case SImode: idx = 2; break;
12118     case DImode: idx = 3; break;
12119     default:
12120       gcc_unreachable ();
12121     }
12122
12123   switch (code)
12124     {
12125     case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
12126     case AARCH64_LDOP_XOR: gen = eor[idx]; break;
12127     case AARCH64_LDOP_OR: gen = ior[idx]; break;
12128     case AARCH64_LDOP_BIC: gen = bic[idx]; break;
12129     default:
12130       gcc_unreachable ();
12131     }
12132
12133   emit_insn (gen (dst, mem, src, model));
12134 }
12135
12136 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
12137    location to store the data read from memory.  OUT_RESULT is the location to
12138    store the result of the operation.  MEM is the memory location to read and
12139    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
12140    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
12141    be NULL.  */
12142
12143 void
12144 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
12145                          rtx mem, rtx value, rtx model_rtx)
12146 {
12147   machine_mode mode = GET_MODE (mem);
12148   machine_mode wmode = (mode == DImode ? DImode : SImode);
12149   const bool short_mode = (mode < SImode);
12150   aarch64_atomic_load_op_code ldop_code;
12151   rtx src;
12152   rtx x;
12153
12154   if (out_data)
12155     out_data = gen_lowpart (mode, out_data);
12156
12157   if (out_result)
12158     out_result = gen_lowpart (mode, out_result);
12159
12160   /* Make sure the value is in a register, putting it into a destination
12161      register if it needs to be manipulated.  */
12162   if (!register_operand (value, mode)
12163       || code == AND || code == MINUS)
12164     {
12165       src = out_result ? out_result : out_data;
12166       emit_move_insn (src, gen_lowpart (mode, value));
12167     }
12168   else
12169     src = value;
12170   gcc_assert (register_operand (src, mode));
12171
12172   /* Preprocess the data for the operation as necessary.  If the operation is
12173      a SET then emit a swap instruction and finish.  */
12174   switch (code)
12175     {
12176     case SET:
12177       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
12178       return;
12179
12180     case MINUS:
12181       /* Negate the value and treat it as a PLUS.  */
12182       {
12183         rtx neg_src;
12184
12185         /* Resize the value if necessary.  */
12186         if (short_mode)
12187           src = gen_lowpart (wmode, src);
12188
12189         neg_src = gen_rtx_NEG (wmode, src);
12190         emit_insn (gen_rtx_SET (src, neg_src));
12191
12192         if (short_mode)
12193           src = gen_lowpart (mode, src);
12194       }
12195       /* Fall-through.  */
12196     case PLUS:
12197       ldop_code = AARCH64_LDOP_PLUS;
12198       break;
12199
12200     case IOR:
12201       ldop_code = AARCH64_LDOP_OR;
12202       break;
12203
12204     case XOR:
12205       ldop_code = AARCH64_LDOP_XOR;
12206       break;
12207
12208     case AND:
12209       {
12210         rtx not_src;
12211
12212         /* Resize the value if necessary.  */
12213         if (short_mode)
12214           src = gen_lowpart (wmode, src);
12215
12216         not_src = gen_rtx_NOT (wmode, src);
12217         emit_insn (gen_rtx_SET (src, not_src));
12218
12219         if (short_mode)
12220           src = gen_lowpart (mode, src);
12221       }
12222       ldop_code = AARCH64_LDOP_BIC;
12223       break;
12224
12225     default:
12226       /* The operation can't be done with atomic instructions.  */
12227       gcc_unreachable ();
12228     }
12229
12230   aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
12231
12232   /* If necessary, calculate the data in memory after the update by redoing the
12233      operation from values in registers.  */
12234   if (!out_result)
12235     return;
12236
12237   if (short_mode)
12238     {
12239       src = gen_lowpart (wmode, src);
12240       out_data = gen_lowpart (wmode, out_data);
12241       out_result = gen_lowpart (wmode, out_result);
12242     }
12243
12244   x = NULL_RTX;
12245
12246   switch (code)
12247     {
12248     case MINUS:
12249     case PLUS:
12250       x = gen_rtx_PLUS (wmode, out_data, src);
12251       break;
12252     case IOR:
12253       x = gen_rtx_IOR (wmode, out_data, src);
12254       break;
12255     case XOR:
12256       x = gen_rtx_XOR (wmode, out_data, src);
12257       break;
12258     case AND:
12259       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
12260       return;
12261     default:
12262       gcc_unreachable ();
12263     }
12264
12265   emit_set_insn (out_result, x);
12266
12267   return;
12268 }
12269
12270 /* Split an atomic operation.  */
12271
12272 void
12273 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
12274                          rtx value, rtx model_rtx, rtx cond)
12275 {
12276   machine_mode mode = GET_MODE (mem);
12277   machine_mode wmode = (mode == DImode ? DImode : SImode);
12278   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
12279   const bool is_sync = is_mm_sync (model);
12280   rtx_code_label *label;
12281   rtx x;
12282
12283   /* Split the atomic operation into a sequence.  */
12284   label = gen_label_rtx ();
12285   emit_label (label);
12286
12287   if (new_out)
12288     new_out = gen_lowpart (wmode, new_out);
12289   if (old_out)
12290     old_out = gen_lowpart (wmode, old_out);
12291   else
12292     old_out = new_out;
12293   value = simplify_gen_subreg (wmode, value, mode, 0);
12294
12295   /* The initial load can be relaxed for a __sync operation since a final
12296      barrier will be emitted to stop code hoisting.  */
12297  if (is_sync)
12298     aarch64_emit_load_exclusive (mode, old_out, mem,
12299                                  GEN_INT (MEMMODEL_RELAXED));
12300   else
12301     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
12302
12303   switch (code)
12304     {
12305     case SET:
12306       new_out = value;
12307       break;
12308
12309     case NOT:
12310       x = gen_rtx_AND (wmode, old_out, value);
12311       emit_insn (gen_rtx_SET (new_out, x));
12312       x = gen_rtx_NOT (wmode, new_out);
12313       emit_insn (gen_rtx_SET (new_out, x));
12314       break;
12315
12316     case MINUS:
12317       if (CONST_INT_P (value))
12318         {
12319           value = GEN_INT (-INTVAL (value));
12320           code = PLUS;
12321         }
12322       /* Fall through.  */
12323
12324     default:
12325       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
12326       emit_insn (gen_rtx_SET (new_out, x));
12327       break;
12328     }
12329
12330   aarch64_emit_store_exclusive (mode, cond, mem,
12331                                 gen_lowpart (mode, new_out), model_rtx);
12332
12333   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12334   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12335                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
12336   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12337
12338   /* Emit any final barrier needed for a __sync operation.  */
12339   if (is_sync)
12340     aarch64_emit_post_barrier (model);
12341 }
12342
12343 static void
12344 aarch64_init_libfuncs (void)
12345 {
12346    /* Half-precision float operations.  The compiler handles all operations
12347      with NULL libfuncs by converting to SFmode.  */
12348
12349   /* Conversions.  */
12350   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
12351   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
12352
12353   /* Arithmetic.  */
12354   set_optab_libfunc (add_optab, HFmode, NULL);
12355   set_optab_libfunc (sdiv_optab, HFmode, NULL);
12356   set_optab_libfunc (smul_optab, HFmode, NULL);
12357   set_optab_libfunc (neg_optab, HFmode, NULL);
12358   set_optab_libfunc (sub_optab, HFmode, NULL);
12359
12360   /* Comparisons.  */
12361   set_optab_libfunc (eq_optab, HFmode, NULL);
12362   set_optab_libfunc (ne_optab, HFmode, NULL);
12363   set_optab_libfunc (lt_optab, HFmode, NULL);
12364   set_optab_libfunc (le_optab, HFmode, NULL);
12365   set_optab_libfunc (ge_optab, HFmode, NULL);
12366   set_optab_libfunc (gt_optab, HFmode, NULL);
12367   set_optab_libfunc (unord_optab, HFmode, NULL);
12368 }
12369
12370 /* Target hook for c_mode_for_suffix.  */
12371 static machine_mode
12372 aarch64_c_mode_for_suffix (char suffix)
12373 {
12374   if (suffix == 'q')
12375     return TFmode;
12376
12377   return VOIDmode;
12378 }
12379
12380 /* We can only represent floating point constants which will fit in
12381    "quarter-precision" values.  These values are characterised by
12382    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
12383    by:
12384
12385    (-1)^s * (n/16) * 2^r
12386
12387    Where:
12388      's' is the sign bit.
12389      'n' is an integer in the range 16 <= n <= 31.
12390      'r' is an integer in the range -3 <= r <= 4.  */
12391
12392 /* Return true iff X can be represented by a quarter-precision
12393    floating point immediate operand X.  Note, we cannot represent 0.0.  */
12394 bool
12395 aarch64_float_const_representable_p (rtx x)
12396 {
12397   /* This represents our current view of how many bits
12398      make up the mantissa.  */
12399   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
12400   int exponent;
12401   unsigned HOST_WIDE_INT mantissa, mask;
12402   REAL_VALUE_TYPE r, m;
12403   bool fail;
12404
12405   if (!CONST_DOUBLE_P (x))
12406     return false;
12407
12408   /* We don't support HFmode constants yet.  */
12409   if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
12410     return false;
12411
12412   r = *CONST_DOUBLE_REAL_VALUE (x);
12413
12414   /* We cannot represent infinities, NaNs or +/-zero.  We won't
12415      know if we have +zero until we analyse the mantissa, but we
12416      can reject the other invalid values.  */
12417   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
12418       || REAL_VALUE_MINUS_ZERO (r))
12419     return false;
12420
12421   /* Extract exponent.  */
12422   r = real_value_abs (&r);
12423   exponent = REAL_EXP (&r);
12424
12425   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12426      highest (sign) bit, with a fixed binary point at bit point_pos.
12427      m1 holds the low part of the mantissa, m2 the high part.
12428      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12429      bits for the mantissa, this can fail (low bits will be lost).  */
12430   real_ldexp (&m, &r, point_pos - exponent);
12431   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
12432
12433   /* If the low part of the mantissa has bits set we cannot represent
12434      the value.  */
12435   if (w.elt (0) != 0)
12436     return false;
12437   /* We have rejected the lower HOST_WIDE_INT, so update our
12438      understanding of how many bits lie in the mantissa and
12439      look only at the high HOST_WIDE_INT.  */
12440   mantissa = w.elt (1);
12441   point_pos -= HOST_BITS_PER_WIDE_INT;
12442
12443   /* We can only represent values with a mantissa of the form 1.xxxx.  */
12444   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
12445   if ((mantissa & mask) != 0)
12446     return false;
12447
12448   /* Having filtered unrepresentable values, we may now remove all
12449      but the highest 5 bits.  */
12450   mantissa >>= point_pos - 5;
12451
12452   /* We cannot represent the value 0.0, so reject it.  This is handled
12453      elsewhere.  */
12454   if (mantissa == 0)
12455     return false;
12456
12457   /* Then, as bit 4 is always set, we can mask it off, leaving
12458      the mantissa in the range [0, 15].  */
12459   mantissa &= ~(1 << 4);
12460   gcc_assert (mantissa <= 15);
12461
12462   /* GCC internally does not use IEEE754-like encoding (where normalized
12463      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
12464      Our mantissa values are shifted 4 places to the left relative to
12465      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
12466      by 5 places to correct for GCC's representation.  */
12467   exponent = 5 - exponent;
12468
12469   return (exponent >= 0 && exponent <= 7);
12470 }
12471
12472 char*
12473 aarch64_output_simd_mov_immediate (rtx const_vector,
12474                                    machine_mode mode,
12475                                    unsigned width)
12476 {
12477   bool is_valid;
12478   static char templ[40];
12479   const char *mnemonic;
12480   const char *shift_op;
12481   unsigned int lane_count = 0;
12482   char element_char;
12483
12484   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
12485
12486   /* This will return true to show const_vector is legal for use as either
12487      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
12488      also update INFO to show how the immediate should be generated.  */
12489   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
12490   gcc_assert (is_valid);
12491
12492   element_char = sizetochar (info.element_width);
12493   lane_count = width / info.element_width;
12494
12495   mode = GET_MODE_INNER (mode);
12496   if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12497     {
12498       gcc_assert (info.shift == 0 && ! info.mvn);
12499       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
12500          move immediate path.  */
12501       if (aarch64_float_const_zero_rtx_p (info.value))
12502         info.value = GEN_INT (0);
12503       else
12504         {
12505           const unsigned int buf_size = 20;
12506           char float_buf[buf_size] = {'\0'};
12507           real_to_decimal_for_mode (float_buf,
12508                                     CONST_DOUBLE_REAL_VALUE (info.value),
12509                                     buf_size, buf_size, 1, mode);
12510
12511           if (lane_count == 1)
12512             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
12513           else
12514             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
12515                       lane_count, element_char, float_buf);
12516           return templ;
12517         }
12518     }
12519
12520   mnemonic = info.mvn ? "mvni" : "movi";
12521   shift_op = info.msl ? "msl" : "lsl";
12522
12523   gcc_assert (CONST_INT_P (info.value));
12524   if (lane_count == 1)
12525     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
12526               mnemonic, UINTVAL (info.value));
12527   else if (info.shift)
12528     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
12529               ", %s %d", mnemonic, lane_count, element_char,
12530               UINTVAL (info.value), shift_op, info.shift);
12531   else
12532     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
12533               mnemonic, lane_count, element_char, UINTVAL (info.value));
12534   return templ;
12535 }
12536
12537 char*
12538 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
12539                                           machine_mode mode)
12540 {
12541   machine_mode vmode;
12542
12543   gcc_assert (!VECTOR_MODE_P (mode));
12544   vmode = aarch64_simd_container_mode (mode, 64);
12545   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
12546   return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
12547 }
12548
12549 /* Split operands into moves from op[1] + op[2] into op[0].  */
12550
12551 void
12552 aarch64_split_combinev16qi (rtx operands[3])
12553 {
12554   unsigned int dest = REGNO (operands[0]);
12555   unsigned int src1 = REGNO (operands[1]);
12556   unsigned int src2 = REGNO (operands[2]);
12557   machine_mode halfmode = GET_MODE (operands[1]);
12558   unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
12559   rtx destlo, desthi;
12560
12561   gcc_assert (halfmode == V16QImode);
12562
12563   if (src1 == dest && src2 == dest + halfregs)
12564     {
12565       /* No-op move.  Can't split to nothing; emit something.  */
12566       emit_note (NOTE_INSN_DELETED);
12567       return;
12568     }
12569
12570   /* Preserve register attributes for variable tracking.  */
12571   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
12572   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
12573                                GET_MODE_SIZE (halfmode));
12574
12575   /* Special case of reversed high/low parts.  */
12576   if (reg_overlap_mentioned_p (operands[2], destlo)
12577       && reg_overlap_mentioned_p (operands[1], desthi))
12578     {
12579       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12580       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
12581       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12582     }
12583   else if (!reg_overlap_mentioned_p (operands[2], destlo))
12584     {
12585       /* Try to avoid unnecessary moves if part of the result
12586          is in the right place already.  */
12587       if (src1 != dest)
12588         emit_move_insn (destlo, operands[1]);
12589       if (src2 != dest + halfregs)
12590         emit_move_insn (desthi, operands[2]);
12591     }
12592   else
12593     {
12594       if (src2 != dest + halfregs)
12595         emit_move_insn (desthi, operands[2]);
12596       if (src1 != dest)
12597         emit_move_insn (destlo, operands[1]);
12598     }
12599 }
12600
12601 /* vec_perm support.  */
12602
12603 #define MAX_VECT_LEN 16
12604
12605 struct expand_vec_perm_d
12606 {
12607   rtx target, op0, op1;
12608   unsigned char perm[MAX_VECT_LEN];
12609   machine_mode vmode;
12610   unsigned char nelt;
12611   bool one_vector_p;
12612   bool testing_p;
12613 };
12614
12615 /* Generate a variable permutation.  */
12616
12617 static void
12618 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
12619 {
12620   machine_mode vmode = GET_MODE (target);
12621   bool one_vector_p = rtx_equal_p (op0, op1);
12622
12623   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
12624   gcc_checking_assert (GET_MODE (op0) == vmode);
12625   gcc_checking_assert (GET_MODE (op1) == vmode);
12626   gcc_checking_assert (GET_MODE (sel) == vmode);
12627   gcc_checking_assert (TARGET_SIMD);
12628
12629   if (one_vector_p)
12630     {
12631       if (vmode == V8QImode)
12632         {
12633           /* Expand the argument to a V16QI mode by duplicating it.  */
12634           rtx pair = gen_reg_rtx (V16QImode);
12635           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
12636           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12637         }
12638       else
12639         {
12640           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
12641         }
12642     }
12643   else
12644     {
12645       rtx pair;
12646
12647       if (vmode == V8QImode)
12648         {
12649           pair = gen_reg_rtx (V16QImode);
12650           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
12651           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12652         }
12653       else
12654         {
12655           pair = gen_reg_rtx (OImode);
12656           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
12657           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
12658         }
12659     }
12660 }
12661
12662 void
12663 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
12664 {
12665   machine_mode vmode = GET_MODE (target);
12666   unsigned int nelt = GET_MODE_NUNITS (vmode);
12667   bool one_vector_p = rtx_equal_p (op0, op1);
12668   rtx mask;
12669
12670   /* The TBL instruction does not use a modulo index, so we must take care
12671      of that ourselves.  */
12672   mask = aarch64_simd_gen_const_vector_dup (vmode,
12673       one_vector_p ? nelt - 1 : 2 * nelt - 1);
12674   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
12675
12676   /* For big-endian, we also need to reverse the index within the vector
12677      (but not which vector).  */
12678   if (BYTES_BIG_ENDIAN)
12679     {
12680       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
12681       if (!one_vector_p)
12682         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
12683       sel = expand_simple_binop (vmode, XOR, sel, mask,
12684                                  NULL, 0, OPTAB_LIB_WIDEN);
12685     }
12686   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
12687 }
12688
12689 /* Recognize patterns suitable for the TRN instructions.  */
12690 static bool
12691 aarch64_evpc_trn (struct expand_vec_perm_d *d)
12692 {
12693   unsigned int i, odd, mask, nelt = d->nelt;
12694   rtx out, in0, in1, x;
12695   rtx (*gen) (rtx, rtx, rtx);
12696   machine_mode vmode = d->vmode;
12697
12698   if (GET_MODE_UNIT_SIZE (vmode) > 8)
12699     return false;
12700
12701   /* Note that these are little-endian tests.
12702      We correct for big-endian later.  */
12703   if (d->perm[0] == 0)
12704     odd = 0;
12705   else if (d->perm[0] == 1)
12706     odd = 1;
12707   else
12708     return false;
12709   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12710
12711   for (i = 0; i < nelt; i += 2)
12712     {
12713       if (d->perm[i] != i + odd)
12714         return false;
12715       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
12716         return false;
12717     }
12718
12719   /* Success!  */
12720   if (d->testing_p)
12721     return true;
12722
12723   in0 = d->op0;
12724   in1 = d->op1;
12725   if (BYTES_BIG_ENDIAN)
12726     {
12727       x = in0, in0 = in1, in1 = x;
12728       odd = !odd;
12729     }
12730   out = d->target;
12731
12732   if (odd)
12733     {
12734       switch (vmode)
12735         {
12736         case V16QImode: gen = gen_aarch64_trn2v16qi; break;
12737         case V8QImode: gen = gen_aarch64_trn2v8qi; break;
12738         case V8HImode: gen = gen_aarch64_trn2v8hi; break;
12739         case V4HImode: gen = gen_aarch64_trn2v4hi; break;
12740         case V4SImode: gen = gen_aarch64_trn2v4si; break;
12741         case V2SImode: gen = gen_aarch64_trn2v2si; break;
12742         case V2DImode: gen = gen_aarch64_trn2v2di; break;
12743         case V4HFmode: gen = gen_aarch64_trn2v4hf; break;
12744         case V8HFmode: gen = gen_aarch64_trn2v8hf; break;
12745         case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
12746         case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
12747         case V2DFmode: gen = gen_aarch64_trn2v2df; break;
12748         default:
12749           return false;
12750         }
12751     }
12752   else
12753     {
12754       switch (vmode)
12755         {
12756         case V16QImode: gen = gen_aarch64_trn1v16qi; break;
12757         case V8QImode: gen = gen_aarch64_trn1v8qi; break;
12758         case V8HImode: gen = gen_aarch64_trn1v8hi; break;
12759         case V4HImode: gen = gen_aarch64_trn1v4hi; break;
12760         case V4SImode: gen = gen_aarch64_trn1v4si; break;
12761         case V2SImode: gen = gen_aarch64_trn1v2si; break;
12762         case V2DImode: gen = gen_aarch64_trn1v2di; break;
12763         case V4HFmode: gen = gen_aarch64_trn1v4hf; break;
12764         case V8HFmode: gen = gen_aarch64_trn1v8hf; break;
12765         case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
12766         case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
12767         case V2DFmode: gen = gen_aarch64_trn1v2df; break;
12768         default:
12769           return false;
12770         }
12771     }
12772
12773   emit_insn (gen (out, in0, in1));
12774   return true;
12775 }
12776
12777 /* Recognize patterns suitable for the UZP instructions.  */
12778 static bool
12779 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
12780 {
12781   unsigned int i, odd, mask, nelt = d->nelt;
12782   rtx out, in0, in1, x;
12783   rtx (*gen) (rtx, rtx, rtx);
12784   machine_mode vmode = d->vmode;
12785
12786   if (GET_MODE_UNIT_SIZE (vmode) > 8)
12787     return false;
12788
12789   /* Note that these are little-endian tests.
12790      We correct for big-endian later.  */
12791   if (d->perm[0] == 0)
12792     odd = 0;
12793   else if (d->perm[0] == 1)
12794     odd = 1;
12795   else
12796     return false;
12797   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12798
12799   for (i = 0; i < nelt; i++)
12800     {
12801       unsigned elt = (i * 2 + odd) & mask;
12802       if (d->perm[i] != elt)
12803         return false;
12804     }
12805
12806   /* Success!  */
12807   if (d->testing_p)
12808     return true;
12809
12810   in0 = d->op0;
12811   in1 = d->op1;
12812   if (BYTES_BIG_ENDIAN)
12813     {
12814       x = in0, in0 = in1, in1 = x;
12815       odd = !odd;
12816     }
12817   out = d->target;
12818
12819   if (odd)
12820     {
12821       switch (vmode)
12822         {
12823         case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
12824         case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
12825         case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
12826         case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
12827         case V4SImode: gen = gen_aarch64_uzp2v4si; break;
12828         case V2SImode: gen = gen_aarch64_uzp2v2si; break;
12829         case V2DImode: gen = gen_aarch64_uzp2v2di; break;
12830         case V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
12831         case V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
12832         case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
12833         case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
12834         case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
12835         default:
12836           return false;
12837         }
12838     }
12839   else
12840     {
12841       switch (vmode)
12842         {
12843         case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
12844         case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
12845         case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
12846         case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
12847         case V4SImode: gen = gen_aarch64_uzp1v4si; break;
12848         case V2SImode: gen = gen_aarch64_uzp1v2si; break;
12849         case V2DImode: gen = gen_aarch64_uzp1v2di; break;
12850         case V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
12851         case V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
12852         case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
12853         case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
12854         case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
12855         default:
12856           return false;
12857         }
12858     }
12859
12860   emit_insn (gen (out, in0, in1));
12861   return true;
12862 }
12863
12864 /* Recognize patterns suitable for the ZIP instructions.  */
12865 static bool
12866 aarch64_evpc_zip (struct expand_vec_perm_d *d)
12867 {
12868   unsigned int i, high, mask, nelt = d->nelt;
12869   rtx out, in0, in1, x;
12870   rtx (*gen) (rtx, rtx, rtx);
12871   machine_mode vmode = d->vmode;
12872
12873   if (GET_MODE_UNIT_SIZE (vmode) > 8)
12874     return false;
12875
12876   /* Note that these are little-endian tests.
12877      We correct for big-endian later.  */
12878   high = nelt / 2;
12879   if (d->perm[0] == high)
12880     /* Do Nothing.  */
12881     ;
12882   else if (d->perm[0] == 0)
12883     high = 0;
12884   else
12885     return false;
12886   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12887
12888   for (i = 0; i < nelt / 2; i++)
12889     {
12890       unsigned elt = (i + high) & mask;
12891       if (d->perm[i * 2] != elt)
12892         return false;
12893       elt = (elt + nelt) & mask;
12894       if (d->perm[i * 2 + 1] != elt)
12895         return false;
12896     }
12897
12898   /* Success!  */
12899   if (d->testing_p)
12900     return true;
12901
12902   in0 = d->op0;
12903   in1 = d->op1;
12904   if (BYTES_BIG_ENDIAN)
12905     {
12906       x = in0, in0 = in1, in1 = x;
12907       high = !high;
12908     }
12909   out = d->target;
12910
12911   if (high)
12912     {
12913       switch (vmode)
12914         {
12915         case V16QImode: gen = gen_aarch64_zip2v16qi; break;
12916         case V8QImode: gen = gen_aarch64_zip2v8qi; break;
12917         case V8HImode: gen = gen_aarch64_zip2v8hi; break;
12918         case V4HImode: gen = gen_aarch64_zip2v4hi; break;
12919         case V4SImode: gen = gen_aarch64_zip2v4si; break;
12920         case V2SImode: gen = gen_aarch64_zip2v2si; break;
12921         case V2DImode: gen = gen_aarch64_zip2v2di; break;
12922         case V4HFmode: gen = gen_aarch64_zip2v4hf; break;
12923         case V8HFmode: gen = gen_aarch64_zip2v8hf; break;
12924         case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
12925         case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
12926         case V2DFmode: gen = gen_aarch64_zip2v2df; break;
12927         default:
12928           return false;
12929         }
12930     }
12931   else
12932     {
12933       switch (vmode)
12934         {
12935         case V16QImode: gen = gen_aarch64_zip1v16qi; break;
12936         case V8QImode: gen = gen_aarch64_zip1v8qi; break;
12937         case V8HImode: gen = gen_aarch64_zip1v8hi; break;
12938         case V4HImode: gen = gen_aarch64_zip1v4hi; break;
12939         case V4SImode: gen = gen_aarch64_zip1v4si; break;
12940         case V2SImode: gen = gen_aarch64_zip1v2si; break;
12941         case V2DImode: gen = gen_aarch64_zip1v2di; break;
12942         case V4HFmode: gen = gen_aarch64_zip1v4hf; break;
12943         case V8HFmode: gen = gen_aarch64_zip1v8hf; break;
12944         case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
12945         case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
12946         case V2DFmode: gen = gen_aarch64_zip1v2df; break;
12947         default:
12948           return false;
12949         }
12950     }
12951
12952   emit_insn (gen (out, in0, in1));
12953   return true;
12954 }
12955
12956 /* Recognize patterns for the EXT insn.  */
12957
12958 static bool
12959 aarch64_evpc_ext (struct expand_vec_perm_d *d)
12960 {
12961   unsigned int i, nelt = d->nelt;
12962   rtx (*gen) (rtx, rtx, rtx, rtx);
12963   rtx offset;
12964
12965   unsigned int location = d->perm[0]; /* Always < nelt.  */
12966
12967   /* Check if the extracted indices are increasing by one.  */
12968   for (i = 1; i < nelt; i++)
12969     {
12970       unsigned int required = location + i;
12971       if (d->one_vector_p)
12972         {
12973           /* We'll pass the same vector in twice, so allow indices to wrap.  */
12974           required &= (nelt - 1);
12975         }
12976       if (d->perm[i] != required)
12977         return false;
12978     }
12979
12980   switch (d->vmode)
12981     {
12982     case V16QImode: gen = gen_aarch64_extv16qi; break;
12983     case V8QImode: gen = gen_aarch64_extv8qi; break;
12984     case V4HImode: gen = gen_aarch64_extv4hi; break;
12985     case V8HImode: gen = gen_aarch64_extv8hi; break;
12986     case V2SImode: gen = gen_aarch64_extv2si; break;
12987     case V4SImode: gen = gen_aarch64_extv4si; break;
12988     case V4HFmode: gen = gen_aarch64_extv4hf; break;
12989     case V8HFmode: gen = gen_aarch64_extv8hf; break;
12990     case V2SFmode: gen = gen_aarch64_extv2sf; break;
12991     case V4SFmode: gen = gen_aarch64_extv4sf; break;
12992     case V2DImode: gen = gen_aarch64_extv2di; break;
12993     case V2DFmode: gen = gen_aarch64_extv2df; break;
12994     default:
12995       return false;
12996     }
12997
12998   /* Success! */
12999   if (d->testing_p)
13000     return true;
13001
13002   /* The case where (location == 0) is a no-op for both big- and little-endian,
13003      and is removed by the mid-end at optimization levels -O1 and higher.  */
13004
13005   if (BYTES_BIG_ENDIAN && (location != 0))
13006     {
13007       /* After setup, we want the high elements of the first vector (stored
13008          at the LSB end of the register), and the low elements of the second
13009          vector (stored at the MSB end of the register). So swap.  */
13010       std::swap (d->op0, d->op1);
13011       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
13012       location = nelt - location;
13013     }
13014
13015   offset = GEN_INT (location);
13016   emit_insn (gen (d->target, d->op0, d->op1, offset));
13017   return true;
13018 }
13019
13020 /* Recognize patterns for the REV insns.  */
13021
13022 static bool
13023 aarch64_evpc_rev (struct expand_vec_perm_d *d)
13024 {
13025   unsigned int i, j, diff, nelt = d->nelt;
13026   rtx (*gen) (rtx, rtx);
13027
13028   if (!d->one_vector_p)
13029     return false;
13030
13031   diff = d->perm[0];
13032   switch (diff)
13033     {
13034     case 7:
13035       switch (d->vmode)
13036         {
13037         case V16QImode: gen = gen_aarch64_rev64v16qi; break;
13038         case V8QImode: gen = gen_aarch64_rev64v8qi;  break;
13039         default:
13040           return false;
13041         }
13042       break;
13043     case 3:
13044       switch (d->vmode)
13045         {
13046         case V16QImode: gen = gen_aarch64_rev32v16qi; break;
13047         case V8QImode: gen = gen_aarch64_rev32v8qi;  break;
13048         case V8HImode: gen = gen_aarch64_rev64v8hi;  break;
13049         case V4HImode: gen = gen_aarch64_rev64v4hi;  break;
13050         default:
13051           return false;
13052         }
13053       break;
13054     case 1:
13055       switch (d->vmode)
13056         {
13057         case V16QImode: gen = gen_aarch64_rev16v16qi; break;
13058         case V8QImode: gen = gen_aarch64_rev16v8qi;  break;
13059         case V8HImode: gen = gen_aarch64_rev32v8hi;  break;
13060         case V4HImode: gen = gen_aarch64_rev32v4hi;  break;
13061         case V4SImode: gen = gen_aarch64_rev64v4si;  break;
13062         case V2SImode: gen = gen_aarch64_rev64v2si;  break;
13063         case V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
13064         case V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
13065         case V8HFmode: gen = gen_aarch64_rev64v8hf;  break;
13066         case V4HFmode: gen = gen_aarch64_rev64v4hf;  break;
13067         default:
13068           return false;
13069         }
13070       break;
13071     default:
13072       return false;
13073     }
13074
13075   for (i = 0; i < nelt ; i += diff + 1)
13076     for (j = 0; j <= diff; j += 1)
13077       {
13078         /* This is guaranteed to be true as the value of diff
13079            is 7, 3, 1 and we should have enough elements in the
13080            queue to generate this.  Getting a vector mask with a
13081            value of diff other than these values implies that
13082            something is wrong by the time we get here.  */
13083         gcc_assert (i + j < nelt);
13084         if (d->perm[i + j] != i + diff - j)
13085           return false;
13086       }
13087
13088   /* Success! */
13089   if (d->testing_p)
13090     return true;
13091
13092   emit_insn (gen (d->target, d->op0));
13093   return true;
13094 }
13095
13096 static bool
13097 aarch64_evpc_dup (struct expand_vec_perm_d *d)
13098 {
13099   rtx (*gen) (rtx, rtx, rtx);
13100   rtx out = d->target;
13101   rtx in0;
13102   machine_mode vmode = d->vmode;
13103   unsigned int i, elt, nelt = d->nelt;
13104   rtx lane;
13105
13106   elt = d->perm[0];
13107   for (i = 1; i < nelt; i++)
13108     {
13109       if (elt != d->perm[i])
13110         return false;
13111     }
13112
13113   /* The generic preparation in aarch64_expand_vec_perm_const_1
13114      swaps the operand order and the permute indices if it finds
13115      d->perm[0] to be in the second operand.  Thus, we can always
13116      use d->op0 and need not do any extra arithmetic to get the
13117      correct lane number.  */
13118   in0 = d->op0;
13119   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
13120
13121   switch (vmode)
13122     {
13123     case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
13124     case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
13125     case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
13126     case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
13127     case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
13128     case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
13129     case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
13130     case V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
13131     case V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
13132     case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
13133     case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
13134     case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
13135     default:
13136       return false;
13137     }
13138
13139   emit_insn (gen (out, in0, lane));
13140   return true;
13141 }
13142
13143 static bool
13144 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
13145 {
13146   rtx rperm[MAX_VECT_LEN], sel;
13147   machine_mode vmode = d->vmode;
13148   unsigned int i, nelt = d->nelt;
13149
13150   if (d->testing_p)
13151     return true;
13152
13153   /* Generic code will try constant permutation twice.  Once with the
13154      original mode and again with the elements lowered to QImode.
13155      So wait and don't do the selector expansion ourselves.  */
13156   if (vmode != V8QImode && vmode != V16QImode)
13157     return false;
13158
13159   for (i = 0; i < nelt; ++i)
13160     {
13161       int nunits = GET_MODE_NUNITS (vmode);
13162
13163       /* If big-endian and two vectors we end up with a weird mixed-endian
13164          mode on NEON.  Reverse the index within each word but not the word
13165          itself.  */
13166       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
13167                                            : d->perm[i]);
13168     }
13169   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
13170   sel = force_reg (vmode, sel);
13171
13172   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
13173   return true;
13174 }
13175
13176 static bool
13177 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
13178 {
13179   /* The pattern matching functions above are written to look for a small
13180      number to begin the sequence (0, 1, N/2).  If we begin with an index
13181      from the second operand, we can swap the operands.  */
13182   if (d->perm[0] >= d->nelt)
13183     {
13184       unsigned i, nelt = d->nelt;
13185
13186       gcc_assert (nelt == (nelt & -nelt));
13187       for (i = 0; i < nelt; ++i)
13188         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
13189
13190       std::swap (d->op0, d->op1);
13191     }
13192
13193   if (TARGET_SIMD)
13194     {
13195       if (aarch64_evpc_rev (d))
13196         return true;
13197       else if (aarch64_evpc_ext (d))
13198         return true;
13199       else if (aarch64_evpc_dup (d))
13200         return true;
13201       else if (aarch64_evpc_zip (d))
13202         return true;
13203       else if (aarch64_evpc_uzp (d))
13204         return true;
13205       else if (aarch64_evpc_trn (d))
13206         return true;
13207       return aarch64_evpc_tbl (d);
13208     }
13209   return false;
13210 }
13211
13212 /* Expand a vec_perm_const pattern.  */
13213
13214 bool
13215 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
13216 {
13217   struct expand_vec_perm_d d;
13218   int i, nelt, which;
13219
13220   d.target = target;
13221   d.op0 = op0;
13222   d.op1 = op1;
13223
13224   d.vmode = GET_MODE (target);
13225   gcc_assert (VECTOR_MODE_P (d.vmode));
13226   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13227   d.testing_p = false;
13228
13229   for (i = which = 0; i < nelt; ++i)
13230     {
13231       rtx e = XVECEXP (sel, 0, i);
13232       int ei = INTVAL (e) & (2 * nelt - 1);
13233       which |= (ei < nelt ? 1 : 2);
13234       d.perm[i] = ei;
13235     }
13236
13237   switch (which)
13238     {
13239     default:
13240       gcc_unreachable ();
13241
13242     case 3:
13243       d.one_vector_p = false;
13244       if (!rtx_equal_p (op0, op1))
13245         break;
13246
13247       /* The elements of PERM do not suggest that only the first operand
13248          is used, but both operands are identical.  Allow easier matching
13249          of the permutation by folding the permutation into the single
13250          input vector.  */
13251       /* Fall Through.  */
13252     case 2:
13253       for (i = 0; i < nelt; ++i)
13254         d.perm[i] &= nelt - 1;
13255       d.op0 = op1;
13256       d.one_vector_p = true;
13257       break;
13258
13259     case 1:
13260       d.op1 = op0;
13261       d.one_vector_p = true;
13262       break;
13263     }
13264
13265   return aarch64_expand_vec_perm_const_1 (&d);
13266 }
13267
13268 static bool
13269 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
13270                                      const unsigned char *sel)
13271 {
13272   struct expand_vec_perm_d d;
13273   unsigned int i, nelt, which;
13274   bool ret;
13275
13276   d.vmode = vmode;
13277   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13278   d.testing_p = true;
13279   memcpy (d.perm, sel, nelt);
13280
13281   /* Calculate whether all elements are in one vector.  */
13282   for (i = which = 0; i < nelt; ++i)
13283     {
13284       unsigned char e = d.perm[i];
13285       gcc_assert (e < 2 * nelt);
13286       which |= (e < nelt ? 1 : 2);
13287     }
13288
13289   /* If all elements are from the second vector, reindex as if from the
13290      first vector.  */
13291   if (which == 2)
13292     for (i = 0; i < nelt; ++i)
13293       d.perm[i] -= nelt;
13294
13295   /* Check whether the mask can be applied to a single vector.  */
13296   d.one_vector_p = (which != 3);
13297
13298   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
13299   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
13300   if (!d.one_vector_p)
13301     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
13302
13303   start_sequence ();
13304   ret = aarch64_expand_vec_perm_const_1 (&d);
13305   end_sequence ();
13306
13307   return ret;
13308 }
13309
13310 rtx
13311 aarch64_reverse_mask (enum machine_mode mode)
13312 {
13313   /* We have to reverse each vector because we dont have
13314      a permuted load that can reverse-load according to ABI rules.  */
13315   rtx mask;
13316   rtvec v = rtvec_alloc (16);
13317   int i, j;
13318   int nunits = GET_MODE_NUNITS (mode);
13319   int usize = GET_MODE_UNIT_SIZE (mode);
13320
13321   gcc_assert (BYTES_BIG_ENDIAN);
13322   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
13323
13324   for (i = 0; i < nunits; i++)
13325     for (j = 0; j < usize; j++)
13326       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
13327   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
13328   return force_reg (V16QImode, mask);
13329 }
13330
13331 /* Implement MODES_TIEABLE_P.  In principle we should always return true.
13332    However due to issues with register allocation it is preferable to avoid
13333    tieing integer scalar and FP scalar modes.  Executing integer operations
13334    in general registers is better than treating them as scalar vector
13335    operations.  This reduces latency and avoids redundant int<->FP moves.
13336    So tie modes if they are either the same class, or vector modes with
13337    other vector modes, vector structs or any scalar mode.
13338 */
13339
13340 bool
13341 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
13342 {
13343   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
13344     return true;
13345
13346   /* We specifically want to allow elements of "structure" modes to
13347      be tieable to the structure.  This more general condition allows
13348      other rarer situations too.  */
13349   if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
13350     return true;
13351
13352   /* Also allow any scalar modes with vectors.  */
13353   if (aarch64_vector_mode_supported_p (mode1)
13354       || aarch64_vector_mode_supported_p (mode2))
13355     return true;
13356
13357   return false;
13358 }
13359
13360 /* Return a new RTX holding the result of moving POINTER forward by
13361    AMOUNT bytes.  */
13362
13363 static rtx
13364 aarch64_move_pointer (rtx pointer, int amount)
13365 {
13366   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
13367
13368   return adjust_automodify_address (pointer, GET_MODE (pointer),
13369                                     next, amount);
13370 }
13371
13372 /* Return a new RTX holding the result of moving POINTER forward by the
13373    size of the mode it points to.  */
13374
13375 static rtx
13376 aarch64_progress_pointer (rtx pointer)
13377 {
13378   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
13379
13380   return aarch64_move_pointer (pointer, amount);
13381 }
13382
13383 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13384    MODE bytes.  */
13385
13386 static void
13387 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
13388                                               machine_mode mode)
13389 {
13390   rtx reg = gen_reg_rtx (mode);
13391
13392   /* "Cast" the pointers to the correct mode.  */
13393   *src = adjust_address (*src, mode, 0);
13394   *dst = adjust_address (*dst, mode, 0);
13395   /* Emit the memcpy.  */
13396   emit_move_insn (reg, *src);
13397   emit_move_insn (*dst, reg);
13398   /* Move the pointers forward.  */
13399   *src = aarch64_progress_pointer (*src);
13400   *dst = aarch64_progress_pointer (*dst);
13401 }
13402
13403 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
13404    we succeed, otherwise return false.  */
13405
13406 bool
13407 aarch64_expand_movmem (rtx *operands)
13408 {
13409   unsigned int n;
13410   rtx dst = operands[0];
13411   rtx src = operands[1];
13412   rtx base;
13413   bool speed_p = !optimize_function_for_size_p (cfun);
13414
13415   /* When optimizing for size, give a better estimate of the length of a
13416      memcpy call, but use the default otherwise.  */
13417   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
13418
13419   /* We can't do anything smart if the amount to copy is not constant.  */
13420   if (!CONST_INT_P (operands[2]))
13421     return false;
13422
13423   n = UINTVAL (operands[2]);
13424
13425   /* Try to keep the number of instructions low.  For cases below 16 bytes we
13426      need to make at most two moves.  For cases above 16 bytes it will be one
13427      move for each 16 byte chunk, then at most two additional moves.  */
13428   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
13429     return false;
13430
13431   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
13432   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
13433
13434   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
13435   src = adjust_automodify_address (src, VOIDmode, base, 0);
13436
13437   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
13438      1-byte chunk.  */
13439   if (n < 4)
13440     {
13441       if (n >= 2)
13442         {
13443           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13444           n -= 2;
13445         }
13446
13447       if (n == 1)
13448         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13449
13450       return true;
13451     }
13452
13453   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
13454      4-byte chunk, partially overlapping with the previously copied chunk.  */
13455   if (n < 8)
13456     {
13457       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13458       n -= 4;
13459       if (n > 0)
13460         {
13461           int move = n - 4;
13462
13463           src = aarch64_move_pointer (src, move);
13464           dst = aarch64_move_pointer (dst, move);
13465           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13466         }
13467       return true;
13468     }
13469
13470   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
13471      them, then (if applicable) an 8-byte chunk.  */
13472   while (n >= 8)
13473     {
13474       if (n / 16)
13475         {
13476           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
13477           n -= 16;
13478         }
13479       else
13480         {
13481           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13482           n -= 8;
13483         }
13484     }
13485
13486   /* Finish the final bytes of the copy.  We can always do this in one
13487      instruction.  We either copy the exact amount we need, or partially
13488      overlap with the previous chunk we copied and copy 8-bytes.  */
13489   if (n == 0)
13490     return true;
13491   else if (n == 1)
13492     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13493   else if (n == 2)
13494     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13495   else if (n == 4)
13496     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13497   else
13498     {
13499       if (n == 3)
13500         {
13501           src = aarch64_move_pointer (src, -1);
13502           dst = aarch64_move_pointer (dst, -1);
13503           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13504         }
13505       else
13506         {
13507           int move = n - 8;
13508
13509           src = aarch64_move_pointer (src, move);
13510           dst = aarch64_move_pointer (dst, move);
13511           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13512         }
13513     }
13514
13515   return true;
13516 }
13517
13518 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
13519    SImode stores.  Handle the case when the constant has identical
13520    bottom and top halves.  This is beneficial when the two stores can be
13521    merged into an STP and we avoid synthesising potentially expensive
13522    immediates twice.  Return true if such a split is possible.  */
13523
13524 bool
13525 aarch64_split_dimode_const_store (rtx dst, rtx src)
13526 {
13527   rtx lo = gen_lowpart (SImode, src);
13528   rtx hi = gen_highpart_mode (SImode, DImode, src);
13529
13530   bool size_p = optimize_function_for_size_p (cfun);
13531
13532   if (!rtx_equal_p (lo, hi))
13533     return false;
13534
13535   unsigned int orig_cost
13536     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
13537   unsigned int lo_cost
13538     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
13539
13540   /* We want to transform:
13541      MOV        x1, 49370
13542      MOVK       x1, 0x140, lsl 16
13543      MOVK       x1, 0xc0da, lsl 32
13544      MOVK       x1, 0x140, lsl 48
13545      STR        x1, [x0]
13546    into:
13547      MOV        w1, 49370
13548      MOVK       w1, 0x140, lsl 16
13549      STP        w1, w1, [x0]
13550    So we want to perform this only when we save two instructions
13551    or more.  When optimizing for size, however, accept any code size
13552    savings we can.  */
13553   if (size_p && orig_cost <= lo_cost)
13554     return false;
13555
13556   if (!size_p
13557       && (orig_cost <= lo_cost + 1))
13558     return false;
13559
13560   rtx mem_lo = adjust_address (dst, SImode, 0);
13561   if (!aarch64_mem_pair_operand (mem_lo, SImode))
13562     return false;
13563
13564   rtx tmp_reg = gen_reg_rtx (SImode);
13565   aarch64_expand_mov_immediate (tmp_reg, lo);
13566   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
13567   /* Don't emit an explicit store pair as this may not be always profitable.
13568      Let the sched-fusion logic decide whether to merge them.  */
13569   emit_move_insn (mem_lo, tmp_reg);
13570   emit_move_insn (mem_hi, tmp_reg);
13571
13572   return true;
13573 }
13574
13575 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
13576
13577 static unsigned HOST_WIDE_INT
13578 aarch64_asan_shadow_offset (void)
13579 {
13580   return (HOST_WIDE_INT_1 << 36);
13581 }
13582
13583 static bool
13584 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
13585                                         unsigned int align,
13586                                         enum by_pieces_operation op,
13587                                         bool speed_p)
13588 {
13589   /* STORE_BY_PIECES can be used when copying a constant string, but
13590      in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
13591      For now we always fail this and let the move_by_pieces code copy
13592      the string from read-only memory.  */
13593   if (op == STORE_BY_PIECES)
13594     return false;
13595
13596   return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
13597 }
13598
13599 static rtx
13600 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
13601                         int code, tree treeop0, tree treeop1)
13602 {
13603   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13604   rtx op0, op1;
13605   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13606   insn_code icode;
13607   struct expand_operand ops[4];
13608
13609   start_sequence ();
13610   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13611
13612   op_mode = GET_MODE (op0);
13613   if (op_mode == VOIDmode)
13614     op_mode = GET_MODE (op1);
13615
13616   switch (op_mode)
13617     {
13618     case QImode:
13619     case HImode:
13620     case SImode:
13621       cmp_mode = SImode;
13622       icode = CODE_FOR_cmpsi;
13623       break;
13624
13625     case DImode:
13626       cmp_mode = DImode;
13627       icode = CODE_FOR_cmpdi;
13628       break;
13629
13630     case SFmode:
13631       cmp_mode = SFmode;
13632       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13633       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
13634       break;
13635
13636     case DFmode:
13637       cmp_mode = DFmode;
13638       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13639       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
13640       break;
13641
13642     default:
13643       end_sequence ();
13644       return NULL_RTX;
13645     }
13646
13647   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
13648   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
13649   if (!op0 || !op1)
13650     {
13651       end_sequence ();
13652       return NULL_RTX;
13653     }
13654   *prep_seq = get_insns ();
13655   end_sequence ();
13656
13657   create_fixed_operand (&ops[0], op0);
13658   create_fixed_operand (&ops[1], op1);
13659
13660   start_sequence ();
13661   if (!maybe_expand_insn (icode, 2, ops))
13662     {
13663       end_sequence ();
13664       return NULL_RTX;
13665     }
13666   *gen_seq = get_insns ();
13667   end_sequence ();
13668
13669   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
13670                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
13671 }
13672
13673 static rtx
13674 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
13675                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
13676 {
13677   rtx op0, op1, target;
13678   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13679   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13680   insn_code icode;
13681   struct expand_operand ops[6];
13682   int aarch64_cond;
13683
13684   push_to_sequence (*prep_seq);
13685   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13686
13687   op_mode = GET_MODE (op0);
13688   if (op_mode == VOIDmode)
13689     op_mode = GET_MODE (op1);
13690
13691   switch (op_mode)
13692     {
13693     case QImode:
13694     case HImode:
13695     case SImode:
13696       cmp_mode = SImode;
13697       icode = CODE_FOR_ccmpsi;
13698       break;
13699
13700     case DImode:
13701       cmp_mode = DImode;
13702       icode = CODE_FOR_ccmpdi;
13703       break;
13704
13705     case SFmode:
13706       cmp_mode = SFmode;
13707       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13708       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
13709       break;
13710
13711     case DFmode:
13712       cmp_mode = DFmode;
13713       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13714       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
13715       break;
13716
13717     default:
13718       end_sequence ();
13719       return NULL_RTX;
13720     }
13721
13722   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
13723   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
13724   if (!op0 || !op1)
13725     {
13726       end_sequence ();
13727       return NULL_RTX;
13728     }
13729   *prep_seq = get_insns ();
13730   end_sequence ();
13731
13732   target = gen_rtx_REG (cc_mode, CC_REGNUM);
13733   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
13734
13735   if (bit_code != AND)
13736     {
13737       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
13738                                                 GET_MODE (XEXP (prev, 0))),
13739                              VOIDmode, XEXP (prev, 0), const0_rtx);
13740       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
13741     }
13742
13743   create_fixed_operand (&ops[0], XEXP (prev, 0));
13744   create_fixed_operand (&ops[1], target);
13745   create_fixed_operand (&ops[2], op0);
13746   create_fixed_operand (&ops[3], op1);
13747   create_fixed_operand (&ops[4], prev);
13748   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
13749
13750   push_to_sequence (*gen_seq);
13751   if (!maybe_expand_insn (icode, 6, ops))
13752     {
13753       end_sequence ();
13754       return NULL_RTX;
13755     }
13756
13757   *gen_seq = get_insns ();
13758   end_sequence ();
13759
13760   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
13761 }
13762
13763 #undef TARGET_GEN_CCMP_FIRST
13764 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
13765
13766 #undef TARGET_GEN_CCMP_NEXT
13767 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
13768
13769 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
13770    instruction fusion of some sort.  */
13771
13772 static bool
13773 aarch64_macro_fusion_p (void)
13774 {
13775   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
13776 }
13777
13778
13779 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
13780    should be kept together during scheduling.  */
13781
13782 static bool
13783 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
13784 {
13785   rtx set_dest;
13786   rtx prev_set = single_set (prev);
13787   rtx curr_set = single_set (curr);
13788   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
13789   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
13790
13791   if (!aarch64_macro_fusion_p ())
13792     return false;
13793
13794   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
13795     {
13796       /* We are trying to match:
13797          prev (mov)  == (set (reg r0) (const_int imm16))
13798          curr (movk) == (set (zero_extract (reg r0)
13799                                            (const_int 16)
13800                                            (const_int 16))
13801                              (const_int imm16_1))  */
13802
13803       set_dest = SET_DEST (curr_set);
13804
13805       if (GET_CODE (set_dest) == ZERO_EXTRACT
13806           && CONST_INT_P (SET_SRC (curr_set))
13807           && CONST_INT_P (SET_SRC (prev_set))
13808           && CONST_INT_P (XEXP (set_dest, 2))
13809           && INTVAL (XEXP (set_dest, 2)) == 16
13810           && REG_P (XEXP (set_dest, 0))
13811           && REG_P (SET_DEST (prev_set))
13812           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
13813         {
13814           return true;
13815         }
13816     }
13817
13818   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
13819     {
13820
13821       /*  We're trying to match:
13822           prev (adrp) == (set (reg r1)
13823                               (high (symbol_ref ("SYM"))))
13824           curr (add) == (set (reg r0)
13825                              (lo_sum (reg r1)
13826                                      (symbol_ref ("SYM"))))
13827           Note that r0 need not necessarily be the same as r1, especially
13828           during pre-regalloc scheduling.  */
13829
13830       if (satisfies_constraint_Ush (SET_SRC (prev_set))
13831           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13832         {
13833           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
13834               && REG_P (XEXP (SET_SRC (curr_set), 0))
13835               && REGNO (XEXP (SET_SRC (curr_set), 0))
13836                  == REGNO (SET_DEST (prev_set))
13837               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
13838                               XEXP (SET_SRC (curr_set), 1)))
13839             return true;
13840         }
13841     }
13842
13843   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
13844     {
13845
13846       /* We're trying to match:
13847          prev (movk) == (set (zero_extract (reg r0)
13848                                            (const_int 16)
13849                                            (const_int 32))
13850                              (const_int imm16_1))
13851          curr (movk) == (set (zero_extract (reg r0)
13852                                            (const_int 16)
13853                                            (const_int 48))
13854                              (const_int imm16_2))  */
13855
13856       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
13857           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
13858           && REG_P (XEXP (SET_DEST (prev_set), 0))
13859           && REG_P (XEXP (SET_DEST (curr_set), 0))
13860           && REGNO (XEXP (SET_DEST (prev_set), 0))
13861              == REGNO (XEXP (SET_DEST (curr_set), 0))
13862           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
13863           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
13864           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
13865           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
13866           && CONST_INT_P (SET_SRC (prev_set))
13867           && CONST_INT_P (SET_SRC (curr_set)))
13868         return true;
13869
13870     }
13871   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
13872     {
13873       /* We're trying to match:
13874           prev (adrp) == (set (reg r0)
13875                               (high (symbol_ref ("SYM"))))
13876           curr (ldr) == (set (reg r1)
13877                              (mem (lo_sum (reg r0)
13878                                              (symbol_ref ("SYM")))))
13879                  or
13880           curr (ldr) == (set (reg r1)
13881                              (zero_extend (mem
13882                                            (lo_sum (reg r0)
13883                                                    (symbol_ref ("SYM"))))))  */
13884       if (satisfies_constraint_Ush (SET_SRC (prev_set))
13885           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13886         {
13887           rtx curr_src = SET_SRC (curr_set);
13888
13889           if (GET_CODE (curr_src) == ZERO_EXTEND)
13890             curr_src = XEXP (curr_src, 0);
13891
13892           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
13893               && REG_P (XEXP (XEXP (curr_src, 0), 0))
13894               && REGNO (XEXP (XEXP (curr_src, 0), 0))
13895                  == REGNO (SET_DEST (prev_set))
13896               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
13897                               XEXP (SET_SRC (prev_set), 0)))
13898               return true;
13899         }
13900     }
13901
13902   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
13903        && aarch_crypto_can_dual_issue (prev, curr))
13904     return true;
13905
13906   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
13907       && any_condjump_p (curr))
13908     {
13909       enum attr_type prev_type = get_attr_type (prev);
13910
13911       /* FIXME: this misses some which is considered simple arthematic
13912          instructions for ThunderX.  Simple shifts are missed here.  */
13913       if (prev_type == TYPE_ALUS_SREG
13914           || prev_type == TYPE_ALUS_IMM
13915           || prev_type == TYPE_LOGICS_REG
13916           || prev_type == TYPE_LOGICS_IMM)
13917         return true;
13918     }
13919
13920   return false;
13921 }
13922
13923 /* Return true iff the instruction fusion described by OP is enabled.  */
13924
13925 bool
13926 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
13927 {
13928   return (aarch64_tune_params.fusible_ops & op) != 0;
13929 }
13930
13931 /* If MEM is in the form of [base+offset], extract the two parts
13932    of address and set to BASE and OFFSET, otherwise return false
13933    after clearing BASE and OFFSET.  */
13934
13935 bool
13936 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
13937 {
13938   rtx addr;
13939
13940   gcc_assert (MEM_P (mem));
13941
13942   addr = XEXP (mem, 0);
13943
13944   if (REG_P (addr))
13945     {
13946       *base = addr;
13947       *offset = const0_rtx;
13948       return true;
13949     }
13950
13951   if (GET_CODE (addr) == PLUS
13952       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
13953     {
13954       *base = XEXP (addr, 0);
13955       *offset = XEXP (addr, 1);
13956       return true;
13957     }
13958
13959   *base = NULL_RTX;
13960   *offset = NULL_RTX;
13961
13962   return false;
13963 }
13964
13965 /* Types for scheduling fusion.  */
13966 enum sched_fusion_type
13967 {
13968   SCHED_FUSION_NONE = 0,
13969   SCHED_FUSION_LD_SIGN_EXTEND,
13970   SCHED_FUSION_LD_ZERO_EXTEND,
13971   SCHED_FUSION_LD,
13972   SCHED_FUSION_ST,
13973   SCHED_FUSION_NUM
13974 };
13975
13976 /* If INSN is a load or store of address in the form of [base+offset],
13977    extract the two parts and set to BASE and OFFSET.  Return scheduling
13978    fusion type this INSN is.  */
13979
13980 static enum sched_fusion_type
13981 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
13982 {
13983   rtx x, dest, src;
13984   enum sched_fusion_type fusion = SCHED_FUSION_LD;
13985
13986   gcc_assert (INSN_P (insn));
13987   x = PATTERN (insn);
13988   if (GET_CODE (x) != SET)
13989     return SCHED_FUSION_NONE;
13990
13991   src = SET_SRC (x);
13992   dest = SET_DEST (x);
13993
13994   machine_mode dest_mode = GET_MODE (dest);
13995
13996   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
13997     return SCHED_FUSION_NONE;
13998
13999   if (GET_CODE (src) == SIGN_EXTEND)
14000     {
14001       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
14002       src = XEXP (src, 0);
14003       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14004         return SCHED_FUSION_NONE;
14005     }
14006   else if (GET_CODE (src) == ZERO_EXTEND)
14007     {
14008       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
14009       src = XEXP (src, 0);
14010       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14011         return SCHED_FUSION_NONE;
14012     }
14013
14014   if (GET_CODE (src) == MEM && REG_P (dest))
14015     extract_base_offset_in_addr (src, base, offset);
14016   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
14017     {
14018       fusion = SCHED_FUSION_ST;
14019       extract_base_offset_in_addr (dest, base, offset);
14020     }
14021   else
14022     return SCHED_FUSION_NONE;
14023
14024   if (*base == NULL_RTX || *offset == NULL_RTX)
14025     fusion = SCHED_FUSION_NONE;
14026
14027   return fusion;
14028 }
14029
14030 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
14031
14032    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
14033    and PRI are only calculated for these instructions.  For other instruction,
14034    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
14035    type instruction fusion can be added by returning different priorities.
14036
14037    It's important that irrelevant instructions get the largest FUSION_PRI.  */
14038
14039 static void
14040 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
14041                                int *fusion_pri, int *pri)
14042 {
14043   int tmp, off_val;
14044   rtx base, offset;
14045   enum sched_fusion_type fusion;
14046
14047   gcc_assert (INSN_P (insn));
14048
14049   tmp = max_pri - 1;
14050   fusion = fusion_load_store (insn, &base, &offset);
14051   if (fusion == SCHED_FUSION_NONE)
14052     {
14053       *pri = tmp;
14054       *fusion_pri = tmp;
14055       return;
14056     }
14057
14058   /* Set FUSION_PRI according to fusion type and base register.  */
14059   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
14060
14061   /* Calculate PRI.  */
14062   tmp /= 2;
14063
14064   /* INSN with smaller offset goes first.  */
14065   off_val = (int)(INTVAL (offset));
14066   if (off_val >= 0)
14067     tmp -= (off_val & 0xfffff);
14068   else
14069     tmp += ((- off_val) & 0xfffff);
14070
14071   *pri = tmp;
14072   return;
14073 }
14074
14075 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
14076    Adjust priority of sha1h instructions so they are scheduled before
14077    other SHA1 instructions.  */
14078
14079 static int
14080 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
14081 {
14082   rtx x = PATTERN (insn);
14083
14084   if (GET_CODE (x) == SET)
14085     {
14086       x = SET_SRC (x);
14087
14088       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
14089         return priority + 10;
14090     }
14091
14092   return priority;
14093 }
14094
14095 /* Given OPERANDS of consecutive load/store, check if we can merge
14096    them into ldp/stp.  LOAD is true if they are load instructions.
14097    MODE is the mode of memory operands.  */
14098
14099 bool
14100 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
14101                                 enum machine_mode mode)
14102 {
14103   HOST_WIDE_INT offval_1, offval_2, msize;
14104   enum reg_class rclass_1, rclass_2;
14105   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
14106
14107   if (load)
14108     {
14109       mem_1 = operands[1];
14110       mem_2 = operands[3];
14111       reg_1 = operands[0];
14112       reg_2 = operands[2];
14113       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
14114       if (REGNO (reg_1) == REGNO (reg_2))
14115         return false;
14116     }
14117   else
14118     {
14119       mem_1 = operands[0];
14120       mem_2 = operands[2];
14121       reg_1 = operands[1];
14122       reg_2 = operands[3];
14123     }
14124
14125   /* The mems cannot be volatile.  */
14126   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
14127     return false;
14128
14129   /* If we have SImode and slow unaligned ldp,
14130      check the alignment to be at least 8 byte. */
14131   if (mode == SImode
14132       && (aarch64_tune_params.extra_tuning_flags
14133           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14134       && !optimize_size
14135       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14136     return false;
14137
14138   /* Check if the addresses are in the form of [base+offset].  */
14139   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14140   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14141     return false;
14142   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14143   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14144     return false;
14145
14146   /* Check if the bases are same.  */
14147   if (!rtx_equal_p (base_1, base_2))
14148     return false;
14149
14150   offval_1 = INTVAL (offset_1);
14151   offval_2 = INTVAL (offset_2);
14152   msize = GET_MODE_SIZE (mode);
14153   /* Check if the offsets are consecutive.  */
14154   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
14155     return false;
14156
14157   /* Check if the addresses are clobbered by load.  */
14158   if (load)
14159     {
14160       if (reg_mentioned_p (reg_1, mem_1))
14161         return false;
14162
14163       /* In increasing order, the last load can clobber the address.  */
14164       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
14165       return false;
14166     }
14167
14168   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14169     rclass_1 = FP_REGS;
14170   else
14171     rclass_1 = GENERAL_REGS;
14172
14173   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14174     rclass_2 = FP_REGS;
14175   else
14176     rclass_2 = GENERAL_REGS;
14177
14178   /* Check if the registers are of same class.  */
14179   if (rclass_1 != rclass_2)
14180     return false;
14181
14182   return true;
14183 }
14184
14185 /* Given OPERANDS of consecutive load/store, check if we can merge
14186    them into ldp/stp by adjusting the offset.  LOAD is true if they
14187    are load instructions.  MODE is the mode of memory operands.
14188
14189    Given below consecutive stores:
14190
14191      str  w1, [xb, 0x100]
14192      str  w1, [xb, 0x104]
14193      str  w1, [xb, 0x108]
14194      str  w1, [xb, 0x10c]
14195
14196    Though the offsets are out of the range supported by stp, we can
14197    still pair them after adjusting the offset, like:
14198
14199      add  scratch, xb, 0x100
14200      stp  w1, w1, [scratch]
14201      stp  w1, w1, [scratch, 0x8]
14202
14203    The peephole patterns detecting this opportunity should guarantee
14204    the scratch register is avaliable.  */
14205
14206 bool
14207 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
14208                                        enum machine_mode mode)
14209 {
14210   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
14211   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
14212   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
14213   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
14214
14215   if (load)
14216     {
14217       reg_1 = operands[0];
14218       mem_1 = operands[1];
14219       reg_2 = operands[2];
14220       mem_2 = operands[3];
14221       reg_3 = operands[4];
14222       mem_3 = operands[5];
14223       reg_4 = operands[6];
14224       mem_4 = operands[7];
14225       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
14226                   && REG_P (reg_3) && REG_P (reg_4));
14227       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
14228         return false;
14229     }
14230   else
14231     {
14232       mem_1 = operands[0];
14233       reg_1 = operands[1];
14234       mem_2 = operands[2];
14235       reg_2 = operands[3];
14236       mem_3 = operands[4];
14237       reg_3 = operands[5];
14238       mem_4 = operands[6];
14239       reg_4 = operands[7];
14240     }
14241   /* Skip if memory operand is by itslef valid for ldp/stp.  */
14242   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
14243     return false;
14244
14245   /* The mems cannot be volatile.  */
14246   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
14247       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
14248     return false;
14249
14250   /* Check if the addresses are in the form of [base+offset].  */
14251   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14252   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14253     return false;
14254   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14255   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14256     return false;
14257   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
14258   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
14259     return false;
14260   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
14261   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
14262     return false;
14263
14264   /* Check if the bases are same.  */
14265   if (!rtx_equal_p (base_1, base_2)
14266       || !rtx_equal_p (base_2, base_3)
14267       || !rtx_equal_p (base_3, base_4))
14268     return false;
14269
14270   offval_1 = INTVAL (offset_1);
14271   offval_2 = INTVAL (offset_2);
14272   offval_3 = INTVAL (offset_3);
14273   offval_4 = INTVAL (offset_4);
14274   msize = GET_MODE_SIZE (mode);
14275   /* Check if the offsets are consecutive.  */
14276   if ((offval_1 != (offval_2 + msize)
14277        || offval_1 != (offval_3 + msize * 2)
14278        || offval_1 != (offval_4 + msize * 3))
14279       && (offval_4 != (offval_3 + msize)
14280           || offval_4 != (offval_2 + msize * 2)
14281           || offval_4 != (offval_1 + msize * 3)))
14282     return false;
14283
14284   /* Check if the addresses are clobbered by load.  */
14285   if (load)
14286     {
14287       if (reg_mentioned_p (reg_1, mem_1)
14288           || reg_mentioned_p (reg_2, mem_2)
14289           || reg_mentioned_p (reg_3, mem_3))
14290         return false;
14291
14292       /* In increasing order, the last load can clobber the address.  */
14293       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
14294         return false;
14295     }
14296
14297   /* If we have SImode and slow unaligned ldp,
14298      check the alignment to be at least 8 byte. */
14299   if (mode == SImode
14300       && (aarch64_tune_params.extra_tuning_flags
14301           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14302       && !optimize_size
14303       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14304     return false;
14305
14306   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14307     rclass_1 = FP_REGS;
14308   else
14309     rclass_1 = GENERAL_REGS;
14310
14311   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14312     rclass_2 = FP_REGS;
14313   else
14314     rclass_2 = GENERAL_REGS;
14315
14316   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
14317     rclass_3 = FP_REGS;
14318   else
14319     rclass_3 = GENERAL_REGS;
14320
14321   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
14322     rclass_4 = FP_REGS;
14323   else
14324     rclass_4 = GENERAL_REGS;
14325
14326   /* Check if the registers are of same class.  */
14327   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
14328     return false;
14329
14330   return true;
14331 }
14332
14333 /* Given OPERANDS of consecutive load/store, this function pairs them
14334    into ldp/stp after adjusting the offset.  It depends on the fact
14335    that addresses of load/store instructions are in increasing order.
14336    MODE is the mode of memory operands.  CODE is the rtl operator
14337    which should be applied to all memory operands, it's SIGN_EXTEND,
14338    ZERO_EXTEND or UNKNOWN.  */
14339
14340 bool
14341 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
14342                              enum machine_mode mode, RTX_CODE code)
14343 {
14344   rtx base, offset, t1, t2;
14345   rtx mem_1, mem_2, mem_3, mem_4;
14346   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
14347
14348   if (load)
14349     {
14350       mem_1 = operands[1];
14351       mem_2 = operands[3];
14352       mem_3 = operands[5];
14353       mem_4 = operands[7];
14354     }
14355   else
14356     {
14357       mem_1 = operands[0];
14358       mem_2 = operands[2];
14359       mem_3 = operands[4];
14360       mem_4 = operands[6];
14361       gcc_assert (code == UNKNOWN);
14362     }
14363
14364   extract_base_offset_in_addr (mem_1, &base, &offset);
14365   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
14366
14367   /* Adjust offset thus it can fit in ldp/stp instruction.  */
14368   msize = GET_MODE_SIZE (mode);
14369   stp_off_limit = msize * 0x40;
14370   off_val = INTVAL (offset);
14371   abs_off = (off_val < 0) ? -off_val : off_val;
14372   new_off = abs_off % stp_off_limit;
14373   adj_off = abs_off - new_off;
14374
14375   /* Further adjust to make sure all offsets are OK.  */
14376   if ((new_off + msize * 2) >= stp_off_limit)
14377     {
14378       adj_off += stp_off_limit;
14379       new_off -= stp_off_limit;
14380     }
14381
14382   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
14383   if (adj_off >= 0x1000)
14384     return false;
14385
14386   if (off_val < 0)
14387     {
14388       adj_off = -adj_off;
14389       new_off = -new_off;
14390     }
14391
14392   /* Create new memory references.  */
14393   mem_1 = change_address (mem_1, VOIDmode,
14394                           plus_constant (DImode, operands[8], new_off));
14395
14396   /* Check if the adjusted address is OK for ldp/stp.  */
14397   if (!aarch64_mem_pair_operand (mem_1, mode))
14398     return false;
14399
14400   msize = GET_MODE_SIZE (mode);
14401   mem_2 = change_address (mem_2, VOIDmode,
14402                           plus_constant (DImode,
14403                                          operands[8],
14404                                          new_off + msize));
14405   mem_3 = change_address (mem_3, VOIDmode,
14406                           plus_constant (DImode,
14407                                          operands[8],
14408                                          new_off + msize * 2));
14409   mem_4 = change_address (mem_4, VOIDmode,
14410                           plus_constant (DImode,
14411                                          operands[8],
14412                                          new_off + msize * 3));
14413
14414   if (code == ZERO_EXTEND)
14415     {
14416       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
14417       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
14418       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
14419       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
14420     }
14421   else if (code == SIGN_EXTEND)
14422     {
14423       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
14424       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
14425       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
14426       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
14427     }
14428
14429   if (load)
14430     {
14431       operands[1] = mem_1;
14432       operands[3] = mem_2;
14433       operands[5] = mem_3;
14434       operands[7] = mem_4;
14435     }
14436   else
14437     {
14438       operands[0] = mem_1;
14439       operands[2] = mem_2;
14440       operands[4] = mem_3;
14441       operands[6] = mem_4;
14442     }
14443
14444   /* Emit adjusting instruction.  */
14445   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
14446   /* Emit ldp/stp instructions.  */
14447   t1 = gen_rtx_SET (operands[0], operands[1]);
14448   t2 = gen_rtx_SET (operands[2], operands[3]);
14449   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
14450   t1 = gen_rtx_SET (operands[4], operands[5]);
14451   t2 = gen_rtx_SET (operands[6], operands[7]);
14452   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
14453   return true;
14454 }
14455
14456 /* Return 1 if pseudo register should be created and used to hold
14457    GOT address for PIC code.  */
14458
14459 bool
14460 aarch64_use_pseudo_pic_reg (void)
14461 {
14462   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
14463 }
14464
14465 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
14466
14467 static int
14468 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
14469 {
14470   switch (XINT (x, 1))
14471     {
14472     case UNSPEC_GOTSMALLPIC:
14473     case UNSPEC_GOTSMALLPIC28K:
14474     case UNSPEC_GOTTINYPIC:
14475       return 0;
14476     default:
14477       break;
14478     }
14479
14480   return default_unspec_may_trap_p (x, flags);
14481 }
14482
14483
14484 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
14485    return the log2 of that value.  Otherwise return -1.  */
14486
14487 int
14488 aarch64_fpconst_pow_of_2 (rtx x)
14489 {
14490   const REAL_VALUE_TYPE *r;
14491
14492   if (!CONST_DOUBLE_P (x))
14493     return -1;
14494
14495   r = CONST_DOUBLE_REAL_VALUE (x);
14496
14497   if (REAL_VALUE_NEGATIVE (*r)
14498       || REAL_VALUE_ISNAN (*r)
14499       || REAL_VALUE_ISINF (*r)
14500       || !real_isinteger (r, DFmode))
14501     return -1;
14502
14503   return exact_log2 (real_to_integer (r));
14504 }
14505
14506 /* If X is a vector of equal CONST_DOUBLE values and that value is
14507    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
14508
14509 int
14510 aarch64_vec_fpconst_pow_of_2 (rtx x)
14511 {
14512   if (GET_CODE (x) != CONST_VECTOR)
14513     return -1;
14514
14515   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
14516     return -1;
14517
14518   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
14519   if (firstval <= 0)
14520     return -1;
14521
14522   for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
14523     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
14524       return -1;
14525
14526   return firstval;
14527 }
14528
14529 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
14530    to float.
14531
14532    __fp16 always promotes through this hook.
14533    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
14534    through the generic excess precision logic rather than here.  */
14535
14536 static tree
14537 aarch64_promoted_type (const_tree t)
14538 {
14539   if (SCALAR_FLOAT_TYPE_P (t)
14540       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
14541     return float_type_node;
14542
14543   return NULL_TREE;
14544 }
14545
14546 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
14547
14548 static bool
14549 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
14550                            optimization_type opt_type)
14551 {
14552   switch (op)
14553     {
14554     case rsqrt_optab:
14555       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
14556
14557     default:
14558       return true;
14559     }
14560 }
14561
14562 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
14563    if MODE is HFmode, and punt to the generic implementation otherwise.  */
14564
14565 static bool
14566 aarch64_libgcc_floating_mode_supported_p (machine_mode mode)
14567 {
14568   return (mode == HFmode
14569           ? true
14570           : default_libgcc_floating_mode_supported_p (mode));
14571 }
14572
14573 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
14574    if MODE is HFmode, and punt to the generic implementation otherwise.  */
14575
14576 static bool
14577 aarch64_scalar_mode_supported_p (machine_mode mode)
14578 {
14579   return (mode == HFmode
14580           ? true
14581           : default_scalar_mode_supported_p (mode));
14582 }
14583
14584 /* Set the value of FLT_EVAL_METHOD.
14585    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
14586
14587     0: evaluate all operations and constants, whose semantic type has at
14588        most the range and precision of type float, to the range and
14589        precision of float; evaluate all other operations and constants to
14590        the range and precision of the semantic type;
14591
14592     N, where _FloatN is a supported interchange floating type
14593        evaluate all operations and constants, whose semantic type has at
14594        most the range and precision of _FloatN type, to the range and
14595        precision of the _FloatN type; evaluate all other operations and
14596        constants to the range and precision of the semantic type;
14597
14598    If we have the ARMv8.2-A extensions then we support _Float16 in native
14599    precision, so we should set this to 16.  Otherwise, we support the type,
14600    but want to evaluate expressions in float precision, so set this to
14601    0.  */
14602
14603 static enum flt_eval_method
14604 aarch64_excess_precision (enum excess_precision_type type)
14605 {
14606   switch (type)
14607     {
14608       case EXCESS_PRECISION_TYPE_FAST:
14609       case EXCESS_PRECISION_TYPE_STANDARD:
14610         /* We can calculate either in 16-bit range and precision or
14611            32-bit range and precision.  Make that decision based on whether
14612            we have native support for the ARMv8.2-A 16-bit floating-point
14613            instructions or not.  */
14614         return (TARGET_FP_F16INST
14615                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
14616                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
14617       case EXCESS_PRECISION_TYPE_IMPLICIT:
14618         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
14619       default:
14620         gcc_unreachable ();
14621     }
14622   return FLT_EVAL_METHOD_UNPREDICTABLE;
14623 }
14624
14625 /* Target-specific selftests.  */
14626
14627 #if CHECKING_P
14628
14629 namespace selftest {
14630
14631 /* Selftest for the RTL loader.
14632    Verify that the RTL loader copes with a dump from
14633    print_rtx_function.  This is essentially just a test that class
14634    function_reader can handle a real dump, but it also verifies
14635    that lookup_reg_by_dump_name correctly handles hard regs.
14636    The presence of hard reg names in the dump means that the test is
14637    target-specific, hence it is in this file.  */
14638
14639 static void
14640 aarch64_test_loading_full_dump ()
14641 {
14642   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
14643
14644   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
14645
14646   rtx_insn *insn_1 = get_insn_by_uid (1);
14647   ASSERT_EQ (NOTE, GET_CODE (insn_1));
14648
14649   rtx_insn *insn_15 = get_insn_by_uid (15);
14650   ASSERT_EQ (INSN, GET_CODE (insn_15));
14651   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
14652
14653   /* Verify crtl->return_rtx.  */
14654   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
14655   ASSERT_EQ (0, REGNO (crtl->return_rtx));
14656   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
14657 }
14658
14659 /* Run all target-specific selftests.  */
14660
14661 static void
14662 aarch64_run_selftests (void)
14663 {
14664   aarch64_test_loading_full_dump ();
14665 }
14666
14667 } // namespace selftest
14668
14669 #endif /* #if CHECKING_P */
14670
14671 #undef TARGET_ADDRESS_COST
14672 #define TARGET_ADDRESS_COST aarch64_address_cost
14673
14674 /* This hook will determines whether unnamed bitfields affect the alignment
14675    of the containing structure.  The hook returns true if the structure
14676    should inherit the alignment requirements of an unnamed bitfield's
14677    type.  */
14678 #undef TARGET_ALIGN_ANON_BITFIELD
14679 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
14680
14681 #undef TARGET_ASM_ALIGNED_DI_OP
14682 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
14683
14684 #undef TARGET_ASM_ALIGNED_HI_OP
14685 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
14686
14687 #undef TARGET_ASM_ALIGNED_SI_OP
14688 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
14689
14690 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
14691 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
14692   hook_bool_const_tree_hwi_hwi_const_tree_true
14693
14694 #undef TARGET_ASM_FILE_START
14695 #define TARGET_ASM_FILE_START aarch64_start_file
14696
14697 #undef TARGET_ASM_OUTPUT_MI_THUNK
14698 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
14699
14700 #undef TARGET_ASM_SELECT_RTX_SECTION
14701 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
14702
14703 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
14704 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
14705
14706 #undef TARGET_BUILD_BUILTIN_VA_LIST
14707 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
14708
14709 #undef TARGET_CALLEE_COPIES
14710 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
14711
14712 #undef TARGET_CAN_ELIMINATE
14713 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
14714
14715 #undef TARGET_CAN_INLINE_P
14716 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
14717
14718 #undef TARGET_CANNOT_FORCE_CONST_MEM
14719 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
14720
14721 #undef TARGET_CASE_VALUES_THRESHOLD
14722 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
14723
14724 #undef TARGET_CONDITIONAL_REGISTER_USAGE
14725 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
14726
14727 /* Only the least significant bit is used for initialization guard
14728    variables.  */
14729 #undef TARGET_CXX_GUARD_MASK_BIT
14730 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
14731
14732 #undef TARGET_C_MODE_FOR_SUFFIX
14733 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
14734
14735 #ifdef TARGET_BIG_ENDIAN_DEFAULT
14736 #undef  TARGET_DEFAULT_TARGET_FLAGS
14737 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
14738 #endif
14739
14740 #undef TARGET_CLASS_MAX_NREGS
14741 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
14742
14743 #undef TARGET_BUILTIN_DECL
14744 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
14745
14746 #undef TARGET_BUILTIN_RECIPROCAL
14747 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
14748
14749 #undef TARGET_C_EXCESS_PRECISION
14750 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
14751
14752 #undef  TARGET_EXPAND_BUILTIN
14753 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
14754
14755 #undef TARGET_EXPAND_BUILTIN_VA_START
14756 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
14757
14758 #undef TARGET_FOLD_BUILTIN
14759 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
14760
14761 #undef TARGET_FUNCTION_ARG
14762 #define TARGET_FUNCTION_ARG aarch64_function_arg
14763
14764 #undef TARGET_FUNCTION_ARG_ADVANCE
14765 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
14766
14767 #undef TARGET_FUNCTION_ARG_BOUNDARY
14768 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
14769
14770 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
14771 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
14772
14773 #undef TARGET_FUNCTION_VALUE
14774 #define TARGET_FUNCTION_VALUE aarch64_function_value
14775
14776 #undef TARGET_FUNCTION_VALUE_REGNO_P
14777 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
14778
14779 #undef TARGET_FRAME_POINTER_REQUIRED
14780 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
14781
14782 #undef TARGET_GIMPLE_FOLD_BUILTIN
14783 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
14784
14785 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
14786 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
14787
14788 #undef  TARGET_INIT_BUILTINS
14789 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
14790
14791 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
14792 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
14793   aarch64_ira_change_pseudo_allocno_class
14794
14795 #undef TARGET_LEGITIMATE_ADDRESS_P
14796 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
14797
14798 #undef TARGET_LEGITIMATE_CONSTANT_P
14799 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
14800
14801 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
14802 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
14803   aarch64_legitimize_address_displacement
14804
14805 #undef TARGET_LIBGCC_CMP_RETURN_MODE
14806 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
14807
14808 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
14809 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
14810 aarch64_libgcc_floating_mode_supported_p
14811
14812 #undef TARGET_MANGLE_TYPE
14813 #define TARGET_MANGLE_TYPE aarch64_mangle_type
14814
14815 #undef TARGET_MEMORY_MOVE_COST
14816 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
14817
14818 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
14819 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
14820
14821 #undef TARGET_MUST_PASS_IN_STACK
14822 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
14823
14824 /* This target hook should return true if accesses to volatile bitfields
14825    should use the narrowest mode possible.  It should return false if these
14826    accesses should use the bitfield container type.  */
14827 #undef TARGET_NARROW_VOLATILE_BITFIELD
14828 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
14829
14830 #undef  TARGET_OPTION_OVERRIDE
14831 #define TARGET_OPTION_OVERRIDE aarch64_override_options
14832
14833 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
14834 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
14835   aarch64_override_options_after_change
14836
14837 #undef TARGET_OPTION_SAVE
14838 #define TARGET_OPTION_SAVE aarch64_option_save
14839
14840 #undef TARGET_OPTION_RESTORE
14841 #define TARGET_OPTION_RESTORE aarch64_option_restore
14842
14843 #undef TARGET_OPTION_PRINT
14844 #define TARGET_OPTION_PRINT aarch64_option_print
14845
14846 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
14847 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
14848
14849 #undef TARGET_SET_CURRENT_FUNCTION
14850 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
14851
14852 #undef TARGET_PASS_BY_REFERENCE
14853 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
14854
14855 #undef TARGET_PREFERRED_RELOAD_CLASS
14856 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
14857
14858 #undef TARGET_SCHED_REASSOCIATION_WIDTH
14859 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
14860
14861 #undef TARGET_PROMOTED_TYPE
14862 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
14863
14864 #undef TARGET_SECONDARY_RELOAD
14865 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
14866
14867 #undef TARGET_SHIFT_TRUNCATION_MASK
14868 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
14869
14870 #undef TARGET_SETUP_INCOMING_VARARGS
14871 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
14872
14873 #undef TARGET_STRUCT_VALUE_RTX
14874 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
14875
14876 #undef TARGET_REGISTER_MOVE_COST
14877 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
14878
14879 #undef TARGET_RETURN_IN_MEMORY
14880 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
14881
14882 #undef TARGET_RETURN_IN_MSB
14883 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
14884
14885 #undef TARGET_RTX_COSTS
14886 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
14887
14888 #undef TARGET_SCALAR_MODE_SUPPORTED_P
14889 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
14890
14891 #undef TARGET_SCHED_ISSUE_RATE
14892 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
14893
14894 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
14895 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
14896   aarch64_sched_first_cycle_multipass_dfa_lookahead
14897
14898 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
14899 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
14900   aarch64_first_cycle_multipass_dfa_lookahead_guard
14901
14902 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
14903 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
14904   aarch64_get_separate_components
14905
14906 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
14907 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
14908   aarch64_components_for_bb
14909
14910 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
14911 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
14912   aarch64_disqualify_components
14913
14914 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
14915 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
14916   aarch64_emit_prologue_components
14917
14918 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
14919 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
14920   aarch64_emit_epilogue_components
14921
14922 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
14923 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
14924   aarch64_set_handled_components
14925
14926 #undef TARGET_TRAMPOLINE_INIT
14927 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
14928
14929 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
14930 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
14931
14932 #undef TARGET_VECTOR_MODE_SUPPORTED_P
14933 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
14934
14935 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
14936 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
14937   aarch64_builtin_support_vector_misalignment
14938
14939 #undef TARGET_ARRAY_MODE_SUPPORTED_P
14940 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
14941
14942 #undef TARGET_VECTORIZE_ADD_STMT_COST
14943 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
14944
14945 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
14946 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
14947   aarch64_builtin_vectorization_cost
14948
14949 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
14950 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
14951
14952 #undef TARGET_VECTORIZE_BUILTINS
14953 #define TARGET_VECTORIZE_BUILTINS
14954
14955 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
14956 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
14957   aarch64_builtin_vectorized_function
14958
14959 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
14960 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
14961   aarch64_autovectorize_vector_sizes
14962
14963 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
14964 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
14965   aarch64_atomic_assign_expand_fenv
14966
14967 /* Section anchor support.  */
14968
14969 #undef TARGET_MIN_ANCHOR_OFFSET
14970 #define TARGET_MIN_ANCHOR_OFFSET -256
14971
14972 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
14973    byte offset; we can do much more for larger data types, but have no way
14974    to determine the size of the access.  We assume accesses are aligned.  */
14975 #undef TARGET_MAX_ANCHOR_OFFSET
14976 #define TARGET_MAX_ANCHOR_OFFSET 4095
14977
14978 #undef TARGET_VECTOR_ALIGNMENT
14979 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
14980
14981 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
14982 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
14983   aarch64_simd_vector_alignment_reachable
14984
14985 /* vec_perm support.  */
14986
14987 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
14988 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
14989   aarch64_vectorize_vec_perm_const_ok
14990
14991 #undef TARGET_INIT_LIBFUNCS
14992 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
14993
14994 #undef TARGET_FIXED_CONDITION_CODE_REGS
14995 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
14996
14997 #undef TARGET_FLAGS_REGNUM
14998 #define TARGET_FLAGS_REGNUM CC_REGNUM
14999
15000 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
15001 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
15002
15003 #undef TARGET_ASAN_SHADOW_OFFSET
15004 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
15005
15006 #undef TARGET_LEGITIMIZE_ADDRESS
15007 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
15008
15009 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
15010 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
15011   aarch64_use_by_pieces_infrastructure_p
15012
15013 #undef TARGET_CAN_USE_DOLOOP_P
15014 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
15015
15016 #undef TARGET_SCHED_ADJUST_PRIORITY
15017 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
15018
15019 #undef TARGET_SCHED_MACRO_FUSION_P
15020 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
15021
15022 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
15023 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
15024
15025 #undef TARGET_SCHED_FUSION_PRIORITY
15026 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
15027
15028 #undef TARGET_UNSPEC_MAY_TRAP_P
15029 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
15030
15031 #undef TARGET_USE_PSEUDO_PIC_REG
15032 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
15033
15034 #undef TARGET_PRINT_OPERAND
15035 #define TARGET_PRINT_OPERAND aarch64_print_operand
15036
15037 #undef TARGET_PRINT_OPERAND_ADDRESS
15038 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
15039
15040 #undef TARGET_OPTAB_SUPPORTED_P
15041 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
15042
15043 #undef TARGET_OMIT_STRUCT_RETURN_REG
15044 #define TARGET_OMIT_STRUCT_RETURN_REG true
15045
15046 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
15047 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
15048 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
15049
15050 #if CHECKING_P
15051 #undef TARGET_RUN_TARGET_SELFTESTS
15052 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
15053 #endif /* #if CHECKING_P */
15054
15055 struct gcc_target targetm = TARGET_INITIALIZER;
15056
15057 #include "gt-aarch64.h"